Merge pull request #4232 from cpcloud/read-html-warnings

cpcloud · cpcloud · commit 382e85779173 · 2013-07-13T15:00:55.000-07:00
CLN/TST: clean up and raise on bs4 version and no tables
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -106,6 +106,8 @@ pandas 0.12
   - Added ``layout`` keyword to DataFrame.hist() for more customizable layout (:issue:`4050`)
   - Timestamp.min and Timestamp.max now represent valid Timestamp instances instead
     of the default datetime.min and datetime.max (respectively), thanks @SleepingPills
+  - ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0
+    is detected (:issue:`4214`)
 
 **API Changes**
 
diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt
@@ -344,6 +344,9 @@ Other Enhancements
   - Timestamp.min and Timestamp.max now represent valid Timestamp instances instead
     of the default datetime.min and datetime.max (respectively), thanks @SleepingPills
 
+  - ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0
+    is detected (:issue:`4214`)
+
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -8,39 +8,34 @@
 import numbers
 import urllib2
 import urlparse
-import contextlib
 import collections
 
-
-try:
-    from importlib import import_module
-except ImportError:
-    import_module = __import__
+from distutils.version import LooseVersion
 
 import numpy as np
 
 from pandas import DataFrame, MultiIndex, isnull
-from pandas.io.common import _is_url
+from pandas.io.common import _is_url, urlopen
 
 
 try:
-    import_module('bs4')
+    import bs4
 except ImportError:
     _HAS_BS4 = False
 else:
     _HAS_BS4 = True
 
 
 try:
-    import_module('lxml')
+    import lxml
 except ImportError:
     _HAS_LXML = False
 else:
     _HAS_LXML = True
 
 
 try:
-    import_module('html5lib')
+    import html5lib
 except ImportError:
     _HAS_HTML5LIB = False
 else:
@@ -119,7 +114,7 @@ def _read(io):
     """
     if _is_url(io):
         try:
-            with contextlib.closing(urllib2.urlopen(io)) as url:
+            with urlopen(io) as url:
                 raw_text = url.read()
         except urllib2.URLError:
             raise ValueError('Invalid URL: "{0}"'.format(io))
@@ -131,7 +126,8 @@ def _read(io):
     elif isinstance(io, basestring):
         raw_text = io
     else:
-        raise ValueError("Cannot read object of type '{0}'".format(type(io)))
+        raise TypeError("Cannot read object of type "
+                        "'{0.__class__.__name__!r}'".format(io))
     return raw_text
 
 
@@ -414,6 +410,7 @@ def _parse_tables(self, doc, match, attrs):
         element_name = self._strainer.name
         tables = doc.find_all(element_name, attrs=attrs)
         if not tables:
+            # known sporadically working release
             raise AssertionError('No tables found')
 
         mts = [table.find(text=match) for table in tables]
@@ -429,7 +426,8 @@ def _parse_tables(self, doc, match, attrs):
     def _setup_build_doc(self):
         raw_text = _read(self.io)
         if not raw_text:
-            raise AssertionError('No text parsed from document')
+            raise AssertionError('No text parsed from document: '
+                                 '{0}'.format(self.io))
         return raw_text
 
     def _build_doc(self):
@@ -721,6 +719,14 @@ def _parser_dispatch(flavor):
             raise ImportError("html5lib not found please install it")
         if not _HAS_BS4:
             raise ImportError("bs4 not found please install it")
+        if bs4.__version__ == LooseVersion('4.2.0'):
+            raise AssertionError("You're using a version"
+                                 " of BeautifulSoup4 (4.2.0) that has been"
+                                 " known to cause problems on certain"
+                                 " operating systems such as Debian. "
+                                 "Please install a version of"
+                                 " BeautifulSoup4 != 4.2.0, both earlier"
+                                 " and later releases will work.")
     else:
         if not _HAS_LXML:
             raise ImportError("lxml not found please install it")
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
@@ -2,18 +2,24 @@
 import re
 from cStringIO import StringIO
 from unittest import TestCase
-from urllib2 import urlopen
-from contextlib import closing
 import warnings
+from distutils.version import LooseVersion
 
 import nose
+from nose.tools import assert_raises
 
 import numpy as np
 from numpy.random import rand
 from numpy.testing.decorators import slow
 
-from pandas.io.html import read_html, import_module
-from pandas.io.html import _remove_whitespace
+try:
+    from importlib import import_module
+except ImportError:
+    import_module = __import__
+
+from pandas.io.html import read_html
+from pandas.io.common import urlopen
+
 from pandas import DataFrame, MultiIndex, read_csv, Timestamp
 from pandas.util.testing import (assert_frame_equal, network,
                                  get_data_path)
@@ -60,14 +66,26 @@ def assert_framelist_equal(list1, list2, *args, **kwargs):
         assert not frame_i.empty, 'frames are both empty'
 
 
+def test_bs4_version_fails():
+    _skip_if_no('bs4')
+    import bs4
+    if bs4.__version__ == LooseVersion('4.2.0'):
+        assert_raises(AssertionError, read_html, os.path.join(DATA_PATH,
+                                                              "spam.html"),
+                      flavor='bs4')
+
+
 class TestReadHtmlBase(TestCase):
     def run_read_html(self, *args, **kwargs):
-        self.try_skip()
         kwargs['flavor'] = kwargs.get('flavor', self.flavor)
         return read_html(*args, **kwargs)
 
     def try_skip(self):
         _skip_if_none_of(('bs4', 'html5lib'))
+        import bs4
+        if (bs4.__version__ == LooseVersion('4.2.0') and
+            self.flavor != ['lxml']):
+            raise nose.SkipTest
 
     def setup_data(self):
         self.spam_data = os.path.join(DATA_PATH, 'spam.html')
@@ -77,6 +95,7 @@ def setup_flavor(self):
         self.flavor = 'bs4'
 
     def setUp(self):
+        self.try_skip()
         self.setup_data()
         self.setup_flavor()
 
@@ -347,6 +366,7 @@ def test_pythonxy_plugins_table(self):
 
     @slow
     def test_banklist_header(self):
+        from pandas.io.html import _remove_whitespace
         def try_remove_ws(x):
             try:
                 return _remove_whitespace(x)
@@ -438,10 +458,9 @@ def test_invalid_flavor():
 def get_elements_from_url(url, element='table', base_url="file://"):
     _skip_if_none_of(('bs4', 'html5lib'))
     url = "".join([base_url, url])
-    from bs4 import BeautifulSoup, SoupStrainer
-    strainer = SoupStrainer(element)
-    with closing(urlopen(url)) as f:
-        soup = BeautifulSoup(f, features='html5lib', parse_only=strainer)
+    from bs4 import BeautifulSoup
+    with urlopen(url) as f:
+        soup = BeautifulSoup(f, features='html5lib')
     return soup.find_all(element)