diff --git a/doc/source/release.rst b/doc/source/release.rst index 45335fa49aa23..f5d29c11e0589 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -106,6 +106,8 @@ pandas 0.12 - Added ``layout`` keyword to DataFrame.hist() for more customizable layout (:issue:`4050`) - Timestamp.min and Timestamp.max now represent valid Timestamp instances instead of the default datetime.min and datetime.max (respectively), thanks @SleepingPills + - ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0 + is detected (:issue:`4214`) **API Changes** diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt index e1484c82ff165..f735803d9af33 100644 --- a/doc/source/v0.12.0.txt +++ b/doc/source/v0.12.0.txt @@ -344,6 +344,9 @@ Other Enhancements - Timestamp.min and Timestamp.max now represent valid Timestamp instances instead of the default datetime.min and datetime.max (respectively), thanks @SleepingPills + - ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0 + is detected (:issue:`4214`) + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/io/html.py b/pandas/io/html.py index 08a9403cd18a7..64fba1cadc6c2 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -8,23 +8,18 @@ import numbers import urllib2 import urlparse -import contextlib import collections - -try: - from importlib import import_module -except ImportError: - import_module = __import__ +from distutils.version import LooseVersion import numpy as np from pandas import DataFrame, MultiIndex, isnull -from pandas.io.common import _is_url +from pandas.io.common import _is_url, urlopen try: - import_module('bs4') + import bs4 except ImportError: _HAS_BS4 = False else: @@ -32,7 +27,7 @@ try: - import_module('lxml') + import lxml except ImportError: _HAS_LXML = False else: @@ -40,7 +35,7 @@ try: - import_module('html5lib') + import html5lib except ImportError: _HAS_HTML5LIB = False else: @@ -119,7 +114,7 @@ def _read(io): """ if _is_url(io): try: - with contextlib.closing(urllib2.urlopen(io)) as url: + with urlopen(io) as url: raw_text = url.read() except urllib2.URLError: raise ValueError('Invalid URL: "{0}"'.format(io)) @@ -131,7 +126,8 @@ def _read(io): elif isinstance(io, basestring): raw_text = io else: - raise ValueError("Cannot read object of type '{0}'".format(type(io))) + raise TypeError("Cannot read object of type " + "'{0.__class__.__name__!r}'".format(io)) return raw_text @@ -414,6 +410,7 @@ def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) if not tables: + # known sporadically working release raise AssertionError('No tables found') mts = [table.find(text=match) for table in tables] @@ -429,7 +426,8 @@ def _parse_tables(self, doc, match, attrs): def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: - raise AssertionError('No text parsed from document') + raise AssertionError('No text parsed from document: ' + '{0}'.format(self.io)) return raw_text def _build_doc(self): @@ -721,6 +719,14 @@ def _parser_dispatch(flavor): raise ImportError("html5lib not found please install it") if not _HAS_BS4: raise ImportError("bs4 not found please install it") + if bs4.__version__ == LooseVersion('4.2.0'): + raise AssertionError("You're using a version" + " of BeautifulSoup4 (4.2.0) that has been" + " known to cause problems on certain" + " operating systems such as Debian. " + "Please install a version of" + " BeautifulSoup4 != 4.2.0, both earlier" + " and later releases will work.") else: if not _HAS_LXML: raise ImportError("lxml not found please install it") diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index eaf06730a84c3..d7c46ea898b33 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -2,18 +2,24 @@ import re from cStringIO import StringIO from unittest import TestCase -from urllib2 import urlopen -from contextlib import closing import warnings +from distutils.version import LooseVersion import nose +from nose.tools import assert_raises import numpy as np from numpy.random import rand from numpy.testing.decorators import slow -from pandas.io.html import read_html, import_module -from pandas.io.html import _remove_whitespace +try: + from importlib import import_module +except ImportError: + import_module = __import__ + +from pandas.io.html import read_html +from pandas.io.common import urlopen + from pandas import DataFrame, MultiIndex, read_csv, Timestamp from pandas.util.testing import (assert_frame_equal, network, get_data_path) @@ -60,14 +66,26 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): assert not frame_i.empty, 'frames are both empty' +def test_bs4_version_fails(): + _skip_if_no('bs4') + import bs4 + if bs4.__version__ == LooseVersion('4.2.0'): + assert_raises(AssertionError, read_html, os.path.join(DATA_PATH, + "spam.html"), + flavor='bs4') + + class TestReadHtmlBase(TestCase): def run_read_html(self, *args, **kwargs): - self.try_skip() kwargs['flavor'] = kwargs.get('flavor', self.flavor) return read_html(*args, **kwargs) def try_skip(self): _skip_if_none_of(('bs4', 'html5lib')) + import bs4 + if (bs4.__version__ == LooseVersion('4.2.0') and + self.flavor != ['lxml']): + raise nose.SkipTest def setup_data(self): self.spam_data = os.path.join(DATA_PATH, 'spam.html') @@ -77,6 +95,7 @@ def setup_flavor(self): self.flavor = 'bs4' def setUp(self): + self.try_skip() self.setup_data() self.setup_flavor() @@ -347,6 +366,7 @@ def test_pythonxy_plugins_table(self): @slow def test_banklist_header(self): + from pandas.io.html import _remove_whitespace def try_remove_ws(x): try: return _remove_whitespace(x) @@ -438,10 +458,9 @@ def test_invalid_flavor(): def get_elements_from_url(url, element='table', base_url="file://"): _skip_if_none_of(('bs4', 'html5lib')) url = "".join([base_url, url]) - from bs4 import BeautifulSoup, SoupStrainer - strainer = SoupStrainer(element) - with closing(urlopen(url)) as f: - soup = BeautifulSoup(f, features='html5lib', parse_only=strainer) + from bs4 import BeautifulSoup + with urlopen(url) as f: + soup = BeautifulSoup(f, features='html5lib') return soup.find_all(element)