Skip to content

Commit 382e857

Browse files
committed
Merge pull request #4232 from cpcloud/read-html-warnings
CLN/TST: clean up and raise on bs4 version and no tables
2 parents 70da8c3 + 357cde3 commit 382e857

File tree

4 files changed

+52
-22
lines changed

4 files changed

+52
-22
lines changed

doc/source/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ pandas 0.12
106106
- Added ``layout`` keyword to DataFrame.hist() for more customizable layout (:issue:`4050`)
107107
- Timestamp.min and Timestamp.max now represent valid Timestamp instances instead
108108
of the default datetime.min and datetime.max (respectively), thanks @SleepingPills
109+
- ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0
110+
is detected (:issue:`4214`)
109111

110112
**API Changes**
111113

doc/source/v0.12.0.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,9 @@ Other Enhancements
344344
- Timestamp.min and Timestamp.max now represent valid Timestamp instances instead
345345
of the default datetime.min and datetime.max (respectively), thanks @SleepingPills
346346

347+
- ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0
348+
is detected (:issue:`4214`)
349+
347350
Experimental Features
348351
~~~~~~~~~~~~~~~~~~~~~
349352

pandas/io/html.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,39 +8,34 @@
88
import numbers
99
import urllib2
1010
import urlparse
11-
import contextlib
1211
import collections
1312

14-
15-
try:
16-
from importlib import import_module
17-
except ImportError:
18-
import_module = __import__
13+
from distutils.version import LooseVersion
1914

2015
import numpy as np
2116

2217
from pandas import DataFrame, MultiIndex, isnull
23-
from pandas.io.common import _is_url
18+
from pandas.io.common import _is_url, urlopen
2419

2520

2621
try:
27-
import_module('bs4')
22+
import bs4
2823
except ImportError:
2924
_HAS_BS4 = False
3025
else:
3126
_HAS_BS4 = True
3227

3328

3429
try:
35-
import_module('lxml')
30+
import lxml
3631
except ImportError:
3732
_HAS_LXML = False
3833
else:
3934
_HAS_LXML = True
4035

4136

4237
try:
43-
import_module('html5lib')
38+
import html5lib
4439
except ImportError:
4540
_HAS_HTML5LIB = False
4641
else:
@@ -119,7 +114,7 @@ def _read(io):
119114
"""
120115
if _is_url(io):
121116
try:
122-
with contextlib.closing(urllib2.urlopen(io)) as url:
117+
with urlopen(io) as url:
123118
raw_text = url.read()
124119
except urllib2.URLError:
125120
raise ValueError('Invalid URL: "{0}"'.format(io))
@@ -131,7 +126,8 @@ def _read(io):
131126
elif isinstance(io, basestring):
132127
raw_text = io
133128
else:
134-
raise ValueError("Cannot read object of type '{0}'".format(type(io)))
129+
raise TypeError("Cannot read object of type "
130+
"'{0.__class__.__name__!r}'".format(io))
135131
return raw_text
136132

137133

@@ -414,6 +410,7 @@ def _parse_tables(self, doc, match, attrs):
414410
element_name = self._strainer.name
415411
tables = doc.find_all(element_name, attrs=attrs)
416412
if not tables:
413+
# known sporadically working release
417414
raise AssertionError('No tables found')
418415

419416
mts = [table.find(text=match) for table in tables]
@@ -429,7 +426,8 @@ def _parse_tables(self, doc, match, attrs):
429426
def _setup_build_doc(self):
430427
raw_text = _read(self.io)
431428
if not raw_text:
432-
raise AssertionError('No text parsed from document')
429+
raise AssertionError('No text parsed from document: '
430+
'{0}'.format(self.io))
433431
return raw_text
434432

435433
def _build_doc(self):
@@ -721,6 +719,14 @@ def _parser_dispatch(flavor):
721719
raise ImportError("html5lib not found please install it")
722720
if not _HAS_BS4:
723721
raise ImportError("bs4 not found please install it")
722+
if bs4.__version__ == LooseVersion('4.2.0'):
723+
raise AssertionError("You're using a version"
724+
" of BeautifulSoup4 (4.2.0) that has been"
725+
" known to cause problems on certain"
726+
" operating systems such as Debian. "
727+
"Please install a version of"
728+
" BeautifulSoup4 != 4.2.0, both earlier"
729+
" and later releases will work.")
724730
else:
725731
if not _HAS_LXML:
726732
raise ImportError("lxml not found please install it")

pandas/io/tests/test_html.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,24 @@
22
import re
33
from cStringIO import StringIO
44
from unittest import TestCase
5-
from urllib2 import urlopen
6-
from contextlib import closing
75
import warnings
6+
from distutils.version import LooseVersion
87

98
import nose
9+
from nose.tools import assert_raises
1010

1111
import numpy as np
1212
from numpy.random import rand
1313
from numpy.testing.decorators import slow
1414

15-
from pandas.io.html import read_html, import_module
16-
from pandas.io.html import _remove_whitespace
15+
try:
16+
from importlib import import_module
17+
except ImportError:
18+
import_module = __import__
19+
20+
from pandas.io.html import read_html
21+
from pandas.io.common import urlopen
22+
1723
from pandas import DataFrame, MultiIndex, read_csv, Timestamp
1824
from pandas.util.testing import (assert_frame_equal, network,
1925
get_data_path)
@@ -60,14 +66,26 @@ def assert_framelist_equal(list1, list2, *args, **kwargs):
6066
assert not frame_i.empty, 'frames are both empty'
6167

6268

69+
def test_bs4_version_fails():
70+
_skip_if_no('bs4')
71+
import bs4
72+
if bs4.__version__ == LooseVersion('4.2.0'):
73+
assert_raises(AssertionError, read_html, os.path.join(DATA_PATH,
74+
"spam.html"),
75+
flavor='bs4')
76+
77+
6378
class TestReadHtmlBase(TestCase):
6479
def run_read_html(self, *args, **kwargs):
65-
self.try_skip()
6680
kwargs['flavor'] = kwargs.get('flavor', self.flavor)
6781
return read_html(*args, **kwargs)
6882

6983
def try_skip(self):
7084
_skip_if_none_of(('bs4', 'html5lib'))
85+
import bs4
86+
if (bs4.__version__ == LooseVersion('4.2.0') and
87+
self.flavor != ['lxml']):
88+
raise nose.SkipTest
7189

7290
def setup_data(self):
7391
self.spam_data = os.path.join(DATA_PATH, 'spam.html')
@@ -77,6 +95,7 @@ def setup_flavor(self):
7795
self.flavor = 'bs4'
7896

7997
def setUp(self):
98+
self.try_skip()
8099
self.setup_data()
81100
self.setup_flavor()
82101

@@ -347,6 +366,7 @@ def test_pythonxy_plugins_table(self):
347366

348367
@slow
349368
def test_banklist_header(self):
369+
from pandas.io.html import _remove_whitespace
350370
def try_remove_ws(x):
351371
try:
352372
return _remove_whitespace(x)
@@ -438,10 +458,9 @@ def test_invalid_flavor():
438458
def get_elements_from_url(url, element='table', base_url="file://"):
439459
_skip_if_none_of(('bs4', 'html5lib'))
440460
url = "".join([base_url, url])
441-
from bs4 import BeautifulSoup, SoupStrainer
442-
strainer = SoupStrainer(element)
443-
with closing(urlopen(url)) as f:
444-
soup = BeautifulSoup(f, features='html5lib', parse_only=strainer)
461+
from bs4 import BeautifulSoup
462+
with urlopen(url) as f:
463+
soup = BeautifulSoup(f, features='html5lib')
445464
return soup.find_all(element)
446465

447466

0 commit comments

Comments
 (0)