Skip to content

ENH: support decimal option in PythonParser #12933 #13189

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions asv_bench/benchmarks/parser_vb.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def setup(self):
def time_read_csv_default_converter(self):
read_csv(StringIO(self.data), sep=',', header=None, float_precision=None)


class read_csv_precise_converter(object):
goal_time = 0.2

Expand Down Expand Up @@ -109,4 +108,20 @@ def setup(self):
self.data = (self.data * 200)

def time_read_table_multiple_date_baseline(self):
read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])


class read_csv_python_engine(object):
goal_time = 0.2

def setup(self):
self.data_decimal = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n '
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do these benchmarks exist for the c-engine as well (ideally we would use the same exact data so we can compare)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback, added one benchmark with with same date for c-engine. Here are the results:

▶ asv continuous master 12933  -b parser_vb.read_csv_default 
· Creating environments
· Discovering benchmarks
·· Uninstalling from conda-py2.7-Cython-matplotlib-numexpr-numpy-openpyxl-pytables-scipy-sqlalchemy-xlrd-xlsxwriter-xlwt.
·· Installing into conda-py2.7-Cython-matplotlib-numexpr-numpy-openpyxl-pytables-scipy-sqlalchemy-xlrd-xlsxwriter-xlwt.
· Running 8 total benchmarks (2 commits * 1 environments * 4 benchmarks)
[  0.00%] · For pandas commit hash 465272e1:
[  0.00%] ·· Building for conda-py2.7-Cython-matplotlib-numexpr-numpy-openpyxl-pytables-scipy-sqlalchemy-xlrd-xlsxwriter-xlwt...............................................
[  0.00%] ·· Benchmarking conda-py2.7-Cython-matplotlib-numexpr-numpy-openpyxl-pytables-scipy-sqlalchemy-xlrd-xlsxwriter-xlwt
[ 12.50%] ··· Running parser_vb.read_csv_default_converter.time_read_csv_default_converter                                                                                                     1.99ms
[ 25.00%] ··· Running parser_vb.read_csv_default_converter_python_engine.time_read_csv_default_converter                                                                                       2.87ms
[ 37.50%] ··· Running parser_vb.read_csv_default_converter_with_decimal.time_read_csv_default_converter_with_decimal                                                                           2.00ms
[ 50.00%] ··· Running parser_vb.read_csv_default_converter_with_decimal_python_engine.time_read_csv_default_converter_with_decimal                                                             9.24ms
[ 50.00%] · For pandas commit hash 86f68e6a:
[ 50.00%] ·· Building for conda-py2.7-Cython-matplotlib-numexpr-numpy-openpyxl-pytables-scipy-sqlalchemy-xlrd-xlsxwriter-xlwt...
[ 50.00%] ·· Benchmarking conda-py2.7-Cython-matplotlib-numexpr-numpy-openpyxl-pytables-scipy-sqlalchemy-xlrd-xlsxwriter-xlwt
[ 62.50%] ··· Running parser_vb.read_csv_default_converter.time_read_csv_default_converter                                                                                                     1.97ms
[ 75.00%] ··· Running parser_vb.read_csv_default_converter_python_engine.time_read_csv_default_converter                                                                                       2.87ms
[ 87.50%] ··· Running parser_vb.read_csv_default_converter_with_decimal.time_read_csv_default_converter_with_decimal                                                                           1.99ms
[100.00%] ··· Running parser_vb.read_csv_default_converter_with_decimal_python_engine.time_read_csv_default_converter_with_decimal                                                             failed
SOME BENCHMARKS HAVE CHANGED SIGNIFICANTLY.

self.data_decimal = (self.data_decimal * 200)
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
self.data = (self.data * 200)

def time_read_csv_default_converter_with_decimal(self):
read_csv(StringIO(self.data_decimal), sep=';', header=None, float_precision=None, decimal=',', engine='python')

def time_read_csv_default_converter(self):
read_csv(StringIO(self.data), sep=',', header=None, float_precision=None, engine='python')
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ Other enhancements
idx = pd.Index(["a1a2", "b1", "c1"])
idx.str.extractall("[ab](?P<digit>\d)")

- The ``pd.read_csv()`` with engine='python' has gained support for the decimal option (:issue:`12933`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double backticks around decimal and engine='python'


.. _whatsnew_0182.api:

API changes
Expand Down
37 changes: 30 additions & 7 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
'keep_default_na': True,
'thousands': None,
'comment': None,
'decimal': b'.',

# 'engine': 'c',
'parse_dates': False,
Expand Down Expand Up @@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines': True,
'warn_bad_lines': True,
'dtype': None,
'decimal': b'.',
'float_precision': None
}

Expand All @@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines',
'warn_bad_lines',
'dtype',
'decimal',
'float_precision',
])

Expand Down Expand Up @@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
self.converters = kwds['converters']

self.thousands = kwds['thousands']
self.decimal = kwds['decimal']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls check / update io.rst and the doc-string. IIRC we list in the option if it doesn't support a particular engine (so that can now be removed).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, might not be the case, but pls check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback I can't find any reference in io.rst to a particular engine when decimal option is used

self.comment = kwds['comment']
self._comment_lines = []

Expand Down Expand Up @@ -1639,6 +1639,15 @@ def __init__(self, f, **kwds):
else:
self._no_thousands_columns = None

if len(self.decimal) != 1:
raise ValueError('Only length-1 decimal markers supported')

if self.thousands is None:
self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
else:
self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
self.decimal))

def _set_no_thousands_columns(self):
# Create a set of column ids that are not to be stripped of thousands
# operators.
Expand Down Expand Up @@ -2050,22 +2059,35 @@ def _check_empty(self, lines):
def _check_thousands(self, lines):
if self.thousands is None:
return lines
nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)

return self._search_replace_num_columns(lines=lines,
search=self.thousands,
replace='')

def _search_replace_num_columns(self, lines, search, replace):
ret = []
for l in lines:
rl = []
for i, x in enumerate(l):
if (not isinstance(x, compat.string_types) or
self.thousands not in x or
search not in x or
(self._no_thousands_columns and
i in self._no_thousands_columns) or
nonnum.search(x.strip())):
self.nonnum.search(x.strip())):
rl.append(x)
else:
rl.append(x.replace(self.thousands, ''))
rl.append(x.replace(search, replace))
ret.append(rl)
return ret

def _check_decimal(self, lines):
if self.decimal == _parser_defaults['decimal']:
return lines

return self._search_replace_num_columns(lines=lines,
search=self.decimal,
replace='.')

def _clear_buffer(self):
self.buf = []

Expand Down Expand Up @@ -2249,7 +2271,8 @@ def _get_lines(self, rows=None):
lines = self._check_comments(lines)
if self.skip_blank_lines:
lines = self._check_empty(lines)
return self._check_thousands(lines)
lines = self._check_thousands(lines)
return self._check_decimal(lines)


def _make_date_converter(date_parser=None, dayfirst=False,
Expand Down
45 changes: 0 additions & 45 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
self.assertEqual(result['B'][2], '')

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)

def test_custom_lineterminator(self):
data = 'a,b,c~1,2,3~4,5,6'

Expand Down Expand Up @@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
data = "\n\n\n"
self.assertRaises(ValueError, self.read_csv, StringIO(data))

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_grow_boundary_at_cap(self):
# See gh-12494
#
Expand Down
53 changes: 49 additions & 4 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def test_empty_decimal_marker(self):
1|2,334|5
10|13|10.
"""
# C parser: supports only length-1 decimals
# Python parser: 'decimal' not supported yet
self.assertRaises(ValueError, self.read_csv,
StringIO(data), decimal='')
# Parsers support only length-1 decimals
msg = 'Only length-1 decimal markers supported'
with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv(StringIO(data), decimal='')

def test_read_csv(self):
if not compat.PY3:
Expand Down Expand Up @@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
result = self.read_table(f, squeeze=True, header=None)
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
tm.assert_series_equal(result, expected)

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)