Skip to content

Commit a292c13

Browse files
chris-b1jreback
authored andcommitted
ENH: parse categoricals in read_csv
Closes #10153 Author: Chris <[email protected]> Closes #13406 from chris-b1/categorical-parse and squashes the following commits: c78f39f [Chris] rebase fixup 75ed6ba [Chris] doc fixups 1f6093a [Chris] rebase 0f0dba6 [Chris] wip da5c5b5 [Chris] flake8 fix 1254768 [Chris] doc fixups; addl tests 2490949 [Chris] fix hash table ordering, null categories 4e0722d [Chris] undo type inference add docs and asv 849a112 [Chris] fix some dtype checking cfa0ce4 [Chris] clean up dtype checking, add function specialization 286d907 [Chris] ENH: parse categoricals in read_csv
1 parent cffe6f2 commit a292c13

File tree

6 files changed

+367
-77
lines changed

6 files changed

+367
-77
lines changed

asv_bench/benchmarks/parser_vb.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,27 @@ def teardown(self):
114114
os.remove('test.csv')
115115

116116

117+
class read_csv_categorical(object):
118+
goal_time = 0.2
119+
120+
def setup(self):
121+
N = 100000
122+
group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
123+
df = DataFrame({'a': np.random.choice(group1, N).astype('object'),
124+
'b': np.random.choice(group1, N).astype('object'),
125+
'c': np.random.choice(group1, N).astype('object')})
126+
df.to_csv('strings.csv', index=False)
127+
128+
def time_read_csv_categorical_post(self):
129+
read_csv('strings.csv').apply(pd.Categorical)
130+
131+
def time_read_csv_categorical_direct(self):
132+
read_csv('strings.csv', dtype='category')
133+
134+
def teardown(self):
135+
os.remove('strings.csv')
136+
137+
117138
class read_table_multiple_date(object):
118139
goal_time = 0.2
119140

doc/source/io.rst

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,43 @@ worth trying.
500500
data that was read in. It is important to note that the overall column will be
501501
marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes.
502502

503+
.. _io.categorical:
504+
505+
Specifying Categorical dtype
506+
''''''''''''''''''''''''''''
507+
508+
.. versionadded:: 0.19.0
509+
510+
``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
511+
512+
.. ipython:: python
513+
514+
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
515+
516+
pd.read_csv(StringIO(data))
517+
pd.read_csv(StringIO(data)).dtypes
518+
pd.read_csv(StringIO(data), dtype='category').dtypes
519+
520+
Individual columns can be parsed as a ``Categorical`` using a dict specification
521+
522+
.. ipython:: python
523+
524+
pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
525+
526+
.. note::
527+
528+
The resulting categories will always be parsed as strings (object dtype).
529+
If the categories are numeric they can be converted using the
530+
:func:`to_numeric` function, or as appropriate, another converter
531+
such as :func:`to_datetime`.
532+
533+
.. ipython:: python
534+
535+
df = pd.read_csv(StringIO(data), dtype='category')
536+
df.dtypes
537+
df['col3']
538+
df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
539+
df['col3']
503540
504541
505542
Naming and Using Columns

doc/source/whatsnew/v0.19.0.txt

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Highlights include:
1212
- :func:`merge_asof` for asof-style time-series joining, see :ref:`here <whatsnew_0190.enhancements.asof_merge>`
1313
- ``.rolling()`` are now time-series aware, see :ref:`here <whatsnew_0190.enhancements.rolling_ts>`
1414
- pandas development api, see :ref:`here <whatsnew_0190.dev_api>`
15+
- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here <whatsnew_0190.enhancements.read_csv_categorical>`
1516

1617
.. contents:: What's new in v0.19.0
1718
:local:
@@ -195,6 +196,14 @@ default of the index) in a DataFrame.
195196
``read_csv`` has improved support for duplicate column names
196197
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
197198

199+
.. ipython:: python
200+
:suppress:
201+
202+
from pandas.compat import StringIO
203+
204+
.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support:
205+
206+
198207
:ref:`Duplicate column names <io.dupe_names>` are now supported in :func:`read_csv` whether
199208
they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`)
200209

@@ -222,6 +231,46 @@ New behaviour:
222231

223232
In [2]: pd.read_csv(StringIO(data), names=names)
224233

234+
235+
.. _whatsnew_0190.enhancements.read_csv_categorical:
236+
237+
:func:`read_csv` supports parsing ``Categorical`` directly
238+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
239+
240+
The :func:`read_csv` function now supports parsing a ``Categorical`` column when
241+
specified as a dtype (:issue:`10153`). Depending on the structure of the data,
242+
this can result in a faster parse time and lower memory usage compared to
243+
converting to ``Categorical`` after parsing. See the io :ref:`docs here <io.categorical>`
244+
245+
.. ipython:: python
246+
247+
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
248+
249+
pd.read_csv(StringIO(data))
250+
pd.read_csv(StringIO(data)).dtypes
251+
pd.read_csv(StringIO(data), dtype='category').dtypes
252+
253+
Individual columns can be parsed as a ``Categorical`` using a dict specification
254+
255+
.. ipython:: python
256+
257+
pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
258+
259+
.. note::
260+
261+
The resulting categories will always be parsed as strings (object dtype).
262+
If the categories are numeric they can be converted using the
263+
:func:`to_numeric` function, or as appropriate, another converter
264+
such as :func:`to_datetime`.
265+
266+
.. ipython:: python
267+
268+
df = pd.read_csv(StringIO(data), dtype='category')
269+
df.dtypes
270+
df['col3']
271+
df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
272+
df['col3']
273+
225274
.. _whatsnew_0190.enhancements.semi_month_offsets:
226275

227276
Semi-Month Offsets

pandas/io/tests/parser/c_parser_only.py

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212

1313
import pandas as pd
1414
import pandas.util.testing as tm
15-
from pandas import DataFrame, Series, Index, MultiIndex
15+
from pandas import DataFrame, Series, Index, MultiIndex, Categorical
1616
from pandas import compat
1717
from pandas.compat import StringIO, range, lrange
18+
from pandas.types.dtypes import CategoricalDtype
1819

1920

2021
class CParserTests(object):
@@ -135,6 +136,11 @@ def test_passing_dtype(self):
135136
dtype={'A': 'timedelta64', 'B': 'float64'},
136137
index_col=0)
137138

139+
# valid but unsupported - fixed width unicode string
140+
self.assertRaises(TypeError, self.read_csv, path,
141+
dtype={'A': 'U8'},
142+
index_col=0)
143+
138144
# see gh-12048: empty frame
139145
actual = self.read_csv(StringIO('A,B'), dtype=str)
140146
expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
@@ -184,6 +190,92 @@ def test_pass_dtype(self):
184190
self.assertEqual(result['one'].dtype, 'u1')
185191
self.assertEqual(result['two'].dtype, 'object')
186192

193+
def test_categorical_dtype(self):
194+
# GH 10153
195+
data = """a,b,c
196+
1,a,3.4
197+
1,a,3.4
198+
2,b,4.5"""
199+
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
200+
'b': Categorical(['a', 'a', 'b']),
201+
'c': Categorical(['3.4', '3.4', '4.5'])})
202+
actual = self.read_csv(StringIO(data), dtype='category')
203+
tm.assert_frame_equal(actual, expected)
204+
205+
actual = self.read_csv(StringIO(data), dtype=CategoricalDtype())
206+
tm.assert_frame_equal(actual, expected)
207+
208+
actual = self.read_csv(StringIO(data), dtype={'a': 'category',
209+
'b': 'category',
210+
'c': CategoricalDtype()})
211+
tm.assert_frame_equal(actual, expected)
212+
213+
actual = self.read_csv(StringIO(data), dtype={'b': 'category'})
214+
expected = pd.DataFrame({'a': [1, 1, 2],
215+
'b': Categorical(['a', 'a', 'b']),
216+
'c': [3.4, 3.4, 4.5]})
217+
tm.assert_frame_equal(actual, expected)
218+
219+
actual = self.read_csv(StringIO(data), dtype={1: 'category'})
220+
tm.assert_frame_equal(actual, expected)
221+
222+
# unsorted
223+
data = """a,b,c
224+
1,b,3.4
225+
1,b,3.4
226+
2,a,4.5"""
227+
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
228+
'b': Categorical(['b', 'b', 'a']),
229+
'c': Categorical(['3.4', '3.4', '4.5'])})
230+
actual = self.read_csv(StringIO(data), dtype='category')
231+
tm.assert_frame_equal(actual, expected)
232+
233+
# missing
234+
data = """a,b,c
235+
1,b,3.4
236+
1,nan,3.4
237+
2,a,4.5"""
238+
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
239+
'b': Categorical(['b', np.nan, 'a']),
240+
'c': Categorical(['3.4', '3.4', '4.5'])})
241+
actual = self.read_csv(StringIO(data), dtype='category')
242+
tm.assert_frame_equal(actual, expected)
243+
244+
def test_categorical_dtype_encoding(self):
245+
# GH 10153
246+
pth = tm.get_data_path('unicode_series.csv')
247+
encoding = 'latin-1'
248+
expected = self.read_csv(pth, header=None, encoding=encoding)
249+
expected[1] = Categorical(expected[1])
250+
actual = self.read_csv(pth, header=None, encoding=encoding,
251+
dtype={1: 'category'})
252+
tm.assert_frame_equal(actual, expected)
253+
254+
pth = tm.get_data_path('utf16_ex.txt')
255+
encoding = 'utf-16'
256+
expected = self.read_table(pth, encoding=encoding)
257+
expected = expected.apply(Categorical)
258+
actual = self.read_table(pth, encoding=encoding, dtype='category')
259+
tm.assert_frame_equal(actual, expected)
260+
261+
def test_categorical_dtype_chunksize(self):
262+
# GH 10153
263+
data = """a,b
264+
1,a
265+
1,b
266+
1,b
267+
2,c"""
268+
expecteds = [pd.DataFrame({'a': [1, 1],
269+
'b': Categorical(['a', 'b'])}),
270+
pd.DataFrame({'a': [1, 2],
271+
'b': Categorical(['b', 'c'])},
272+
index=[2, 3])]
273+
actuals = self.read_csv(StringIO(data), dtype={'b': 'category'},
274+
chunksize=2)
275+
276+
for actual, expected in zip(actuals, expecteds):
277+
tm.assert_frame_equal(actual, expected)
278+
187279
def test_pass_dtype_as_recarray(self):
188280
if compat.is_platform_windows() and self.low_memory:
189281
raise nose.SkipTest(

0 commit comments

Comments
 (0)