From 24d1b6a39168eb2051ac828a3ed16fdc32c870d0 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 19 Dec 2018 07:04:12 +0000 Subject: [PATCH 1/5] ENH: Allow fixed-length strings in df.to_records() Adds parameter to allow string-like columns to be cast as fixed-length string-like dtypes for more efficient storage. Closes gh-18146. Originally authored by @qinghao1 but cleaned up by @gfyoung to fix merge conflicts. --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/frame.py | 73 +++++++++++++++++++++++++-- pandas/tests/frame/test_convert_to.py | 38 ++++++++++++++ 3 files changed, 109 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b4331aab3085f..b3172eb2ca966 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -411,6 +411,7 @@ Other Enhancements - :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`) - The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`) +- :meth:`DataFrame.to_records` now accepts a ``stringlike_as_fixed_length`` parameter to efficiently store string-likes as fixed-length string-like dtypes (e.g. ``S1``) instead of object dtype (``O``) (:issue:`18146`) - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) - :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method ` section in the documentation. (:issue:`8953`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 99ae551d3c55b..83cb2240c7f86 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,7 +35,7 @@ OrderedDict, PY36, raise_with_traceback, string_and_binary_types) from pandas.compat.numpy import function as nv - +from pandas.api.types import infer_dtype from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, @@ -1540,7 +1540,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None, return cls(mgr) - def to_records(self, index=True, convert_datetime64=None): + def to_records(self, index=True, convert_datetime64=None, + stringlike_as_fixed_length=False): """ Convert DataFrame to a NumPy record array. @@ -1557,6 +1558,11 @@ def to_records(self, index=True, convert_datetime64=None): Whether to convert the index to datetime.datetime if it is a DatetimeIndex. + stringlike_as_fixed_length : bool, default False + .. versionadded:: 0.24.0 + + Store string-likes as fixed-length string-like dtypes + (e.g. ``S1`` dtype) instead of Python objects (``O`` dtype). Returns ------- @@ -1598,6 +1604,27 @@ def to_records(self, index=True, convert_datetime64=None): >>> df.to_records(index=False) rec.array([(1, 0.5 ), (2, 0.75)], dtype=[('A', '>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']}, + ... index=['a', 'b']) + >>> df.to_records() + rec.array([('a', 1, 'abc'), ('b', 2, 'defg')], + dtype=[('index', 'O'), ('A', '>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']}, + ... index=['a', 'b']) + >>> df.to_records(stringlike_as_fixed_length=True) + rec.array([('a', 1, 'abc'), ('b', 2, 'defg')], + dtype=[('index', ' just take the dtype. + ([1, 2], lambda fixed, isPY2: " cast to object. + ([1, "1"], lambda fixed, isPY2: "O"), + + # String --> cast to string is PY2 else unicode in PY3. + (["1", "2"], lambda fixed, isPY2: ( + ("S" if isPY2 else "U") + "1") if fixed else "O"), + + # String + max-length of longest string. + (["12", "2"], lambda fixed, isPY2: ( + ("S" if isPY2 else "U") + "2") if fixed else "O"), + + # Unicode --> cast to unicode for both PY2 and PY3. + ([u"\u2120b", u"456"], lambda fixed, isPY2: "U3" if fixed else "O"), + + # Bytes --> cast to string for both PY2 and PY3. + ([b"2", b"5"], lambda fixed, isPY2: "S1" if fixed else "O"), + ], ids=["int", "mixed", "str", "max-len", "unicode", "bytes"]) + def test_to_records_with_strings_as_fixed_length(self, fixed_length, + values, dtype_getter): + + # see gh-18146 + df = DataFrame({"values": values}, index=["a", "b"]) + result = df.to_records(stringlike_as_fixed_length=fixed_length) + + ind_dtype = ((("S" if compat.PY2 else "U") + "1") + if fixed_length else "O") + val_dtype = dtype_getter(fixed_length, compat.PY2) + + expected = np.rec.array([("a", values[0]), ("b", values[1])], + dtype=[("index", ind_dtype), + ("values", val_dtype)]) + tm.assert_almost_equal(result, expected) + @pytest.mark.parametrize('mapping', [ dict, collections.defaultdict(list), From fa5e1ea67fe09f632b29d97092e00ea6ae47eb3f Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 26 Dec 2018 11:07:52 +0000 Subject: [PATCH 2/5] Add dtype parameters instead of fix-string-like The original parameter was causing a lot of acrobatics with regards to string dtypes between 2.x and 3.x. The new parameters simplify the internal logic and pass the responsibility and motivation of memory efficiency back to the users. --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/frame.py | 115 ++++++++++--------- pandas/tests/frame/test_convert_to.py | 155 ++++++++++++++++++++------ 3 files changed, 178 insertions(+), 94 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b3172eb2ca966..cca15f26cbf99 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -411,7 +411,7 @@ Other Enhancements - :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`) - The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`) -- :meth:`DataFrame.to_records` now accepts a ``stringlike_as_fixed_length`` parameter to efficiently store string-likes as fixed-length string-like dtypes (e.g. ``S1``) instead of object dtype (``O``) (:issue:`18146`) +- :meth:`DataFrame.to_records` now accepts ``index_dtypes`` and ``column_dtypes`` parameters to allow different data types in stored column and index records (:issue:`18146`) - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) - :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method ` section in the documentation. (:issue:`8953`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 83cb2240c7f86..fe8a1e3fed3aa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,7 +35,6 @@ OrderedDict, PY36, raise_with_traceback, string_and_binary_types) from pandas.compat.numpy import function as nv -from pandas.api.types import infer_dtype from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, @@ -1541,7 +1540,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, return cls(mgr) def to_records(self, index=True, convert_datetime64=None, - stringlike_as_fixed_length=False): + column_dtypes=None, index_dtypes=None): """ Convert DataFrame to a NumPy record array. @@ -1558,11 +1557,20 @@ def to_records(self, index=True, convert_datetime64=None, Whether to convert the index to datetime.datetime if it is a DatetimeIndex. - stringlike_as_fixed_length : bool, default False - .. versionadded:: 0.24.0 + column_dtypes : str, type, dict, default None + .. versionadded:: 0.24.0 + + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. + index_dtypes : str, type, dict, default None + .. versionadded:: 0.24.0 - Store string-likes as fixed-length string-like dtypes - (e.g. ``S1`` dtype) instead of Python objects (``O`` dtype). + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + + This mapping is applied only if `index=True`. Returns ------- @@ -1605,26 +1613,22 @@ def to_records(self, index=True, convert_datetime64=None, rec.array([(1, 0.5 ), (2, 0.75)], dtype=[('A', '>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']}, - ... index=['a', 'b']) - >>> df.to_records() - rec.array([('a', 1, 'abc'), ('b', 2, 'defg')], - dtype=[('index', 'O'), ('A', '>> df.to_records(column_dtypes={"A": "int32"}) + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']}, - ... index=['a', 'b']) - >>> df.to_records(stringlike_as_fixed_length=True) - rec.array([('a', 1, 'abc'), ('b', 2, 'defg')], - dtype=[('index', '>> df.to_records(index_dtypes=">> index_dtypes = ">> df.to_records(index_dtypes=index_dtypes) + rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], + dtype=[('I', 'S1'), ('A', ' just take the dtype. - ([1, 2], lambda fixed, isPY2: " cast to object. - ([1, "1"], lambda fixed, isPY2: "O"), - - # String --> cast to string is PY2 else unicode in PY3. - (["1", "2"], lambda fixed, isPY2: ( - ("S" if isPY2 else "U") + "1") if fixed else "O"), - - # String + max-length of longest string. - (["12", "2"], lambda fixed, isPY2: ( - ("S" if isPY2 else "U") + "2") if fixed else "O"), - - # Unicode --> cast to unicode for both PY2 and PY3. - ([u"\u2120b", u"456"], lambda fixed, isPY2: "U3" if fixed else "O"), - - # Bytes --> cast to string for both PY2 and PY3. - ([b"2", b"5"], lambda fixed, isPY2: "S1" if fixed else "O"), - ], ids=["int", "mixed", "str", "max-len", "unicode", "bytes"]) - def test_to_records_with_strings_as_fixed_length(self, fixed_length, - values, dtype_getter): - + @pytest.mark.parametrize("kwargs,expected", [ + # No dtypes --> default to array dtypes. + (dict(), + np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[("index", " Date: Sat, 29 Dec 2018 02:00:11 +0000 Subject: [PATCH 3/5] MAINT: Use is_dict_like in to_records More generic than checking whether our mappings are instances of dict. Expands is_dict_like check to include whether it has a __contains__ method. --- pandas/core/dtypes/inference.py | 5 ++++- pandas/core/frame.py | 3 ++- pandas/tests/frame/test_convert_to.py | 28 +++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 241a1b471f677..b11542622451c 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -398,8 +398,11 @@ def is_dict_like(obj): >>> is_dict_like([1, 2, 3]) False """ + for attr in ("__getitem__", "keys", "__contains__"): + if not hasattr(obj, attr): + return False - return hasattr(obj, '__getitem__') and hasattr(obj, 'keys') + return True def is_named_tuple(obj): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fe8a1e3fed3aa..dfa539159a25a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -48,6 +48,7 @@ maybe_upcast_putmask, find_common_type) from pandas.core.dtypes.common import ( + is_dict_like, is_object_dtype, is_extension_type, is_extension_array_dtype, @@ -1681,7 +1682,7 @@ def to_records(self, index=True, convert_datetime64=None, dtype_mapping = column_dtypes name = self.columns[index] - if isinstance(dtype_mapping, dict): + if is_dict_like(dtype_mapping): if name in dtype_mapping: dtype_mapping = dtype_mapping[name] elif index in dtype_mapping: diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 3f517c9d51c8e..b875559169205 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -314,6 +314,34 @@ def test_to_records_dtype_mi(self, df, kwargs, expected): result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) + def test_to_records_dict_like(self): + # see gh-18146 + class DictLike(object): + def __init__(self, **kwargs): + self.d = kwargs.copy() + + def __getitem__(self, key): + return self.d.__getitem__(key) + + def __contains__(self, key): + return key in self.d + + def keys(self): + return self.d.keys() + + df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) + + dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8, + "B": np.float32}), + index_dtypes=" Date: Sun, 30 Dec 2018 01:16:14 +0000 Subject: [PATCH 4/5] TST: Add test for is_dict_like expanded def --- pandas/tests/dtypes/test_inference.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 0c22b595bc74d..9c5944ed585f3 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -178,6 +178,33 @@ def test_is_dict_like_fails(ll): assert not inference.is_dict_like(ll) +@pytest.mark.parametrize("has_keys", [True, False]) +@pytest.mark.parametrize("has_getitem", [True, False]) +@pytest.mark.parametrize("has_contains", [True, False]) +def test_is_dict_like_duct_type(has_keys, has_getitem, has_contains): + class DictLike(object): + def __init__(self, d): + self.d = d + + if has_keys: + def keys(self): + return self.d.keys() + + if has_getitem: + def __getitem__(self, key): + return self.d.__getitem__(key) + + if has_contains: + def __contains__(self, key): + return self.d.__contains__(key) + + d = DictLike({1: 2}) + result = inference.is_dict_like(d) + expected = has_keys and has_getitem and has_contains + + assert result is expected + + def test_is_file_like(mock): class MockFile(object): pass From ec69fe0b4a98b7e21463ec1ceb6a3eaeed9cc96f Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 30 Dec 2018 20:58:06 +0000 Subject: [PATCH 5/5] MAINT: Address final comments --- pandas/core/frame.py | 19 +++++++++++++++++++ pandas/tests/dtypes/test_inference.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dfa539159a25a..99653248216f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1674,6 +1674,15 @@ def to_records(self, index=True, convert_datetime64=None, for i, v in enumerate(arrays): index = i + # When the names and arrays are collected, we + # first collect those in the DataFrame's index, + # followed by those in its columns. + # + # Thus, the total length of the array is: + # len(index_names) + len(DataFrame.columns). + # + # This check allows us to see whether we are + # handling a name / array in the index or column. if index < index_len: dtype_mapping = index_dtypes name = index_names[index] @@ -1682,6 +1691,11 @@ def to_records(self, index=True, convert_datetime64=None, dtype_mapping = column_dtypes name = self.columns[index] + # We have a dictionary, so we get the data type + # associated with the index or column (which can + # be denoted by its name in the DataFrame or its + # position in DataFrame's array of indices or + # columns, whichever is applicable. if is_dict_like(dtype_mapping): if name in dtype_mapping: dtype_mapping = dtype_mapping[name] @@ -1690,6 +1704,11 @@ def to_records(self, index=True, convert_datetime64=None, else: dtype_mapping = None + # If no mapping can be found, use the array's + # dtype attribute for formatting. + # + # A valid dtype must either be a type or + # string naming a type. if dtype_mapping is None: formats.append(v.dtype) elif isinstance(dtype_mapping, (type, compat.string_types)): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 9c5944ed585f3..d9b1b0db90562 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -181,7 +181,7 @@ def test_is_dict_like_fails(ll): @pytest.mark.parametrize("has_keys", [True, False]) @pytest.mark.parametrize("has_getitem", [True, False]) @pytest.mark.parametrize("has_contains", [True, False]) -def test_is_dict_like_duct_type(has_keys, has_getitem, has_contains): +def test_is_dict_like_duck_type(has_keys, has_getitem, has_contains): class DictLike(object): def __init__(self, d): self.d = d