From 9fc0d8c50537f192c7af5f241727f3ac8ce9b462 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 14 Sep 2019 12:47:04 -0500 Subject: [PATCH 1/5] REF: Refactor sparse HDF5 read / write In preparation for the removal of SparseSeries and SparseDataFrame, we read into a Series[sparse] / DataFrame[sparse]. --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/io/pytables.py | 113 +++++----------------- pandas/tests/io/pytables/test_pytables.py | 32 ++---- 3 files changed, 34 insertions(+), 113 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c78e27f098f13..19f09e49f9026 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -69,7 +69,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) -- +- :func:`read_hdf` now reads sparse values into a :class:`Series` or :class:`DataFrame` with sparse values rather than a ``SparseDataFrame`` or ``SparseSeries`` (:issue:``) - .. _whatsnew_1000.deprecations: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4f3f639de5cb1..f6da988e110c0 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -28,6 +28,7 @@ is_datetime64tz_dtype, is_extension_type, is_list_like, + is_sparse, is_timedelta64_dtype, ) from pandas.core.dtypes.missing import array_equivalent @@ -40,8 +41,7 @@ MultiIndex, PeriodIndex, Series, - SparseDataFrame, - SparseSeries, + SparseArray, TimedeltaIndex, concat, isna, @@ -173,12 +173,7 @@ class DuplicateWarning(Warning): """ # map object types -_TYPE_MAP = { - Series: "series", - SparseSeries: "sparse_series", - DataFrame: "frame", - SparseDataFrame: "sparse_frame", -} +_TYPE_MAP = {Series: "series", DataFrame: "frame"} # storer class map _STORER_MAP = { @@ -186,9 +181,9 @@ class DuplicateWarning(Warning): "DataFrame": "LegacyFrameFixed", "DataMatrix": "LegacyFrameFixed", "series": "SeriesFixed", - "sparse_series": "SparseSeriesFixed", + "sparse_series": "SeriesFixed", "frame": "FrameFixed", - "sparse_frame": "SparseFrameFixed", + "sparse_frame": "FrameFixed", } # table class map @@ -2754,6 +2749,16 @@ def read_array(self, key, start=None, stop=None): elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") + if dtype == "Sparse": + if start or stop: + raise NotImplementedError( + "start and/or stop are not supported in fixed Sparse reading" + ) + sp_index = self.read_index("{}_sp_index".format(key)) + ret = SparseArray( + ret, sparse_index=sp_index, fill_value=self.attrs.fill_value + ) + if transposed: return ret.T else: @@ -3004,7 +3009,7 @@ def write_array(self, key, value, items=None): vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) vlarr.append(value) else: - if empty_array: + if empty_array and not is_sparse(value): self.write_array_empty(key, value) else: if is_datetime64_dtype(value.dtype): @@ -3021,6 +3026,15 @@ def write_array(self, key, value, items=None): elif is_timedelta64_dtype(value.dtype): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" + elif is_sparse(value): + # TODO: think about EA API for this. + # value._write_hdf5(self) + self.write_index("{}_sp_index".format(key), value.sp_index) + self._handle.create_array(self.group, key, value.sp_values) + getattr(self.group, key)._v_attrs.value_type = "Sparse" + self.attrs.fill_value = value.fill_value + self.attrs.kind = value.kind + self.attributes.extend(["fill_value", "kind"]) else: self._handle.create_array(self.group, key, value) @@ -3078,83 +3092,6 @@ def write(self, obj, **kwargs): self.attrs.name = obj.name -class SparseFixed(GenericFixed): - def validate_read(self, kwargs): - """ - we don't support start, stop kwds in Sparse - """ - kwargs = super().validate_read(kwargs) - if "start" in kwargs or "stop" in kwargs: - raise NotImplementedError( - "start and/or stop are not supported in fixed Sparse reading" - ) - return kwargs - - -class SparseSeriesFixed(SparseFixed): - pandas_kind = "sparse_series" - attributes = ["name", "fill_value", "kind"] - - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index("index") - sp_values = self.read_array("sp_values") - sp_index = self.read_index("sp_index") - return SparseSeries( - sp_values, - index=index, - sparse_index=sp_index, - kind=self.kind or "block", - fill_value=self.fill_value, - name=self.name, - ) - - def write(self, obj, **kwargs): - super().write(obj, **kwargs) - self.write_index("index", obj.index) - self.write_index("sp_index", obj.sp_index) - self.write_array("sp_values", obj.sp_values) - self.attrs.name = obj.name - self.attrs.fill_value = obj.fill_value - self.attrs.kind = obj.kind - - -class SparseFrameFixed(SparseFixed): - pandas_kind = "sparse_frame" - attributes = ["default_kind", "default_fill_value"] - - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - columns = self.read_index("columns") - sdict = {} - for c in columns: - key = "sparse_series_{columns}".format(columns=c) - s = SparseSeriesFixed(self.parent, getattr(self.group, key)) - s.infer_axes() - sdict[c] = s.read() - return SparseDataFrame( - sdict, - columns=columns, - default_kind=self.default_kind, - default_fill_value=self.default_fill_value, - ) - - def write(self, obj, **kwargs): - """ write it as a collection of individual sparse series """ - super().write(obj, **kwargs) - for name, ss in obj.items(): - key = "sparse_series_{name}".format(name=name) - if key not in self.group._v_children: - node = self._handle.create_group(self.group, key) - else: - node = getattr(self.group, key) - s = SparseSeriesFixed(self.parent, node) - s.write(ss) - self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind - self.write_index("columns", obj.columns) - - class BlockManagerFixed(GenericFixed): attributes = ["ndim", "nblocks"] is_shape_reversed = False diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 77cac00882771..541ecbaa5b811 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -71,14 +71,6 @@ ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" ) -ignore_sparse = pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -ignore_dataframe_tosparse = pytest.mark.filterwarnings( - "ignore:DataFrame.to_sparse:FutureWarning" -) -ignore_series_tosparse = pytest.mark.filterwarnings( - "ignore:Series.to_sparse:FutureWarning" -) - # contextmanager to ensure the file cleanup @@ -2353,36 +2345,32 @@ def test_series(self): ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) self._check_roundtrip(ts3, tm.assert_series_equal, check_index_type=False) - @ignore_sparse - @ignore_series_tosparse def test_sparse_series(self): s = tm.makeStringSeries() s.iloc[3:5] = np.nan - ss = s.to_sparse() + ss = s.astype("Sparse") self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) - ss2 = s.to_sparse(kind="integer") + ss2 = pd.Series(pd.SparseArray(s, kind="integer")) self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) - ss3 = s.to_sparse(fill_value=0) + ss3 = pd.Series(pd.SparseArray(s, fill_value=0)) self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) - @ignore_sparse - @ignore_dataframe_tosparse def test_sparse_frame(self): s = tm.makeDataFrame() s.iloc[3:5, 1:3] = np.nan s.iloc[8:10, -2] = np.nan - ss = s.to_sparse() + ss = s.astype("Sparse") self._check_double_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) - ss2 = s.to_sparse(kind="integer") + ss2 = s.apply(lambda x: pd.SparseArray(x, kind="integer")) self._check_double_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) - ss3 = s.to_sparse(fill_value=0) + ss3 = s.apply(lambda x: pd.SparseArray(x, fill_value=0)) self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) def test_float_index(self): @@ -2709,15 +2697,13 @@ def test_overwrite_node(self): tm.assert_series_equal(store["a"], ts) - @ignore_sparse - @ignore_dataframe_tosparse def test_sparse_with_compression(self): # GH 2931 # make sparse dataframe arr = np.random.binomial(n=1, p=0.01, size=(1000, 10)) - df = DataFrame(arr).to_sparse(fill_value=0) + df = DataFrame(arr).apply(lambda x: pd.SparseArray(x, fill_value=0)) # case 1: store uncompressed self._check_double_roundtrip( @@ -3890,8 +3876,6 @@ def test_start_stop_multiple(self): expected = df.loc[[0], ["foo", "bar"]] tm.assert_frame_equal(result, expected) - @ignore_sparse - @ignore_dataframe_tosparse def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: @@ -3931,7 +3915,7 @@ def test_start_stop_fixed(self): df = tm.makeDataFrame() df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan - dfs = df.to_sparse() + dfs = df.apply(pd.SparseArray) store.put("dfs", dfs) with pytest.raises(NotImplementedError): store.select("dfs", start=0, stop=5) From 1f885f259e3e6673fab79c564ae244f00ce7dfaf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2019 09:35:07 -0500 Subject: [PATCH 2/5] issue no --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 19f09e49f9026..53c1d8c6c5777 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -69,7 +69,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) -- :func:`read_hdf` now reads sparse values into a :class:`Series` or :class:`DataFrame` with sparse values rather than a ``SparseDataFrame`` or ``SparseSeries`` (:issue:``) +- :func:`read_hdf` now reads sparse values into a :class:`Series` or :class:`DataFrame` with sparse values rather than a ``SparseDataFrame`` or ``SparseSeries`` (:issue:`28456`) - .. _whatsnew_1000.deprecations: From 2d5c299ceb32b189a2f25a4d803e96794a594741 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2019 14:34:00 -0500 Subject: [PATCH 3/5] wip --- pandas/io/pytables.py | 39 ++++++++++++++++++++--- pandas/tests/io/pytables/test_pytables.py | 23 +++++++++++++ 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f6da988e110c0..1d789651feb2a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -183,7 +183,7 @@ class DuplicateWarning(Warning): "series": "SeriesFixed", "sparse_series": "SeriesFixed", "frame": "FrameFixed", - "sparse_frame": "FrameFixed", + "sparse_frame": "SparseFrameFixed", } # table class map @@ -2722,6 +2722,18 @@ def write(self, obj, **kwargs): def read_array(self, key, start=None, stop=None): """ read an array for the specified node (off of group """ + if ( + self.pandas_type == "sparse_series" or "sp_index_length" in self.attrs + ) and key not in self.group: + # Compatibility for files written with pandas 0.25.1 and earlier. + if "sp_values" in self.group: + key = "sp_values" + dtype = "Sparse" + sp_index = self.read_index("sp_index".format(key)) + else: + dtype = None + sp_index = None + import tables node = getattr(self.group, key) @@ -2732,7 +2744,7 @@ def read_array(self, key, start=None, stop=None): if isinstance(node, tables.VLArray): ret = node[0][start:stop] else: - dtype = getattr(attrs, "value_type", None) + dtype = getattr(attrs, "value_type", dtype) shape = getattr(attrs, "shape", None) if shape is not None: @@ -2754,7 +2766,8 @@ def read_array(self, key, start=None, stop=None): raise NotImplementedError( "start and/or stop are not supported in fixed Sparse reading" ) - sp_index = self.read_index("{}_sp_index".format(key)) + if sp_index is None: + sp_index = self.read_index("{}_sp_index".format(key)) ret = SparseArray( ret, sparse_index=sp_index, fill_value=self.attrs.fill_value ) @@ -3079,10 +3092,10 @@ def shape(self): except (TypeError, AttributeError): return None - def read(self, **kwargs): + def read(self, key="values", **kwargs): kwargs = self.validate_read(kwargs) index = self.read_index("index", **kwargs) - values = self.read_array("values", **kwargs) + values = self.read_array(key, **kwargs) return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): @@ -3184,6 +3197,22 @@ class FrameFixed(BlockManagerFixed): obj_type = DataFrame +class SparseFrameFixed(GenericFixed): + pandas_kind = "sparse_frame" + attributes = ["default_kind", "default_fill_value"] + + def read(self, **kwargs): + kwargs = self.validate_read(kwargs) + columns = self.read_index("columns") + sdict = {} + for c in columns: + key = "sparse_series_{columns}".format(columns=c) + s = SeriesFixed(self.parent, getattr(self.group, key)) + s.infer_axes() + sdict[c] = s.read(key=key) + return DataFrame(sdict) + + class Table(Fixed): """ represent a table: facilitate read/write of various types of tables diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 541ecbaa5b811..e4347bd6357bf 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -4883,6 +4883,29 @@ def test_read_py2_hdf_file_in_py3(self, datapath): result = store["p"] assert_frame_equal(result, expected) + def test_read_legacy_sparse(self, datapath): + """ + Generated with pandas 0.25.1 and + + >>> s = pd.Series([1, None, 2, 3]).to_sparse() + >>> df = pd.DataFrame({"A": [1, None, 2, 3], "B": [1, 0, 0, 0]}).to_sparse() + >>> s.to_hdf("pandas/tests/io/data/legacy_hdf/legacy_sparse.h5", "series") + >>> df.to_hdf("pandas/tests/io/data/legacy_hdf/legacy_sparse.h5", "frame") + """ + result = pd.read_hdf( + datapath("io", "data", "legacy_hdf", "legacy_sparse.h5"), "series" + ) + expected = pd.Series(pd.SparseArray([1, None, 2, 3])) + tm.assert_series_equal(result, expected) + + result = pd.read_hdf( + datapath("io", "data", "legacy_hdf", "legacy_sparse.h5"), "frame" + ) + expected = pd.DataFrame( + {"A": pd.SparseArray([1, None, 2, 3]), "B": pd.SparseArray([1, 0, 0, 0])} + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) def test_select_empty_where(self, where): # GH26610 From 227f47f16bef9bc99259a2ecdfd9d99325055a94 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2019 14:43:25 -0500 Subject: [PATCH 4/5] Revert "wip" This reverts commit 2d5c299ceb32b189a2f25a4d803e96794a594741. --- pandas/io/pytables.py | 39 +++-------------------- pandas/tests/io/pytables/test_pytables.py | 23 ------------- 2 files changed, 5 insertions(+), 57 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1d789651feb2a..f6da988e110c0 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -183,7 +183,7 @@ class DuplicateWarning(Warning): "series": "SeriesFixed", "sparse_series": "SeriesFixed", "frame": "FrameFixed", - "sparse_frame": "SparseFrameFixed", + "sparse_frame": "FrameFixed", } # table class map @@ -2722,18 +2722,6 @@ def write(self, obj, **kwargs): def read_array(self, key, start=None, stop=None): """ read an array for the specified node (off of group """ - if ( - self.pandas_type == "sparse_series" or "sp_index_length" in self.attrs - ) and key not in self.group: - # Compatibility for files written with pandas 0.25.1 and earlier. - if "sp_values" in self.group: - key = "sp_values" - dtype = "Sparse" - sp_index = self.read_index("sp_index".format(key)) - else: - dtype = None - sp_index = None - import tables node = getattr(self.group, key) @@ -2744,7 +2732,7 @@ def read_array(self, key, start=None, stop=None): if isinstance(node, tables.VLArray): ret = node[0][start:stop] else: - dtype = getattr(attrs, "value_type", dtype) + dtype = getattr(attrs, "value_type", None) shape = getattr(attrs, "shape", None) if shape is not None: @@ -2766,8 +2754,7 @@ def read_array(self, key, start=None, stop=None): raise NotImplementedError( "start and/or stop are not supported in fixed Sparse reading" ) - if sp_index is None: - sp_index = self.read_index("{}_sp_index".format(key)) + sp_index = self.read_index("{}_sp_index".format(key)) ret = SparseArray( ret, sparse_index=sp_index, fill_value=self.attrs.fill_value ) @@ -3092,10 +3079,10 @@ def shape(self): except (TypeError, AttributeError): return None - def read(self, key="values", **kwargs): + def read(self, **kwargs): kwargs = self.validate_read(kwargs) index = self.read_index("index", **kwargs) - values = self.read_array(key, **kwargs) + values = self.read_array("values", **kwargs) return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): @@ -3197,22 +3184,6 @@ class FrameFixed(BlockManagerFixed): obj_type = DataFrame -class SparseFrameFixed(GenericFixed): - pandas_kind = "sparse_frame" - attributes = ["default_kind", "default_fill_value"] - - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - columns = self.read_index("columns") - sdict = {} - for c in columns: - key = "sparse_series_{columns}".format(columns=c) - s = SeriesFixed(self.parent, getattr(self.group, key)) - s.infer_axes() - sdict[c] = s.read(key=key) - return DataFrame(sdict) - - class Table(Fixed): """ represent a table: facilitate read/write of various types of tables diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index e4347bd6357bf..541ecbaa5b811 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -4883,29 +4883,6 @@ def test_read_py2_hdf_file_in_py3(self, datapath): result = store["p"] assert_frame_equal(result, expected) - def test_read_legacy_sparse(self, datapath): - """ - Generated with pandas 0.25.1 and - - >>> s = pd.Series([1, None, 2, 3]).to_sparse() - >>> df = pd.DataFrame({"A": [1, None, 2, 3], "B": [1, 0, 0, 0]}).to_sparse() - >>> s.to_hdf("pandas/tests/io/data/legacy_hdf/legacy_sparse.h5", "series") - >>> df.to_hdf("pandas/tests/io/data/legacy_hdf/legacy_sparse.h5", "frame") - """ - result = pd.read_hdf( - datapath("io", "data", "legacy_hdf", "legacy_sparse.h5"), "series" - ) - expected = pd.Series(pd.SparseArray([1, None, 2, 3])) - tm.assert_series_equal(result, expected) - - result = pd.read_hdf( - datapath("io", "data", "legacy_hdf", "legacy_sparse.h5"), "frame" - ) - expected = pd.DataFrame( - {"A": pd.SparseArray([1, None, 2, 3]), "B": pd.SparseArray([1, 0, 0, 0])} - ) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) def test_select_empty_where(self, where): # GH26610 From 62cd058f41f90eae01563a5caae46750ceea4027 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2019 14:51:32 -0500 Subject: [PATCH 5/5] tests --- pandas/io/pytables.py | 13 +++++++++---- pandas/tests/io/pytables/test_pytables.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f6da988e110c0..211af08cb750e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2756,7 +2756,10 @@ def read_array(self, key, start=None, stop=None): ) sp_index = self.read_index("{}_sp_index".format(key)) ret = SparseArray( - ret, sparse_index=sp_index, fill_value=self.attrs.fill_value + ret, + sparse_index=sp_index, + fill_value=self.attrs["{}_fill_value".format(key)], + kind=self.attrs["{}_kind".format(key)], ) if transposed: @@ -3032,9 +3035,11 @@ def write_array(self, key, value, items=None): self.write_index("{}_sp_index".format(key), value.sp_index) self._handle.create_array(self.group, key, value.sp_values) getattr(self.group, key)._v_attrs.value_type = "Sparse" - self.attrs.fill_value = value.fill_value - self.attrs.kind = value.kind - self.attributes.extend(["fill_value", "kind"]) + setattr(self.attrs, "{}_fill_value".format(key), value.fill_value) + setattr(self.attrs, "{}_kind".format(key), value.kind) + self.attributes.extend( + ["{}_fill_value".format(key), "{}_kind".format(key)] + ) else: self._handle.create_array(self.group, key, value) diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 541ecbaa5b811..bb5a0c9a8a4b7 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -2373,6 +2373,17 @@ def test_sparse_frame(self): ss3 = s.apply(lambda x: pd.SparseArray(x, fill_value=0)) self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) + def test_mixed_sparse_dense_frame(self): + df = pd.DataFrame( + { + "A": [0, 1, 2, 3], + "B": pd.SparseArray([0, 1, 2, 3], kind="block"), + "C": [0.0, 1.0, 2.0, 3.0], + "D": pd.SparseArray([0.0, 1.0, 2.0, 3.0], kind="integer"), + } + ) + self._check_roundtrip(df, tm.assert_frame_equal) + def test_float_index(self): # GH #454