diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c78e27f098f13..53c1d8c6c5777 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -69,7 +69,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) -- +- :func:`read_hdf` now reads sparse values into a :class:`Series` or :class:`DataFrame` with sparse values rather than a ``SparseDataFrame`` or ``SparseSeries`` (:issue:`28456`) - .. _whatsnew_1000.deprecations: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4f3f639de5cb1..211af08cb750e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -28,6 +28,7 @@ is_datetime64tz_dtype, is_extension_type, is_list_like, + is_sparse, is_timedelta64_dtype, ) from pandas.core.dtypes.missing import array_equivalent @@ -40,8 +41,7 @@ MultiIndex, PeriodIndex, Series, - SparseDataFrame, - SparseSeries, + SparseArray, TimedeltaIndex, concat, isna, @@ -173,12 +173,7 @@ class DuplicateWarning(Warning): """ # map object types -_TYPE_MAP = { - Series: "series", - SparseSeries: "sparse_series", - DataFrame: "frame", - SparseDataFrame: "sparse_frame", -} +_TYPE_MAP = {Series: "series", DataFrame: "frame"} # storer class map _STORER_MAP = { @@ -186,9 +181,9 @@ class DuplicateWarning(Warning): "DataFrame": "LegacyFrameFixed", "DataMatrix": "LegacyFrameFixed", "series": "SeriesFixed", - "sparse_series": "SparseSeriesFixed", + "sparse_series": "SeriesFixed", "frame": "FrameFixed", - "sparse_frame": "SparseFrameFixed", + "sparse_frame": "FrameFixed", } # table class map @@ -2754,6 +2749,19 @@ def read_array(self, key, start=None, stop=None): elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") + if dtype == "Sparse": + if start or stop: + raise NotImplementedError( + "start and/or stop are not supported in fixed Sparse reading" + ) + sp_index = self.read_index("{}_sp_index".format(key)) + ret = SparseArray( + ret, + sparse_index=sp_index, + fill_value=self.attrs["{}_fill_value".format(key)], + kind=self.attrs["{}_kind".format(key)], + ) + if transposed: return ret.T else: @@ -3004,7 +3012,7 @@ def write_array(self, key, value, items=None): vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) vlarr.append(value) else: - if empty_array: + if empty_array and not is_sparse(value): self.write_array_empty(key, value) else: if is_datetime64_dtype(value.dtype): @@ -3021,6 +3029,17 @@ def write_array(self, key, value, items=None): elif is_timedelta64_dtype(value.dtype): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" + elif is_sparse(value): + # TODO: think about EA API for this. + # value._write_hdf5(self) + self.write_index("{}_sp_index".format(key), value.sp_index) + self._handle.create_array(self.group, key, value.sp_values) + getattr(self.group, key)._v_attrs.value_type = "Sparse" + setattr(self.attrs, "{}_fill_value".format(key), value.fill_value) + setattr(self.attrs, "{}_kind".format(key), value.kind) + self.attributes.extend( + ["{}_fill_value".format(key), "{}_kind".format(key)] + ) else: self._handle.create_array(self.group, key, value) @@ -3078,83 +3097,6 @@ def write(self, obj, **kwargs): self.attrs.name = obj.name -class SparseFixed(GenericFixed): - def validate_read(self, kwargs): - """ - we don't support start, stop kwds in Sparse - """ - kwargs = super().validate_read(kwargs) - if "start" in kwargs or "stop" in kwargs: - raise NotImplementedError( - "start and/or stop are not supported in fixed Sparse reading" - ) - return kwargs - - -class SparseSeriesFixed(SparseFixed): - pandas_kind = "sparse_series" - attributes = ["name", "fill_value", "kind"] - - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index("index") - sp_values = self.read_array("sp_values") - sp_index = self.read_index("sp_index") - return SparseSeries( - sp_values, - index=index, - sparse_index=sp_index, - kind=self.kind or "block", - fill_value=self.fill_value, - name=self.name, - ) - - def write(self, obj, **kwargs): - super().write(obj, **kwargs) - self.write_index("index", obj.index) - self.write_index("sp_index", obj.sp_index) - self.write_array("sp_values", obj.sp_values) - self.attrs.name = obj.name - self.attrs.fill_value = obj.fill_value - self.attrs.kind = obj.kind - - -class SparseFrameFixed(SparseFixed): - pandas_kind = "sparse_frame" - attributes = ["default_kind", "default_fill_value"] - - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - columns = self.read_index("columns") - sdict = {} - for c in columns: - key = "sparse_series_{columns}".format(columns=c) - s = SparseSeriesFixed(self.parent, getattr(self.group, key)) - s.infer_axes() - sdict[c] = s.read() - return SparseDataFrame( - sdict, - columns=columns, - default_kind=self.default_kind, - default_fill_value=self.default_fill_value, - ) - - def write(self, obj, **kwargs): - """ write it as a collection of individual sparse series """ - super().write(obj, **kwargs) - for name, ss in obj.items(): - key = "sparse_series_{name}".format(name=name) - if key not in self.group._v_children: - node = self._handle.create_group(self.group, key) - else: - node = getattr(self.group, key) - s = SparseSeriesFixed(self.parent, node) - s.write(ss) - self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind - self.write_index("columns", obj.columns) - - class BlockManagerFixed(GenericFixed): attributes = ["ndim", "nblocks"] is_shape_reversed = False diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 77cac00882771..bb5a0c9a8a4b7 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -71,14 +71,6 @@ ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" ) -ignore_sparse = pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -ignore_dataframe_tosparse = pytest.mark.filterwarnings( - "ignore:DataFrame.to_sparse:FutureWarning" -) -ignore_series_tosparse = pytest.mark.filterwarnings( - "ignore:Series.to_sparse:FutureWarning" -) - # contextmanager to ensure the file cleanup @@ -2353,38 +2345,45 @@ def test_series(self): ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) self._check_roundtrip(ts3, tm.assert_series_equal, check_index_type=False) - @ignore_sparse - @ignore_series_tosparse def test_sparse_series(self): s = tm.makeStringSeries() s.iloc[3:5] = np.nan - ss = s.to_sparse() + ss = s.astype("Sparse") self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) - ss2 = s.to_sparse(kind="integer") + ss2 = pd.Series(pd.SparseArray(s, kind="integer")) self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) - ss3 = s.to_sparse(fill_value=0) + ss3 = pd.Series(pd.SparseArray(s, fill_value=0)) self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) - @ignore_sparse - @ignore_dataframe_tosparse def test_sparse_frame(self): s = tm.makeDataFrame() s.iloc[3:5, 1:3] = np.nan s.iloc[8:10, -2] = np.nan - ss = s.to_sparse() + ss = s.astype("Sparse") self._check_double_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) - ss2 = s.to_sparse(kind="integer") + ss2 = s.apply(lambda x: pd.SparseArray(x, kind="integer")) self._check_double_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) - ss3 = s.to_sparse(fill_value=0) + ss3 = s.apply(lambda x: pd.SparseArray(x, fill_value=0)) self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) + def test_mixed_sparse_dense_frame(self): + df = pd.DataFrame( + { + "A": [0, 1, 2, 3], + "B": pd.SparseArray([0, 1, 2, 3], kind="block"), + "C": [0.0, 1.0, 2.0, 3.0], + "D": pd.SparseArray([0.0, 1.0, 2.0, 3.0], kind="integer"), + } + ) + self._check_roundtrip(df, tm.assert_frame_equal) + def test_float_index(self): # GH #454 @@ -2709,15 +2708,13 @@ def test_overwrite_node(self): tm.assert_series_equal(store["a"], ts) - @ignore_sparse - @ignore_dataframe_tosparse def test_sparse_with_compression(self): # GH 2931 # make sparse dataframe arr = np.random.binomial(n=1, p=0.01, size=(1000, 10)) - df = DataFrame(arr).to_sparse(fill_value=0) + df = DataFrame(arr).apply(lambda x: pd.SparseArray(x, fill_value=0)) # case 1: store uncompressed self._check_double_roundtrip( @@ -3890,8 +3887,6 @@ def test_start_stop_multiple(self): expected = df.loc[[0], ["foo", "bar"]] tm.assert_frame_equal(result, expected) - @ignore_sparse - @ignore_dataframe_tosparse def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: @@ -3931,7 +3926,7 @@ def test_start_stop_fixed(self): df = tm.makeDataFrame() df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan - dfs = df.to_sparse() + dfs = df.apply(pd.SparseArray) store.put("dfs", dfs) with pytest.raises(NotImplementedError): store.select("dfs", start=0, stop=5)