Skip to content

Commit 02cb410

Browse files
committed
Squashed commit of the following:
commit 8b136bf Merge: 3005aed 01d3dc2 Author: Tom Augspurger <[email protected]> Date: Fri Mar 15 16:03:23 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 3005aed Author: Tom Augspurger <[email protected]> Date: Thu Mar 14 06:26:32 2019 -0500 isort? commit 318c06f Merge: 0922296 79205ea Author: Tom Augspurger <[email protected]> Date: Thu Mar 14 06:25:45 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 0922296 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 21:35:51 2019 -0500 updates commit f433be8 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 20:54:07 2019 -0500 lint commit 6696f28 Merge: 534a379 1017382 Author: Tom Augspurger <[email protected]> Date: Wed Mar 13 20:53:13 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 534a379 Merge: 94a7baf 5c341dc Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 14:37:27 2019 -0500 Merge remote-tracking branch 'upstream/master' into sparse-frame-accessor commit 94a7baf Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 14:22:48 2019 -0500 fixups commit 6f619b5 Author: Tom Augspurger <[email protected]> Date: Tue Mar 12 13:38:48 2019 -0500 32-bit compat commit 24f48c3 Author: Tom Augspurger <[email protected]> Date: Mon Mar 11 22:05:46 2019 -0500 API: DataFrame.sparse accessor Closes pandas-dev#25681
1 parent 01d3dc2 commit 02cb410

File tree

7 files changed

+391
-82
lines changed

7 files changed

+391
-82
lines changed

doc/source/reference/frame.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
312312
DataFrame.boxplot
313313
DataFrame.hist
314314

315+
316+
.. _api.frame.sparse:
317+
318+
Sparse Accessor
319+
~~~~~~~~~~~~~~~
320+
321+
Sparse-dtype specific methods and attributes are provided under the
322+
``DataFrame.sparse`` accessor.
323+
324+
.. autosummary::
325+
:toctree: api/
326+
:template: autosummary/accessor_attribute.rst
327+
328+
DataFrame.sparse.density
329+
330+
.. autosummary::
331+
:toctree: api/
332+
333+
DataFrame.sparse.from_spmatrix
334+
DataFrame.sparse.to_coo
335+
DataFrame.sparse.to_dense
336+
337+
315338
Serialization / IO / Conversion
316339
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
317340
.. autosummary::

doc/source/whatsnew/v0.25.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Other Enhancements
2626
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
2727
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
2828
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
29+
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)
2930
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
3031

3132
.. _whatsnew_0250.api_breaking:

pandas/core/arrays/sparse.py

Lines changed: 250 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,55 @@ def _simple_new(cls, sparse_array, sparse_index, dtype):
678678
new._dtype = dtype
679679
return new
680680

681+
@classmethod
682+
def from_spmatrix(cls, data):
683+
"""
684+
Create a SparseArray from a scipy.sparse matrix.
685+
686+
.. versionadded:: 0.25.0
687+
688+
Parameters
689+
----------
690+
data : scipy.sparse.sp_matrix
691+
This should be a SciPy sparse matrix where the size
692+
of the second dimension is 1. In other words, a
693+
sparse matrix with a single column.
694+
695+
Returns
696+
-------
697+
SparseArray
698+
699+
Examples
700+
--------
701+
>>> import scipy.sparse
702+
>>> mat = scipy.sparse.coo_matrix((4, 1))
703+
>>> pd.SparseArray.from_spmatrix(mat)
704+
[0.0, 0.0, 0.0, 0.0]
705+
Fill: 0.0
706+
IntIndex
707+
Indices: array([], dtype=int32)
708+
"""
709+
length, ncol = data.shape
710+
711+
if ncol != 1:
712+
raise ValueError(
713+
"'data' must have a single column, not '{}'".format(ncol)
714+
)
715+
716+
# our sparse index classes require that the positions be strictly
717+
# increasing. So we need to sort loc, and arr accordingly.
718+
arr = data.data
719+
idx, _ = data.nonzero()
720+
loc = np.argsort(idx)
721+
arr = arr.take(loc)
722+
idx.sort()
723+
724+
zero = np.array(0, dtype=arr.dtype).item()
725+
dtype = SparseDtype(arr.dtype, zero)
726+
index = IntIndex(length, idx)
727+
728+
return cls._simple_new(arr, index, dtype)
729+
681730
def __array__(self, dtype=None, copy=True):
682731
fill_value = self.fill_value
683732

@@ -1891,27 +1940,32 @@ def _make_index(length, indices, kind):
18911940
# ----------------------------------------------------------------------------
18921941
# Accessor
18931942

1943+
1944+
class BaseAccessor(object):
1945+
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1946+
1947+
def __init__(self, data=None):
1948+
self._parent = data
1949+
self._validate(data)
1950+
1951+
def _validate(self, data):
1952+
raise NotImplementedError
1953+
1954+
18941955
@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
18951956
'sp_values'],
18961957
typ='property')
1897-
class SparseAccessor(PandasDelegate):
1958+
class SparseAccessor(BaseAccessor, PandasDelegate):
18981959
"""
18991960
Accessor for SparseSparse from other sparse matrix data types.
19001961
"""
19011962

1902-
def __init__(self, data=None):
1903-
self._validate(data)
1904-
# Store the Series since we need that for to_coo
1905-
self._parent = data
1906-
1907-
@staticmethod
1908-
def _validate(data):
1963+
def _validate(self, data):
19091964
if not isinstance(data.dtype, SparseDtype):
1910-
msg = "Can only use the '.sparse' accessor with Sparse data."
1911-
raise AttributeError(msg)
1965+
raise AttributeError(self._validation_msg)
19121966

19131967
def _delegate_property_get(self, name, *args, **kwargs):
1914-
return getattr(self._parent.values, name)
1968+
return getattr(self._parent.array, name)
19151969

19161970
def _delegate_method(self, name, *args, **kwargs):
19171971
if name == 'from_coo':
@@ -2025,3 +2079,188 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
20252079
column_levels,
20262080
sort_labels=sort_labels)
20272081
return A, rows, columns
2082+
2083+
def to_dense(self):
2084+
"""
2085+
Convert a Series from sparse values to dense.
2086+
2087+
.. versionadded:: 0.25.0
2088+
2089+
Returns
2090+
-------
2091+
Series:
2092+
A Series with the same values, stored as a dense array.
2093+
2094+
Examples
2095+
--------
2096+
>>> series = pd.Series(pd.SparseArray([0, 1, 0]))
2097+
>>> series
2098+
0 0
2099+
1 1
2100+
2 0
2101+
dtype: Sparse[int64, 0]
2102+
2103+
>>> series.sparse.to_dense()
2104+
0 0
2105+
1 1
2106+
2 0
2107+
dtype: int64
2108+
"""
2109+
from pandas import Series
2110+
return Series(self._parent.array.to_dense(),
2111+
index=self._parent.index,
2112+
name=self._parent.name)
2113+
2114+
2115+
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
2116+
"""
2117+
DataFrame accessor for sparse data.
2118+
2119+
.. versionadded :: 0.25.0
2120+
"""
2121+
2122+
def _validate(self, data):
2123+
dtypes = data.dtypes
2124+
if not all(isinstance(t, SparseDtype) for t in dtypes):
2125+
raise AttributeError(self._validation_msg)
2126+
2127+
@classmethod
2128+
def from_spmatrix(cls, data, index=None, columns=None):
2129+
"""
2130+
Create a new DataFrame from a scipy sparse matrix.
2131+
2132+
.. versionadded:: 0.25.0
2133+
2134+
Parameters
2135+
----------
2136+
data : scipy.sparse.spmatrix
2137+
Must be convertible to csc format.
2138+
index, columns : Index, optional
2139+
Row and column labels to use for the resulting DataFrame.
2140+
Defaults to a RangeIndex.
2141+
2142+
Returns
2143+
-------
2144+
DataFrame
2145+
Each column of the DataFrame is stored as a
2146+
:class:`SparseArray`.
2147+
2148+
Examples
2149+
--------
2150+
>>> import scipy.sparse
2151+
>>> mat = scipy.sparse.eye(3)
2152+
>>> pd.DataFrame.sparse.from_spmatrix(mat)
2153+
0 1 2
2154+
0 1.0 0.0 0.0
2155+
1 0.0 1.0 0.0
2156+
2 0.0 0.0 1.0
2157+
"""
2158+
from pandas import DataFrame
2159+
2160+
data = data.tocsc()
2161+
index, columns = cls._prep_index(data, index, columns)
2162+
sparrays = [
2163+
SparseArray.from_spmatrix(data[:, i])
2164+
for i in range(data.shape[1])
2165+
]
2166+
data = dict(zip(columns, sparrays))
2167+
return DataFrame(data, index=index)
2168+
2169+
def to_dense(self):
2170+
"""
2171+
Convert a DataFrame with sparse values to dense.
2172+
2173+
.. versionadded:: 0.25.0
2174+
2175+
Returns
2176+
-------
2177+
DataFrame
2178+
A DataFrame with the same values stored as dense arrays.
2179+
2180+
Examples
2181+
--------
2182+
>>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
2183+
>>> df.sparse.to_dense()
2184+
A
2185+
0 0
2186+
1 1
2187+
2 0
2188+
"""
2189+
from pandas import DataFrame
2190+
2191+
data = {k: v.array.to_dense()
2192+
for k, v in compat.iteritems(self._parent)}
2193+
return DataFrame(data,
2194+
index=self._parent.index,
2195+
columns=self._parent.columns)
2196+
2197+
def to_coo(self):
2198+
"""
2199+
Return the contents of the frame as a sparse SciPy COO matrix.
2200+
2201+
.. versionadded:: 0.20.0
2202+
2203+
Returns
2204+
-------
2205+
coo_matrix : scipy.sparse.spmatrix
2206+
If the caller is heterogeneous and contains booleans or objects,
2207+
the result will be of dtype=object. See Notes.
2208+
2209+
Notes
2210+
-----
2211+
The dtype will be the lowest-common-denominator type (implicit
2212+
upcasting); that is to say if the dtypes (even of numeric types)
2213+
are mixed, the one that accommodates all will be chosen.
2214+
2215+
e.g. If the dtypes are float16 and float32, dtype will be upcast to
2216+
float32. By numpy.find_common_type convention, mixing int64 and
2217+
and uint64 will result in a float64 dtype.
2218+
"""
2219+
try:
2220+
from scipy.sparse import coo_matrix
2221+
except ImportError:
2222+
raise ImportError('Scipy is not installed')
2223+
2224+
dtype = find_common_type(self._parent.dtypes)
2225+
if isinstance(dtype, SparseDtype):
2226+
dtype = dtype.subtype
2227+
2228+
cols, rows, datas = [], [], []
2229+
for col, name in enumerate(self._parent):
2230+
s = self._parent[name]
2231+
row = s.array.sp_index.to_int_index().indices
2232+
cols.append(np.repeat(col, len(row)))
2233+
rows.append(row)
2234+
datas.append(s.array.sp_values.astype(dtype, copy=False))
2235+
2236+
cols = np.concatenate(cols)
2237+
rows = np.concatenate(rows)
2238+
datas = np.concatenate(datas)
2239+
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)
2240+
2241+
@property
2242+
def density(self):
2243+
"""
2244+
Ratio of non-sparse points to total (dense) data points
2245+
represented in the DataFrame.
2246+
"""
2247+
return np.mean([column.array.density
2248+
for _, column in self._parent.iteritems()])
2249+
2250+
@staticmethod
2251+
def _prep_index(data, index, columns):
2252+
import pandas.core.indexes.base as ibase
2253+
2254+
N, K = data.shape
2255+
if index is None:
2256+
index = ibase.default_index(N)
2257+
if columns is None:
2258+
columns = ibase.default_index(K)
2259+
2260+
if len(columns) != K:
2261+
raise ValueError('Column length mismatch: {columns} vs. {K}'
2262+
.format(columns=len(columns), K=K))
2263+
if len(index) != N:
2264+
raise ValueError('Index length mismatch: {index} vs. {N}'
2265+
.format(index=len(index), N=N))
2266+
return index, columns

pandas/core/frame.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
PY36, raise_with_traceback, Iterator,
3737
string_and_binary_types)
3838
from pandas.compat.numpy import function as nv
39+
from pandas.core.arrays.sparse import SparseFrameAccessor
3940
from pandas.core.dtypes.cast import (
4041
maybe_upcast,
4142
cast_scalar_to_array,
@@ -8009,6 +8010,7 @@ def isin(self, values):
80098010
plot = CachedAccessor("plot", gfx.FramePlotMethods)
80108011
hist = gfx.hist_frame
80118012
boxplot = gfx.boxplot_frame
8013+
sparse = CachedAccessor("sparse", SparseFrameAccessor)
80128014

80138015

80148016
DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,

0 commit comments

Comments
 (0)