@@ -678,6 +678,55 @@ def _simple_new(cls, sparse_array, sparse_index, dtype):
678
678
new ._dtype = dtype
679
679
return new
680
680
681
+ @classmethod
682
+ def from_spmatrix (cls , data ):
683
+ """
684
+ Create a SparseArray from a scipy.sparse matrix.
685
+
686
+ .. versionadded:: 0.25.0
687
+
688
+ Parameters
689
+ ----------
690
+ data : scipy.sparse.sp_matrix
691
+ This should be a SciPy sparse matrix where the size
692
+ of the second dimension is 1. In other words, a
693
+ sparse matrix with a single column.
694
+
695
+ Returns
696
+ -------
697
+ SparseArray
698
+
699
+ Examples
700
+ --------
701
+ >>> import scipy.sparse
702
+ >>> mat = scipy.sparse.coo_matrix((4, 1))
703
+ >>> pd.SparseArray.from_spmatrix(mat)
704
+ [0.0, 0.0, 0.0, 0.0]
705
+ Fill: 0.0
706
+ IntIndex
707
+ Indices: array([], dtype=int32)
708
+ """
709
+ length , ncol = data .shape
710
+
711
+ if ncol != 1 :
712
+ raise ValueError (
713
+ "'data' must have a single column, not '{}'" .format (ncol )
714
+ )
715
+
716
+ # our sparse index classes require that the positions be strictly
717
+ # increasing. So we need to sort loc, and arr accordingly.
718
+ arr = data .data
719
+ idx , _ = data .nonzero ()
720
+ loc = np .argsort (idx )
721
+ arr = arr .take (loc )
722
+ idx .sort ()
723
+
724
+ zero = np .array (0 , dtype = arr .dtype ).item ()
725
+ dtype = SparseDtype (arr .dtype , zero )
726
+ index = IntIndex (length , idx )
727
+
728
+ return cls ._simple_new (arr , index , dtype )
729
+
681
730
def __array__ (self , dtype = None , copy = True ):
682
731
fill_value = self .fill_value
683
732
@@ -1891,27 +1940,32 @@ def _make_index(length, indices, kind):
1891
1940
# ----------------------------------------------------------------------------
1892
1941
# Accessor
1893
1942
1943
+
1944
+ class BaseAccessor (object ):
1945
+ _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
1946
+
1947
+ def __init__ (self , data = None ):
1948
+ self ._parent = data
1949
+ self ._validate (data )
1950
+
1951
+ def _validate (self , data ):
1952
+ raise NotImplementedError
1953
+
1954
+
1894
1955
@delegate_names (SparseArray , ['npoints' , 'density' , 'fill_value' ,
1895
1956
'sp_values' ],
1896
1957
typ = 'property' )
1897
- class SparseAccessor (PandasDelegate ):
1958
+ class SparseAccessor (BaseAccessor , PandasDelegate ):
1898
1959
"""
1899
1960
Accessor for SparseSparse from other sparse matrix data types.
1900
1961
"""
1901
1962
1902
- def __init__ (self , data = None ):
1903
- self ._validate (data )
1904
- # Store the Series since we need that for to_coo
1905
- self ._parent = data
1906
-
1907
- @staticmethod
1908
- def _validate (data ):
1963
+ def _validate (self , data ):
1909
1964
if not isinstance (data .dtype , SparseDtype ):
1910
- msg = "Can only use the '.sparse' accessor with Sparse data."
1911
- raise AttributeError (msg )
1965
+ raise AttributeError (self ._validation_msg )
1912
1966
1913
1967
def _delegate_property_get (self , name , * args , ** kwargs ):
1914
- return getattr (self ._parent .values , name )
1968
+ return getattr (self ._parent .array , name )
1915
1969
1916
1970
def _delegate_method (self , name , * args , ** kwargs ):
1917
1971
if name == 'from_coo' :
@@ -2025,3 +2079,188 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
2025
2079
column_levels ,
2026
2080
sort_labels = sort_labels )
2027
2081
return A , rows , columns
2082
+
2083
+ def to_dense (self ):
2084
+ """
2085
+ Convert a Series from sparse values to dense.
2086
+
2087
+ .. versionadded:: 0.25.0
2088
+
2089
+ Returns
2090
+ -------
2091
+ Series:
2092
+ A Series with the same values, stored as a dense array.
2093
+
2094
+ Examples
2095
+ --------
2096
+ >>> series = pd.Series(pd.SparseArray([0, 1, 0]))
2097
+ >>> series
2098
+ 0 0
2099
+ 1 1
2100
+ 2 0
2101
+ dtype: Sparse[int64, 0]
2102
+
2103
+ >>> series.sparse.to_dense()
2104
+ 0 0
2105
+ 1 1
2106
+ 2 0
2107
+ dtype: int64
2108
+ """
2109
+ from pandas import Series
2110
+ return Series (self ._parent .array .to_dense (),
2111
+ index = self ._parent .index ,
2112
+ name = self ._parent .name )
2113
+
2114
+
2115
+ class SparseFrameAccessor (BaseAccessor , PandasDelegate ):
2116
+ """
2117
+ DataFrame accessor for sparse data.
2118
+
2119
+ .. versionadded :: 0.25.0
2120
+ """
2121
+
2122
+ def _validate (self , data ):
2123
+ dtypes = data .dtypes
2124
+ if not all (isinstance (t , SparseDtype ) for t in dtypes ):
2125
+ raise AttributeError (self ._validation_msg )
2126
+
2127
+ @classmethod
2128
+ def from_spmatrix (cls , data , index = None , columns = None ):
2129
+ """
2130
+ Create a new DataFrame from a scipy sparse matrix.
2131
+
2132
+ .. versionadded:: 0.25.0
2133
+
2134
+ Parameters
2135
+ ----------
2136
+ data : scipy.sparse.spmatrix
2137
+ Must be convertible to csc format.
2138
+ index, columns : Index, optional
2139
+ Row and column labels to use for the resulting DataFrame.
2140
+ Defaults to a RangeIndex.
2141
+
2142
+ Returns
2143
+ -------
2144
+ DataFrame
2145
+ Each column of the DataFrame is stored as a
2146
+ :class:`SparseArray`.
2147
+
2148
+ Examples
2149
+ --------
2150
+ >>> import scipy.sparse
2151
+ >>> mat = scipy.sparse.eye(3)
2152
+ >>> pd.DataFrame.sparse.from_spmatrix(mat)
2153
+ 0 1 2
2154
+ 0 1.0 0.0 0.0
2155
+ 1 0.0 1.0 0.0
2156
+ 2 0.0 0.0 1.0
2157
+ """
2158
+ from pandas import DataFrame
2159
+
2160
+ data = data .tocsc ()
2161
+ index , columns = cls ._prep_index (data , index , columns )
2162
+ sparrays = [
2163
+ SparseArray .from_spmatrix (data [:, i ])
2164
+ for i in range (data .shape [1 ])
2165
+ ]
2166
+ data = dict (zip (columns , sparrays ))
2167
+ return DataFrame (data , index = index )
2168
+
2169
+ def to_dense (self ):
2170
+ """
2171
+ Convert a DataFrame with sparse values to dense.
2172
+
2173
+ .. versionadded:: 0.25.0
2174
+
2175
+ Returns
2176
+ -------
2177
+ DataFrame
2178
+ A DataFrame with the same values stored as dense arrays.
2179
+
2180
+ Examples
2181
+ --------
2182
+ >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])})
2183
+ >>> df.sparse.to_dense()
2184
+ A
2185
+ 0 0
2186
+ 1 1
2187
+ 2 0
2188
+ """
2189
+ from pandas import DataFrame
2190
+
2191
+ data = {k : v .array .to_dense ()
2192
+ for k , v in compat .iteritems (self ._parent )}
2193
+ return DataFrame (data ,
2194
+ index = self ._parent .index ,
2195
+ columns = self ._parent .columns )
2196
+
2197
+ def to_coo (self ):
2198
+ """
2199
+ Return the contents of the frame as a sparse SciPy COO matrix.
2200
+
2201
+ .. versionadded:: 0.20.0
2202
+
2203
+ Returns
2204
+ -------
2205
+ coo_matrix : scipy.sparse.spmatrix
2206
+ If the caller is heterogeneous and contains booleans or objects,
2207
+ the result will be of dtype=object. See Notes.
2208
+
2209
+ Notes
2210
+ -----
2211
+ The dtype will be the lowest-common-denominator type (implicit
2212
+ upcasting); that is to say if the dtypes (even of numeric types)
2213
+ are mixed, the one that accommodates all will be chosen.
2214
+
2215
+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
2216
+ float32. By numpy.find_common_type convention, mixing int64 and
2217
+ and uint64 will result in a float64 dtype.
2218
+ """
2219
+ try :
2220
+ from scipy .sparse import coo_matrix
2221
+ except ImportError :
2222
+ raise ImportError ('Scipy is not installed' )
2223
+
2224
+ dtype = find_common_type (self ._parent .dtypes )
2225
+ if isinstance (dtype , SparseDtype ):
2226
+ dtype = dtype .subtype
2227
+
2228
+ cols , rows , datas = [], [], []
2229
+ for col , name in enumerate (self ._parent ):
2230
+ s = self ._parent [name ]
2231
+ row = s .array .sp_index .to_int_index ().indices
2232
+ cols .append (np .repeat (col , len (row )))
2233
+ rows .append (row )
2234
+ datas .append (s .array .sp_values .astype (dtype , copy = False ))
2235
+
2236
+ cols = np .concatenate (cols )
2237
+ rows = np .concatenate (rows )
2238
+ datas = np .concatenate (datas )
2239
+ return coo_matrix ((datas , (rows , cols )), shape = self ._parent .shape )
2240
+
2241
+ @property
2242
+ def density (self ):
2243
+ """
2244
+ Ratio of non-sparse points to total (dense) data points
2245
+ represented in the DataFrame.
2246
+ """
2247
+ return np .mean ([column .array .density
2248
+ for _ , column in self ._parent .iteritems ()])
2249
+
2250
+ @staticmethod
2251
+ def _prep_index (data , index , columns ):
2252
+ import pandas .core .indexes .base as ibase
2253
+
2254
+ N , K = data .shape
2255
+ if index is None :
2256
+ index = ibase .default_index (N )
2257
+ if columns is None :
2258
+ columns = ibase .default_index (K )
2259
+
2260
+ if len (columns ) != K :
2261
+ raise ValueError ('Column length mismatch: {columns} vs. {K}'
2262
+ .format (columns = len (columns ), K = K ))
2263
+ if len (index ) != N :
2264
+ raise ValueError ('Index length mismatch: {index} vs. {N}'
2265
+ .format (index = len (index ), N = N ))
2266
+ return index , columns
0 commit comments