-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Support EAs in Series.unstack #23284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ced299f
3b63fcb
756dde9
90f84ef
942db1b
36a4450
ee330d6
2fcaf4d
4f46364
e9498a1
72b5a0d
f6b2050
4d679cb
ff7aba7
91587cb
49bdb50
cf8ed73
5902b5b
17d3002
a75806a
2397e89
8ed7c73
b23234c
29a6bb1
19b7cfa
254fe52
2d78d42
a9e6263
ca286f7
2f28638
967c674
f6aa4b9
32bc3de
56e5f2f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
# -*- coding: utf-8 -*- | ||
import functools | ||
import warnings | ||
import inspect | ||
import re | ||
|
@@ -34,6 +35,7 @@ | |
is_numeric_v_string_like, is_extension_type, | ||
is_extension_array_dtype, | ||
is_list_like, | ||
is_sparse, | ||
is_re, | ||
is_re_compilable, | ||
pandas_dtype) | ||
|
@@ -632,7 +634,10 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, | |
return self | ||
|
||
if klass is None: | ||
if dtype == np.object_: | ||
if is_sparse(self.values): | ||
# special case sparse, Series[Sparse].astype(object) is sparse | ||
klass = ExtensionBlock | ||
elif is_object_dtype(dtype): | ||
klass = ObjectBlock | ||
elif is_extension_array_dtype(dtype): | ||
klass = ExtensionBlock | ||
|
@@ -1429,7 +1434,7 @@ def equals(self, other): | |
return False | ||
return array_equivalent(self.values, other.values) | ||
|
||
def _unstack(self, unstacker_func, new_columns): | ||
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): | ||
"""Return a list of unstacked blocks of self | ||
|
||
Parameters | ||
|
@@ -1438,6 +1443,10 @@ def _unstack(self, unstacker_func, new_columns): | |
Partially applied unstacker. | ||
new_columns : Index | ||
All columns of the unstacked BlockManager. | ||
n_rows : int | ||
Only used in ExtensionBlock.unstack | ||
fill_value : int | ||
Only used in ExtensionBlock.unstack | ||
|
||
Returns | ||
------- | ||
|
@@ -1731,7 +1740,7 @@ def _slice(self, slicer): | |
def _try_cast_result(self, result, dtype=None): | ||
return result | ||
|
||
def _unstack(self, unstacker_func, new_columns): | ||
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): | ||
"""Return a list of unstacked blocks of self | ||
|
||
Parameters | ||
|
@@ -1740,6 +1749,10 @@ def _unstack(self, unstacker_func, new_columns): | |
Partially applied unstacker. | ||
new_columns : Index | ||
All columns of the unstacked BlockManager. | ||
n_rows : int | ||
Only used in ExtensionBlock.unstack | ||
fill_value : int | ||
Only used in ExtensionBlock.unstack | ||
|
||
Returns | ||
------- | ||
|
@@ -1751,18 +1764,50 @@ def _unstack(self, unstacker_func, new_columns): | |
# NonConsolidatable blocks can have a single item only, so we return | ||
# one block per item | ||
unstacker = unstacker_func(self.values.T) | ||
new_items = unstacker.get_new_columns() | ||
new_placement = new_columns.get_indexer(new_items) | ||
new_values, mask = unstacker.get_new_values() | ||
|
||
mask = mask.any(0) | ||
new_placement, new_values, mask = self._get_unstack_items( | ||
unstacker, new_columns | ||
) | ||
|
||
new_values = new_values.T[mask] | ||
new_placement = new_placement[mask] | ||
|
||
blocks = [self.make_block_same_class(vals, [place]) | ||
for vals, place in zip(new_values, new_placement)] | ||
return blocks, mask | ||
|
||
def _get_unstack_items(self, unstacker, new_columns): | ||
""" | ||
Get the placement, values, and mask for a Block unstack. | ||
|
||
This is shared between ObjectBlock and ExtensionBlock. They | ||
differ in that ObjectBlock passes the values, while ExtensionBlock | ||
passes the dummy ndarray of positions to be used by a take | ||
later. | ||
|
||
Parameters | ||
---------- | ||
unstacker : pandas.core.reshape.reshape._Unstacker | ||
new_columns : Index | ||
All columns of the unstacked BlockManager. | ||
|
||
Returns | ||
------- | ||
new_placement : ndarray[int] | ||
The placement of the new columns in `new_columns`. | ||
new_values : Union[ndarray, ExtensionArray] | ||
The first return value from _Unstacker.get_new_values. | ||
mask : ndarray[bool] | ||
The second return value from _Unstacker.get_new_values. | ||
""" | ||
# shared with ExtensionBlock | ||
new_items = unstacker.get_new_columns() | ||
new_placement = new_columns.get_indexer(new_items) | ||
new_values, mask = unstacker.get_new_values() | ||
|
||
mask = mask.any(0) | ||
return new_placement, new_values, mask | ||
|
||
|
||
class ExtensionBlock(NonConsolidatableMixIn, Block): | ||
"""Block for holding extension types. | ||
|
@@ -1950,6 +1995,30 @@ def shift(self, periods, axis=0): | |
def _ftype(self): | ||
return getattr(self.values, '_pandas_ftype', Block._ftype) | ||
|
||
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): | ||
# ExtensionArray-safe unstack. | ||
# We override ObjectBlock._unstack, which unstacks directly on the | ||
# values of the array. For EA-backed blocks, this would require | ||
# converting to a 2-D ndarray of objects. | ||
# Instead, we unstack an ndarray of integer positions, followed by | ||
# a `take` on the actual values. | ||
dummy_arr = np.arange(n_rows) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a doc-string (or does it share)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The inherited one OK. I'm going to leave a comment explaining why we override. |
||
dummy_unstacker = functools.partial(unstacker_func, fill_value=-1) | ||
unstacker = dummy_unstacker(dummy_arr) | ||
|
||
new_placement, new_values, mask = self._get_unstack_items( | ||
unstacker, new_columns | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would not this generically work for all unstacking/ (e.g. what if you make this the super method)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is slower in general. It's necessarily slower for NumPy types, since you have to do the reshaping / unstack on the ndarray of positions anyway. The hope is that the cost of the additional Working on benchmarks now. |
||
) | ||
|
||
blocks = [ | ||
self.make_block_same_class( | ||
self.values.take(indices, allow_fill=True, | ||
fill_value=fill_value), | ||
[place]) | ||
for indices, place in zip(new_values.T, new_placement) | ||
] | ||
return blocks, mask | ||
|
||
|
||
class NumericBlock(Block): | ||
__slots__ = () | ||
|
Uh oh!
There was an error while loading. Please reload this page.