-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
[ArrayManager] Test DataFrame reductions + implement ignore_failures #39719
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0644d90
77395d8
ac6117a
748d767
8dc339c
0ecd08d
c5106f0
9239046
0dcef95
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,10 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._libs import lib | ||
from pandas._libs import ( | ||
NaT, | ||
lib, | ||
) | ||
from pandas._typing import ( | ||
ArrayLike, | ||
DtypeObj, | ||
|
@@ -33,6 +36,8 @@ | |
is_dtype_equal, | ||
is_extension_array_dtype, | ||
is_numeric_dtype, | ||
is_object_dtype, | ||
is_timedelta64_ns_dtype, | ||
) | ||
from pandas.core.dtypes.dtypes import ( | ||
ExtensionDtype, | ||
|
@@ -50,7 +55,11 @@ | |
import pandas.core.algorithms as algos | ||
from pandas.core.arrays import ExtensionArray | ||
from pandas.core.arrays.sparse import SparseDtype | ||
from pandas.core.construction import extract_array | ||
from pandas.core.construction import ( | ||
ensure_wrapped_if_datetimelike, | ||
extract_array, | ||
sanitize_array, | ||
) | ||
from pandas.core.indexers import maybe_convert_indices | ||
from pandas.core.indexes.api import ( | ||
Index, | ||
|
@@ -201,18 +210,48 @@ def _verify_integrity(self) -> None: | |
def reduce( | ||
self: T, func: Callable, ignore_failures: bool = False | ||
) -> Tuple[T, np.ndarray]: | ||
# TODO this still fails because `func` assumes to work on 2D arrays | ||
# TODO implement ignore_failures | ||
assert self.ndim == 2 | ||
""" | ||
Apply reduction function column-wise, returning a single-row ArrayManager. | ||
|
||
res_arrays = [] | ||
for arr in self.arrays: | ||
res = func(arr, axis=0) | ||
res_arrays.append(np.array([res])) | ||
Parameters | ||
---------- | ||
func : reduction function | ||
ignore_failures : bool, default False | ||
Whether to drop columns where func raises TypeError. | ||
|
||
index = Index([None]) # placeholder | ||
new_mgr = type(self)(res_arrays, [index, self.items]) | ||
indexer = np.arange(self.shape[0]) | ||
Returns | ||
------- | ||
ArrayManager | ||
np.ndarray | ||
Indexer of column indices that are retained. | ||
""" | ||
result_arrays: List[np.ndarray] = [] | ||
result_indices: List[int] = [] | ||
for i, arr in enumerate(self.arrays): | ||
try: | ||
res = func(arr, axis=0) | ||
except TypeError: | ||
if not ignore_failures: | ||
raise | ||
else: | ||
# TODO NaT doesn't preserve dtype, so we need to ensure to create | ||
# a timedelta result array if original was timedelta | ||
# what if datetime results in timedelta? (eg std) | ||
if res is NaT and is_timedelta64_ns_dtype(arr.dtype): | ||
result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]")) | ||
else: | ||
result_arrays.append(sanitize_array([res], None)) | ||
Comment on lines
+237
to
+243
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an ugly special case ... But as far as I can see it is the consequence of storing Datetime/TimedeltaArray as the 1D array for those dtypes in ArrayManager instead of the numpy ndarray version. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, yes. That would need the Manager to get the actual op name, and not the function object. Now, just passing the op name in addition is not difficult of course. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think this will be OK, but what about if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I know, that case is not covered (see my comment "what if datetime results in timedelta? (eg std)"). I can make the todo more explicit. (note that this is a bug that already exists on master for ArrayManager as well, it's not caused by the changes in this PR; it's just not yet addressed by this PR) |
||
result_indices.append(i) | ||
|
||
index = Index._simple_new(np.array([None], dtype=object)) # placeholder | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if ignore_failures: | ||
indexer = np.array(result_indices) | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
columns = self.items[result_indices] | ||
else: | ||
indexer = np.arange(self.shape[0]) | ||
columns = self.items | ||
|
||
new_mgr = type(self)(result_arrays, [index, columns]) | ||
return new_mgr, indexer | ||
|
||
def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager: | ||
|
@@ -489,14 +528,17 @@ def _get_data_subset(self, predicate: Callable) -> ArrayManager: | |
|
||
def get_bool_data(self, copy: bool = False) -> ArrayManager: | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Select columns that are bool-dtype. | ||
Select columns that are bool-dtype and object-dtype columns that are all-bool. | ||
|
||
Parameters | ||
---------- | ||
copy : bool, default False | ||
Whether to copy the blocks | ||
""" | ||
return self._get_data_subset(lambda arr: is_bool_dtype(arr.dtype)) | ||
return self._get_data_subset( | ||
lambda arr: is_bool_dtype(arr.dtype) | ||
or (is_object_dtype(arr.dtype) and lib.is_bool_array(arr)) | ||
) | ||
|
||
def get_numeric_data(self, copy: bool = False) -> ArrayManager: | ||
""" | ||
|
@@ -693,6 +735,10 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): | |
assert value.shape[1] == 1 | ||
value = value[0, :] | ||
|
||
# TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item | ||
# but we should avoid that and pass directly the proper array | ||
value = ensure_wrapped_if_datetimelike(value) | ||
|
||
assert isinstance(value, (np.ndarray, ExtensionArray)) | ||
assert value.ndim == 1 | ||
assert len(value) == len(self._axes[0]) | ||
|
Uh oh!
There was an error while loading. Please reload this page.