-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
[ArrayManager] REF: Implement concat with reindexing #39612
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6cd1c4d
73d9de2
272d674
555d7ac
ebad8a4
ee495e5
19c7f75
42e1b05
724be3e
db3f0ed
a2aa388
6bdd175
910e1fe
cab90f6
c22a010
eec0161
6c69869
04ead63
427b6f4
8c10a53
ec5bd11
f0061f7
9ba8854
ad61f2f
d960619
a3c2662
0fafb1a
f67e9e2
f655e33
d21bd3a
9435c39
22ea7d2
77b05f4
81d0954
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,11 +30,13 @@ | |
) | ||
|
||
|
||
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: | ||
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yah this probably belongs in dtypes.cast (independent of this PR) |
||
""" | ||
Helper function for `arr.astype(common_dtype)` but handling all special | ||
cases. | ||
""" | ||
if is_dtype_equal(arr.dtype, dtype): | ||
return arr | ||
if ( | ||
is_categorical_dtype(arr.dtype) | ||
and isinstance(dtype, np.dtype) | ||
|
@@ -121,7 +123,7 @@ def is_nonempty(x) -> bool: | |
# for axis=0 | ||
if not single_dtype: | ||
target_dtype = find_common_type([x.dtype for x in to_concat]) | ||
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] | ||
to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat] | ||
|
||
if isinstance(to_concat[0], ExtensionArray): | ||
cls = type(to_concat[0]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,12 +18,14 @@ | |
) | ||
from pandas._typing import ( | ||
ArrayLike, | ||
DtypeObj, | ||
Hashable, | ||
) | ||
from pandas.util._validators import validate_bool_kwarg | ||
|
||
from pandas.core.dtypes.cast import ( | ||
astype_array_safe, | ||
ensure_dtype_can_hold_na, | ||
infer_dtype_from_scalar, | ||
soft_convert_objects, | ||
) | ||
|
@@ -49,6 +51,7 @@ | |
from pandas.core.dtypes.missing import ( | ||
array_equals, | ||
isna, | ||
na_value_for_dtype, | ||
) | ||
|
||
import pandas.core.algorithms as algos | ||
|
@@ -952,10 +955,18 @@ def reindex_indexer( | |
# ignored keywords | ||
consolidate: bool = True, | ||
only_slice: bool = False, | ||
# ArrayManager specific keywords | ||
use_na_proxy: bool = False, | ||
) -> T: | ||
axis = self._normalize_axis(axis) | ||
return self._reindex_indexer( | ||
new_axis, indexer, axis, fill_value, allow_dups, copy | ||
new_axis, | ||
indexer, | ||
axis, | ||
fill_value, | ||
allow_dups, | ||
copy, | ||
use_na_proxy, | ||
) | ||
|
||
def _reindex_indexer( | ||
|
@@ -966,6 +977,7 @@ def _reindex_indexer( | |
fill_value=None, | ||
allow_dups: bool = False, | ||
copy: bool = True, | ||
use_na_proxy: bool = False, | ||
) -> T: | ||
""" | ||
Parameters | ||
|
@@ -1000,7 +1012,9 @@ def _reindex_indexer( | |
new_arrays = [] | ||
for i in indexer: | ||
if i == -1: | ||
arr = self._make_na_array(fill_value=fill_value) | ||
arr = self._make_na_array( | ||
fill_value=fill_value, use_na_proxy=use_na_proxy | ||
) | ||
else: | ||
arr = self.arrays[i] | ||
new_arrays.append(arr) | ||
|
@@ -1051,7 +1065,11 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: | |
new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True | ||
) | ||
|
||
def _make_na_array(self, fill_value=None): | ||
def _make_na_array(self, fill_value=None, use_na_proxy=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need ArrayLike to include NullArrayProxy? |
||
if use_na_proxy: | ||
assert fill_value is None | ||
return NullArrayProxy(self.shape_proper[0]) | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if fill_value is None: | ||
fill_value = np.nan | ||
|
||
|
@@ -1271,3 +1289,50 @@ def set_values(self, values: ArrayLike): | |
valid for the current SingleArrayManager (length, dtype, etc). | ||
""" | ||
self.arrays[0] = values | ||
|
||
|
||
class NullArrayProxy: | ||
""" | ||
Proxy object for an all-NA array. | ||
|
||
Only stores the length of the array, and not the dtype. The dtype | ||
will only be known when actually concatenating (after determining the | ||
common dtype, for which this proxy is ignored). | ||
Using this object avoids that the internals/concat.py needs to determine | ||
the proper dtype and array type. | ||
""" | ||
|
||
ndim = 1 | ||
|
||
def __init__(self, n: int): | ||
self.n = n | ||
|
||
@property | ||
def shape(self): | ||
return (self.n,) | ||
|
||
def to_array(self, dtype: DtypeObj) -> ArrayLike: | ||
""" | ||
Helper function to create the actual all-NA array from the NullArrayProxy | ||
object. | ||
|
||
Parameters | ||
---------- | ||
arr : NullArrayProxy | ||
dtype : the dtype for the resulting array | ||
|
||
Returns | ||
------- | ||
np.ndarray or ExtensionArray | ||
""" | ||
if isinstance(dtype, ExtensionDtype): | ||
empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) | ||
indexer = -np.ones(self.n, dtype=np.intp) | ||
return empty.take(indexer, allow_fill=True) | ||
else: | ||
# when introducing missing values, int becomes float, bool becomes object | ||
dtype = ensure_dtype_can_hold_na(dtype) | ||
fill_value = na_value_for_dtype(dtype) | ||
arr = np.empty(self.n, dtype=dtype) | ||
arr.fill(fill_value) | ||
return ensure_wrapped_if_datetimelike(arr) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,10 @@ | |
is_extension_array_dtype, | ||
is_sparse, | ||
) | ||
from pandas.core.dtypes.concat import concat_compat | ||
from pandas.core.dtypes.concat import ( | ||
cast_to_common_type, | ||
concat_compat, | ||
) | ||
from pandas.core.dtypes.dtypes import ExtensionDtype | ||
from pandas.core.dtypes.missing import ( | ||
is_valid_na_for_dtype, | ||
|
@@ -42,7 +45,10 @@ | |
ExtensionArray, | ||
) | ||
from pandas.core.construction import ensure_wrapped_if_datetimelike | ||
from pandas.core.internals.array_manager import ArrayManager | ||
from pandas.core.internals.array_manager import ( | ||
ArrayManager, | ||
NullArrayProxy, | ||
) | ||
from pandas.core.internals.blocks import ( | ||
ensure_block_shape, | ||
new_block, | ||
|
@@ -74,14 +80,16 @@ def _concatenate_array_managers( | |
mgrs = [] | ||
for mgr, indexers in mgrs_indexers: | ||
for ax, indexer in indexers.items(): | ||
mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True) | ||
mgr = mgr.reindex_indexer( | ||
axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True | ||
) | ||
mgrs.append(mgr) | ||
|
||
if concat_axis == 1: | ||
# concatting along the rows -> concat the reindexed arrays | ||
# TODO(ArrayManager) doesn't yet preserve the correct dtype | ||
arrays = [ | ||
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))]) | ||
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you now remove concat_compat? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and reading your comment below, why is this not in array manger if its only used there? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Because this is the code for concatting managers, which for BlockManager also resides in internals/concat.py? |
||
for j in range(len(mgrs[0].arrays)) | ||
] | ||
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) | ||
|
@@ -92,6 +100,68 @@ def _concatenate_array_managers( | |
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) | ||
|
||
|
||
def concat_arrays(to_concat: list) -> ArrayLike: | ||
""" | ||
Alternative for concat_compat but specialized for use in the ArrayManager. | ||
|
||
Differences: only deals with 1D arrays (no axis keyword), assumes | ||
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine | ||
the dtype. | ||
In addition ensures that all NullArrayProxies get replaced with actual | ||
arrays. | ||
|
||
Parameters | ||
---------- | ||
to_concat : list of arrays | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the annotation is List[Any]. can that be made more specific? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In principle the more specific annotation would be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think the practice in this case is to not annotate rather than use Any, cc @simonjayhawkins There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. im hoping @simonjayhawkins will weigh in here, but my understanding of the rule of thumb is that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I thought you meant to have no annoation at all (like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that's the general consensus at the moment until we add |
||
|
||
Returns | ||
------- | ||
np.ndarray or ExtensionArray | ||
""" | ||
# ignore the all-NA proxies to determine the resulting dtype | ||
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] | ||
|
||
single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 | ||
|
||
if not single_dtype: | ||
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) | ||
else: | ||
target_dtype = to_concat_no_proxy[0].dtype | ||
|
||
if target_dtype.kind in ["m", "M"]: | ||
# for datetimelike use DatetimeArray/TimedeltaArray concatenation | ||
# don't use arr.astype(target_dtype, copy=False), because that doesn't | ||
# work for DatetimeArray/TimedeltaArray (returns ndarray) | ||
to_concat = [ | ||
arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr | ||
for arr in to_concat | ||
] | ||
return type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=0) | ||
|
||
to_concat = [ | ||
arr.to_array(target_dtype) | ||
if isinstance(arr, NullArrayProxy) | ||
else cast_to_common_type(arr, target_dtype) | ||
for arr in to_concat | ||
] | ||
|
||
if isinstance(to_concat[0], ExtensionArray): | ||
cls = type(to_concat[0]) | ||
return cls._concat_same_type(to_concat) | ||
|
||
result = np.concatenate(to_concat) | ||
|
||
# TODO decide on exact behaviour (we shouldn't do this only for empty result) | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# see https://github.com/pandas-dev/pandas/issues/39817 | ||
if len(result) == 0: | ||
# all empties -> check for bool to not coerce to float | ||
kinds = {obj.dtype.kind for obj in to_concat_no_proxy} | ||
if len(kinds) != 1: | ||
if "b" in kinds: | ||
result = result.astype(object) | ||
return result | ||
|
||
|
||
def concatenate_managers( | ||
mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool | ||
) -> Manager: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,6 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
import pandas.util._test_decorators as td | ||
|
||
import pandas as pd | ||
from pandas import ( | ||
DataFrame, | ||
|
@@ -13,9 +11,6 @@ | |
) | ||
import pandas._testing as tm | ||
|
||
# TODO td.skip_array_manager_not_yet_implemented | ||
# appending with reindexing not yet working | ||
|
||
|
||
class TestDataFrameAppend: | ||
def test_append_multiindex(self, multiindex_dataframe_random_data, frame_or_series): | ||
|
@@ -43,7 +38,6 @@ def test_append_empty_list(self): | |
tm.assert_frame_equal(result, expected) | ||
assert result is not df # .append() should return a new object | ||
|
||
@td.skip_array_manager_not_yet_implemented | ||
def test_append_series_dict(self): | ||
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) | ||
|
||
|
@@ -84,7 +78,6 @@ def test_append_series_dict(self): | |
expected = df.append(df[-1:], ignore_index=True) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@td.skip_array_manager_not_yet_implemented | ||
def test_append_list_of_series_dicts(self): | ||
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) | ||
|
||
|
@@ -103,7 +96,6 @@ def test_append_list_of_series_dicts(self): | |
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@td.skip_array_manager_not_yet_implemented | ||
def test_append_missing_cols(self): | ||
# GH22252 | ||
# exercise the conditional branch in append method where the data | ||
|
@@ -148,8 +140,7 @@ def test_append_empty_dataframe(self): | |
expected = df1.copy() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@td.skip_array_manager_not_yet_implemented | ||
def test_append_dtypes(self): | ||
def test_append_dtypes(self, using_array_manager): | ||
|
||
# GH 5754 | ||
# row appends of different dtypes (so need to do by-item) | ||
|
@@ -173,6 +164,10 @@ def test_append_dtypes(self): | |
expected = DataFrame( | ||
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} | ||
) | ||
if using_array_manager: | ||
# TODO(ArrayManager) decide on exact casting rules in concat | ||
# With ArrayManager, all-NaN float is not ignored | ||
expected = expected.astype(object) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) | ||
|
@@ -181,6 +176,9 @@ def test_append_dtypes(self): | |
expected = DataFrame( | ||
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} | ||
) | ||
if using_array_manager: | ||
# With ArrayManager, all-NaN float is not ignored | ||
expected = expected.astype(object) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
df1 = DataFrame({"bar": np.nan}, index=range(1)) | ||
|
@@ -189,6 +187,9 @@ def test_append_dtypes(self): | |
expected = DataFrame( | ||
{"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} | ||
) | ||
if using_array_manager: | ||
# With ArrayManager, all-NaN float is not ignored | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can the new JoinUnit.is_valid_na_for be used for these? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something similar could be used, but for now I didn't include such logic on purpose (I would like to keep the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think weve landed on the idea being to keep the behaviors matching wherever feasible There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In general where it is easy to do so, yes. But for now I kept the new implementation strictly dtype-dependent without any value-dependent behaviour. I don't think we need to mimic every special case or inconsistency of the BM in the new code, but of course in this case it depends on what behaviour we want long term. I opened #40893 for this. |
||
expected = expected.astype(object) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) | ||
|
@@ -208,7 +209,6 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): | |
expected = Series(Timestamp(timestamp, tz=tz), name=0) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@td.skip_array_manager_not_yet_implemented | ||
@pytest.mark.parametrize( | ||
"data, dtype", | ||
[ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +0,0 @@ | ||
import pandas.util._test_decorators as td | ||
|
||
# TODO(ArrayManager) concat axis=0 | ||
pytestmark = td.skip_array_manager_not_yet_implemented | ||
Uh oh!
There was an error while loading. Please reload this page.