Skip to content

Commit df24c10

Browse files
BUG: fix IntegerArray astype with copy=True/False (#34931)
* BUG: fix IntegerArray astype with copy=True/False * fix mypy * return self for same dtype and copy=False * whatsnew
1 parent f5d7213 commit df24c10

File tree

4 files changed

+62
-8
lines changed

4 files changed

+62
-8
lines changed

doc/source/whatsnew/v1.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1143,6 +1143,7 @@ ExtensionArray
11431143
- Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`)
11441144
- Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`)
11451145
- Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`)
1146+
- Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`).
11461147

11471148
Other
11481149
^^^^^

pandas/core/arrays/integer.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -448,18 +448,22 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
448448
if incompatible type with an IntegerDtype, equivalent of same_kind
449449
casting
450450
"""
451-
from pandas.core.arrays.boolean import BooleanDtype
451+
from pandas.core.arrays.masked import BaseMaskedDtype
452452
from pandas.core.arrays.string_ import StringDtype
453453

454454
dtype = pandas_dtype(dtype)
455455

456-
# if we are astyping to an existing IntegerDtype we can fastpath
457-
if isinstance(dtype, _IntegerDtype):
458-
result = self._data.astype(dtype.numpy_dtype, copy=False)
459-
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
460-
elif isinstance(dtype, BooleanDtype):
461-
result = self._data.astype("bool", copy=False)
462-
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
456+
# if the dtype is exactly the same, we can fastpath
457+
if self.dtype == dtype:
458+
# return the same object for copy=False
459+
return self.copy() if copy else self
460+
# if we are astyping to another nullable masked dtype, we can fastpath
461+
if isinstance(dtype, BaseMaskedDtype):
462+
data = self._data.astype(dtype.numpy_dtype, copy=copy)
463+
# mask is copied depending on whether the data was copied, and
464+
# not directly depending on the `copy` keyword
465+
mask = self._mask if data is self._data else self._mask.copy()
466+
return dtype.construct_array_type()(data, mask, copy=False)
463467
elif isinstance(dtype, StringDtype):
464468
return dtype.construct_array_type()._from_sequence(self, copy=False)
465469

pandas/core/arrays/masked.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@ class BaseMaskedDtype(ExtensionDtype):
4040
def numpy_dtype(self) -> np.dtype:
4141
raise AbstractMethodError
4242

43+
@classmethod
44+
def construct_array_type(cls) -> Type["BaseMaskedArray"]:
45+
"""
46+
Return the array type associated with this dtype.
47+
48+
Returns
49+
-------
50+
type
51+
"""
52+
raise NotImplementedError
53+
4354

4455
class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
4556
"""

pandas/tests/arrays/integer/test_dtypes.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,44 @@ def test_astype(all_data):
144144
tm.assert_series_equal(result, expected)
145145

146146

147+
def test_astype_copy():
148+
arr = pd.array([1, 2, 3, None], dtype="Int64")
149+
orig = pd.array([1, 2, 3, None], dtype="Int64")
150+
151+
# copy=True -> ensure both data and mask are actual copies
152+
result = arr.astype("Int64", copy=True)
153+
assert result is not arr
154+
assert not np.shares_memory(result._data, arr._data)
155+
assert not np.shares_memory(result._mask, arr._mask)
156+
result[0] = 10
157+
tm.assert_extension_array_equal(arr, orig)
158+
result[0] = pd.NA
159+
tm.assert_extension_array_equal(arr, orig)
160+
161+
# copy=False
162+
result = arr.astype("Int64", copy=False)
163+
assert result is arr
164+
assert np.shares_memory(result._data, arr._data)
165+
assert np.shares_memory(result._mask, arr._mask)
166+
result[0] = 10
167+
assert arr[0] == 10
168+
result[0] = pd.NA
169+
assert arr[0] is pd.NA
170+
171+
# astype to different dtype -> always needs a copy -> even with copy=False
172+
# we need to ensure that also the mask is actually copied
173+
arr = pd.array([1, 2, 3, None], dtype="Int64")
174+
orig = pd.array([1, 2, 3, None], dtype="Int64")
175+
176+
result = arr.astype("Int32", copy=False)
177+
assert not np.shares_memory(result._data, arr._data)
178+
assert not np.shares_memory(result._mask, arr._mask)
179+
result[0] = 10
180+
tm.assert_extension_array_equal(arr, orig)
181+
result[0] = pd.NA
182+
tm.assert_extension_array_equal(arr, orig)
183+
184+
147185
def test_astype_to_larger_numpy():
148186
a = pd.array([1, 2], dtype="Int32")
149187
result = a.astype("int64")

0 commit comments

Comments
 (0)