Skip to content

Commit 2a0e253

Browse files
phoflmroeschke
andauthored
Convert to compatible NumPy dtype for MaskedArray to_numpy (#55058)
* Convert masked arrays to valid numpy dtype * Convert ea to appropriate numpy dtype * Fix typing * Add whatsnew * Fix test * Try map implementation * Add comment * Try * Fix * Update * Update doc/source/whatsnew/v2.2.0.rst Co-authored-by: Matthew Roeschke <[email protected]> --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 65af776 commit 2a0e253

File tree

14 files changed

+141
-33
lines changed

14 files changed

+141
-33
lines changed

doc/source/whatsnew/v2.2.0.rst

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,37 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv
105105
Implementation Status <https://arrow.apache.org/adbc/current/driver/status.html>`_
106106
documentation.
107107

108+
.. _whatsnew_220.enhancements.to_numpy_ea:
109+
110+
ExtensionArray.to_numpy converts to suitable NumPy dtype
111+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
112+
113+
:meth:`ExtensionArray.to_numpy`` will now convert to a suitable NumPy dtype instead
114+
of ``object`` dtype for nullable extension dtypes.
115+
116+
*Old behavior:*
117+
118+
.. code-block:: ipython
119+
120+
In [1]: ser = pd.Series([1, 2, 3], dtype="Int64")
121+
In [2]: ser.to_numpy()
122+
Out[2]: array([1, 2, 3], dtype=object)
123+
124+
*New behavior:*
125+
126+
.. ipython:: python
127+
128+
ser = pd.Series([1, 2, 3], dtype="Int64")
129+
ser.to_numpy()
130+
131+
The default NumPy dtype (without any arguments) is determined as follows:
132+
133+
- float dtypes are cast to NumPy floats
134+
- integer dtypes without missing values are cast to NumPy integer dtypes
135+
- integer dtypes with missing values are cast to NumPy float dtypes and ``NaN`` is used as missing value indicator
136+
- boolean dtypes without missing values are cast to NumPy bool dtype
137+
- boolean dtypes with missing values keep object dtype
138+
108139
.. _whatsnew_220.enhancements.struct_accessor:
109140

110141
Series.struct accessor to with PyArrow structured data

pandas/_libs/parsers.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1477,7 +1477,7 @@ def _maybe_upcast(
14771477
import pyarrow as pa
14781478
if isinstance(arr, IntegerArray) and arr.isna().all():
14791479
# use null instead of int64 in pyarrow
1480-
arr = arr.to_numpy()
1480+
arr = arr.to_numpy(na_value=None)
14811481
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
14821482

14831483
return arr

pandas/core/arrays/masked.py

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,15 @@
3838
IS64,
3939
is_platform_windows,
4040
)
41-
from pandas.errors import AbstractMethodError
41+
from pandas.errors import (
42+
AbstractMethodError,
43+
LossySetitemError,
44+
)
4245
from pandas.util._decorators import doc
4346
from pandas.util._validators import validate_fillna_kwargs
4447

4548
from pandas.core.dtypes.base import ExtensionDtype
49+
from pandas.core.dtypes.cast import np_can_hold_element
4650
from pandas.core.dtypes.common import (
4751
is_bool,
4852
is_integer_dtype,
@@ -69,6 +73,7 @@
6973
from pandas.core.algorithms import (
7074
factorize_array,
7175
isin,
76+
map_array,
7277
mode,
7378
take,
7479
)
@@ -473,13 +478,35 @@ def to_numpy(
473478
>>> a.to_numpy(dtype="bool", na_value=False)
474479
array([ True, False, False])
475480
"""
476-
if na_value is lib.no_default:
477-
na_value = libmissing.NA
481+
hasna = self._hasna
482+
478483
if dtype is None:
479-
dtype = object
484+
dtype_given = False
485+
if hasna:
486+
if self.dtype.kind == "b":
487+
dtype = object
488+
else:
489+
if self.dtype.kind in "iu":
490+
dtype = np.dtype(np.float64)
491+
else:
492+
dtype = self.dtype.numpy_dtype
493+
if na_value is lib.no_default:
494+
na_value = np.nan
495+
else:
496+
dtype = self.dtype.numpy_dtype
480497
else:
481498
dtype = np.dtype(dtype)
482-
if self._hasna:
499+
dtype_given = True
500+
if na_value is lib.no_default:
501+
na_value = libmissing.NA
502+
503+
if not dtype_given and hasna:
504+
try:
505+
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
506+
except LossySetitemError:
507+
dtype = object
508+
509+
if hasna:
483510
if (
484511
dtype != object
485512
and not is_string_dtype(dtype)
@@ -506,7 +533,7 @@ def tolist(self):
506533
if self.ndim > 1:
507534
return [x.tolist() for x in self]
508535
dtype = None if self._hasna else self._data.dtype
509-
return self.to_numpy(dtype=dtype).tolist()
536+
return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist()
510537

511538
@overload
512539
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
@@ -1300,6 +1327,9 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
13001327
)
13011328
return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis)
13021329

1330+
def map(self, mapper, na_action=None):
1331+
return map_array(self.to_numpy(), mapper, na_action=None)
1332+
13031333
def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
13041334
"""
13051335
Return whether any element is truthy.

pandas/core/methods/to_dict.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,17 @@
99

1010
import numpy as np
1111

12+
from pandas._libs import (
13+
lib,
14+
missing as libmissing,
15+
)
1216
from pandas.util._exceptions import find_stack_level
1317

1418
from pandas.core.dtypes.cast import maybe_box_native
15-
from pandas.core.dtypes.dtypes import ExtensionDtype
19+
from pandas.core.dtypes.dtypes import (
20+
BaseMaskedDtype,
21+
ExtensionDtype,
22+
)
1623

1724
from pandas.core import common as com
1825

@@ -150,6 +157,10 @@ def to_dict(
150157
for i, col_dtype in enumerate(df.dtypes.values)
151158
if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype)
152159
]
160+
box_na_values = [
161+
lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA
162+
for i, col_dtype in enumerate(df.dtypes.values)
163+
]
153164
are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
154165

155166
if orient == "dict":
@@ -160,7 +171,11 @@ def to_dict(
160171
return into_c(
161172
(
162173
k,
163-
list(map(maybe_box_native, v.to_numpy().tolist()))
174+
list(
175+
map(
176+
maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist()
177+
)
178+
)
164179
if i in object_dtype_indices_as_set
165180
else v.to_numpy().tolist(),
166181
)

pandas/io/formats/format.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
)
6262

6363
from pandas.core.arrays import (
64+
BaseMaskedArray,
6465
Categorical,
6566
DatetimeArray,
6667
ExtensionArray,
@@ -1527,6 +1528,8 @@ def _format_strings(self) -> list[str]:
15271528
if isinstance(values, Categorical):
15281529
# Categorical is special for now, so that we can preserve tzinfo
15291530
array = values._internal_get_values()
1531+
elif isinstance(values, BaseMaskedArray):
1532+
array = values.to_numpy(dtype=object)
15301533
else:
15311534
array = np.asarray(values)
15321535

pandas/io/parsers/base_parser.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
from pandas.core import algorithms
6464
from pandas.core.arrays import (
6565
ArrowExtensionArray,
66+
BaseMaskedArray,
6667
BooleanArray,
6768
Categorical,
6869
ExtensionArray,
@@ -762,8 +763,15 @@ def _infer_types(
762763
pa = import_optional_dependency("pyarrow")
763764
if isinstance(result, np.ndarray):
764765
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
766+
elif isinstance(result, BaseMaskedArray):
767+
if result._mask.all():
768+
# We want an arrow null array here
769+
result = ArrowExtensionArray(pa.array([None] * len(result)))
770+
else:
771+
result = ArrowExtensionArray(
772+
pa.array(result._data, mask=result._mask)
773+
)
765774
else:
766-
# ExtensionArray
767775
result = ArrowExtensionArray(
768776
pa.array(result.to_numpy(), from_pandas=True)
769777
)

pandas/tests/arrays/boolean/test_construction.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def test_coerce_to_numpy_array():
223223
# also with no missing values -> object dtype
224224
arr = pd.array([True, False, True], dtype="boolean")
225225
result = np.array(arr)
226-
expected = np.array([True, False, True], dtype="object")
226+
expected = np.array([True, False, True], dtype="bool")
227227
tm.assert_numpy_array_equal(result, expected)
228228

229229
# force bool dtype
@@ -263,7 +263,7 @@ def test_to_numpy(box):
263263
# default (with or without missing values) -> object dtype
264264
arr = con([True, False, True], dtype="boolean")
265265
result = arr.to_numpy()
266-
expected = np.array([True, False, True], dtype="object")
266+
expected = np.array([True, False, True], dtype="bool")
267267
tm.assert_numpy_array_equal(result, expected)
268268

269269
arr = con([True, False, None], dtype="boolean")

pandas/tests/arrays/floating/test_to_numpy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@ def test_to_numpy(box):
1313
# default (with or without missing values) -> object dtype
1414
arr = con([0.1, 0.2, 0.3], dtype="Float64")
1515
result = arr.to_numpy()
16-
expected = np.array([0.1, 0.2, 0.3], dtype="object")
16+
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
1717
tm.assert_numpy_array_equal(result, expected)
1818

1919
arr = con([0.1, 0.2, None], dtype="Float64")
2020
result = arr.to_numpy()
21-
expected = np.array([0.1, 0.2, pd.NA], dtype="object")
21+
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
2222
tm.assert_numpy_array_equal(result, expected)
2323

2424

pandas/tests/arrays/integer/test_dtypes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def test_astype(all_data):
141141
# coerce to object
142142
s = pd.Series(mixed)
143143
result = s.astype("object")
144-
expected = pd.Series(np.asarray(mixed))
144+
expected = pd.Series(np.asarray(mixed, dtype=object))
145145
tm.assert_series_equal(result, expected)
146146

147147

pandas/tests/base/test_conversion.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ def test_array_multiindex_raises():
295295
pd.core.arrays.period_array(["2000", "2001"], freq="D"),
296296
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
297297
),
298-
(pd.array([0, np.nan], dtype="Int64"), np.array([0, pd.NA], dtype=object)),
298+
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
299299
(
300300
IntervalArray.from_breaks([0, 1, 2]),
301301
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
@@ -346,10 +346,6 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request):
346346
with tm.assert_produces_warning(None):
347347
thing = box(arr)
348348

349-
if arr.dtype.name == "int64" and box is pd.array:
350-
mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object")
351-
request.applymarker(mark)
352-
353349
result = thing.to_numpy()
354350
tm.assert_numpy_array_equal(result, expected)
355351

pandas/tests/copy_view/test_array.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,21 +133,25 @@ def test_series_array_ea_dtypes(using_copy_on_write):
133133
assert arr.flags.writeable is True
134134

135135
arr = np.asarray(ser)
136-
assert not np.shares_memory(arr, get_array(ser))
137-
assert arr.flags.writeable is True
136+
assert np.shares_memory(arr, get_array(ser))
137+
if using_copy_on_write:
138+
assert arr.flags.writeable is False
139+
else:
140+
assert arr.flags.writeable is True
138141

139142

140143
def test_dataframe_array_ea_dtypes(using_copy_on_write):
141144
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
142145
arr = np.asarray(df, dtype="int64")
143-
# TODO: This should be able to share memory, but we are roundtripping
144-
# through object
145-
assert not np.shares_memory(arr, get_array(df, "a"))
146-
assert arr.flags.writeable is True
146+
assert np.shares_memory(arr, get_array(df, "a"))
147+
if using_copy_on_write:
148+
assert arr.flags.writeable is False
149+
else:
150+
assert arr.flags.writeable is True
147151

148152
arr = np.asarray(df)
153+
assert np.shares_memory(arr, get_array(df, "a"))
149154
if using_copy_on_write:
150-
# TODO(CoW): This should be True
151155
assert arr.flags.writeable is False
152156
else:
153157
assert arr.flags.writeable is True

pandas/tests/extension/test_masked.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,16 @@ def data_for_grouping(dtype):
169169

170170

171171
class TestMaskedArrays(base.ExtensionTests):
172+
@pytest.mark.parametrize("na_action", [None, "ignore"])
173+
def test_map(self, data_missing, na_action):
174+
result = data_missing.map(lambda x: x, na_action=na_action)
175+
if data_missing.dtype == Float32Dtype():
176+
# map roundtrips through objects, which converts to float64
177+
expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
178+
else:
179+
expected = data_missing.to_numpy()
180+
tm.assert_numpy_array_equal(result, expected)
181+
172182
def _get_expected_exception(self, op_name, obj, other):
173183
try:
174184
dtype = tm.get_dtype(obj)

pandas/tests/frame/indexing/test_where.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -717,15 +717,12 @@ def test_where_ea_other(self):
717717

718718
# TODO: ideally we would get Int64 instead of object
719719
result = df.where(mask, ser, axis=0)
720-
expected = DataFrame({"A": [1, pd.NA, 3], "B": [4, pd.NA, 6]}).astype(object)
720+
expected = DataFrame({"A": [1, np.nan, 3], "B": [4, np.nan, 6]})
721721
tm.assert_frame_equal(result, expected)
722722

723723
ser2 = Series(arr[:2], index=["A", "B"])
724-
expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]})
725-
expected["B"] = expected["B"].astype(object)
726-
msg = "Downcasting behavior in Series and DataFrame methods 'where'"
727-
with tm.assert_produces_warning(FutureWarning, match=msg):
728-
result = df.where(mask, ser2, axis=1)
724+
expected = DataFrame({"A": [1, 7, 3], "B": [4, np.nan, 6]})
725+
result = df.where(mask, ser2, axis=1)
729726
tm.assert_frame_equal(result, expected)
730727

731728
def test_where_interval_noop(self):

pandas/tests/frame/test_repr.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,20 @@ def test_to_records_with_inf_record(self):
456456
result = repr(df)
457457
assert result == expected
458458

459+
def test_masked_ea_with_formatter(self):
460+
# GH#39336
461+
df = DataFrame(
462+
{
463+
"a": Series([0.123456789, 1.123456789], dtype="Float64"),
464+
"b": Series([1, 2], dtype="Int64"),
465+
}
466+
)
467+
result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format])
468+
expected = """ a b
469+
0 0.12 1.00
470+
1 1.12 2.00"""
471+
assert result == expected
472+
459473
def test_repr_ea_columns(self, any_string_dtype):
460474
# GH#54797
461475
pytest.importorskip("pyarrow")

0 commit comments

Comments
 (0)