Convert to compatible NumPy dtype for MaskedArray to_numpy (#55058)

phofl · mroeschke · web-flow · commit 2a0e253688a4 · 2023-12-01T11:34:48.000-08:00
* Convert masked arrays to valid numpy dtype

* Convert ea to appropriate numpy dtype

* Fix typing

* Add whatsnew

* Fix test

* Try map implementation

* Add comment

* Try

* Fix

* Update

* Update doc/source/whatsnew/v2.2.0.rst

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

---------

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -105,6 +105,37 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv
 Implementation Status <https://arrow.apache.org/adbc/current/driver/status.html>`_
 documentation.
 
+.. _whatsnew_220.enhancements.to_numpy_ea:
+
+ExtensionArray.to_numpy converts to suitable NumPy dtype
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`ExtensionArray.to_numpy`` will now convert to a suitable NumPy dtype instead
+of ``object`` dtype for nullable extension dtypes.
+
+*Old behavior:*
+
+.. code-block:: ipython
+
+    In [1]: ser = pd.Series([1, 2, 3], dtype="Int64")
+    In [2]: ser.to_numpy()
+    Out[2]: array([1, 2, 3], dtype=object)
+
+*New behavior:*
+
+.. ipython:: python
+
+    ser = pd.Series([1, 2, 3], dtype="Int64")
+    ser.to_numpy()
+
+The default NumPy dtype (without any arguments) is determined as follows:
+
+- float dtypes are cast to NumPy floats
+- integer dtypes without missing values are cast to NumPy integer dtypes
+- integer dtypes with missing values are cast to NumPy float dtypes and ``NaN`` is used as missing value indicator
+- boolean dtypes without missing values are cast to NumPy bool dtype
+- boolean dtypes with missing values keep object dtype
+
 .. _whatsnew_220.enhancements.struct_accessor:
 
 Series.struct accessor to with PyArrow structured data
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1477,7 +1477,7 @@ def _maybe_upcast(
         import pyarrow as pa
         if isinstance(arr, IntegerArray) and arr.isna().all():
             # use null instead of int64 in pyarrow
-            arr = arr.to_numpy()
+            arr = arr.to_numpy(na_value=None)
         arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
 
     return arr
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -38,11 +38,15 @@
     IS64,
     is_platform_windows,
 )
-from pandas.errors import AbstractMethodError
+from pandas.errors import (
+    AbstractMethodError,
+    LossySetitemError,
+)
 from pandas.util._decorators import doc
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.cast import np_can_hold_element
 from pandas.core.dtypes.common import (
     is_bool,
     is_integer_dtype,
@@ -69,6 +73,7 @@
 from pandas.core.algorithms import (
     factorize_array,
     isin,
+    map_array,
     mode,
     take,
 )
@@ -473,13 +478,35 @@ def to_numpy(
         >>> a.to_numpy(dtype="bool", na_value=False)
         array([ True, False, False])
         """
-        if na_value is lib.no_default:
-            na_value = libmissing.NA
+        hasna = self._hasna
+
         if dtype is None:
-            dtype = object
+            dtype_given = False
+            if hasna:
+                if self.dtype.kind == "b":
+                    dtype = object
+                else:
+                    if self.dtype.kind in "iu":
+                        dtype = np.dtype(np.float64)
+                    else:
+                        dtype = self.dtype.numpy_dtype
+                    if na_value is lib.no_default:
+                        na_value = np.nan
+            else:
+                dtype = self.dtype.numpy_dtype
         else:
             dtype = np.dtype(dtype)
-        if self._hasna:
+            dtype_given = True
+        if na_value is lib.no_default:
+            na_value = libmissing.NA
+
+        if not dtype_given and hasna:
+            try:
+                np_can_hold_element(dtype, na_value)  # type: ignore[arg-type]
+            except LossySetitemError:
+                dtype = object
+
+        if hasna:
             if (
                 dtype != object
                 and not is_string_dtype(dtype)
@@ -506,7 +533,7 @@ def tolist(self):
         if self.ndim > 1:
             return [x.tolist() for x in self]
         dtype = None if self._hasna else self._data.dtype
-        return self.to_numpy(dtype=dtype).tolist()
+        return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist()
 
     @overload
     def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
@@ -1300,6 +1327,9 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         )
         return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis)
 
+    def map(self, mapper, na_action=None):
+        return map_array(self.to_numpy(), mapper, na_action=None)
+
     def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         """
         Return whether any element is truthy.
diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py
@@ -9,10 +9,17 @@
 
 import numpy as np
 
+from pandas._libs import (
+    lib,
+    missing as libmissing,
+)
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.cast import maybe_box_native
-from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.dtypes import (
+    BaseMaskedDtype,
+    ExtensionDtype,
+)
 
 from pandas.core import common as com
 
@@ -150,6 +157,10 @@ def to_dict(
         for i, col_dtype in enumerate(df.dtypes.values)
         if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype)
     ]
+    box_na_values = [
+        lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA
+        for i, col_dtype in enumerate(df.dtypes.values)
+    ]
     are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
 
     if orient == "dict":
@@ -160,7 +171,11 @@ def to_dict(
         return into_c(
             (
                 k,
-                list(map(maybe_box_native, v.to_numpy().tolist()))
+                list(
+                    map(
+                        maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist()
+                    )
+                )
                 if i in object_dtype_indices_as_set
                 else v.to_numpy().tolist(),
             )
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -61,6 +61,7 @@
 )
 
 from pandas.core.arrays import (
+    BaseMaskedArray,
     Categorical,
     DatetimeArray,
     ExtensionArray,
@@ -1527,6 +1528,8 @@ def _format_strings(self) -> list[str]:
         if isinstance(values, Categorical):
             # Categorical is special for now, so that we can preserve tzinfo
             array = values._internal_get_values()
+        elif isinstance(values, BaseMaskedArray):
+            array = values.to_numpy(dtype=object)
         else:
             array = np.asarray(values)
 
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -63,6 +63,7 @@
 from pandas.core import algorithms
 from pandas.core.arrays import (
     ArrowExtensionArray,
+    BaseMaskedArray,
     BooleanArray,
     Categorical,
     ExtensionArray,
@@ -762,8 +763,15 @@ def _infer_types(
             pa = import_optional_dependency("pyarrow")
             if isinstance(result, np.ndarray):
                 result = ArrowExtensionArray(pa.array(result, from_pandas=True))
+            elif isinstance(result, BaseMaskedArray):
+                if result._mask.all():
+                    # We want an arrow null array here
+                    result = ArrowExtensionArray(pa.array([None] * len(result)))
+                else:
+                    result = ArrowExtensionArray(
+                        pa.array(result._data, mask=result._mask)
+                    )
             else:
-                # ExtensionArray
                 result = ArrowExtensionArray(
                     pa.array(result.to_numpy(), from_pandas=True)
                 )
diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py
@@ -223,7 +223,7 @@ def test_coerce_to_numpy_array():
     # also with no missing values -> object dtype
     arr = pd.array([True, False, True], dtype="boolean")
     result = np.array(arr)
-    expected = np.array([True, False, True], dtype="object")
+    expected = np.array([True, False, True], dtype="bool")
     tm.assert_numpy_array_equal(result, expected)
 
     # force bool dtype
@@ -263,7 +263,7 @@ def test_to_numpy(box):
     # default (with or without missing values) -> object dtype
     arr = con([True, False, True], dtype="boolean")
     result = arr.to_numpy()
-    expected = np.array([True, False, True], dtype="object")
+    expected = np.array([True, False, True], dtype="bool")
     tm.assert_numpy_array_equal(result, expected)
 
     arr = con([True, False, None], dtype="boolean")
diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py
@@ -13,12 +13,12 @@ def test_to_numpy(box):
     # default (with or without missing values) -> object dtype
     arr = con([0.1, 0.2, 0.3], dtype="Float64")
     result = arr.to_numpy()
-    expected = np.array([0.1, 0.2, 0.3], dtype="object")
+    expected = np.array([0.1, 0.2, 0.3], dtype="float64")
     tm.assert_numpy_array_equal(result, expected)
 
     arr = con([0.1, 0.2, None], dtype="Float64")
     result = arr.to_numpy()
-    expected = np.array([0.1, 0.2, pd.NA], dtype="object")
+    expected = np.array([0.1, 0.2, np.nan], dtype="float64")
     tm.assert_numpy_array_equal(result, expected)
 
 
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
@@ -141,7 +141,7 @@ def test_astype(all_data):
     # coerce to object
     s = pd.Series(mixed)
     result = s.astype("object")
-    expected = pd.Series(np.asarray(mixed))
+    expected = pd.Series(np.asarray(mixed, dtype=object))
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
@@ -295,7 +295,7 @@ def test_array_multiindex_raises():
             pd.core.arrays.period_array(["2000", "2001"], freq="D"),
             np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
         ),
-        (pd.array([0, np.nan], dtype="Int64"), np.array([0, pd.NA], dtype=object)),
+        (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
         (
             IntervalArray.from_breaks([0, 1, 2]),
             np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
@@ -346,10 +346,6 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request):
     with tm.assert_produces_warning(None):
         thing = box(arr)
 
-    if arr.dtype.name == "int64" and box is pd.array:
-        mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object")
-        request.applymarker(mark)
-
     result = thing.to_numpy()
     tm.assert_numpy_array_equal(result, expected)
 
diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py
@@ -133,21 +133,25 @@ def test_series_array_ea_dtypes(using_copy_on_write):
         assert arr.flags.writeable is True
 
     arr = np.asarray(ser)
-    assert not np.shares_memory(arr, get_array(ser))
-    assert arr.flags.writeable is True
+    assert np.shares_memory(arr, get_array(ser))
+    if using_copy_on_write:
+        assert arr.flags.writeable is False
+    else:
+        assert arr.flags.writeable is True
 
 
 def test_dataframe_array_ea_dtypes(using_copy_on_write):
     df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
     arr = np.asarray(df, dtype="int64")
-    # TODO: This should be able to share memory, but we are roundtripping
-    # through object
-    assert not np.shares_memory(arr, get_array(df, "a"))
-    assert arr.flags.writeable is True
+    assert np.shares_memory(arr, get_array(df, "a"))
+    if using_copy_on_write:
+        assert arr.flags.writeable is False
+    else:
+        assert arr.flags.writeable is True
 
     arr = np.asarray(df)
+    assert np.shares_memory(arr, get_array(df, "a"))
     if using_copy_on_write:
-        # TODO(CoW): This should be True
         assert arr.flags.writeable is False
     else:
         assert arr.flags.writeable is True
diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py
@@ -169,6 +169,16 @@ def data_for_grouping(dtype):
 
 
 class TestMaskedArrays(base.ExtensionTests):
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data_missing, na_action):
+        result = data_missing.map(lambda x: x, na_action=na_action)
+        if data_missing.dtype == Float32Dtype():
+            # map roundtrips through objects, which converts to float64
+            expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
+        else:
+            expected = data_missing.to_numpy()
+        tm.assert_numpy_array_equal(result, expected)
+
     def _get_expected_exception(self, op_name, obj, other):
         try:
             dtype = tm.get_dtype(obj)
diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
@@ -717,15 +717,12 @@ def test_where_ea_other(self):
 
         # TODO: ideally we would get Int64 instead of object
         result = df.where(mask, ser, axis=0)
-        expected = DataFrame({"A": [1, pd.NA, 3], "B": [4, pd.NA, 6]}).astype(object)
+        expected = DataFrame({"A": [1, np.nan, 3], "B": [4, np.nan, 6]})
         tm.assert_frame_equal(result, expected)
 
         ser2 = Series(arr[:2], index=["A", "B"])
-        expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]})
-        expected["B"] = expected["B"].astype(object)
-        msg = "Downcasting behavior in Series and DataFrame methods 'where'"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            result = df.where(mask, ser2, axis=1)
+        expected = DataFrame({"A": [1, 7, 3], "B": [4, np.nan, 6]})
+        result = df.where(mask, ser2, axis=1)
         tm.assert_frame_equal(result, expected)
 
     def test_where_interval_noop(self):
diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py
@@ -456,6 +456,20 @@ def test_to_records_with_inf_record(self):
                 result = repr(df)
         assert result == expected
 
+    def test_masked_ea_with_formatter(self):
+        # GH#39336
+        df = DataFrame(
+            {
+                "a": Series([0.123456789, 1.123456789], dtype="Float64"),
+                "b": Series([1, 2], dtype="Int64"),
+            }
+        )
+        result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format])
+        expected = """      a     b
+0  0.12  1.00
+1  1.12  2.00"""
+        assert result == expected
+
     def test_repr_ea_columns(self, any_string_dtype):
         # GH#54797
         pytest.importorskip("pyarrow")