From e26e164262d54265c8186d73177c063655a3fab2 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 29 Aug 2023 07:19:54 -0400 Subject: [PATCH 1/5] ENH: ArrowExtensionArray.to_numpy to avoid object dtype when na_value provided --- pandas/core/arrays/arrow/array.py | 50 +++++++++++++++------------- pandas/tests/extension/test_arrow.py | 13 ++++++++ 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 70c59e94f6d27..87d452210c41b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -34,7 +34,6 @@ is_bool_dtype, is_integer, is_list_like, - is_object_dtype, is_scalar, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -1243,48 +1242,53 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: + pa_type = self._pa_array.type + if ( + self._hasna + and na_value is not lib.no_default + and not isna(na_value) + and not pa.types.is_null(pa_type) + ): + data = self.fillna(na_value) + copy = False + else: + data = self + if dtype is not None: dtype = np.dtype(dtype) - elif self._hasna: + elif data._hasna and not (pa.types.is_floating(pa_type) and na_value is np.nan): dtype = np.dtype(object) if na_value is lib.no_default: na_value = self.dtype.na_value - pa_type = self._pa_array.type if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): - result = self._maybe_convert_datelike_array() + result = data._maybe_convert_datelike_array() if dtype is None or dtype.kind == "O": result = result.to_numpy(dtype=object, na_value=na_value) else: result = result.to_numpy(dtype=dtype) - return result elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type): # convert to list of python datetime.time objects before # wrapping in ndarray - result = np.array(list(self), dtype=dtype) - elif is_object_dtype(dtype) and self._hasna: - result = np.empty(len(self), dtype=object) - mask = ~self.isna() - result[mask] = np.asarray(self[mask]._pa_array) - elif pa.types.is_null(self._pa_array.type): - fill_value = None if isna(na_value) else na_value - return np.full(len(self), fill_value=fill_value, dtype=dtype) - elif self._hasna: - data = self.fillna(na_value) + result = np.array(list(data), dtype=dtype) + if self._hasna: + result[self.isna()] = na_value + elif pa.types.is_null(pa_type): + if dtype != np.object_ and isna(na_value): + na_value = None + result = np.full(len(data), fill_value=na_value, dtype=dtype) + elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): result = data._pa_array.to_numpy() - if dtype is not None: - result = result.astype(dtype, copy=False) - return result - else: - result = self._pa_array.to_numpy() if dtype is not None: result = result.astype(dtype, copy=False) if copy: result = result.copy() - return result - if self._hasna: - result[self.isna()] = na_value + else: + result = np.empty(len(data), dtype=dtype) + mask = data.isna() + result[mask] = na_value + result[~mask] = np.asarray(data[~mask]._pa_array) return result def unique(self) -> Self: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index cb79539cd2bd1..2339563e44133 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1595,6 +1595,19 @@ def test_to_numpy_null_array_no_dtype(): tm.assert_numpy_array_equal(result, expected) +def test_to_numpy_without_dtype(): + # GH 54808 + arr = pd.array([True, pd.NA], dtype="boolean[pyarrow]") + result = arr.to_numpy(na_value=False) + expected = np.array([True, False], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + arr = pd.array([1.0, pd.NA], dtype="float32[pyarrow]") + result = arr.to_numpy(na_value=0.0) + expected = np.array([1.0, 0.0], dtype=np.float32) + tm.assert_numpy_array_equal(result, expected) + + def test_setitem_null_slice(data): # GH50248 orig = data.copy() From 084c4c1fe13f7805237f9a555439e7afa19b0e23 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 30 Aug 2023 18:04:02 -0400 Subject: [PATCH 2/5] refactor --- pandas/core/arrays/arrow/array.py | 36 +++++++++++----------- pandas/tests/arrays/string_/test_string.py | 12 ++++++-- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d19677b541282..dfa7b6076d591 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -29,6 +29,7 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, @@ -1237,26 +1238,19 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: - pa_type = self._pa_array.type - if ( - self._hasna - and na_value is not lib.no_default - and not isna(na_value) - and not pa.types.is_null(pa_type) - ): - data = self.fillna(na_value) - copy = False - else: - data = self - if dtype is not None: dtype = np.dtype(dtype) - elif data._hasna and not (pa.types.is_floating(pa_type) and na_value is np.nan): - dtype = np.dtype(object) if na_value is lib.no_default: na_value = self.dtype.na_value + pa_type = self._pa_array.type + if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): + data = self + else: + data = self.fillna(na_value) + copy = False + if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): result = data._maybe_convert_datelike_array() if dtype is None or dtype.kind == "O": @@ -1267,10 +1261,10 @@ def to_numpy( # convert to list of python datetime.time objects before # wrapping in ndarray result = np.array(list(data), dtype=dtype) - if self._hasna: - result[self.isna()] = na_value + if data._hasna: + result[data.isna()] = na_value elif pa.types.is_null(pa_type): - if dtype != np.object_ and isna(na_value): + if dtype is not None and dtype.kind != "O" and isna(na_value): na_value = None result = np.full(len(data), fill_value=na_value, dtype=dtype) elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): @@ -1280,10 +1274,16 @@ def to_numpy( if copy: result = result.copy() else: + if dtype is None: + empty = data._pa_array[:0].to_numpy() + if can_hold_element(empty, na_value): + dtype = empty.dtype + else: + dtype = np.object_ result = np.empty(len(data), dtype=dtype) mask = data.isna() result[mask] = na_value - result[~mask] = np.asarray(data[~mask]._pa_array) + result[~mask] = data[~mask]._pa_array.to_numpy() return result def unique(self) -> Self: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 24d8e43708b91..89cc31ec5ecc8 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -377,8 +377,16 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number" - with pytest.raises(TypeError, match=msg): + if dtype.storage == "pyarrow_numpy": + err = ValueError + msg = "cannot convert float NaN to integer" + else: + err = TypeError + msg = ( + r"int\(\) argument must be a string, a bytes-like " + r"object or a( real)? number" + ) + with pytest.raises(err, match=msg): arr.astype("int64") From 609b755b9b8dc481a29376c84f40cce5381cca01 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 31 Aug 2023 21:45:52 -0400 Subject: [PATCH 3/5] cleanup --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index dfa7b6076d591..e857174c21cf8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1264,7 +1264,7 @@ def to_numpy( if data._hasna: result[data.isna()] = na_value elif pa.types.is_null(pa_type): - if dtype is not None and dtype.kind != "O" and isna(na_value): + if dtype is not None and isna(na_value): na_value = None result = np.full(len(data), fill_value=na_value, dtype=dtype) elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): @@ -1275,7 +1275,7 @@ def to_numpy( result = result.copy() else: if dtype is None: - empty = data._pa_array[:0].to_numpy() + empty = self.dtype.empty(0)._pa_array.to_numpy() if can_hold_element(empty, na_value): dtype = empty.dtype else: From 0eb9ba00d10a3d8d0104a129d648d041d45e9bac Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 31 Aug 2023 22:36:48 -0400 Subject: [PATCH 4/5] mypy --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e857174c21cf8..d8fa0a27e4a5d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1275,7 +1275,7 @@ def to_numpy( result = result.copy() else: if dtype is None: - empty = self.dtype.empty(0)._pa_array.to_numpy() + empty = pa.array([], type=pa_type).to_numpy() if can_hold_element(empty, na_value): dtype = empty.dtype else: From 3dca86b967ae0d95c05b1f56bbfadd94727b81d4 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 1 Sep 2023 04:37:01 -0400 Subject: [PATCH 5/5] fix --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d8fa0a27e4a5d..4d887ecd1510f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1275,7 +1275,7 @@ def to_numpy( result = result.copy() else: if dtype is None: - empty = pa.array([], type=pa_type).to_numpy() + empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False) if can_hold_element(empty, na_value): dtype = empty.dtype else: