From d59dcd302817a292e68f7276b97c253c5708c487 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 22 Dec 2021 11:50:13 +0100 Subject: [PATCH 1/6] Set correct missing value indicator in astype for categorical --- pandas/core/arrays/categorical.py | 10 ++++------ pandas/tests/arrays/categorical/test_dtypes.py | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0ce7e0fbfb80a..ea038de457245 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -108,7 +108,6 @@ ) import pandas.core.common as com from pandas.core.construction import ( - ensure_wrapped_if_datetimelike, extract_array, sanitize_array, ) @@ -539,14 +538,13 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: else: # GH8628 (PERF): astype category codes instead of astyping array - if is_datetime64_dtype(self.categories): - new_cats = ensure_wrapped_if_datetimelike(self.categories._values) - else: - new_cats = np.asarray(self.categories) + new_cats = self.categories._values try: new_cats = new_cats.astype(dtype=dtype, copy=copy) - fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) + fill_value = lib.item_from_zerodim( + np.array(self.categories._na_value).astype(dtype) + ) except ( TypeError, # downstream error msg for CategoricalIndex is misleading ValueError, diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 8733bfccd9f9d..38998d89593db 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -182,7 +182,7 @@ def test_astype_object_datetime_categories(self): # GH#40754 cat = Categorical(to_datetime(["2021-03-27", NaT])) result = cat.astype(object) - expected = np.array([Timestamp("2021-03-27 00:00:00"), np.nan], dtype="object") + expected = np.array([Timestamp("2021-03-27 00:00:00"), NaT], dtype="object") tm.assert_numpy_array_equal(result, expected) def test_astype_object_timestamp_categories(self): From f4e8c9b6d51ead5422a1eb73726687122bd1a186 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 22 Dec 2021 12:53:50 +0100 Subject: [PATCH 2/6] Try casting twice --- pandas/core/arrays/categorical.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ea038de457245..10c06e18c98d0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -542,9 +542,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: try: new_cats = new_cats.astype(dtype=dtype, copy=copy) - fill_value = lib.item_from_zerodim( - np.array(self.categories._na_value).astype(dtype) - ) + try: + fill_value = lib.item_from_zerodim( + np.array(self.categories._na_value).astype(dtype) + ) + except ValueError: + fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) except ( TypeError, # downstream error msg for CategoricalIndex is misleading ValueError, From c21fdcf2edad542acc0cc70a78d36298feeb4cd8 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 22 Dec 2021 12:57:17 +0100 Subject: [PATCH 3/6] Check if fill value is valid --- pandas/core/arrays/categorical.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 10c06e18c98d0..c9a5b9e6cec67 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -542,12 +542,11 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: try: new_cats = new_cats.astype(dtype=dtype, copy=copy) - try: + fill_value = self.categories._na_value + if not is_valid_na_for_dtype(fill_value, dtype): fill_value = lib.item_from_zerodim( np.array(self.categories._na_value).astype(dtype) ) - except ValueError: - fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) except ( TypeError, # downstream error msg for CategoricalIndex is misleading ValueError, From afacccb4e037c1bb020639b51495e0939dd695c4 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 22 Dec 2021 13:55:50 +0100 Subject: [PATCH 4/6] Catch np string dtype --- pandas/core/dtypes/missing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4e3306e84c1a1..2f2c4617061a6 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -640,6 +640,9 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: # Numeric return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64)) + elif dtype == np.dtype(str): + return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal, float)) + elif dtype == np.dtype("object"): # This is needed for Categorical, but is kind of weird return True From 7b5619f45d5bd369d1c39df8cfffe15698e8a435 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 28 Dec 2021 20:31:34 +0100 Subject: [PATCH 5/6] Add comment --- pandas/core/dtypes/missing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 2f2c4617061a6..2f6c7f7612b83 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -641,6 +641,7 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64)) elif dtype == np.dtype(str): + # numpy string dtypes to avoid float np.nan return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal, float)) elif dtype == np.dtype("object"): From 3991976576e5f1d43bbd8648c2c55e7b5cd39c7b Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 28 Dec 2021 20:32:42 +0100 Subject: [PATCH 6/6] Add dtypes to module level --- pandas/core/dtypes/missing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 2f6c7f7612b83..c854eb1ea322a 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -56,6 +56,8 @@ nan_checker = np.isnan INF_AS_NA = False +_dtype_object = np.dtype("object") +_dtype_str = np.dtype(str) def isna(obj): @@ -640,11 +642,11 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: # Numeric return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64)) - elif dtype == np.dtype(str): + elif dtype == _dtype_str: # numpy string dtypes to avoid float np.nan return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal, float)) - elif dtype == np.dtype("object"): + elif dtype == _dtype_object: # This is needed for Categorical, but is kind of weird return True