From 03b8b47c482a7538589ea11ffebfb5b25621be32 Mon Sep 17 00:00:00 2001 From: Simon Boehm Date: Mon, 15 Mar 2021 19:06:08 +0100 Subject: [PATCH 1/8] Fix StringArray.astype for category dtype --- pandas/core/arrays/string_.py | 5 ++++- pandas/tests/series/methods/test_astype.py | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 67cd6c63c1faa..ff2141605a7bf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -36,10 +36,12 @@ from pandas.core import ops from pandas.core.array_algos import masked_reductions from pandas.core.arrays import ( + Categorical, FloatingArray, IntegerArray, PandasArray, ) +from pandas.core.arrays.categorical import CategoricalDtype from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -307,7 +309,6 @@ def __setitem__(self, key, value): def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) - if is_dtype_equal(dtype, self.dtype): if copy: return self.copy() @@ -327,6 +328,8 @@ def astype(self, dtype, copy=True): arr[mask] = "0" values = arr.astype(dtype.numpy_dtype) return FloatingArray(values, mask, copy=False) + elif isinstance(dtype, CategoricalDtype): + return Categorical(self, dtype=dtype, copy=copy) elif np.issubdtype(dtype, np.floating): arr = self._ndarray.copy() mask = self.isna() diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 4e068690c41e5..31cd641aa2561 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -470,6 +470,13 @@ def test_astype_categories_raises(self): with pytest.raises(TypeError, match="got an unexpected"): s.astype("category", categories=["a", "b"], ordered=True) + def test_astype_str_to_categorical(self): + # GH-40351 + s = Series(["A", np.NaN], dtype="string") + result = s.astype("category") + expected = Series(["A", np.NaN], dtype="category") + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("items", [["a", "b", "c", "a"], [1, 2, 3, 1]]) def test_astype_from_categorical(self, items): ser = Series(items) From 333e24b25e945ddf0ac54d6b6469aada9aa72a6d Mon Sep 17 00:00:00 2001 From: Simon Boehm Date: Tue, 16 Mar 2021 07:49:28 +0100 Subject: [PATCH 2/8] Fix str to ExtensionDtype conversion --- pandas/core/arrays/string_.py | 7 +++---- pandas/tests/series/methods/test_astype.py | 7 ++++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ff2141605a7bf..8e0f319552923 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -36,12 +36,10 @@ from pandas.core import ops from pandas.core.array_algos import masked_reductions from pandas.core.arrays import ( - Categorical, FloatingArray, IntegerArray, PandasArray, ) -from pandas.core.arrays.categorical import CategoricalDtype from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -328,8 +326,9 @@ def astype(self, dtype, copy=True): arr[mask] = "0" values = arr.astype(dtype.numpy_dtype) return FloatingArray(values, mask, copy=False) - elif isinstance(dtype, CategoricalDtype): - return Categorical(self, dtype=dtype, copy=copy) + elif isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + return cls._from_sequence(self, dtype=dtype, copy=copy) elif np.issubdtype(dtype, np.floating): arr = self._ndarray.copy() mask = self.isna() diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 31cd641aa2561..e46aa6cc8763f 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -470,13 +470,18 @@ def test_astype_categories_raises(self): with pytest.raises(TypeError, match="got an unexpected"): s.astype("category", categories=["a", "b"], ordered=True) - def test_astype_str_to_categorical(self): + def test_astype_str_to_extension_dtype(self): # GH-40351 s = Series(["A", np.NaN], dtype="string") result = s.astype("category") expected = Series(["A", np.NaN], dtype="category") tm.assert_series_equal(result, expected) + s = Series(["1/1/2021", "2/1/2021"], dtype="string") + result = s.astype("period[M]") + expected = Series(["1/1/2021", "2/1/2021"], dtype="period[M]") + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("items", [["a", "b", "c", "a"], [1, 2, 3, 1]]) def test_astype_from_categorical(self, items): ser = Series(items) From c29c9dd0f3f2282e166147cef45cf832729cac54 Mon Sep 17 00:00:00 2001 From: Simon Boehm Date: Wed, 17 Mar 2021 17:55:25 +0100 Subject: [PATCH 3/8] Use is_extension_array_dtype, add more tests --- pandas/core/arrays/string_.py | 3 +- pandas/tests/series/methods/test_astype.py | 33 ++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8e0f319552923..ea8d0fcb0bd30 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -27,6 +27,7 @@ is_array_like, is_bool_dtype, is_dtype_equal, + is_extension_array_dtype, is_integer_dtype, is_object_dtype, is_string_dtype, @@ -326,7 +327,7 @@ def astype(self, dtype, copy=True): arr[mask] = "0" values = arr.astype(dtype.numpy_dtype) return FloatingArray(values, mask, copy=False) - elif isinstance(dtype, ExtensionDtype): + elif is_extension_array_dtype(dtype): cls = dtype.construct_array_type() return cls._from_sequence(self, dtype=dtype, copy=copy) elif np.issubdtype(dtype, np.floating): diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index e46aa6cc8763f..6615e11356444 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -350,6 +350,39 @@ def test_astype_bytes(self): assert result.dtypes == np.dtype("S3") +class TestAstypeString: + @pytest.mark.parametrize( + "data, dtype", + [ + (["A", NA], "category"), + (["2020-10-10", "2020-10-10"], "datetime64[ns]"), + (["2020-10-10", "2020-10-10", NaT], "datetime64[ns]"), + ( + ["2012-01-01 00:00:00-05:00", NaT], + "datetime64[ns, US/Eastern]", + ), + ([1, None], "UInt16"), + (["1/1/2021", "2/1/2021"], "period[M]"), + (["1/1/2021", "2/1/2021", NaT], "period[M]"), + (["1 Day", "59 Days", NaT], "timedelta64[ns]"), + # currently no way to parse BooleanArray, IntervalArray from a + # list of strings + ], + ) + def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): + if dtype in ("timedelta64[ns]"): + mark = pytest.mark.xfail(reason="TODO fix is_extension_array_dtype GH40478") + request.node.add_marker(mark) + if NaT in data and dtype in ("period[M]", "datetime64[ns]"): + mark = pytest.mark.xfail( + reason="TODO StringArray.astype() None to dtype.na_value conversion" + ) + request.node.add_marker(mark) + # GH-40351 + s = Series(data, dtype=dtype) + tm.assert_series_equal(s, s.astype("string").astype(dtype)) + + class TestAstypeCategorical: def test_astype_categorical_to_other(self): cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) From 94a8b580ebaed8a2d2c8dc81b3eb1d5333ccd1d1 Mon Sep 17 00:00:00 2001 From: Simon Boehm Date: Sun, 21 Mar 2021 19:27:24 +0100 Subject: [PATCH 4/8] String.astype() uses _from_sequence_of_strings --- pandas/core/arrays/categorical.py | 6 ++++++ pandas/core/arrays/datetimes.py | 9 ++++++++ pandas/core/arrays/period.py | 4 +++- pandas/core/arrays/string_.py | 22 ++++++++++++++----- pandas/core/arrays/timedeltas.py | 13 ++++++++++- pandas/tests/series/methods/test_astype.py | 25 +++------------------- 6 files changed, 50 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 769ae52744c74..eb918f754c6f5 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -478,6 +478,12 @@ def _constructor(self) -> Type[Categorical]: def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): return Categorical(scalars, dtype=dtype, copy=copy) + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Optional[Dtype] = None, copy=False + ): + return cls._from_sequence(scalars=strings, dtype=dtype, copy=copy) + def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: """ Coerce this type to another dtype diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index c0a8c20832fa8..8a6fd90bc59ba 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -38,6 +38,7 @@ to_offset, tzconversion, ) +from pandas._typing import Dtype from pandas.errors import PerformanceWarning from pandas.core.dtypes.cast import astype_dt64_to_dt64tz @@ -65,6 +66,7 @@ from pandas.core.dtypes.missing import isna from pandas.core.algorithms import checked_add_with_arr +from pandas.core.api import NA from pandas.core.arrays import ( ExtensionArray, datetimelike as dtl, @@ -334,6 +336,13 @@ def _simple_new( def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Optional[Dtype] = None, copy=False + ): + scalars = [NaT if s is NA else s for s in strings] + return cls._from_sequence(scalars, dtype=dtype, copy=copy) + @classmethod def _from_sequence_not_strict( cls, diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index d91522a9e1bb6..8a02528d18401 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -73,6 +73,7 @@ ) import pandas.core.algorithms as algos +from pandas.core.api import NA from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com @@ -252,7 +253,8 @@ def _from_sequence( def _from_sequence_of_strings( cls, strings, *, dtype: Optional[Dtype] = None, copy=False ) -> PeriodArray: - return cls._from_sequence(strings, dtype=dtype, copy=copy) + scalars = [NaT if s is NA else s for s in strings] + return cls._from_sequence(scalars, dtype=dtype, copy=copy) @classmethod def _from_datetime64(cls, data, freq, tz=None) -> PeriodArray: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ea8d0fcb0bd30..c74ba25054780 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -26,23 +26,29 @@ from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, + is_datetime64_any_dtype, is_dtype_equal, is_extension_array_dtype, is_integer_dtype, is_object_dtype, is_string_dtype, + is_timedelta64_dtype, pandas_dtype, ) from pandas.core import ops from pandas.core.array_algos import masked_reductions -from pandas.core.arrays import ( +from pandas.core.arrays import PandasArray +from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.floating import ( FloatingArray, + FloatingDtype, +) +from pandas.core.arrays.integer import ( IntegerArray, - PandasArray, + _IntegerDtype, ) -from pandas.core.arrays.floating import FloatingDtype -from pandas.core.arrays.integer import _IntegerDtype +from pandas.core.arrays.timedeltas import TimedeltaArray from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna @@ -329,7 +335,13 @@ def astype(self, dtype, copy=True): return FloatingArray(values, mask, copy=False) elif is_extension_array_dtype(dtype): cls = dtype.construct_array_type() - return cls._from_sequence(self, dtype=dtype, copy=copy) + return cls._from_sequence_of_strings(self, dtype=dtype, copy=copy) + elif is_datetime64_any_dtype(dtype): + return DatetimeArray._from_sequence_of_strings(self, dtype=dtype, copy=copy) + elif is_timedelta64_dtype(dtype): + return TimedeltaArray._from_sequence_of_strings( + self, dtype=dtype, copy=copy + ) elif np.issubdtype(dtype, np.floating): arr = self._ndarray.copy() mask = self.isna() diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f3889ff360aa8..b3106f0d30d96 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -35,7 +35,10 @@ ints_to_pytimedelta, parse_timedelta_unit, ) -from pandas._typing import NpDtype +from pandas._typing import ( + Dtype, + NpDtype, +) from pandas.compat.numpy import function as nv from pandas.core.dtypes.cast import astype_td64_unit_conversion @@ -60,6 +63,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr +from pandas.core.api import NA from pandas.core.arrays import ( ExtensionArray, IntegerArray, @@ -254,6 +258,13 @@ def _from_sequence( return cls._simple_new(data, freq=freq) + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Optional[Dtype] = None, copy=False + ): + scalars = [NaT if s is NA else s for s in strings] + return cls._from_sequence(scalars, dtype=dtype, copy=copy) + @classmethod def _from_sequence_not_strict( cls, diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 6615e11356444..2cd11f381792b 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -354,6 +354,8 @@ class TestAstypeString: @pytest.mark.parametrize( "data, dtype", [ + ([True, False, NA], "boolean"), + # GH-40351 (["A", NA], "category"), (["2020-10-10", "2020-10-10"], "datetime64[ns]"), (["2020-10-10", "2020-10-10", NaT], "datetime64[ns]"), @@ -365,19 +367,10 @@ class TestAstypeString: (["1/1/2021", "2/1/2021"], "period[M]"), (["1/1/2021", "2/1/2021", NaT], "period[M]"), (["1 Day", "59 Days", NaT], "timedelta64[ns]"), - # currently no way to parse BooleanArray, IntervalArray from a - # list of strings + # currently no way to parse IntervalArray from strings ], ) def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): - if dtype in ("timedelta64[ns]"): - mark = pytest.mark.xfail(reason="TODO fix is_extension_array_dtype GH40478") - request.node.add_marker(mark) - if NaT in data and dtype in ("period[M]", "datetime64[ns]"): - mark = pytest.mark.xfail( - reason="TODO StringArray.astype() None to dtype.na_value conversion" - ) - request.node.add_marker(mark) # GH-40351 s = Series(data, dtype=dtype) tm.assert_series_equal(s, s.astype("string").astype(dtype)) @@ -503,18 +496,6 @@ def test_astype_categories_raises(self): with pytest.raises(TypeError, match="got an unexpected"): s.astype("category", categories=["a", "b"], ordered=True) - def test_astype_str_to_extension_dtype(self): - # GH-40351 - s = Series(["A", np.NaN], dtype="string") - result = s.astype("category") - expected = Series(["A", np.NaN], dtype="category") - tm.assert_series_equal(result, expected) - - s = Series(["1/1/2021", "2/1/2021"], dtype="string") - result = s.astype("period[M]") - expected = Series(["1/1/2021", "2/1/2021"], dtype="period[M]") - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("items", [["a", "b", "c", "a"], [1, 2, 3, 1]]) def test_astype_from_categorical(self, items): ser = Series(items) From f71afa2fba7c09baa2433a8cf5b9ea42fd091f5b Mon Sep 17 00:00:00 2001 From: Simon Boehm Date: Mon, 22 Mar 2021 08:33:28 +0100 Subject: [PATCH 5/8] Revert "String.astype() uses _from_sequence_of_strings" This reverts commit 94a8b580ebaed8a2d2c8dc81b3eb1d5333ccd1d1. --- pandas/core/arrays/categorical.py | 6 ------ pandas/core/arrays/datetimes.py | 9 -------- pandas/core/arrays/period.py | 4 +--- pandas/core/arrays/string_.py | 22 +++++-------------- pandas/core/arrays/timedeltas.py | 13 +---------- pandas/tests/series/methods/test_astype.py | 25 +++++++++++++++++++--- 6 files changed, 29 insertions(+), 50 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index eb918f754c6f5..769ae52744c74 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -478,12 +478,6 @@ def _constructor(self) -> Type[Categorical]: def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): return Categorical(scalars, dtype=dtype, copy=copy) - @classmethod - def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy=False - ): - return cls._from_sequence(scalars=strings, dtype=dtype, copy=copy) - def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: """ Coerce this type to another dtype diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8a6fd90bc59ba..c0a8c20832fa8 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -38,7 +38,6 @@ to_offset, tzconversion, ) -from pandas._typing import Dtype from pandas.errors import PerformanceWarning from pandas.core.dtypes.cast import astype_dt64_to_dt64tz @@ -66,7 +65,6 @@ from pandas.core.dtypes.missing import isna from pandas.core.algorithms import checked_add_with_arr -from pandas.core.api import NA from pandas.core.arrays import ( ExtensionArray, datetimelike as dtl, @@ -336,13 +334,6 @@ def _simple_new( def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) - @classmethod - def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy=False - ): - scalars = [NaT if s is NA else s for s in strings] - return cls._from_sequence(scalars, dtype=dtype, copy=copy) - @classmethod def _from_sequence_not_strict( cls, diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8a02528d18401..d91522a9e1bb6 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -73,7 +73,6 @@ ) import pandas.core.algorithms as algos -from pandas.core.api import NA from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com @@ -253,8 +252,7 @@ def _from_sequence( def _from_sequence_of_strings( cls, strings, *, dtype: Optional[Dtype] = None, copy=False ) -> PeriodArray: - scalars = [NaT if s is NA else s for s in strings] - return cls._from_sequence(scalars, dtype=dtype, copy=copy) + return cls._from_sequence(strings, dtype=dtype, copy=copy) @classmethod def _from_datetime64(cls, data, freq, tz=None) -> PeriodArray: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c74ba25054780..ea8d0fcb0bd30 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -26,29 +26,23 @@ from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, - is_datetime64_any_dtype, is_dtype_equal, is_extension_array_dtype, is_integer_dtype, is_object_dtype, is_string_dtype, - is_timedelta64_dtype, pandas_dtype, ) from pandas.core import ops from pandas.core.array_algos import masked_reductions -from pandas.core.arrays import PandasArray -from pandas.core.arrays.datetimes import DatetimeArray -from pandas.core.arrays.floating import ( +from pandas.core.arrays import ( FloatingArray, - FloatingDtype, -) -from pandas.core.arrays.integer import ( IntegerArray, - _IntegerDtype, + PandasArray, ) -from pandas.core.arrays.timedeltas import TimedeltaArray +from pandas.core.arrays.floating import FloatingDtype +from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna @@ -335,13 +329,7 @@ def astype(self, dtype, copy=True): return FloatingArray(values, mask, copy=False) elif is_extension_array_dtype(dtype): cls = dtype.construct_array_type() - return cls._from_sequence_of_strings(self, dtype=dtype, copy=copy) - elif is_datetime64_any_dtype(dtype): - return DatetimeArray._from_sequence_of_strings(self, dtype=dtype, copy=copy) - elif is_timedelta64_dtype(dtype): - return TimedeltaArray._from_sequence_of_strings( - self, dtype=dtype, copy=copy - ) + return cls._from_sequence(self, dtype=dtype, copy=copy) elif np.issubdtype(dtype, np.floating): arr = self._ndarray.copy() mask = self.isna() diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index b3106f0d30d96..f3889ff360aa8 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -35,10 +35,7 @@ ints_to_pytimedelta, parse_timedelta_unit, ) -from pandas._typing import ( - Dtype, - NpDtype, -) +from pandas._typing import NpDtype from pandas.compat.numpy import function as nv from pandas.core.dtypes.cast import astype_td64_unit_conversion @@ -63,7 +60,6 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr -from pandas.core.api import NA from pandas.core.arrays import ( ExtensionArray, IntegerArray, @@ -258,13 +254,6 @@ def _from_sequence( return cls._simple_new(data, freq=freq) - @classmethod - def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy=False - ): - scalars = [NaT if s is NA else s for s in strings] - return cls._from_sequence(scalars, dtype=dtype, copy=copy) - @classmethod def _from_sequence_not_strict( cls, diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 2cd11f381792b..6615e11356444 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -354,8 +354,6 @@ class TestAstypeString: @pytest.mark.parametrize( "data, dtype", [ - ([True, False, NA], "boolean"), - # GH-40351 (["A", NA], "category"), (["2020-10-10", "2020-10-10"], "datetime64[ns]"), (["2020-10-10", "2020-10-10", NaT], "datetime64[ns]"), @@ -367,10 +365,19 @@ class TestAstypeString: (["1/1/2021", "2/1/2021"], "period[M]"), (["1/1/2021", "2/1/2021", NaT], "period[M]"), (["1 Day", "59 Days", NaT], "timedelta64[ns]"), - # currently no way to parse IntervalArray from strings + # currently no way to parse BooleanArray, IntervalArray from a + # list of strings ], ) def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): + if dtype in ("timedelta64[ns]"): + mark = pytest.mark.xfail(reason="TODO fix is_extension_array_dtype GH40478") + request.node.add_marker(mark) + if NaT in data and dtype in ("period[M]", "datetime64[ns]"): + mark = pytest.mark.xfail( + reason="TODO StringArray.astype() None to dtype.na_value conversion" + ) + request.node.add_marker(mark) # GH-40351 s = Series(data, dtype=dtype) tm.assert_series_equal(s, s.astype("string").astype(dtype)) @@ -496,6 +503,18 @@ def test_astype_categories_raises(self): with pytest.raises(TypeError, match="got an unexpected"): s.astype("category", categories=["a", "b"], ordered=True) + def test_astype_str_to_extension_dtype(self): + # GH-40351 + s = Series(["A", np.NaN], dtype="string") + result = s.astype("category") + expected = Series(["A", np.NaN], dtype="category") + tm.assert_series_equal(result, expected) + + s = Series(["1/1/2021", "2/1/2021"], dtype="string") + result = s.astype("period[M]") + expected = Series(["1/1/2021", "2/1/2021"], dtype="period[M]") + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("items", [["a", "b", "c", "a"], [1, 2, 3, 1]]) def test_astype_from_categorical(self, items): ser = Series(items) From d8ff71657b4d1c7c18c2d47ee6e16bdee7f613ab Mon Sep 17 00:00:00 2001 From: Simon Boehm Date: Mon, 22 Mar 2021 08:33:28 +0100 Subject: [PATCH 6/8] Add Xfailing Boolean roundtrip --- pandas/core/arrays/string_.py | 3 +-- pandas/tests/series/methods/test_astype.py | 13 ++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ea8d0fcb0bd30..8e0f319552923 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -27,7 +27,6 @@ is_array_like, is_bool_dtype, is_dtype_equal, - is_extension_array_dtype, is_integer_dtype, is_object_dtype, is_string_dtype, @@ -327,7 +326,7 @@ def astype(self, dtype, copy=True): arr[mask] = "0" values = arr.astype(dtype.numpy_dtype) return FloatingArray(values, mask, copy=False) - elif is_extension_array_dtype(dtype): + elif isinstance(dtype, ExtensionDtype): cls = dtype.construct_array_type() return cls._from_sequence(self, dtype=dtype, copy=copy) elif np.issubdtype(dtype, np.floating): diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 6615e11356444..ad5c3bcc18ddb 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -354,6 +354,7 @@ class TestAstypeString: @pytest.mark.parametrize( "data, dtype", [ + ([True, NA], "boolean"), (["A", NA], "category"), (["2020-10-10", "2020-10-10"], "datetime64[ns]"), (["2020-10-10", "2020-10-10", NaT], "datetime64[ns]"), @@ -365,17 +366,15 @@ class TestAstypeString: (["1/1/2021", "2/1/2021"], "period[M]"), (["1/1/2021", "2/1/2021", NaT], "period[M]"), (["1 Day", "59 Days", NaT], "timedelta64[ns]"), - # currently no way to parse BooleanArray, IntervalArray from a - # list of strings + # currently no way to parse IntervalArray from a list of strings ], ) def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): - if dtype in ("timedelta64[ns]"): - mark = pytest.mark.xfail(reason="TODO fix is_extension_array_dtype GH40478") - request.node.add_marker(mark) - if NaT in data and dtype in ("period[M]", "datetime64[ns]"): + if dtype == "boolean" or ( + dtype in ("period[M]", "datetime64[ns]", "timedelta64[ns]") and NaT in data + ): mark = pytest.mark.xfail( - reason="TODO StringArray.astype() None to dtype.na_value conversion" + reason="TODO StringArray.astype() with missing values #GH40566" ) request.node.add_marker(mark) # GH-40351 From 857d2e22511fa0848b5d2c8da14add58781fa0e2 Mon Sep 17 00:00:00 2001 From: Simon Boehm Date: Sat, 27 Mar 2021 11:09:58 +0100 Subject: [PATCH 7/8] Remove duplicate test --- pandas/core/arrays/string_.py | 1 + pandas/tests/series/methods/test_astype.py | 12 ------------ 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8e0f319552923..73cf6f39513c5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -307,6 +307,7 @@ def __setitem__(self, key, value): def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) + if is_dtype_equal(dtype, self.dtype): if copy: return self.copy() diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index ad5c3bcc18ddb..d23c44733949a 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -502,18 +502,6 @@ def test_astype_categories_raises(self): with pytest.raises(TypeError, match="got an unexpected"): s.astype("category", categories=["a", "b"], ordered=True) - def test_astype_str_to_extension_dtype(self): - # GH-40351 - s = Series(["A", np.NaN], dtype="string") - result = s.astype("category") - expected = Series(["A", np.NaN], dtype="category") - tm.assert_series_equal(result, expected) - - s = Series(["1/1/2021", "2/1/2021"], dtype="string") - result = s.astype("period[M]") - expected = Series(["1/1/2021", "2/1/2021"], dtype="period[M]") - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("items", [["a", "b", "c", "a"], [1, 2, 3, 1]]) def test_astype_from_categorical(self, items): ser = Series(items) From d9bb52dd417ef8f1987afea922dffc57724f4ecd Mon Sep 17 00:00:00 2001 From: Simon Boehm Date: Mon, 29 Mar 2021 20:20:13 +0200 Subject: [PATCH 8/8] add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1e723493a4cc8..745a52bba3f93 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -546,6 +546,7 @@ Conversion - Bug in creating a :class:`DataFrame` from an empty ``np.recarray`` not retaining the original dtypes (:issue:`40121`) - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`) - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`) +- Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`) Strings ^^^^^^^