diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1bb5556663c29..4b29663adda23 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -30,6 +30,7 @@ coerce_indexer_dtype, maybe_cast_to_extension_array, maybe_infer_to_datetimelike, + sanitize_to_nanoseconds, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -366,6 +367,9 @@ def __init__( values = [values[idx] for idx in np.where(~null_mask)[0]] values = sanitize_array(values, None, dtype=sanitize_dtype) + else: + values = sanitize_to_nanoseconds(values) + if dtype.categories is None: try: codes, categories = factorize(values, sort=True) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 25259093f9fba..08e193acdf5ea 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1521,13 +1521,7 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): # catch a datetime/timedelta that is not of ns variety # and no coercion specified if is_array and value.dtype.kind in ["M", "m"]: - dtype = value.dtype - - if dtype.kind == "M" and dtype != DT64NS_DTYPE: - value = conversion.ensure_datetime64ns(value) - - elif dtype.kind == "m" and dtype != TD64NS_DTYPE: - value = conversion.ensure_timedelta64ns(value) + value = sanitize_to_nanoseconds(value) # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this @@ -1543,6 +1537,20 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): return value +def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray: + """ + Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond. + """ + dtype = values.dtype + if dtype.kind == "M" and dtype != DT64NS_DTYPE: + values = conversion.ensure_datetime64ns(values) + + elif dtype.kind == "m" and dtype != TD64NS_DTYPE: + values = conversion.ensure_timedelta64ns(values) + + return values + + def find_common_type(types: List[DtypeObj]) -> DtypeObj: """ Find a common data type among the given dtypes. diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 924a20c7e6490..556f8c24f2ab1 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat import IS64, is_platform_windows + from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype from pandas.core.dtypes.dtypes import CategoricalDtype @@ -723,3 +725,14 @@ def test_from_sequence_copy(self): result = Categorical._from_sequence(cat, dtype=None, copy=True) assert not np.shares_memory(result._codes, cat._codes) + + @pytest.mark.xfail( + not IS64 or is_platform_windows(), + reason="Incorrectly raising in ensure_datetime64ns", + ) + def test_constructor_datetime64_non_nano(self): + categories = np.arange(10).view("M8[D]") + values = categories[::2].copy() + + cat = Categorical(values, categories=categories) + assert (cat == values).all() diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 6eb0e09f12658..fe4bcb44d5e61 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -67,72 +67,124 @@ def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values): class TestSeriesDropDuplicates: - @pytest.mark.parametrize( - "dtype", - ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], + @pytest.fixture( + params=["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"] ) - def test_drop_duplicates_categorical_non_bool(self, dtype, ordered): - cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + def dtype(self, request): + return request.param + @pytest.fixture + def cat_series1(self, dtype, ordered): # Test case 1 + cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) - tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered)) - if dtype == "datetime64[D]": - # pre-empty flaky xfail, tc1 values are seemingly-random - if not (np.array(tc1) == input1).all(): - pytest.xfail(reason="GH#7996") + cat = Categorical(input1, categories=cat_array, ordered=ordered) + tc1 = Series(cat) + return tc1 + + def test_drop_duplicates_categorical_non_bool(self, cat_series1): + tc1 = cat_series1 expected = Series([False, False, False, True]) - tm.assert_series_equal(tc1.duplicated(), expected) - tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) + + result = tc1.duplicated() + tm.assert_series_equal(result, expected) + + result = tc1.drop_duplicates() + tm.assert_series_equal(result, tc1[~expected]) + sc = tc1.copy() return_value = sc.drop_duplicates(inplace=True) assert return_value is None tm.assert_series_equal(sc, tc1[~expected]) + def test_drop_duplicates_categorical_non_bool_keeplast(self, cat_series1): + tc1 = cat_series1 + expected = Series([False, False, True, False]) - tm.assert_series_equal(tc1.duplicated(keep="last"), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) + + result = tc1.duplicated(keep="last") + tm.assert_series_equal(result, expected) + + result = tc1.drop_duplicates(keep="last") + tm.assert_series_equal(result, tc1[~expected]) + sc = tc1.copy() return_value = sc.drop_duplicates(keep="last", inplace=True) assert return_value is None tm.assert_series_equal(sc, tc1[~expected]) + def test_drop_duplicates_categorical_non_bool_keepfalse(self, cat_series1): + tc1 = cat_series1 + expected = Series([False, False, True, True]) - tm.assert_series_equal(tc1.duplicated(keep=False), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) + + result = tc1.duplicated(keep=False) + tm.assert_series_equal(result, expected) + + result = tc1.drop_duplicates(keep=False) + tm.assert_series_equal(result, tc1[~expected]) + sc = tc1.copy() return_value = sc.drop_duplicates(keep=False, inplace=True) assert return_value is None tm.assert_series_equal(sc, tc1[~expected]) - # Test case 2 + @pytest.fixture + def cat_series2(self, dtype, ordered): + # Test case 2; TODO: better name + cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) - tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered)) - if dtype == "datetime64[D]": - # pre-empty flaky xfail, tc2 values are seemingly-random - if not (np.array(tc2) == input2).all(): - pytest.xfail(reason="GH#7996") + cat = Categorical(input2, categories=cat_array, ordered=ordered) + tc2 = Series(cat) + return tc2 + + def test_drop_duplicates_categorical_non_bool2(self, cat_series2): + # Test case 2; TODO: better name + tc2 = cat_series2 expected = Series([False, False, False, False, True, True, False]) - tm.assert_series_equal(tc2.duplicated(), expected) - tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) + + result = tc2.duplicated() + tm.assert_series_equal(result, expected) + + result = tc2.drop_duplicates() + tm.assert_series_equal(result, tc2[~expected]) + sc = tc2.copy() return_value = sc.drop_duplicates(inplace=True) assert return_value is None tm.assert_series_equal(sc, tc2[~expected]) + def test_drop_duplicates_categorical_non_bool2_keeplast(self, cat_series2): + tc2 = cat_series2 + expected = Series([False, True, True, False, False, False, False]) - tm.assert_series_equal(tc2.duplicated(keep="last"), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) + + result = tc2.duplicated(keep="last") + tm.assert_series_equal(result, expected) + + result = tc2.drop_duplicates(keep="last") + tm.assert_series_equal(result, tc2[~expected]) + sc = tc2.copy() return_value = sc.drop_duplicates(keep="last", inplace=True) assert return_value is None tm.assert_series_equal(sc, tc2[~expected]) + def test_drop_duplicates_categorical_non_bool2_keepfalse(self, cat_series2): + tc2 = cat_series2 + expected = Series([False, True, True, False, True, True, False]) - tm.assert_series_equal(tc2.duplicated(keep=False), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) + + result = tc2.duplicated(keep=False) + tm.assert_series_equal(result, expected) + + result = tc2.drop_duplicates(keep=False) + tm.assert_series_equal(result, tc2[~expected]) + sc = tc2.copy() return_value = sc.drop_duplicates(keep=False, inplace=True) assert return_value is None