Skip to content

BUG: Categorical with non-nano dt64 #38791

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
coerce_indexer_dtype,
maybe_cast_to_extension_array,
maybe_infer_to_datetimelike,
sanitize_to_nanoseconds,
)
from pandas.core.dtypes.common import (
ensure_int64,
Expand Down Expand Up @@ -366,6 +367,9 @@ def __init__(
values = [values[idx] for idx in np.where(~null_mask)[0]]
values = sanitize_array(values, None, dtype=sanitize_dtype)

else:
values = sanitize_to_nanoseconds(values)

if dtype.categories is None:
try:
codes, categories = factorize(values, sort=True)
Expand Down
22 changes: 15 additions & 7 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1521,13 +1521,7 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]):
# catch a datetime/timedelta that is not of ns variety
# and no coercion specified
if is_array and value.dtype.kind in ["M", "m"]:
dtype = value.dtype

if dtype.kind == "M" and dtype != DT64NS_DTYPE:
value = conversion.ensure_datetime64ns(value)

elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
value = conversion.ensure_timedelta64ns(value)
value = sanitize_to_nanoseconds(value)

# only do this if we have an array and the dtype of the array is not
# setup already we are not an integer/object, so don't bother with this
Expand All @@ -1543,6 +1537,20 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]):
return value


def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray:
"""
Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond.
"""
dtype = values.dtype
if dtype.kind == "M" and dtype != DT64NS_DTYPE:
values = conversion.ensure_datetime64ns(values)

elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
values = conversion.ensure_timedelta64ns(values)

return values


def find_common_type(types: List[DtypeObj]) -> DtypeObj:
"""
Find a common data type among the given dtypes.
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

from pandas.compat import IS64, is_platform_windows

from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
from pandas.core.dtypes.dtypes import CategoricalDtype

Expand Down Expand Up @@ -723,3 +725,14 @@ def test_from_sequence_copy(self):
result = Categorical._from_sequence(cat, dtype=None, copy=True)

assert not np.shares_memory(result._codes, cat._codes)

@pytest.mark.xfail(
not IS64 or is_platform_windows(),
reason="Incorrectly raising in ensure_datetime64ns",
)
def test_constructor_datetime64_non_nano(self):
categories = np.arange(10).view("M8[D]")
values = categories[::2].copy()

cat = Categorical(values, categories=categories)
assert (cat == values).all()
108 changes: 80 additions & 28 deletions pandas/tests/series/methods/test_drop_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,72 +67,124 @@ def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values):


class TestSeriesDropDuplicates:
@pytest.mark.parametrize(
"dtype",
["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"],
@pytest.fixture(
params=["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"]
)
def test_drop_duplicates_categorical_non_bool(self, dtype, ordered):
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
def dtype(self, request):
return request.param

@pytest.fixture
def cat_series1(self, dtype, ordered):
# Test case 1
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))

input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered))
if dtype == "datetime64[D]":
# pre-empty flaky xfail, tc1 values are seemingly-random
if not (np.array(tc1) == input1).all():
pytest.xfail(reason="GH#7996")
cat = Categorical(input1, categories=cat_array, ordered=ordered)
tc1 = Series(cat)
return tc1

def test_drop_duplicates_categorical_non_bool(self, cat_series1):
tc1 = cat_series1

expected = Series([False, False, False, True])
tm.assert_series_equal(tc1.duplicated(), expected)
tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])

result = tc1.duplicated()
tm.assert_series_equal(result, expected)

result = tc1.drop_duplicates()
tm.assert_series_equal(result, tc1[~expected])

sc = tc1.copy()
return_value = sc.drop_duplicates(inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc1[~expected])

def test_drop_duplicates_categorical_non_bool_keeplast(self, cat_series1):
tc1 = cat_series1

expected = Series([False, False, True, False])
tm.assert_series_equal(tc1.duplicated(keep="last"), expected)
tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected])

result = tc1.duplicated(keep="last")
tm.assert_series_equal(result, expected)

result = tc1.drop_duplicates(keep="last")
tm.assert_series_equal(result, tc1[~expected])

sc = tc1.copy()
return_value = sc.drop_duplicates(keep="last", inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc1[~expected])

def test_drop_duplicates_categorical_non_bool_keepfalse(self, cat_series1):
tc1 = cat_series1

expected = Series([False, False, True, True])
tm.assert_series_equal(tc1.duplicated(keep=False), expected)
tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])

result = tc1.duplicated(keep=False)
tm.assert_series_equal(result, expected)

result = tc1.drop_duplicates(keep=False)
tm.assert_series_equal(result, tc1[~expected])

sc = tc1.copy()
return_value = sc.drop_duplicates(keep=False, inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc1[~expected])

# Test case 2
@pytest.fixture
def cat_series2(self, dtype, ordered):
# Test case 2; TODO: better name
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))

input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered))
if dtype == "datetime64[D]":
# pre-empty flaky xfail, tc2 values are seemingly-random
if not (np.array(tc2) == input2).all():
pytest.xfail(reason="GH#7996")
cat = Categorical(input2, categories=cat_array, ordered=ordered)
tc2 = Series(cat)
return tc2

def test_drop_duplicates_categorical_non_bool2(self, cat_series2):
# Test case 2; TODO: better name
tc2 = cat_series2

expected = Series([False, False, False, False, True, True, False])
tm.assert_series_equal(tc2.duplicated(), expected)
tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])

result = tc2.duplicated()
tm.assert_series_equal(result, expected)

result = tc2.drop_duplicates()
tm.assert_series_equal(result, tc2[~expected])

sc = tc2.copy()
return_value = sc.drop_duplicates(inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc2[~expected])

def test_drop_duplicates_categorical_non_bool2_keeplast(self, cat_series2):
tc2 = cat_series2

expected = Series([False, True, True, False, False, False, False])
tm.assert_series_equal(tc2.duplicated(keep="last"), expected)
tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected])

result = tc2.duplicated(keep="last")
tm.assert_series_equal(result, expected)

result = tc2.drop_duplicates(keep="last")
tm.assert_series_equal(result, tc2[~expected])

sc = tc2.copy()
return_value = sc.drop_duplicates(keep="last", inplace=True)
assert return_value is None
tm.assert_series_equal(sc, tc2[~expected])

def test_drop_duplicates_categorical_non_bool2_keepfalse(self, cat_series2):
tc2 = cat_series2

expected = Series([False, True, True, False, True, True, False])
tm.assert_series_equal(tc2.duplicated(keep=False), expected)
tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])

result = tc2.duplicated(keep=False)
tm.assert_series_equal(result, expected)

result = tc2.drop_duplicates(keep=False)
tm.assert_series_equal(result, tc2[~expected])

sc = tc2.copy()
return_value = sc.drop_duplicates(keep=False, inplace=True)
assert return_value is None
Expand Down