diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0dc139781f58d..675288e20d1f8 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -43,6 +43,8 @@ from pandas._libs.missing cimport ( is_matching_na, ) +from decimal import InvalidOperation + # Defines shift of MultiIndex codes to avoid negative codes (missing values) multiindex_nulls_shift = 2 @@ -248,6 +250,10 @@ cdef class IndexEngine: @property def is_unique(self) -> bool: + # for why we check is_monotonic_increasing here, see + # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781 + if self.need_monotonic_check: + self.is_monotonic_increasing if self.need_unique_check: self._do_unique_check() @@ -281,7 +287,7 @@ cdef class IndexEngine: values = self.values self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \ self._call_monotonic(values) - except TypeError: + except (TypeError, InvalidOperation): self.monotonic_inc = 0 self.monotonic_dec = 0 is_strict_monotonic = 0 @@ -843,6 +849,10 @@ cdef class SharedEngine: @property def is_unique(self) -> bool: + # for why we check is_monotonic_increasing here, see + # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781 + if self.need_monotonic_check: + self.is_monotonic_increasing if self.need_unique_check: arr = self.values.unique() self.unique = len(arr) == len(self.values) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e2e589440bd9..11d17066ebebd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3711,7 +3711,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) - 5244 + 5136 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 88a08dd55f739..998c29fb3227c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3382,9 +3382,7 @@ def _union(self, other: Index, sort: bool | None): if ( sort in (None, True) - and self.is_monotonic_increasing - and other.is_monotonic_increasing - and not (self.has_duplicates and other.has_duplicates) + and (self.is_unique or other.is_unique) and self._can_use_libjoin and other._can_use_libjoin ): @@ -3536,12 +3534,7 @@ def _intersection(self, other: Index, sort: bool = False): """ intersection specialized to the case with matching dtypes. """ - if ( - self.is_monotonic_increasing - and other.is_monotonic_increasing - and self._can_use_libjoin - and other._can_use_libjoin - ): + if self._can_use_libjoin and other._can_use_libjoin: try: res_indexer, indexer, _ = self._inner_indexer(other) except TypeError: @@ -4980,7 +4973,10 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]: def _join_monotonic( self, other: Index, how: JoinHow = "left" ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: - # We only get here with matching dtypes and both monotonic increasing + # We only get here with (caller is responsible for ensuring): + # 1) matching dtypes + # 2) both monotonic increasing + # 3) other.is_unique or self.is_unique assert other.dtype == self.dtype assert self._can_use_libjoin and other._can_use_libjoin @@ -5062,6 +5058,10 @@ def _can_use_libjoin(self) -> bool: making a copy. If we cannot, this negates the performance benefit of using libjoin. """ + if not self.is_monotonic_increasing: + # The libjoin functions all assume monotonicity. + return False + if type(self) is Index: # excludes EAs, but include masks, we get here with monotonic # values only, meaning no NA diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6f33b18b19c51..1b322b1797144 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -75,11 +75,6 @@ def data_for_grouping(): class TestCategorical(base.ExtensionTests): - @pytest.mark.xfail(reason="Memory usage doesn't match") - def test_memory_usage(self, data): - # TODO: Is this deliberate? - super().test_memory_usage(data) - def test_contains(self, data, data_missing): # GH-37867 # na value handling in Categorical.__contains__ is deprecated.