diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index d3d755b205878..9d9bf4174a3b8 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -21,6 +21,7 @@ find_common_type, ) from pandas.core.dtypes.common import ( + is_categorical_dtype, is_dtype_equal, is_sparse, ) @@ -112,6 +113,30 @@ def is_nonempty(x) -> bool: if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 + cats = [x for x in to_concat if is_categorical_dtype(x.dtype)] + if len(cats): + # TODO: Ideally this shouldn't be order-dependent + first = cats[0] + from pandas import ( + CategoricalIndex, + Index, + ) + + ci = CategoricalIndex(first) + + try: + codes = np.concatenate( + [ci._is_dtype_compat(Index._with_infer(c)).codes for c in to_concat] + ) + except TypeError: + # not all to_concat elements are among our categories (or NA) + pass + else: + cat = first._from_backing_data(codes) + if all(x.dtype.ordered for x in cats): + cat = cat.as_ordered() + return cat + if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) target_dtype = common_dtype_categorical_compat(to_concat, target_dtype) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c2bcd90ff10fb..d7a7d699e7167 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -285,7 +285,8 @@ def _is_dtype_compat(self, other) -> Categorical: "categories must match existing categories when appending" ) - return other + # TODO: this is a lot like the non-coercing constructor + return other.astype(self.dtype, copy=False) @doc(Index.astype) def astype(self, dtype: Dtype, copy: bool = True) -> Index: @@ -567,17 +568,3 @@ def map(self, mapper): """ mapped = self._values.map(mapper) return Index(mapped, name=self.name) - - def _concat(self, to_concat: list[Index], name: Hashable) -> Index: - # if calling index is category, don't check dtype of others - try: - codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) - except TypeError: - # not all to_concat elements are among our categories (or NA) - from pandas.core.dtypes.concat import concat_compat - - res = concat_compat([x._values for x in to_concat]) - return Index(res, name=name) - else: - cat = self._data._from_backing_data(codes) - return type(self)._simple_new(cat, name=name) diff --git a/pandas/tests/arrays/sparse/test_combine_concat.py b/pandas/tests/arrays/sparse/test_combine_concat.py index 0f09af269148b..f18aab416a2ea 100644 --- a/pandas/tests/arrays/sparse/test_combine_concat.py +++ b/pandas/tests/arrays/sparse/test_combine_concat.py @@ -53,10 +53,17 @@ def test_concat_with_non_sparse(other, expected_dtype): # https://github.com/pandas-dev/pandas/issues/34336 s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0)) - result = pd.concat([s_sparse, other], ignore_index=True) + msg = "passing a SparseArray to pd.Index" + warn = FutureWarning + if isinstance(expected_dtype, pd.SparseDtype): + warn = None + + with tm.assert_produces_warning(warn, match=msg): + result = pd.concat([s_sparse, other], ignore_index=True) expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype) tm.assert_series_equal(result, expected) - result = pd.concat([other, s_sparse], ignore_index=True) + with tm.assert_produces_warning(warn, match=msg): + result = pd.concat([other, s_sparse], ignore_index=True) expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 13bf096cfe167..3d56d13d827da 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -42,7 +42,8 @@ def test_value_counts(index_or_series_obj): @pytest.mark.parametrize("null_obj", [np.nan, None]) -def test_value_counts_null(null_obj, index_or_series_obj): +@pytest.mark.parametrize("dropna", [True, False]) +def test_value_counts_null(null_obj, dropna, index_or_series_obj): orig = index_or_series_obj obj = orig.copy() @@ -66,7 +67,11 @@ def test_value_counts_null(null_obj, index_or_series_obj): expected = Series(dict(counter.most_common()), dtype=np.int64) expected.index = expected.index.astype(obj.dtype) - result = obj.value_counts() + result = obj.value_counts(dropna=dropna) + + if not dropna: + expected[null_obj] = 3 + if obj.duplicated().any(): # TODO(GH#32514): # Order of entries with the same count is inconsistent on CI (gh-32449) @@ -76,16 +81,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): if not isinstance(result.dtype, np.dtype): # i.e IntegerDtype expected = expected.astype("Int64") - tm.assert_series_equal(result, expected) - - expected[null_obj] = 3 - result = obj.value_counts(dropna=False) - if obj.duplicated().any(): - # TODO(GH#32514): - # Order of entries with the same count is inconsistent on CI (gh-32449) - expected = expected.sort_index() - result = result.sort_index() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 5e2f452009e92..82bb444d5a0ae 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -135,7 +135,7 @@ class TestConstructors(BaseSparseTests, base.BaseConstructorsTests): class TestReshaping(BaseSparseTests, base.BaseReshapingTests): - def test_concat_mixed_dtypes(self, data): + def test_concat_mixed_dtypes(self, data, using_array_manager): # https://github.com/pandas-dev/pandas/issues/20762 # This should be the same, aside from concat([sparse, float]) df1 = pd.DataFrame({"A": data[:3]}) @@ -144,7 +144,10 @@ def test_concat_mixed_dtypes(self, data): dfs = [df1, df2, df3] # dataframes - result = pd.concat(dfs) + msg = "passing a SparseArray to pd.Index" + warn = None if using_array_manager else FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = pd.concat(dfs) expected = pd.concat( [x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs] ) diff --git a/pandas/tests/indexes/categorical/test_append.py b/pandas/tests/indexes/categorical/test_append.py index b48c3219f5111..2a99f72de947b 100644 --- a/pandas/tests/indexes/categorical/test_append.py +++ b/pandas/tests/indexes/categorical/test_append.py @@ -48,9 +48,10 @@ def test_append_non_categories(self, ci): tm.assert_index_equal(result, expected, exact=True) def test_append_object(self, ci): - # GH#14298 - if base object is not categorical -> coerce to object + # GH#14298 - if base object and all entries are among + # categories -> cast to categorical (GH#41626) result = Index(["c", "a"]).append(ci) - expected = Index(list("caaabbca")) + expected = Index(list("caaabbca"), dtype=ci.dtype) tm.assert_index_equal(result, expected, exact=True) def test_append_to_another(self): diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index 36bca1c2b654e..93c8a746e8e9d 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -523,18 +523,18 @@ def test_union_categorical_same_categories_different_order(self): tm.assert_series_equal(result, expected) def test_concat_categorical_coercion(self): - # GH 13524 + # GH 13524, GH#41626 - # category + not-category => not-category + # category + not-category (but all-castable/nan) => category s1 = Series([1, 2, np.nan], dtype="category") s2 = Series([2, 1, 2]) - exp = Series([1, 2, np.nan, 2, 1, 2], dtype=np.float64) + exp = Series([1, 2, np.nan, 2, 1, 2], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) # result shouldn't be affected by 1st elem dtype - exp = Series([2, 1, 2, 1, 2, np.nan], dtype=np.float64) + exp = Series([2, 1, 2, 1, 2, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) @@ -574,31 +574,31 @@ def test_concat_categorical_coercion(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) - # if normal series only contains NaN-likes => not-category + # if normal series only contains NaN-likes => category (GH#41626) s1 = Series([10, 11], dtype="category") s2 = Series([np.nan, np.nan, np.nan]) - exp = Series([10, 11, np.nan, np.nan, np.nan]) + exp = Series([10, 11, np.nan, np.nan, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) - exp = Series([np.nan, np.nan, np.nan, 10, 11]) + exp = Series([np.nan, np.nan, np.nan, 10, 11], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) def test_concat_categorical_3elem_coercion(self): - # GH 13524 + # GH 13524, GH#41626 - # mixed dtypes => not-category + # mixed dtypes, all castable to our categories => category (GH#41626) s1 = Series([1, 2, np.nan], dtype="category") s2 = Series([2, 1, 2], dtype="category") s3 = Series([1, 2, 1, 2, np.nan]) - exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") + exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp) - exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") + exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp) @@ -666,7 +666,7 @@ def test_concat_categorical_ordered(self): tm.assert_series_equal(s1._append([s2, s1], ignore_index=True), exp) def test_concat_categorical_coercion_nan(self): - # GH 13524 + # GH 13524, GH#41626 # some edge cases # category + not-category => not category @@ -677,18 +677,19 @@ def test_concat_categorical_coercion_nan(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) + # all elements of s2 are nan => category (GH#41626) s1 = Series([1, np.nan], dtype="category") s2 = Series([np.nan, np.nan]) - exp = Series([1, np.nan, np.nan, np.nan], dtype="float") + exp = Series([1, np.nan, np.nan, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) - # mixed dtype, all nan-likes => not-category + # mixed dtype, all nan-likes => category (GH#41626) s1 = Series([np.nan, np.nan], dtype="category") s2 = Series([np.nan, np.nan]) - exp = Series([np.nan, np.nan, np.nan, np.nan]) + exp = Series([np.nan, np.nan, np.nan, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) @@ -704,7 +705,7 @@ def test_concat_categorical_coercion_nan(self): tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) def test_concat_categorical_empty(self): - # GH 13524 + # GH 13524, GH#41626 s1 = Series([], dtype="category") s2 = Series([1, 2], dtype="category") @@ -724,11 +725,11 @@ def test_concat_categorical_empty(self): s1 = Series([], dtype="category") s2 = Series([], dtype="object") - # different dtype => not-category - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1._append(s2, ignore_index=True), s2) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) - tm.assert_series_equal(s2._append(s1, ignore_index=True), s2) + # different dtype, but all castable (bc empty) => category (GH#41626) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s1) + tm.assert_series_equal(s1._append(s2, ignore_index=True), s1) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s1) + tm.assert_series_equal(s2._append(s1, ignore_index=True), s1) s1 = Series([], dtype="category") s2 = Series([np.nan, np.nan]) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 82d2a8a2b1fd2..d5b129441e84c 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -105,7 +105,7 @@ def test_concat_empty_series_timelike(self, tz, values): ("M8[ns]", np.int64, np.object_), # categorical ("category", "category", "category"), - ("category", "object", "object"), + ("category", "object", "category"), # GH#41626 ], ) def test_concat_empty_series_dtypes(self, left, right, expected): @@ -182,12 +182,12 @@ def test_concat_empty_series_dtypes_triple(self): ) def test_concat_empty_series_dtype_category_with_array(self): - # GH#18515 + # GH#18515, GH#41626 assert ( concat( [Series(np.array([]), dtype="category"), Series(dtype="float64")] ).dtype - == "float64" + == "category" ) def test_concat_empty_series_dtypes_sparse(self):