diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index a77bdcec2ce7a..4b3c96da10efd 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -180,7 +180,7 @@ Bug Fixes - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) - +- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 7e0c094aec4c2..8d237016d1b33 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -162,9 +162,12 @@ def get_result(self): # may need to coerce categoricals here if self.is_categorical is not None: - values = [Categorical.from_array( - values[:, i], categories=self.is_categorical.categories, - ordered=True) for i in range(values.shape[-1])] + categories = self.is_categorical.categories + ordered = self.is_categorical.ordered + values = [Categorical.from_array(values[:, i], + categories=categories, + ordered=ordered) + for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 4ff0363d07df6..7f2813d5281cb 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -108,6 +108,13 @@ def compare_series_dt_tz(self, result, expected, typ, version): else: tm.assert_series_equal(result, expected) + def compare_series_cat(self, result, expected, typ, version): + # Categorical.ordered is changed in < 0.16.0 + if LooseVersion(version) < '0.16.0': + tm.assert_series_equal(result, expected, check_categorical=False) + else: + tm.assert_series_equal(result, expected) + def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): # 8260 # dtype is object < 0.17.0 @@ -117,6 +124,16 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) + def compare_frame_cat_onecol(self, result, expected, typ, version): + # Categorical.ordered is changed in < 0.16.0 + if LooseVersion(version) < '0.16.0': + tm.assert_frame_equal(result, expected, check_categorical=False) + else: + tm.assert_frame_equal(result, expected) + + def compare_frame_cat_and_float(self, result, expected, typ, version): + self.compare_frame_cat_onecol(result, expected, typ, version) + def compare_index_period(self, result, expected, typ, version): tm.assert_index_equal(result, expected) tm.assertIsInstance(result.freq, MonthEnd) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6bf0175526424..5ee84ce97979a 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1004,7 +1004,7 @@ def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): nan_rep=nan_rep) retr = read_hdf(store, key) s_nan = s.replace(nan_rep, np.nan) - assert_series_equal(s_nan, retr) + assert_series_equal(s_nan, retr, check_categorical=False) for s in examples: roundtrip(s) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index fe782bb86d1be..17f74d5789298 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -234,10 +234,11 @@ def test_read_dta4(self): expected = pd.concat([expected[col].astype('category') for col in expected], axis=1) - tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected) - tm.assert_frame_equal(parsed_115, expected) - tm.assert_frame_equal(parsed_117, expected) + # stata doesn't save .category metadata + tm.assert_frame_equal(parsed_113, expected, check_categorical=False) + tm.assert_frame_equal(parsed_114, expected, check_categorical=False) + tm.assert_frame_equal(parsed_115, expected, check_categorical=False) + tm.assert_frame_equal(parsed_117, expected, check_categorical=False) # File containing strls def test_read_dta12(self): @@ -872,8 +873,8 @@ def test_categorical_writing(self): # Silence warnings original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), expected) + res = written_and_read_again.set_index('index') + tm.assert_frame_equal(res, expected, check_categorical=False) def test_categorical_warnings_and_errors(self): # Warning for non-string labels @@ -915,8 +916,8 @@ def test_categorical_with_stata_missing_values(self): with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), original) + res = written_and_read_again.set_index('index') + tm.assert_frame_equal(res, original, check_categorical=False) def test_categorical_order(self): # Directly construct using expected codes @@ -945,8 +946,8 @@ def test_categorical_order(self): # Read with and with out categoricals, ensure order is identical parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) - tm.assert_frame_equal(expected, parsed_115) - tm.assert_frame_equal(expected, parsed_117) + tm.assert_frame_equal(expected, parsed_115, check_categorical=False) + tm.assert_frame_equal(expected, parsed_117, check_categorical=False) # Check identity of codes for col in expected: @@ -969,8 +970,10 @@ def test_categorical_sorting(self): categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] cat = pd.Categorical.from_codes(codes=codes, categories=categories) expected = pd.Series(cat, name='srh') - tm.assert_series_equal(expected, parsed_115["srh"]) - tm.assert_series_equal(expected, parsed_117["srh"]) + tm.assert_series_equal(expected, parsed_115["srh"], + check_categorical=False) + tm.assert_series_equal(expected, parsed_117["srh"], + check_categorical=False) def test_categorical_ordering(self): parsed_115 = read_stata(self.dta19_115) @@ -1021,7 +1024,8 @@ def test_read_chunks_117(self): from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal( from_frame, chunk, check_dtype=False, - check_datetimelike_compat=True) + check_datetimelike_compat=True, + check_categorical=False) pos += chunksize itr.close() @@ -1087,7 +1091,8 @@ def test_read_chunks_115(self): from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal( from_frame, chunk, check_dtype=False, - check_datetimelike_compat=True) + check_datetimelike_compat=True, + check_categorical=False) pos += chunksize itr.close() diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index e7d64324e6590..43c288162b134 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -158,6 +158,8 @@ def test_unstack_fill(self): index=['x', 'y', 'z'], dtype=np.float) assert_frame_equal(result, expected) + def test_unstack_fill_frame(self): + # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list('AB'), dtype=np.int32) @@ -190,6 +192,8 @@ def test_unstack_fill(self): [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) + def test_unstack_fill_frame_datetime(self): + # Test unstacking with date times dv = pd.date_range('2012-01-01', periods=4).values data = Series(dv) @@ -208,6 +212,8 @@ def test_unstack_fill(self): index=['x', 'y', 'z']) assert_frame_equal(result, expected) + def test_unstack_fill_frame_timedelta(self): + # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) @@ -226,6 +232,8 @@ def test_unstack_fill(self): index=['x', 'y', 'z']) assert_frame_equal(result, expected) + def test_unstack_fill_frame_period(self): + # Test unstacking with period periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'), Period('2012-04')] @@ -245,6 +253,8 @@ def test_unstack_fill(self): index=['x', 'y', 'z']) assert_frame_equal(result, expected) + def test_unstack_fill_frame_categorical(self): + # Test unstacking with categorical data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') data.index = pd.MultiIndex.from_tuples( @@ -273,27 +283,20 @@ def test_unstack_fill(self): assert_frame_equal(result, expected) def test_stack_ints(self): - df = DataFrame( - np.random.randn(30, 27), - columns=MultiIndex.from_tuples( - list(itertools.product(range(3), repeat=3)) - ) - ) - assert_frame_equal( - df.stack(level=[1, 2]), - df.stack(level=1).stack(level=1) - ) - assert_frame_equal( - df.stack(level=[-2, -1]), - df.stack(level=1).stack(level=1) - ) + columns = MultiIndex.from_tuples(list(itertools.product(range(3), + repeat=3))) + df = DataFrame(np.random.randn(30, 27), columns=columns) + + assert_frame_equal(df.stack(level=[1, 2]), + df.stack(level=1).stack(level=1)) + assert_frame_equal(df.stack(level=[-2, -1]), + df.stack(level=1).stack(level=1)) df_named = df.copy() df_named.columns.set_names(range(3), inplace=True) - assert_frame_equal( - df_named.stack(level=[1, 2]), - df_named.stack(level=1).stack(level=1) - ) + + assert_frame_equal(df_named.stack(level=[1, 2]), + df_named.stack(level=1).stack(level=1)) def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 53ab9aca03f6c..2cb62a60f885b 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -108,15 +108,17 @@ def test_loc_listlike_dtypes(self): # unique slice res = df.loc[['a', 'b']] - exp = DataFrame({'A': [1, 2], - 'B': [4, 5]}, index=pd.CategoricalIndex(['a', 'b'])) + exp_index = pd.CategoricalIndex(['a', 'b'], + categories=index.categories) + exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] - exp = DataFrame({'A': [1, 1, 2], - 'B': [4, 4, 5]}, - index=pd.CategoricalIndex(['a', 'a', 'b'])) + + exp_index = pd.CategoricalIndex(['a', 'a', 'b'], + categories=index.categories) + exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) with tm.assertRaisesRegexp( @@ -194,12 +196,15 @@ def test_ix_categorical_index(self): expect = pd.Series(df.ix[:, 'X'], index=cdf.index, name='X') assert_series_equal(cdf.ix[:, 'X'], expect) + exp_index = pd.CategoricalIndex(list('AB'), categories=['A', 'B', 'C']) expect = pd.DataFrame(df.ix[['A', 'B'], :], columns=cdf.columns, - index=pd.CategoricalIndex(list('AB'))) + index=exp_index) assert_frame_equal(cdf.ix[['A', 'B'], :], expect) + exp_columns = pd.CategoricalIndex(list('XY'), + categories=['X', 'Y', 'Z']) expect = pd.DataFrame(df.ix[:, ['X', 'Y']], index=cdf.index, - columns=pd.CategoricalIndex(list('XY'))) + columns=exp_columns) assert_frame_equal(cdf.ix[:, ['X', 'Y']], expect) # non-unique @@ -209,12 +214,14 @@ def test_ix_categorical_index(self): cdf.index = pd.CategoricalIndex(df.index) cdf.columns = pd.CategoricalIndex(df.columns) + exp_index = pd.CategoricalIndex(list('AA'), categories=['A', 'B']) expect = pd.DataFrame(df.ix['A', :], columns=cdf.columns, - index=pd.CategoricalIndex(list('AA'))) + index=exp_index) assert_frame_equal(cdf.ix['A', :], expect) + exp_columns = pd.CategoricalIndex(list('XX'), categories=['X', 'Y']) expect = pd.DataFrame(df.ix[:, 'X'], index=cdf.index, - columns=pd.CategoricalIndex(list('XX'))) + columns=exp_columns) assert_frame_equal(cdf.ix[:, 'X'], expect) expect = pd.DataFrame(df.ix[['A', 'B'], :], columns=cdf.columns, diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 6e0a0175b403f..9cb1e9dd93d16 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -187,7 +187,8 @@ def test_map(self): index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e'])) - exp = Series([np.nan, 'B', 'C', 'D'], dtype='category') + exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'], + categories=['B', 'C', 'D', 'E'])) self.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 'B', 'C', 'D']) self.assert_series_equal(a.map(c), exp) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 5a6667e57ce9d..40ef5354e91bd 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -556,28 +556,35 @@ def test_categories_none(self): def test_describe(self): # string type desc = self.factor.describe() + self.assertTrue(self.factor.ordered) + exp_index = pd.CategoricalIndex(['a', 'b', 'c'], name='categories', + ordered=self.factor.ordered) expected = DataFrame({'counts': [3, 2, 3], 'freqs': [3 / 8., 2 / 8., 3 / 8.]}, - index=pd.CategoricalIndex(['a', 'b', 'c'], - name='categories')) + index=exp_index) tm.assert_frame_equal(desc, expected) # check unused categories cat = self.factor.copy() cat.set_categories(["a", "b", "c", "d"], inplace=True) desc = cat.describe() + + exp_index = pd.CategoricalIndex(['a', 'b', 'c', 'd'], + ordered=self.factor.ordered, + name='categories') expected = DataFrame({'counts': [3, 2, 3, 0], 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]}, - index=pd.CategoricalIndex(['a', 'b', 'c', 'd'], - name='categories')) + index=exp_index) tm.assert_frame_equal(desc, expected) # check an integer one - desc = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]).describe() + cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) + desc = cat.describe() + exp_index = pd.CategoricalIndex([1, 2, 3], ordered=cat.ordered, + name='categories') expected = DataFrame({'counts': [5, 3, 3], 'freqs': [5 / 11., 3 / 11., 3 / 11.]}, - index=pd.CategoricalIndex([1, 2, 3], - name='categories')) + index=exp_index) tm.assert_frame_equal(desc, expected) # https://github.com/pydata/pandas/issues/3678 @@ -601,7 +608,7 @@ def test_describe(self): columns=['counts', 'freqs'], index=pd.CategoricalIndex(['b', 'a', 'c', np.nan], name='categories')) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_categorical=False) # NA as an unused category with tm.assert_produces_warning(FutureWarning): @@ -613,7 +620,7 @@ def test_describe(self): ['b', 'a', 'c', np.nan], name='categories') expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]], columns=['counts', 'freqs'], index=exp_idx) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_categorical=False) def test_print(self): expected = ["[a, b, b, a, a, c, c, c]", @@ -2885,13 +2892,17 @@ def test_value_counts(self): categories=["c", "a", "b", "d"]) s = pd.Series(cats, name='xxx') res = s.value_counts(sort=False) - exp = Series([3, 1, 2, 0], name='xxx', - index=pd.CategoricalIndex(["c", "a", "b", "d"])) + + exp_index = pd.CategoricalIndex(["c", "a", "b", "d"], + categories=cats.categories) + exp = Series([3, 1, 2, 0], name='xxx', index=exp_index) tm.assert_series_equal(res, exp) res = s.value_counts(sort=True) - exp = Series([3, 2, 1, 0], name='xxx', - index=pd.CategoricalIndex(["c", "b", "a", "d"])) + + exp_index = pd.CategoricalIndex(["c", "b", "a", "d"], + categories=cats.categories) + exp = Series([3, 2, 1, 0], name='xxx', index=exp_index) tm.assert_series_equal(res, exp) # check object dtype handles the Series.name as the same @@ -2927,38 +2938,39 @@ def test_value_counts_with_nan(self): index=pd.CategoricalIndex(["a", "b", np.nan]))) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical( - ["a", "b", "a"], categories=["a", "b", np.nan])) - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([2, 1, 0], - index=pd.CategoricalIndex(["a", "b", np.nan]))) + s = pd.Series(pd.Categorical(["a", "b", "a"], + categories=["a", "b", np.nan])) + + # internal categories are different because of NaN + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) + tm.assert_series_equal(s.value_counts(dropna=True), exp, + check_categorical=False) + exp = pd.Series([2, 1, 0], + index=pd.CategoricalIndex(["a", "b", np.nan])) + tm.assert_series_equal(s.value_counts(dropna=False), exp, + check_categorical=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical( - ["a", "b", None, "a", None, None], categories=["a", "b", np.nan - ])) - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([3, 2, 1], - index=pd.CategoricalIndex([np.nan, "a", "b"]))) + s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], + categories=["a", "b", np.nan])) + + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) + tm.assert_series_equal(s.value_counts(dropna=True), exp, + check_categorical=False) + exp = pd.Series([3, 2, 1], + index=pd.CategoricalIndex([np.nan, "a", "b"])) + tm.assert_series_equal(s.value_counts(dropna=False), exp, + check_categorical=False) def test_groupby(self): - cats = Categorical( - ["a", "a", "a", "b", "b", "b", "c", "c", "c" - ], categories=["a", "b", "c", "d"], ordered=True) + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - expected = DataFrame({'a': Series( - [1, 2, 4, np.nan], index=pd.CategoricalIndex( - ['a', 'b', 'c', 'd'], name='b'))}) + exp_index = pd.CategoricalIndex(['a', 'b', 'c', 'd'], name='b', + ordered=True) + expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b").mean() tm.assert_frame_equal(result, expected) @@ -2970,17 +2982,19 @@ def test_groupby(self): # single grouper gb = df.groupby("A") - exp_idx = pd.CategoricalIndex(['a', 'b', 'z'], name='A') + exp_idx = pd.CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # multiple groupers gb = df.groupby(['A', 'B']) - expected = DataFrame({'values': Series( - [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan - ], index=pd.MultiIndex.from_product( - [['a', 'b', 'z'], ['c', 'd', 'y']], names=['A', 'B']))}) + exp_index = pd.MultiIndex.from_product([['a', 'b', 'z'], + ['c', 'd', 'y']], + names=['A', 'B']) + expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, + np.nan, np.nan, np.nan]}, + index=exp_index) result = gb.sum() tm.assert_frame_equal(result, expected) @@ -3054,8 +3068,10 @@ def f(x): df = pd.DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4]) result = df.groupby(c).apply(len) - expected = pd.Series([1, 0, 0, 0], - index=pd.CategoricalIndex(c.values.categories)) + + exp_index = pd.CategoricalIndex(c.values.categories, + ordered=c.values.ordered) + expected = pd.Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) @@ -3369,30 +3385,28 @@ def test_assigning_ops(self): # assign a part of a column with dtype != categorical -> # exp_parts_cats_col - cats = pd.Categorical( - ["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], + categories=["a", "b"]) idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 1, 1, 1, 1, 1, 1] orig = pd.DataFrame({"cats": cats, "values": values}, index=idx) # the expected values # changed single row - cats1 = pd.Categorical( - ["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + cats1 = pd.Categorical(["a", "a", "b", "a", "a", "a", "a"], + categories=["a", "b"]) idx1 = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = pd.DataFrame( - {"cats": cats1, - "values": values1}, index=idx1) + exp_single_row = pd.DataFrame({"cats": cats1, + "values": values1}, index=idx1) # changed multiple rows - cats2 = pd.Categorical( - ["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + cats2 = pd.Categorical(["a", "a", "b", "b", "a", "a", "a"], + categories=["a", "b"]) idx2 = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = pd.DataFrame( - {"cats": cats2, - "values": values2}, index=idx2) + exp_multi_row = pd.DataFrame({"cats": cats2, + "values": values2}, index=idx2) # changed part of the cats column cats3 = pd.Categorical( @@ -3653,7 +3667,8 @@ def f(): exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) df[df["cats"] == "c"] = ["b", 2] - tm.assert_frame_equal(df, exp_multi_row) + # category c is kept in .categories + tm.assert_frame_equal(df, exp_fancy) # set_value df = orig.copy() @@ -3708,7 +3723,7 @@ def f(): # ensure that one can set something to np.nan s = Series(Categorical([1, 2, 3])) - exp = Series(Categorical([1, np.nan, 3])) + exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3])) s[1] = np.nan tm.assert_series_equal(s, exp) @@ -4083,10 +4098,12 @@ def f(): c = Categorical(["a", "b", np.nan]) with tm.assert_produces_warning(FutureWarning): c.set_categories(["a", "b", np.nan], rename=True, inplace=True) + c[0] = np.nan df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) - df_exp = pd.DataFrame({"cats": Categorical(["a", "b", "a"]), - "vals": [1, 2, 3]}) + + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b", np.nan]) + df_exp = pd.DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) res = df.fillna("a") tm.assert_frame_equal(res, df_exp) @@ -4128,7 +4145,9 @@ def cmp(a, b): ]: result = valid(s) - tm.assert_series_equal(result, s) + # compare series values + # internal .categories can't be compared because it is sorted + tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) for invalid in [lambda x: x.astype(pd.Categorical), diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 2bad2fabcfc57..794b5e8aa5650 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -847,7 +847,7 @@ def test_to_xarray(self): assert_almost_equal(list(result.coords.keys()), ['foo']) self.assertIsInstance(result, DataArray) - def testit(index, check_index_type=True): + def testit(index, check_index_type=True, check_categorical=True): s = Series(range(6), index=index(6)) s.index.name = 'foo' result = s.to_xarray() @@ -859,7 +859,8 @@ def testit(index, check_index_type=True): # idempotency assert_series_equal(result.to_series(), s, - check_index_type=check_index_type) + check_index_type=check_index_type, + check_categorical=check_categorical) for index in [tm.makeFloatIndex, tm.makeIntIndex, tm.makeStringIndex, tm.makeUnicodeIndex, @@ -868,7 +869,8 @@ def testit(index, check_index_type=True): testit(index) # not idempotent - testit(tm.makeCategoricalIndex, check_index_type=False) + testit(tm.makeCategoricalIndex, check_index_type=False, + check_categorical=False) s = Series(range(6)) s.index.name = 'foo' @@ -1409,9 +1411,8 @@ def test_to_xarray(self): expected['f'] = expected['f'].astype(object) expected['h'] = expected['h'].astype('datetime64[ns]') expected.columns.name = None - assert_frame_equal(result.to_dataframe(), - expected, - check_index_type=False) + assert_frame_equal(result.to_dataframe(), expected, + check_index_type=False, check_categorical=False) # available in 0.7.1 # MultiIndex diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 5dfe88d04309e..38e6a066d3eea 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3868,8 +3868,8 @@ def test_groupby_sort_categorical(self): ['(0, 2.5]', 1, 60], ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) df['range'] = Categorical(df['range'], ordered=True) - index = CategoricalIndex( - ['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range') + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range', ordered=True) result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'], index=index) @@ -3879,13 +3879,15 @@ def test_groupby_sort_categorical(self): assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) df['range'] = Categorical(df['range'], ordered=False) - index = CategoricalIndex( - ['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range') + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range') result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'], index=index) - index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', - '(5, 7.5]', '(0, 2.5]'], + index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', + '(0, 2.5]'], + categories=['(7.5, 10]', '(2.5, 5]', + '(5, 7.5]', '(0, 2.5]'], name='range') result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=['foo', 'bar']) @@ -3975,7 +3977,8 @@ def test_groupby_categorical(self): result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() - exp_idx = CategoricalIndex(levels, ordered=True) + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) @@ -3986,14 +3989,16 @@ def test_groupby_categorical(self): idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) - expected = ord_data.groupby( - Categorical(ord_labels), sort=False).describe() + + exp_cats = Categorical(ord_labels, ordered=True, + categories=['foo', 'bar', 'baz', 'qux']) + expected = ord_data.groupby(exp_cats, sort=False).describe() expected.index.names = [None, None] assert_frame_equal(desc_result, expected) # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', @@ -6266,8 +6271,11 @@ def test_groupby_categorical_two_columns(self): # Grouping on a single column groups_single_key = test.groupby("cat") res = groups_single_key.agg('mean') + + exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", + ordered=True) exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=pd.CategoricalIndex(["a", "b", "c"], name="cat")) + index=exp_index) tm.assert_frame_equal(res, exp) # Grouping on two columns diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 862e2282bae2f..7136d7effc1fc 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -239,26 +239,16 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) - exp = DataFrame({'a': {0: 1.0, - 1: 0.0, - 2: 0.0}, - 'b': {0: 0.0, - 1: 1.0, - 2: 0.0}}) + exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) - exp_na = DataFrame({nan: {0: 0.0, - 1: 0.0, - 2: 1.0}, - 'a': {0: 1.0, - 1: 0.0, - 2: 0.0}, - 'b': {0: 0.0, - 1: 1.0, - 2: 0.0}}).reindex_axis( - ['a', 'b', nan], 1) + exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, + 'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 8682302b542be..0ec2c96dbbd7d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -25,7 +25,7 @@ from pandas.core.common import (is_sequence, array_equivalent, is_list_like, is_datetimelike_v_numeric, is_datetimelike_v_object, is_number, - needs_i8_conversion) + needs_i8_conversion, is_categorical_dtype) from pandas.formats.printing import pprint_thing from pandas.core.algorithms import take_1d @@ -657,7 +657,7 @@ def assert_equal(a, b, msg=""): def assert_index_equal(left, right, exact='equiv', check_names=True, check_less_precise=False, check_exact=True, - obj='Index'): + check_categorical=True, obj='Index'): """Check that left and right Index are equal. Parameters @@ -675,6 +675,8 @@ def assert_index_equal(left, right, exact='equiv', check_names=True, 5 digits (False) or 3 digits (True) after decimal points are compared. check_exact : bool, default True Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message @@ -752,6 +754,11 @@ def _get_ilevel_values(index, level): if check_names: assert_attr_equal('names', left, right, obj=obj) + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, + obj='{0} category'.format(obj)) + def assert_class_equal(left, right, exact=True, obj='Input'): """checks classes are equal.""" @@ -999,6 +1006,7 @@ def assert_series_equal(left, right, check_dtype=True, check_names=True, check_exact=False, check_datetimelike_compat=False, + check_categorical=True, obj='Series'): """Check that left and right Series are equal. @@ -1023,6 +1031,8 @@ def assert_series_equal(left, right, check_dtype=True, Whether to check the Series and Index names attribute. check_dateteimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message @@ -1049,6 +1059,7 @@ def assert_series_equal(left, right, check_dtype=True, check_names=check_names, check_less_precise=check_less_precise, check_exact=check_exact, + check_categorical=check_categorical, obj='{0}.index'.format(obj)) if check_dtype: @@ -1085,6 +1096,11 @@ def assert_series_equal(left, right, check_dtype=True, if check_names: assert_attr_equal('name', left, right, obj=obj) + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, + obj='{0} category'.format(obj)) + # This could be refactored to use the NDFrame.equals method def assert_frame_equal(left, right, check_dtype=True, @@ -1096,6 +1112,7 @@ def assert_frame_equal(left, right, check_dtype=True, by_blocks=False, check_exact=False, check_datetimelike_compat=False, + check_categorical=True, check_like=False, obj='DataFrame'): @@ -1127,6 +1144,8 @@ def assert_frame_equal(left, right, check_dtype=True, Whether to compare number exactly. check_dateteimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. check_like : bool, default False If true, then reindex_like operands obj : str, default 'DataFrame' @@ -1168,6 +1187,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_names=check_names, check_less_precise=check_less_precise, check_exact=check_exact, + check_categorical=check_categorical, obj='{0}.index'.format(obj)) # column comparison @@ -1175,6 +1195,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_names=check_names, check_less_precise=check_less_precise, check_exact=check_exact, + check_categorical=check_categorical, obj='{0}.columns'.format(obj)) # compare by blocks @@ -1199,6 +1220,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_less_precise=check_less_precise, check_exact=check_exact, check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, obj='DataFrame.iloc[:, {0}]'.format(i))