From b5cbe2ca172669041ba468502f1ef7dcf738240d Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 23 May 2016 08:06:50 +0900 Subject: [PATCH] BUG: remove_unused_categories dtype coerces to int64 --- doc/source/whatsnew/v0.18.2.txt | 2 ++ pandas/core/categorical.py | 2 +- pandas/tests/test_categorical.py | 21 ++++++++++++--------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 4b3c96da10efd..de987edcdc679 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -184,3 +184,5 @@ Bug Fixes - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) + +- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 44c91862227d8..ea6e9012f7e8a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -883,8 +883,8 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._codes = inv cat._categories = cat.categories.take(idx) + cat._codes = _coerce_indexer_dtype(inv, self._categories) if not inplace: return cat diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 40ef5354e91bd..5a0d079efb4c2 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1022,14 +1022,14 @@ def f(): def test_remove_unused_categories(self): c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) - exp_categories_all = np.array(["a", "b", "c", "d", "e"]) - exp_categories_dropped = np.array(["a", "b", "c", "d"]) + exp_categories_all = Index(["a", "b", "c", "d", "e"]) + exp_categories_dropped = Index(["a", "b", "c", "d"]) self.assert_numpy_array_equal(c.categories, exp_categories_all) res = c.remove_unused_categories() - self.assert_numpy_array_equal(res.categories, exp_categories_dropped) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(res.categories, exp_categories_dropped) + self.assert_index_equal(c.categories, exp_categories_all) res = c.remove_unused_categories(inplace=True) self.assert_numpy_array_equal(c.categories, exp_categories_dropped) @@ -1039,15 +1039,18 @@ def test_remove_unused_categories(self): c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) res = c.remove_unused_categories() - self.assert_numpy_array_equal(res.categories, - np.array(["a", "b", "c"])) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(res.categories, + Index(np.array(["a", "b", "c"]))) + exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) + self.assert_numpy_array_equal(res.codes, exp_codes) + self.assert_index_equal(c.categories, exp_categories_all) val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan] cat = pd.Categorical(values=val, categories=list('ABCDEFG')) out = cat.remove_unused_categories() - self.assert_numpy_array_equal(out.categories, ['B', 'D', 'F']) - self.assert_numpy_array_equal(out.codes, [2, -1, 1, 0, 1, 2, -1]) + self.assert_index_equal(out.categories, Index(['B', 'D', 'F'])) + exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) + self.assert_numpy_array_equal(out.codes, exp_codes) self.assertEqual(out.get_values().tolist(), val) alpha = list('abcdefghijklmnopqrstuvwxyz')