BUG: Handle fill_value in Categorical.take

TomAugspurger · TomAugspurger · commit 8ec774639d93 · 2018-10-23T09:03:51.000-05:00
Closes pandas-dev#23296
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -973,6 +973,7 @@ Categorical
 - Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
 - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
 - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
+- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).
 
 Datetimelike
 ^^^^^^^^^^^^
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1019,15 +1019,7 @@ def add_categories(self, new_categories, inplace=False):
         set_categories
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        if not is_list_like(new_categories):
-            new_categories = [new_categories]
-        already_included = set(new_categories) & set(self.dtype.categories)
-        if len(already_included) != 0:
-            msg = ("new categories must not include old categories: "
-                   "{already_included!s}")
-            raise ValueError(msg.format(already_included=already_included))
-        new_categories = list(self.dtype.categories) + list(new_categories)
-        new_dtype = CategoricalDtype(new_categories, self.ordered)
+        new_dtype = self.dtype._add_categories(new_categories)
 
         cat = self if inplace else self.copy()
         cat._dtype = new_dtype
@@ -1768,8 +1760,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
 
         Parameters
         ----------
-        indexer : sequence of integers
-        allow_fill : bool, default None.
+        indexer : sequence of int
+            The indices in `self` to take. The meaning of negative values in
+            `indexer` depends on the value of `allow_fill`.
+        allow_fill : bool, default None
             How to handle negative values in `indexer`.
 
             * False: negative values in `indices` indicate positional indices
@@ -1786,26 +1780,79 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
                default is ``True``. In the future, this will change to
                ``False``.
 
+        fill_value : object
+            The value to use for `indices` that are missing (-1), when
+            ``allow_fill=True``. This should be the category, i.e. a value
+            in ``self.categories``, not a code.
+
+            Specifying a `fill_value` that's not in ``self.categories`` is
+            allowed. The new category is added to the end of the existing
+            categories.
+
         Returns
         -------
         Categorical
             This Categorical will have the same categories and ordered as
             `self`.
+
+        See Also
+        --------
+        Series.take : Similar method for Series.
+        numpy.ndarray.take : Similar method for NumPy arrays.
+
+        Examples
+        --------
+        >>> cat = pd.Categorical(['a', 'a', 'b'])
+        >>> cat
+        [a, a, b]
+        Categories (2, object): [a, b]
+
+        Specify ``allow_fill==False`` to have negative indices mean indexing
+        from the right.
+
+        >>> cat.take([0, -1, -2], allow_fill=False)
+        [a, b, a]
+        Categories (2, object): [a, b]
+
+        With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
+        values that should be filled with the `fill_value`, which is
+        ``np.nan`` by default.
+
+        >>> cat.take([0, -1, -1], allow_fill=True)
+        [a, NaN, NaN]
+        Categories (2, object): [a, b]
+
+        The fill value can be specified. Notice that if the `fill_value` was
+        not previously present in ``self.categories``, it is added to the end
+        of the categories in the output Categorical.
+
+        >>> cat.take([0, -1, -1], allow_fill=True, fill_value='c')
+        [a, c, c]
+        Categories (3, object): [a, b, c]
         """
         indexer = np.asarray(indexer, dtype=np.intp)
         if allow_fill is None:
             if (indexer < 0).any():
                 warn(_take_msg, FutureWarning, stacklevel=2)
                 allow_fill = True
 
+        dtype = self.dtype
+
         if isna(fill_value):
-            # For categorical, any NA value is considered a user-facing
-            # NA value. Our storage NA value is -1.
             fill_value = -1
+        elif allow_fill and fill_value is not None:
+            # convert user-provided `fill_value` to codes
+            if fill_value in self.categories:
+                fill_value = self.categories.get_loc(fill_value)
+            else:
+                dtype = self.dtype._add_categories(fill_value)
+                fill_value = dtype.categories.get_loc(fill_value)
 
         codes = take(self._codes, indexer, allow_fill=allow_fill,
                      fill_value=fill_value)
-        result = self._constructor(codes, dtype=self.dtype, fastpath=True)
+        result = type(self).from_codes(codes,
+                                       categories=dtype.categories,
+                                       ordered=dtype.ordered)
         return result
 
     take = take_nd
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -469,6 +469,23 @@ def _is_boolean(self):
 
         return is_bool_dtype(self.categories)
 
+    def _add_categories(self, new_categories):
+        """
+        Return a new CategoricalDtype with new categories added at the end.
+
+        """
+        from pandas.core.dtypes.common import is_list_like
+
+        if not is_list_like(new_categories):
+            new_categories = [new_categories]
+        already_included = set(new_categories) & set(self.categories)
+        if len(already_included) != 0:
+            msg = ("new categories must not include old categories: "
+                   "{already_included!s}")
+            raise ValueError(msg.format(already_included=already_included))
+        new_categories = list(self.categories) + list(new_categories)
+        return CategoricalDtype(new_categories, self.ordered)
+
 
 class DatetimeTZDtype(PandasExtensionDtype):
 
diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py
@@ -111,3 +111,37 @@ def test_positional_take_unobserved(self, ordered):
         expected = pd.Categorical(['b', 'a'], categories=cat.categories,
                                   ordered=ordered)
         tm.assert_categorical_equal(result, expected)
+
+    def test_take_allow_fill(self):
+        cat = pd.Categorical(['a', 'a', 'b'])
+        result = cat.take([0, -1, -1], allow_fill=True)
+        expected = pd.Categorical(['a', np.nan, np.nan],
+                                  categories=['a', 'b'])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_take_fill_with_negative_one(self):
+        # -1 was a category
+        cat = pd.Categorical([-1, 0, 1])
+        result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
+        expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1])
+        tm.assert_categorical_equal(result, expected)
+
+        # -1 was not a category
+        cat = pd.Categorical([0, 1])
+        result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
+        expected = pd.Categorical([0, -1, 1], categories=[0, 1, -1])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_take_fill_value(self):
+        # https://github.com/pandas-dev/pandas/issues/23296
+        cat = pd.Categorical(['a', 'b', 'c'])
+        result = cat.take([0, 1, -1], fill_value='a', allow_fill=True)
+        expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_take_fill_value_adds_categories(self):
+        # https://github.com/pandas-dev/pandas/issues/23296
+        cat = pd.Categorical(['a', 'b', 'c'])
+        result = cat.take([0, 1, -1], fill_value='d', allow_fill=True)
+        expected = pd.Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c', 'd'])
+        tm.assert_categorical_equal(result, expected)