Skip to content

Commit 8ec7746

Browse files
committed
BUG: Handle fill_value in Categorical.take
Closes pandas-dev#23296
1 parent 104ccfd commit 8ec7746

File tree

4 files changed

+113
-14
lines changed

4 files changed

+113
-14
lines changed

doc/source/whatsnew/v0.24.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,7 @@ Categorical
973973
- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
974974
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
975975
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
976+
- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).
976977

977978
Datetimelike
978979
^^^^^^^^^^^^

pandas/core/arrays/categorical.py

Lines changed: 61 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,15 +1019,7 @@ def add_categories(self, new_categories, inplace=False):
10191019
set_categories
10201020
"""
10211021
inplace = validate_bool_kwarg(inplace, 'inplace')
1022-
if not is_list_like(new_categories):
1023-
new_categories = [new_categories]
1024-
already_included = set(new_categories) & set(self.dtype.categories)
1025-
if len(already_included) != 0:
1026-
msg = ("new categories must not include old categories: "
1027-
"{already_included!s}")
1028-
raise ValueError(msg.format(already_included=already_included))
1029-
new_categories = list(self.dtype.categories) + list(new_categories)
1030-
new_dtype = CategoricalDtype(new_categories, self.ordered)
1022+
new_dtype = self.dtype._add_categories(new_categories)
10311023

10321024
cat = self if inplace else self.copy()
10331025
cat._dtype = new_dtype
@@ -1768,8 +1760,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
17681760
17691761
Parameters
17701762
----------
1771-
indexer : sequence of integers
1772-
allow_fill : bool, default None.
1763+
indexer : sequence of int
1764+
The indices in `self` to take. The meaning of negative values in
1765+
`indexer` depends on the value of `allow_fill`.
1766+
allow_fill : bool, default None
17731767
How to handle negative values in `indexer`.
17741768
17751769
* False: negative values in `indices` indicate positional indices
@@ -1786,26 +1780,79 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
17861780
default is ``True``. In the future, this will change to
17871781
``False``.
17881782
1783+
fill_value : object
1784+
The value to use for `indices` that are missing (-1), when
1785+
``allow_fill=True``. This should be the category, i.e. a value
1786+
in ``self.categories``, not a code.
1787+
1788+
Specifying a `fill_value` that's not in ``self.categories`` is
1789+
allowed. The new category is added to the end of the existing
1790+
categories.
1791+
17891792
Returns
17901793
-------
17911794
Categorical
17921795
This Categorical will have the same categories and ordered as
17931796
`self`.
1797+
1798+
See Also
1799+
--------
1800+
Series.take : Similar method for Series.
1801+
numpy.ndarray.take : Similar method for NumPy arrays.
1802+
1803+
Examples
1804+
--------
1805+
>>> cat = pd.Categorical(['a', 'a', 'b'])
1806+
>>> cat
1807+
[a, a, b]
1808+
Categories (2, object): [a, b]
1809+
1810+
Specify ``allow_fill==False`` to have negative indices mean indexing
1811+
from the right.
1812+
1813+
>>> cat.take([0, -1, -2], allow_fill=False)
1814+
[a, b, a]
1815+
Categories (2, object): [a, b]
1816+
1817+
With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
1818+
values that should be filled with the `fill_value`, which is
1819+
``np.nan`` by default.
1820+
1821+
>>> cat.take([0, -1, -1], allow_fill=True)
1822+
[a, NaN, NaN]
1823+
Categories (2, object): [a, b]
1824+
1825+
The fill value can be specified. Notice that if the `fill_value` was
1826+
not previously present in ``self.categories``, it is added to the end
1827+
of the categories in the output Categorical.
1828+
1829+
>>> cat.take([0, -1, -1], allow_fill=True, fill_value='c')
1830+
[a, c, c]
1831+
Categories (3, object): [a, b, c]
17941832
"""
17951833
indexer = np.asarray(indexer, dtype=np.intp)
17961834
if allow_fill is None:
17971835
if (indexer < 0).any():
17981836
warn(_take_msg, FutureWarning, stacklevel=2)
17991837
allow_fill = True
18001838

1839+
dtype = self.dtype
1840+
18011841
if isna(fill_value):
1802-
# For categorical, any NA value is considered a user-facing
1803-
# NA value. Our storage NA value is -1.
18041842
fill_value = -1
1843+
elif allow_fill and fill_value is not None:
1844+
# convert user-provided `fill_value` to codes
1845+
if fill_value in self.categories:
1846+
fill_value = self.categories.get_loc(fill_value)
1847+
else:
1848+
dtype = self.dtype._add_categories(fill_value)
1849+
fill_value = dtype.categories.get_loc(fill_value)
18051850

18061851
codes = take(self._codes, indexer, allow_fill=allow_fill,
18071852
fill_value=fill_value)
1808-
result = self._constructor(codes, dtype=self.dtype, fastpath=True)
1853+
result = type(self).from_codes(codes,
1854+
categories=dtype.categories,
1855+
ordered=dtype.ordered)
18091856
return result
18101857

18111858
take = take_nd

pandas/core/dtypes/dtypes.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,23 @@ def _is_boolean(self):
469469

470470
return is_bool_dtype(self.categories)
471471

472+
def _add_categories(self, new_categories):
473+
"""
474+
Return a new CategoricalDtype with new categories added at the end.
475+
476+
"""
477+
from pandas.core.dtypes.common import is_list_like
478+
479+
if not is_list_like(new_categories):
480+
new_categories = [new_categories]
481+
already_included = set(new_categories) & set(self.categories)
482+
if len(already_included) != 0:
483+
msg = ("new categories must not include old categories: "
484+
"{already_included!s}")
485+
raise ValueError(msg.format(already_included=already_included))
486+
new_categories = list(self.categories) + list(new_categories)
487+
return CategoricalDtype(new_categories, self.ordered)
488+
472489

473490
class DatetimeTZDtype(PandasExtensionDtype):
474491

pandas/tests/arrays/categorical/test_algos.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,37 @@ def test_positional_take_unobserved(self, ordered):
111111
expected = pd.Categorical(['b', 'a'], categories=cat.categories,
112112
ordered=ordered)
113113
tm.assert_categorical_equal(result, expected)
114+
115+
def test_take_allow_fill(self):
116+
cat = pd.Categorical(['a', 'a', 'b'])
117+
result = cat.take([0, -1, -1], allow_fill=True)
118+
expected = pd.Categorical(['a', np.nan, np.nan],
119+
categories=['a', 'b'])
120+
tm.assert_categorical_equal(result, expected)
121+
122+
def test_take_fill_with_negative_one(self):
123+
# -1 was a category
124+
cat = pd.Categorical([-1, 0, 1])
125+
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
126+
expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1])
127+
tm.assert_categorical_equal(result, expected)
128+
129+
# -1 was not a category
130+
cat = pd.Categorical([0, 1])
131+
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
132+
expected = pd.Categorical([0, -1, 1], categories=[0, 1, -1])
133+
tm.assert_categorical_equal(result, expected)
134+
135+
def test_take_fill_value(self):
136+
# https://github.com/pandas-dev/pandas/issues/23296
137+
cat = pd.Categorical(['a', 'b', 'c'])
138+
result = cat.take([0, 1, -1], fill_value='a', allow_fill=True)
139+
expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'])
140+
tm.assert_categorical_equal(result, expected)
141+
142+
def test_take_fill_value_adds_categories(self):
143+
# https://github.com/pandas-dev/pandas/issues/23296
144+
cat = pd.Categorical(['a', 'b', 'c'])
145+
result = cat.take([0, 1, -1], fill_value='d', allow_fill=True)
146+
expected = pd.Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c', 'd'])
147+
tm.assert_categorical_equal(result, expected)

0 commit comments

Comments
 (0)