Skip to content

Commit c9b091f

Browse files
committed
BUG: Categoricals shouldn't allow non-strings when object dtype is passed (pandas-dev#13919)
1 parent 453bc26 commit c9b091f

File tree

3 files changed

+41
-3
lines changed

3 files changed

+41
-3
lines changed

doc/source/whatsnew/v0.19.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,3 +1075,4 @@ Bug Fixes
10751075
- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
10761076
- Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`)
10771077
- Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment.
1078+
- Bug in ``Categorical`` would allow creation when ``object`` dtype was passed in with categories not containing either all non-string or all non-period values

pandas/core/categorical.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
is_categorical_dtype,
2121
is_integer_dtype, is_bool,
2222
is_list_like, is_sequence,
23-
is_scalar)
23+
is_scalar,
24+
is_object_dtype)
2425
from pandas.core.common import is_null_slice
2526

2627
from pandas.core.algorithms import factorize, take_1d
@@ -191,6 +192,8 @@ class Categorical(PandasObject):
191192
If an explicit ``ordered=True`` is given but no `categories` and the
192193
`values` are not sortable.
193194
195+
If an `object` dtype is passed and `values` contains dtypes other
196+
than all strings or all periods.
194197
195198
Examples
196199
--------
@@ -324,6 +327,18 @@ def __init__(self, values, categories=None, ordered=False,
324327
"mean to use\n'Categorical.from_codes(codes, "
325328
"categories)'?", RuntimeWarning, stacklevel=2)
326329

330+
# TODO: disallow period when they stop being handled as object dtype
331+
# categoricals w/ object dtype shouldn't allow non-strings
332+
if is_object_dtype(categories) and len(categories) > 0:
333+
from pandas.lib import infer_dtype
334+
mask = notnull(categories)
335+
if infer_dtype(categories[mask]) not in ['period',
336+
'unicode',
337+
'string']:
338+
raise TypeError(
339+
"Categoricals cannot be object dtype unless"
340+
" all values are strings or all are periods.")
341+
327342
self.set_ordered(ordered or False, inplace=True)
328343
self._categories = categories
329344
self._codes = _coerce_indexer_dtype(codes, categories)

pandas/tests/test_categorical.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,35 @@ def test_constructor_unsortable(self):
9494

9595
# it works!
9696
arr = np.array([1, 2, 3, datetime.now()], dtype='O')
97-
factor = Categorical.from_array(arr, ordered=False)
98-
self.assertFalse(factor.ordered)
97+
msg = "Categoricals cannot be object dtype unless all values are " \
98+
"strings or all are periods."
99+
with tm.assertRaisesRegexp(TypeError, msg):
100+
factor = Categorical.from_array(arr, ordered=False)
99101

100102
# this however will raise as cannot be sorted
101103
self.assertRaises(
102104
TypeError, lambda: Categorical.from_array(arr, ordered=True))
103105

106+
def test_constructor_object_dtype(self):
107+
#GH 13919
108+
109+
#categories must be of single dtype
110+
arr = np.array([1, 2, 3, 's'], dtype=object)
111+
msg = "Categoricals cannot be object dtype unless all values are " \
112+
"strings or all are periods."
113+
with tm.assertRaisesRegexp(TypeError, msg):
114+
c = Categorical.from_array(arr)
115+
116+
# object dtype allowed when all strs
117+
exp_arr = np.array(list('abcd'), dtype=object)
118+
c = Categorical.from_array(exp_arr)
119+
tm.assert_numpy_array_equal(c.__array__(), exp_arr)
120+
121+
# object dtype also allowed when all periods
122+
idx = pd.period_range('1/1/2000', freq='D', periods=5)
123+
c = Categorical(idx)
124+
tm.assert_index_equal(c.categories, idx)
125+
104126
def test_is_equal_dtype(self):
105127

106128
# test dtype comparisons between cats

0 commit comments

Comments
 (0)