Skip to content

add sort_categories argument #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ Unioning
.. versionadded:: 0.19.0

If you want to combine categoricals that do not necessarily have
the same categories, the `union_categorical` function will
the same categories, the ``union_categoricals`` function will
combine a list-like of categoricals. The new categories
will be the union of the categories being combined.

Expand All @@ -667,10 +667,20 @@ will be the union of the categories being combined.
b = pd.Categorical(["a", "b"])
union_categoricals([a, b])

By default, the resulting categories will be ordered as
they appear in the data. If you want the categories to
be lexsorted, use ``sort_categories=True`` argument.

.. ipython:: python

union_categoricals([a, b], sort_categories=True)

.. note::

`union_categoricals` only works with unordered categoricals
and will raise if any are ordered.
``union_categoricals`` only works with:
- unordered categoricals
- ordered categoricals which have the same categories



Getting Data In/Out
Expand Down
97 changes: 89 additions & 8 deletions pandas/tools/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,23 +870,26 @@ def test_union_categorical(self):
# new categories ordered by appearance
s = Categorical(['x', 'y', 'z'])
s2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([s, s2]).categories
expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_index_equal(result, expected)
result = union_categoricals([s, s2])
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
categories=['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)

# can't be ordered
s = Categorical([0, 1.2, 2], ordered=True)
s2 = Categorical([0, 1.2, 2], ordered=True)
with tm.assertRaises(TypeError):
union_categoricals([s, s2])
result = union_categoricals([s, s2])
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
tm.assert_categorical_equal(result, expected)

# must exactly match types
s = Categorical([0, 1.2, 2])
s2 = Categorical([2, 3, 4])
with tm.assertRaises(TypeError):
msg = 'dtype of categories must be the same'
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([s, s2])

with tm.assertRaises(ValueError):
msg = 'No Categoricals to union'
with tm.assertRaisesRegexp(ValueError, msg):
union_categoricals([])

def test_union_categoricals_nan(self):
Expand Down Expand Up @@ -942,6 +945,84 @@ def test_union_categoricals_empty(self):
pd.Categorical([])])
tm.assert_categorical_equal(res, nanc)

def test_union_categorical_same_category(self):
# check fastpath
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
categories=[1, 2, 3, 4])
tm.assert_categorical_equal(res, exp)

c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
res = union_categoricals([c1, c2])
exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
categories=['x', 'y', 'z'])
tm.assert_categorical_equal(res, exp)

def test_union_categoricals_ordered(self):
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)

msg = 'Categorical.ordered must be the same'
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([c1, c2])

res = union_categoricals([c1, c1])
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)

c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
tm.assert_categorical_equal(res, exp)

c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

msg = "to union ordered Categoricals, all categories must be the same"
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([c1, c2])

def test_union_categoricals_sort(self):
# GH 13763
c1 = Categorical(['x', 'y', 'z'])
c2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
categories=['a', 'b', 'c', 'x', 'y', 'z'])
tm.assert_categorical_equal(result, expected)

# fastpath
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['a', 'b', 'b', 'c'],
categories=['a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)

c1 = Categorical(['x', np.nan])
c2 = Categorical([np.nan, 'b'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['x', np.nan, np.nan, 'b'],
categories=['b', 'x'])
tm.assert_categorical_equal(result, expected)

c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([np.nan, np.nan], categories=[])
tm.assert_categorical_equal(result, expected)

c1 = Categorical([])
c2 = Categorical([])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([])
tm.assert_categorical_equal(result, expected)

def test_concat_bug_1719(self):
ts1 = tm.makeTimeSeries()
ts2 = tm.makeTimeSeries()[::2]
Expand Down
70 changes: 46 additions & 24 deletions pandas/types/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,28 +211,30 @@ def convert_categorical(x):
return Categorical(concatted, rawcats)


def union_categoricals(to_union):
def union_categoricals(to_union, sort_categories=False):
"""
Combine list-like of Categoricals, unioning categories. All
must have the same dtype, and none can be ordered.
categories must have the same dtype.

.. versionadded:: 0.19.0

Parameters
----------
to_union : list-like of Categoricals
sort_categories : boolean, default False
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would just call this sort

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't that kind of ambiguous as to whether it will sort the categories or the values in the resulting Categorical?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also prefer sort_categories to be explicit.

BTW, I recommend @chris-b1 to open a separate PR on pandas master for open discussion.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure once you merge pandas-dev#13763, I can submit as a standalone PR

If true, resulting categories will be lexsorted, otherwise
they will be ordered as they appear in the data

Returns
-------
Categorical
A single array, categories will be ordered as they
appear in the list
result : Categorical

Raises
------
TypeError
If any of the categoricals are ordered or all do not
have the same dtype
- all inputs do not have the same dtype
- all inputs do not have the same ordered property
- all inputs are ordered and their categories are not identical
ValueError
Emmpty list of categoricals passed
"""
Expand All @@ -242,28 +244,48 @@ def union_categoricals(to_union):
raise ValueError('No Categoricals to union')

first = to_union[0]
if any(c.ordered for c in to_union):
raise TypeError("Can only combine unordered Categoricals")

if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
for c in to_union):
if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
for other in to_union[1:]):
raise TypeError("dtype of categories must be the same")

cats = first.categories
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
categories = Index(unique_cats)

new_codes = []
for c in to_union:
if len(c.categories) > 0:
indexer = categories.get_indexer(c.categories)
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
ordered = False
if all(first.is_dtype_equal(other) for other in to_union[1:]):
# identical categories - fastpath
categories = first.categories
ordered = first.ordered
new_codes = np.concatenate([c.codes for c in to_union])

if sort_categories:
categories = categories.sort_values()
indexer = first.categories.get_indexer(categories)
new_codes = take_1d(indexer, new_codes, fill_value=-1)
elif all(not c.ordered for c in to_union):
# different categories - union and recode
cats = first.categories.append([c.categories for c in to_union[1:]])
categories = Index(cats.unique())
if sort_categories:
categories = categories.sort_values()

new_codes = []
for c in to_union:
if len(c.categories) > 0:
indexer = categories.get_indexer(c.categories)
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
else:
# must be all NaN
new_codes.append(c.codes)
new_codes = np.concatenate(new_codes)
else:
# ordered - to show a proper error message
if all(c.ordered for c in to_union):
msg = ("to union ordered Categoricals, "
"all categories must be the same")
raise TypeError(msg)
else:
# must be all NaN
new_codes.append(c.codes)
raise TypeError('Categorical.ordered must be the same')

new_codes = np.concatenate(new_codes)
return Categorical(new_codes, categories=categories, ordered=False,
return Categorical(new_codes, categories=categories, ordered=ordered,
fastpath=True)


Expand Down