Skip to content

BUG: fix degenerate MultiIndex sorting #16092

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ DataFrame.sort_index changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort.
This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)
This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`, :issue:`15797`)

This is *unchanged* from prior versions, but shown for illustration purposes:

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3364,6 +3364,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
axis=baxis,
convert=False, verify=False)

# reconstruct axis if needed
new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()

if inplace:
return self._update_inplace(new_data)
else:
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,10 @@ def _update_inplace(self, result, **kwargs):
# guard when called from IndexOpsMixin
raise TypeError("Index can't be updated inplace")

def _sort_levels_monotonic(self):
""" compat with MultiIndex """
return self

_index_shared_docs['_get_grouper_for_level'] = """
Get index grouper corresponding to an index level

Expand Down
9 changes: 8 additions & 1 deletion pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,8 +650,15 @@ def _convert_level_number(level_num, columns):
drop_cols = []
for key in unique_groups:
loc = this.columns.get_loc(key)
slice_len = loc.stop - loc.start

# can make more efficient?
# we almost always return a slice
# but if unsorted can get a boolean
# indexer
if not isinstance(loc, slice):
slice_len = len(loc)
else:
slice_len = loc.stop - loc.start

if slice_len == 0:
drop_cols.append(key)
Expand Down
1 change: 1 addition & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1773,6 +1773,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,

indexer = _ensure_platform_int(indexer)
new_index = index.take(indexer)
new_index = new_index._sort_levels_monotonic()

new_values = self._values.take(indexer)
result = self._constructor(new_values, index=new_index)
Expand Down
33 changes: 19 additions & 14 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from pandas.core.index import Index, MultiIndex
from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp

from pandas.core.common import UnsortedIndexError
from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
import pandas.core.common as com
import pandas.util.testing as tm
Expand Down Expand Up @@ -938,7 +937,7 @@ def test_stack_mixed_dtype(self):
df = df.sort_index(level=1, axis=1)

stacked = df.stack()
result = df['foo'].stack()
result = df['foo'].stack().sort_index()
tm.assert_series_equal(stacked['foo'], result, check_names=False)
self.assertIs(result.name, None)
self.assertEqual(stacked['bar'].dtype, np.float_)
Expand Down Expand Up @@ -2456,11 +2455,11 @@ def test_frame_getitem_not_sorted2(self):

assert df2_original.index.equals(df2.index)
expected = df2.sort_index()
assert not expected.index.is_lexsorted()
assert expected.index.is_lexsorted()
assert expected.index.is_monotonic

result = df2.sort_index(level=0)
assert not result.index.is_lexsorted()
assert result.index.is_lexsorted()
assert result.index.is_monotonic
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -2536,8 +2535,7 @@ def test_sort_index_and_reconstruction(self):
concatted = pd.concat([df, df], keys=[0.8, 0.5])
result = concatted.sort_index()

# this will be monotonic, but not lexsorted!
assert not result.index.is_lexsorted()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -2576,7 +2574,7 @@ def test_sort_index_and_reconstruction_doc_example(self):
levels=[['a', 'b'], ['aa', 'bb']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
result = df.sort_index()
assert not result.index.is_lexsorted()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -2618,22 +2616,29 @@ def my_func(group):
def test_sort_non_lexsorted(self):
# degenerate case where we sort but don't
# have a satisfying result :<

# GH 15797
idx = MultiIndex([['A', 'B', 'C'],
['c', 'b', 'a']],
[[0, 1, 2, 0, 1, 2],
[0, 2, 1, 1, 0, 2]])

df = DataFrame({'col': range(len(idx))}, index=idx)
df = DataFrame({'col': range(len(idx))},
index=idx,
dtype='int64')
assert df.index.is_lexsorted() is False
assert df.index.is_monotonic is False

result = df.sort_index()
assert result.index.is_lexsorted() is False
assert result.index.is_monotonic is True
sorted = df.sort_index()
assert sorted.index.is_lexsorted() is True
assert sorted.index.is_monotonic is True

with pytest.raises(UnsortedIndexError):
result.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
expected = DataFrame(
{'col': [1, 4, 5, 2]},
index=MultiIndex.from_tuples([('B', 'a'), ('B', 'c'),
('C', 'a'), ('C', 'b')]),
dtype='int64')
result = sorted.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
tm.assert_frame_equal(result, expected)

def test_sort_index_nan(self):
# GH 14784
Expand Down