From 80516ff80d3674b36ec8405558f4c7bea25cb838 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 21 Apr 2017 19:25:43 -0400 Subject: [PATCH] BUG: fix degenerate MultiIndex sorting xref #15694 closes #15797 --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/frame.py | 3 +++ pandas/core/indexes/base.py | 4 ++++ pandas/core/reshape/reshape.py | 9 ++++++++- pandas/core/series.py | 1 + pandas/tests/test_multilevel.py | 33 +++++++++++++++++++-------------- 6 files changed, 36 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 781e90a39e48f..945922b5f9ba8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -870,7 +870,7 @@ DataFrame.sort_index changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort. -This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`) +This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`, :issue:`15797`) This is *unchanged* from prior versions, but shown for illustration purposes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dd2b975560186..b3da897b97e5c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3364,6 +3364,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis=baxis, convert=False, verify=False) + # reconstruct axis if needed + new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + if inplace: return self._update_inplace(new_data) else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dcb9f9a144f39..04458d684d795 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -465,6 +465,10 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") + def _sort_levels_monotonic(self): + """ compat with MultiIndex """ + return self + _index_shared_docs['_get_grouper_for_level'] = """ Get index grouper corresponding to an index level diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index bfd5320af13fb..a3cf80d758b7b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -650,8 +650,15 @@ def _convert_level_number(level_num, columns): drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) - slice_len = loc.stop - loc.start + # can make more efficient? + # we almost always return a slice + # but if unsorted can get a boolean + # indexer + if not isinstance(loc, slice): + slice_len = len(loc) + else: + slice_len = loc.stop - loc.start if slice_len == 0: drop_cols.append(key) diff --git a/pandas/core/series.py b/pandas/core/series.py index e0364ad629c5d..d4511fb58b2f3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1773,6 +1773,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, indexer = _ensure_platform_int(indexer) new_index = index.take(indexer) + new_index = new_index._sort_levels_monotonic() new_values = self._values.take(indexer) result = self._constructor(new_values, index=new_index) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index d7ba7f1c6fac6..a7a80c635a364 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -11,7 +11,6 @@ from pandas.core.index import Index, MultiIndex from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp -from pandas.core.common import UnsortedIndexError from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype import pandas.core.common as com import pandas.util.testing as tm @@ -938,7 +937,7 @@ def test_stack_mixed_dtype(self): df = df.sort_index(level=1, axis=1) stacked = df.stack() - result = df['foo'].stack() + result = df['foo'].stack().sort_index() tm.assert_series_equal(stacked['foo'], result, check_names=False) self.assertIs(result.name, None) self.assertEqual(stacked['bar'].dtype, np.float_) @@ -2456,11 +2455,11 @@ def test_frame_getitem_not_sorted2(self): assert df2_original.index.equals(df2.index) expected = df2.sort_index() - assert not expected.index.is_lexsorted() + assert expected.index.is_lexsorted() assert expected.index.is_monotonic result = df2.sort_index(level=0) - assert not result.index.is_lexsorted() + assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) @@ -2536,8 +2535,7 @@ def test_sort_index_and_reconstruction(self): concatted = pd.concat([df, df], keys=[0.8, 0.5]) result = concatted.sort_index() - # this will be monotonic, but not lexsorted! - assert not result.index.is_lexsorted() + assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) @@ -2576,7 +2574,7 @@ def test_sort_index_and_reconstruction_doc_example(self): levels=[['a', 'b'], ['aa', 'bb']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) result = df.sort_index() - assert not result.index.is_lexsorted() + assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) @@ -2618,22 +2616,29 @@ def my_func(group): def test_sort_non_lexsorted(self): # degenerate case where we sort but don't # have a satisfying result :< - + # GH 15797 idx = MultiIndex([['A', 'B', 'C'], ['c', 'b', 'a']], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]]) - df = DataFrame({'col': range(len(idx))}, index=idx) + df = DataFrame({'col': range(len(idx))}, + index=idx, + dtype='int64') assert df.index.is_lexsorted() is False assert df.index.is_monotonic is False - result = df.sort_index() - assert result.index.is_lexsorted() is False - assert result.index.is_monotonic is True + sorted = df.sort_index() + assert sorted.index.is_lexsorted() is True + assert sorted.index.is_monotonic is True - with pytest.raises(UnsortedIndexError): - result.loc[pd.IndexSlice['B':'C', 'a':'c'], :] + expected = DataFrame( + {'col': [1, 4, 5, 2]}, + index=MultiIndex.from_tuples([('B', 'a'), ('B', 'c'), + ('C', 'a'), ('C', 'b')]), + dtype='int64') + result = sorted.loc[pd.IndexSlice['B':'C', 'a':'c'], :] + tm.assert_frame_equal(result, expected) def test_sort_index_nan(self): # GH 14784