-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: MultiIndex.from_frame #23141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: MultiIndex.from_frame #23141
Changes from 38 commits
79bdecb
fa82618
64b45d6
64c7bb1
3ee676c
fd266f5
4bc8f5b
9d92b70
45595ad
3530cd3
1c22791
cf78780
64c2750
ede030b
190c341
e0df632
78ff5c2
0252db9
d98c8a9
8a1906e
08c120f
8353c3f
9df3c11
6d4915e
b5df7b2
ab3259c
cf95261
63051d7
a75a4a5
8d23df9
c8d696d
7cf82d1
1a282e5
b3c6a90
c760359
bb69314
9e11180
96c6af3
a5236bf
c78f364
14bfea8
6960804
11c5947
904644a
30fe0df
ec60563
8fc6609
9b906c6
e416122
4ef9ec4
4240a1e
9159b2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
|
||
# pylint: disable=E1101,E1103,W0232 | ||
import datetime | ||
from collections import OrderedDict | ||
from sys import getsizeof | ||
import warnings | ||
|
||
|
@@ -19,6 +20,7 @@ | |
is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, | ||
pandas_dtype) | ||
from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype | ||
from pandas.core.dtypes.generic import ABCDataFrame | ||
from pandas.core.dtypes.missing import array_equivalent, isna | ||
|
||
import pandas.core.algorithms as algos | ||
|
@@ -180,6 +182,7 @@ class MultiIndex(Index): | |
from_arrays | ||
from_tuples | ||
from_product | ||
from_frame | ||
set_levels | ||
set_labels | ||
to_frame | ||
|
@@ -1184,11 +1187,17 @@ def to_frame(self, index=True, name=None): | |
else: | ||
idx_names = self.names | ||
|
||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
result = DataFrame({(name or level): | ||
self._get_level_values(level) | ||
for name, level in | ||
zip(idx_names, range(len(self.levels)))}, | ||
copy=False) | ||
# Guarantee resulting column order | ||
result = DataFrame( | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
OrderedDict([ | ||
((level if name is None else name), | ||
self._get_level_values(level)) | ||
for name, level in zip(idx_names, range(len(self.levels))) | ||
]), | ||
copy=False | ||
) | ||
|
||
|
||
if index: | ||
result.index = self | ||
return result | ||
|
@@ -1317,6 +1326,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): | |
MultiIndex.from_tuples : Convert list of tuples to MultiIndex. | ||
MultiIndex.from_product : Make a MultiIndex from cartesian product | ||
of iterables. | ||
MultiIndex.from_frame : Make a MultiIndex from a DataFrame. | ||
""" | ||
if not is_list_like(arrays): | ||
raise TypeError("Input must be a list / sequence of array-likes.") | ||
|
@@ -1366,6 +1376,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): | |
MultiIndex.from_arrays : Convert list of arrays to MultiIndex | ||
MultiIndex.from_product : Make a MultiIndex from cartesian product | ||
of iterables | ||
MultiIndex.from_frame : Make a MultiIndex from a DataFrame. | ||
""" | ||
if not is_list_like(tuples): | ||
raise TypeError('Input must be a list / sequence of tuple-likes.') | ||
|
@@ -1422,6 +1433,7 @@ def from_product(cls, iterables, sortorder=None, names=None): | |
-------- | ||
MultiIndex.from_arrays : Convert list of arrays to MultiIndex. | ||
MultiIndex.from_tuples : Convert list of tuples to MultiIndex. | ||
MultiIndex.from_frame : Make a MultiIndex from a DataFrame. | ||
""" | ||
from pandas.core.arrays.categorical import _factorize_from_iterables | ||
from pandas.core.reshape.util import cartesian_product | ||
|
@@ -1435,6 +1447,89 @@ def from_product(cls, iterables, sortorder=None, names=None): | |
labels = cartesian_product(labels) | ||
return MultiIndex(levels, labels, sortorder=sortorder, names=names) | ||
|
||
@classmethod | ||
def from_frame(cls, df, sortorder=None, names=None): | ||
""" | ||
Make a MultiIndex from a DataFrame. | ||
|
||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
.. versionadded:: 0.24.0 | ||
|
||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
DataFrame to be converted to MultiIndex. | ||
sortorder : int or None | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Level of sortedness (must be lexicographically sorted by that | ||
level). | ||
names : list-like, optonal | ||
If no names are provided, use the column names, or tuple of column | ||
names if the columns is a MultiIndex. If a sequence, overwrite | ||
names with the given sequence. | ||
|
||
Returns | ||
------- | ||
MultiIndex or Index | ||
The MultiIndex representation of the given DataFrame. | ||
|
||
Examples | ||
-------- | ||
>>> df = pd.DataFrame([[0, 'happy'], [0, 'jolly'], [1, 'happy'], | ||
... [1, 'jolly'], [2, 'joy'], [2, 'joy']], | ||
... columns=['will_be', 'used']) | ||
>>> df | ||
will_be used | ||
0 0 happy | ||
1 0 jolly | ||
2 1 happy | ||
3 1 jolly | ||
4 2 joy | ||
5 2 joy | ||
>>> pd.MultiIndex.from_frame(df) | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
MultiIndex(levels=[[0, 1, 2], ['happy', 'jolly', 'joy']], | ||
labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
names=['will_be', 'used']) | ||
|
||
>>> df = pd.DataFrame([['ahc', 'iam'], ['ahc', 'wim'], ['boh', 'amg'], | ||
... ['boh', 'iam'], ['oil', 'wim'], ['oil', 'amg']], | ||
... columns=['will_be', 'overriden']) | ||
>>> df | ||
will_be overriden | ||
0 ahc iam | ||
1 ahc wim | ||
2 boh amg | ||
3 boh iam | ||
4 oil wim | ||
5 oil amg | ||
>>> pd.MultiIndex.from_frame(df, names=['sure', 'will']) | ||
MultiIndex(levels=[['ahc', 'boh', 'oil'], ['amg', 'iam', 'wim']], | ||
labels=[[0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0]], | ||
names=['sure', 'will']) | ||
|
||
See Also | ||
-------- | ||
MultiIndex.from_arrays : Convert list of arrays to MultiIndex. | ||
MultiIndex.from_tuples : Convert list of tuples to MultiIndex. | ||
MultiIndex.from_product : Make a MultiIndex from cartesian product | ||
of iterables. | ||
""" | ||
if not isinstance(df, ABCDataFrame): | ||
raise TypeError("Input must be a DataFrame") | ||
|
||
column_names, columns = lzip(*df.iteritems()) | ||
|
||
# Get MultiIndex names | ||
if names is None: | ||
names = column_names | ||
elif is_list_like(names): | ||
if len(names) != len(df.columns): | ||
raise ValueError("'names' should have same length as " | ||
"number of columns in df.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggested in my previous comment that all checks on
sufficient?! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup, removed the redundant code. Thanks. |
||
else: | ||
raise TypeError("'names' must be a list / sequence of column " | ||
"names.") | ||
|
||
return cls.from_arrays(columns, sortorder=sortorder, names=names) | ||
|
||
def _sort_levels_monotonic(self): | ||
""" | ||
.. versionadded:: 0.20.0 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import re | ||
from collections import OrderedDict | ||
|
||
import numpy as np | ||
import pytest | ||
|
@@ -99,6 +100,9 @@ def test_copy_in_constructor(): | |
assert mi.levels[0][0] == val | ||
|
||
|
||
# ---------------------------------------------------------------------------- | ||
# from_arrays | ||
# ---------------------------------------------------------------------------- | ||
def test_from_arrays(idx): | ||
arrays = [] | ||
for lev, lab in zip(idx.levels, idx.labels): | ||
|
@@ -271,6 +275,9 @@ def test_from_arrays_different_lengths(idx1, idx2): | |
MultiIndex.from_arrays([idx1, idx2]) | ||
|
||
|
||
# ---------------------------------------------------------------------------- | ||
# from_tuples | ||
# ---------------------------------------------------------------------------- | ||
def test_from_tuples(): | ||
msg = 'Cannot infer number of levels from empty list' | ||
with pytest.raises(TypeError, match=msg): | ||
|
@@ -314,6 +321,28 @@ def test_from_tuples_index_values(idx): | |
assert (result.values == idx.values).all() | ||
|
||
|
||
def test_tuples_with_name_string(): | ||
# GH 15110 and GH 14848 | ||
|
||
li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] | ||
with pytest.raises(ValueError): | ||
pd.Index(li, name='abc') | ||
with pytest.raises(ValueError): | ||
pd.Index(li, name='a') | ||
|
||
|
||
def test_from_tuples_with_tuple_label(): | ||
# GH 15457 | ||
expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], | ||
columns=['a', 'b', 'c']).set_index(['a', 'b']) | ||
idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) | ||
result = pd.DataFrame([2, 3], columns=['c'], index=idx) | ||
tm.assert_frame_equal(expected, result) | ||
|
||
|
||
# ---------------------------------------------------------------------------- | ||
# from_product | ||
# ---------------------------------------------------------------------------- | ||
def test_from_product_empty_zero_levels(): | ||
# 0 levels | ||
msg = "Must pass non-zero number of levels/labels" | ||
|
@@ -463,20 +492,71 @@ def test_create_index_existing_name(idx): | |
tm.assert_index_equal(result, expected) | ||
|
||
|
||
def test_tuples_with_name_string(): | ||
# GH 15110 and GH 14848 | ||
|
||
li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] | ||
with pytest.raises(ValueError): | ||
pd.Index(li, name='abc') | ||
with pytest.raises(ValueError): | ||
pd.Index(li, name='a') | ||
|
||
|
||
def test_from_tuples_with_tuple_label(): | ||
# GH 15457 | ||
expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], | ||
columns=['a', 'b', 'c']).set_index(['a', 'b']) | ||
idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) | ||
result = pd.DataFrame([2, 3], columns=['c'], index=idx) | ||
tm.assert_frame_equal(expected, result) | ||
# ---------------------------------------------------------------------------- | ||
# from_frame | ||
# ---------------------------------------------------------------------------- | ||
def test_from_frame(): | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# GH 22420 | ||
df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], | ||
columns=['L1', 'L2']) | ||
expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), | ||
('b', 'a'), ('b', 'b')], | ||
names=['L1', 'L2']) | ||
result = pd.MultiIndex.from_frame(df) | ||
tm.assert_index_equal(expected, result) | ||
|
||
|
||
@pytest.mark.parametrize('non_frame', [ | ||
pd.Series([1, 2, 3, 4]), | ||
[1, 2, 3, 4], | ||
[[1, 2], [3, 4], [5, 6]], | ||
pd.Index([1, 2, 3, 4]), | ||
np.array([[1, 2], [3, 4], [5, 6]]), | ||
27 | ||
]) | ||
def test_from_frame_non_frame(non_frame): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rename to test_from_frame_error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
# GH 22420 | ||
with tm.assert_raises_regex(TypeError, 'Input must be a DataFrame'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed, thanks. |
||
pd.MultiIndex.from_frame(non_frame) | ||
|
||
|
||
def test_from_frame_dtype_fidelity(): | ||
# GH 22420 | ||
df = pd.DataFrame(OrderedDict([ | ||
('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), | ||
('a', [1, 1, 1, 2, 2, 2]), | ||
('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), | ||
('c', ['x', 'x', 'y', 'z', 'x', 'y']) | ||
])) | ||
original_dtypes = df.dtypes.to_dict() | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
expected_mi = pd.MultiIndex.from_arrays([ | ||
pd.date_range('19910905', periods=6, tz='US/Eastern'), | ||
[1, 1, 1, 2, 2, 2], | ||
pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), | ||
['x', 'x', 'y', 'z', 'x', 'y'] | ||
], names=['dates', 'a', 'b', 'c']) | ||
mi = pd.MultiIndex.from_frame(df) | ||
mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} | ||
|
||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tm.assert_index_equal(expected_mi, mi) | ||
assert original_dtypes == mi_dtypes | ||
|
||
|
||
@pytest.mark.parametrize('names_in,names_out', [ | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
(None, [('L1', 'x'), ('L2', 'y')]), | ||
(['x', 'y'], ['x', 'y']), | ||
('bad_input', None), | ||
]) | ||
def test_from_frame_names(names_in, names_out): | ||
# GH 22420 | ||
df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], | ||
columns=pd.MultiIndex.from_tuples([('L1', 'x'), | ||
('L2', 'y')])) | ||
if names_out is None: | ||
with tm.assert_raises_regex(TypeError, "'names' must be a list / " | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"sequence of column names."): | ||
pd.MultiIndex.from_frame(df, names=names_in) | ||
else: | ||
mi = pd.MultiIndex.from_frame(df, names=names_in) | ||
assert mi.names == names_out |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from collections import OrderedDict | ||
|
||
import pytest | ||
import numpy as np | ||
|
||
|
@@ -83,6 +85,39 @@ def test_to_frame(): | |
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_to_frame_dtype_fidelity(): | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# GH 22420 | ||
mi = pd.MultiIndex.from_arrays([ | ||
pd.date_range('19910905', periods=6, tz='US/Eastern'), | ||
[1, 1, 1, 2, 2, 2], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this a repeated test of the above, if so, then not necessary here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test was at the suggestion of @TomAugspurger |
||
pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), | ||
['x', 'x', 'y', 'z', 'x', 'y'] | ||
], names=['dates', 'a', 'b', 'c']) | ||
original_dtypes = {name: mi.levels[i].dtype | ||
for i, name in enumerate(mi.names)} | ||
|
||
expected_df = pd.DataFrame(OrderedDict([ | ||
('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), | ||
('a', [1, 1, 1, 2, 2, 2]), | ||
('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), | ||
('c', ['x', 'x', 'y', 'z', 'x', 'y']) | ||
])) | ||
df = mi.to_frame(index=False) | ||
df_dtypes = df.dtypes.to_dict() | ||
|
||
tm.assert_frame_equal(df, expected_df) | ||
assert original_dtypes == df_dtypes | ||
|
||
|
||
def test_to_frame_resulting_column_order(): | ||
# GH 22420 | ||
expected = ['z', 0, 'a'] | ||
mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z'], | ||
['q', 'w', 'e']], names=expected) | ||
result = mi.to_frame().columns.tolist() | ||
assert result == expected | ||
|
||
|
||
def test_to_hierarchical(): | ||
index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( | ||
2, 'two')]) | ||
|
Uh oh!
There was an error while loading. Please reload this page.