-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: MultiIndex not dropping nan level and invalid code value #26408
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
e0d3bb4
af2ad91
248177b
b6d29bd
041b4cd
933615b
d4227c6
b66d3a4
81ebfda
7dfd698
ec265fa
af35fa3
6ec67b3
28c953f
055999d
d9db853
a141e40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -242,11 +242,35 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, | |
result.sortorder = sortorder | ||
|
||
if verify_integrity: | ||
result._verify_integrity() | ||
new_codes = result._verify_integrity() | ||
result._codes = new_codes | ||
|
||
if _set_identity: | ||
result._reset_identity() | ||
|
||
return result | ||
|
||
def _validate_codes(self, level: list, code: list): | ||
""" | ||
Reassign code values as -1 if their corresponding levels are NaN. | ||
|
||
Parameters | ||
---------- | ||
code : list | ||
Code to reassign. | ||
level : list | ||
Level to check for missing values (NaN, NaT, None). | ||
|
||
Returns | ||
------- | ||
code : new code where code value = -1 if it corresponds | ||
to a level with missing values (NaN, NaT, None). | ||
""" | ||
null_mask = isna(level) | ||
if np.any(null_mask): | ||
code = np.where(null_mask[code], -1, code) | ||
return code | ||
|
||
def _verify_integrity(self, codes=None, levels=None): | ||
""" | ||
|
||
|
@@ -262,6 +286,11 @@ def _verify_integrity(self, codes=None, levels=None): | |
ValueError | ||
If length of levels and codes don't match, if the codes for any | ||
level would exceed level bounds, or there are any duplicate levels. | ||
|
||
Returns | ||
------- | ||
codes : new codes where code value = -1 if it corresponds to a | ||
NaN level. | ||
""" | ||
# NOTE: Currently does not check, among other things, that cached | ||
# nlevels matches nor that sortorder matches actually sortorder. | ||
|
@@ -271,22 +300,33 @@ def _verify_integrity(self, codes=None, levels=None): | |
if len(levels) != len(codes): | ||
raise ValueError("Length of levels and codes must match. NOTE:" | ||
" this index is in an inconsistent state.") | ||
codes_length = len(self.codes[0]) | ||
codes_length = len(codes[0]) | ||
for i, (level, level_codes) in enumerate(zip(levels, codes)): | ||
if len(level_codes) != codes_length: | ||
raise ValueError("Unequal code lengths: %s" % | ||
([len(code_) for code_ in codes])) | ||
if len(level_codes) and level_codes.max() >= len(level): | ||
raise ValueError("On level %d, code max (%d) >= length of" | ||
" level (%d). NOTE: this index is in an" | ||
" inconsistent state" % (i, level_codes.max(), | ||
len(level))) | ||
msg = ("On level {level}, code max ({max_code}) >= length of " | ||
"level ({level_len}). NOTE: this index is in an " | ||
"inconsistent state".format( | ||
level=i, max_code=level_codes.max(), | ||
level_len=len(level))) | ||
raise ValueError(msg) | ||
if len(level_codes) and level_codes.min() < -1: | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
raise ValueError("On level {level}, code value ({code})" | ||
" < -1".format( | ||
level=i, code=level_codes.min())) | ||
if not level.is_unique: | ||
raise ValueError("Level values must be unique: {values} on " | ||
"level {level}".format( | ||
values=[value for value in level], | ||
level=i)) | ||
|
||
codes = [self._validate_codes(level, code) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for level, code in zip(levels, codes)] | ||
new_codes = FrozenList(codes) | ||
return new_codes | ||
|
||
@classmethod | ||
def from_arrays(cls, arrays, sortorder=None, names=None): | ||
""" | ||
|
@@ -585,7 +625,8 @@ def _set_levels(self, levels, level=None, copy=False, validate=True, | |
new_levels = FrozenList(new_levels) | ||
|
||
if verify_integrity: | ||
self._verify_integrity(levels=new_levels) | ||
new_codes = self._verify_integrity(levels=new_levels) | ||
self._codes = new_codes | ||
|
||
names = self.names | ||
self._levels = new_levels | ||
|
@@ -675,7 +716,6 @@ def labels(self): | |
|
||
def _set_codes(self, codes, level=None, copy=False, validate=True, | ||
verify_integrity=False): | ||
|
||
if validate and level is None and len(codes) != self.nlevels: | ||
raise ValueError("Length of codes must match number of levels") | ||
if validate and level is not None and len(codes) != len(level): | ||
|
@@ -695,9 +735,10 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, | |
new_codes = FrozenList(new_codes) | ||
|
||
if verify_integrity: | ||
self._verify_integrity(codes=new_codes) | ||
new_codes = self._verify_integrity(codes=new_codes) | ||
|
||
self._codes = new_codes | ||
|
||
self._tuples = None | ||
self._reset_cache() | ||
|
||
|
@@ -1760,9 +1801,10 @@ def __setstate__(self, state): | |
|
||
self._set_levels([Index(x) for x in levels], validate=False) | ||
self._set_codes(codes) | ||
new_codes = self._verify_integrity() | ||
self._set_codes(new_codes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't we end up calling verify_integrity twice? here (and in _set_codes)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that the set_codes are called with default verify_integrity=False, verification would not be called twice |
||
self._set_names(names) | ||
self.sortorder = sortorder | ||
self._verify_integrity() | ||
self._reset_identity() | ||
|
||
def __getitem__(self, key): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,9 +64,10 @@ def test_constructor_mismatched_codes_levels(idx): | |
with pytest.raises(ValueError, match=msg): | ||
MultiIndex(levels=levels, codes=codes) | ||
|
||
length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." | ||
length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." | ||
" NOTE: this index is in an inconsistent state") | ||
label_error = r"Unequal code lengths: \[4, 2\]" | ||
code_value_error = r"On level 0, code value \(-2\) < -1" | ||
|
||
# important to check that it's looking at the right thing. | ||
with pytest.raises(ValueError, match=length_error): | ||
|
@@ -83,6 +84,44 @@ def test_constructor_mismatched_codes_levels(idx): | |
with pytest.raises(ValueError, match=label_error): | ||
idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) | ||
|
||
# test set_codes with verify_integrity=False | ||
# the setting should not raise any value error | ||
idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], | ||
verify_integrity=False) | ||
|
||
# code value smaller than -1 | ||
with pytest.raises(ValueError, match=code_value_error): | ||
MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) | ||
|
||
|
||
def test_na_levels(): | ||
# GH26408 | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# test if codes are re-assigned value -1 for levels | ||
# with mising values (NaN, NaT, None) | ||
result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], | ||
codes=[[0, -1, 1, 2, 3, 4]]) | ||
expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], | ||
codes=[[-1, -1, -1, -1, 3, 4]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], | ||
codes=[[0, -1, 1, 2, 3, 4]]) | ||
expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], | ||
codes=[[-1, -1, 1, -1, 3, -1]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
# verify set_levels and set_codes | ||
result = MultiIndex( | ||
levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( | ||
[[np.nan, 's', pd.NaT, 128, None]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = MultiIndex( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you replace the actual OP tests as well (e.g. the .dropna()); I don't think they are very different that what you have, but they also attempt to .dropna() There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dropna tests added, and green now. |
||
levels=[[np.nan, 's', pd.NaT, 128, None]], | ||
codes=[[1, 2, 2, 2, 2, 2]]).set_codes( | ||
[[0, -1, 1, 2, 3, 4]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
|
||
def test_labels_deprecated(idx): | ||
# GH23752 | ||
|
Uh oh!
There was an error while loading. Please reload this page.