-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
TST/CLN: deduplicate troublesome rank values #38894
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
b6e52de
c43318c
863e4d6
da4c76b
b816586
c77a84f
4961e38
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,8 @@ | |
import pytest | ||
from pytz import FixedOffset, utc | ||
|
||
from pandas._libs import iNaT | ||
from pandas._libs.algos import Infinity, NegInfinity | ||
import pandas.util._test_decorators as td | ||
|
||
from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype | ||
|
@@ -591,6 +593,91 @@ def narrow_series(request): | |
return _narrow_series[request.param].copy() | ||
|
||
|
||
_dtype_nuisance_arr_map = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put rank in the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
"float64": [ | ||
-np.inf, | ||
-50, | ||
-1, | ||
-1e-20, | ||
-1e-25, | ||
-1e-50, | ||
0, | ||
1e-40, | ||
1e-20, | ||
1e-10, | ||
2, | ||
40, | ||
np.inf, | ||
], | ||
"float32": [ | ||
-np.inf, | ||
-50, | ||
-1, | ||
-1e-20, | ||
-1e-25, | ||
-1e-45, | ||
0, | ||
1e-40, | ||
1e-20, | ||
1e-10, | ||
2, | ||
40, | ||
np.inf, | ||
], | ||
"uint8": [np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], | ||
"int64": [ | ||
np.iinfo(np.int64).min, | ||
-100, | ||
0, | ||
1, | ||
9999, | ||
100000, | ||
1e10, | ||
np.iinfo(np.int64).max, | ||
], | ||
"object": [NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], | ||
} | ||
|
||
_dtype_na_map = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you put this inside the fixture, this is generally a special case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
"float64": np.nan, | ||
"float32": np.nan, | ||
"int64": iNaT, | ||
"object": None, | ||
} | ||
|
||
|
||
@pytest.fixture(params=_dtype_nuisance_arr_map.keys()) | ||
def nuisance_rank_series_and_expected(request): | ||
""" | ||
Fixture for Series with troublesome values for rank | ||
algorithms | ||
""" | ||
dtype = request.param | ||
if dtype == "int64": | ||
mark = pytest.mark.xfail( | ||
reason="iNaT is equivalent to minimum value of dtype" | ||
"int64 pending issue GH#16674" | ||
) | ||
request.node.add_marker(mark) | ||
data = _dtype_nuisance_arr_map[dtype] | ||
values = np.array(data, dtype=dtype) | ||
exp_order = np.array(range(len(values)), dtype="float64") + 1.0 | ||
# Insert nans at random positions if underlying dtype has missing | ||
# value. Then adjust the expected order by adding nans accordingly | ||
# This is for testing whether rank calculation is affected | ||
# when values are interwined with nan values. | ||
if dtype in _dtype_na_map: | ||
na_value = _dtype_na_map[dtype] | ||
nan_indices = np.random.choice(range(len(values)), 5) | ||
values = np.insert(values, nan_indices, na_value) | ||
exp_order = np.insert(exp_order, nan_indices, np.nan) | ||
# shuffle the testing array and expected results in the same way | ||
random_order = np.random.permutation(len(values)) | ||
iseries = Series(values[random_order]) | ||
exp = Series(exp_order[random_order], dtype="float64") | ||
return iseries, exp | ||
|
||
|
||
_index_or_series_objs = {**indices_dict, **_series, **_narrow_series} | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,6 @@ | |
import numpy as np | ||
import pytest | ||
|
||
from pandas._libs import iNaT | ||
from pandas._libs.algos import Infinity, NegInfinity | ||
import pandas.util._test_decorators as td | ||
|
||
|
@@ -206,90 +205,10 @@ def test_rank_signature(self): | |
with pytest.raises(ValueError, match=msg): | ||
s.rank("average") | ||
|
||
@pytest.mark.parametrize( | ||
"contents,dtype", | ||
[ | ||
( | ||
[ | ||
-np.inf, | ||
-50, | ||
-1, | ||
-1e-20, | ||
-1e-25, | ||
-1e-50, | ||
0, | ||
1e-40, | ||
1e-20, | ||
1e-10, | ||
2, | ||
40, | ||
np.inf, | ||
], | ||
"float64", | ||
), | ||
( | ||
[ | ||
-np.inf, | ||
-50, | ||
-1, | ||
-1e-20, | ||
-1e-25, | ||
-1e-45, | ||
0, | ||
1e-40, | ||
1e-20, | ||
1e-10, | ||
2, | ||
40, | ||
np.inf, | ||
], | ||
"float32", | ||
), | ||
([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), | ||
pytest.param( | ||
[ | ||
np.iinfo(np.int64).min, | ||
-100, | ||
0, | ||
1, | ||
9999, | ||
100000, | ||
1e10, | ||
np.iinfo(np.int64).max, | ||
], | ||
"int64", | ||
marks=pytest.mark.xfail( | ||
reason="iNaT is equivalent to minimum value of dtype" | ||
"int64 pending issue GH#16674" | ||
), | ||
), | ||
([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), | ||
], | ||
) | ||
def test_rank_inf(self, contents, dtype): | ||
dtype_na_map = { | ||
"float64": np.nan, | ||
"float32": np.nan, | ||
"int64": iNaT, | ||
"object": None, | ||
} | ||
# Insert nans at random positions if underlying dtype has missing | ||
# value. Then adjust the expected order by adding nans accordingly | ||
# This is for testing whether rank calculation is affected | ||
# when values are interwined with nan values. | ||
values = np.array(contents, dtype=dtype) | ||
exp_order = np.array(range(len(values)), dtype="float64") + 1.0 | ||
if dtype in dtype_na_map: | ||
na_value = dtype_na_map[dtype] | ||
nan_indices = np.random.choice(range(len(values)), 5) | ||
values = np.insert(values, nan_indices, na_value) | ||
exp_order = np.insert(exp_order, nan_indices, np.nan) | ||
# shuffle the testing array and expected results in the same way | ||
random_order = np.random.permutation(len(values)) | ||
iseries = Series(values[random_order]) | ||
exp = Series(exp_order[random_order], dtype="float64") | ||
iranks = iseries.rank() | ||
tm.assert_series_equal(iranks, exp) | ||
def test_rank_inf(self, nuisance_rank_series_and_expected): | ||
series, expected = nuisance_rank_series_and_expected | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually an alternative here is simply to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yah i think this makes a lot more sense than a relatively-complicated fixture in conftest There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks that is much nicer, good pattern to know |
||
result = series.rank() | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_rank_tie_methods(self): | ||
s = self.s | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you add a comment here about where this is used
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done