-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: nullable dtypes not preserved in Series.replace #44940
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
91ffff9
ddd71b6
83f3ca1
56c8b84
9b3891b
f3d9b40
569db46
2351cbb
b7679d3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
from pandas.core.arrays import IntervalArray | ||
|
||
|
||
class TestSeriesReplace: | ||
|
@@ -148,20 +149,21 @@ def test_replace_with_single_list(self): | |
tm.assert_series_equal(s, ser) | ||
|
||
def test_replace_mixed_types(self): | ||
s = pd.Series(np.arange(5), dtype="int64") | ||
ser = pd.Series(np.arange(5), dtype="int64") | ||
|
||
def check_replace(to_rep, val, expected): | ||
sc = s.copy() | ||
r = s.replace(to_rep, val) | ||
sc = ser.copy() | ||
result = ser.replace(to_rep, val) | ||
return_value = sc.replace(to_rep, val, inplace=True) | ||
assert return_value is None | ||
tm.assert_series_equal(expected, r) | ||
tm.assert_series_equal(expected, result) | ||
tm.assert_series_equal(expected, sc) | ||
|
||
# MUST upcast to float | ||
e = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) | ||
# 3.0 can still be held in our int64 series, so we do not upcast GH#44940 | ||
tr, v = [3], [3.0] | ||
check_replace(tr, v, e) | ||
check_replace(tr, v, ser) | ||
# Note this matches what we get with the scalars 3 and 3.0 | ||
check_replace(tr[0], v[0], ser) | ||
|
||
# MUST upcast to float | ||
e = pd.Series([0, 1, 2, 3.5, 4]) | ||
|
@@ -257,10 +259,10 @@ def test_replace2(self): | |
assert (ser[20:30] == -1).all() | ||
|
||
def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): | ||
# GH 32621 | ||
s = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) | ||
expected = pd.Series(["1", "2", np.nan]) | ||
result = s.replace({"one": "1", "two": "2"}) | ||
# GH 32621, GH#44940 | ||
ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) | ||
expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype) | ||
result = ser.replace({"one": "1", "two": "2"}) | ||
tm.assert_series_equal(expected, result) | ||
|
||
def test_replace_with_empty_dictlike(self): | ||
|
@@ -305,17 +307,18 @@ def test_replace_mixed_types_with_string(self): | |
"categorical, numeric", | ||
[ | ||
(pd.Categorical(["A"], categories=["A", "B"]), [1]), | ||
(pd.Categorical(("A",), categories=["A", "B"]), [1]), | ||
(pd.Categorical(("A", "B"), categories=["A", "B"]), [1, 2]), | ||
(pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), | ||
], | ||
) | ||
def test_replace_categorical(self, categorical, numeric): | ||
# GH 24971 | ||
# Do not check if dtypes are equal due to a known issue that | ||
# Categorical.replace sometimes coerces to object (GH 23305) | ||
s = pd.Series(categorical) | ||
result = s.replace({"A": 1, "B": 2}) | ||
expected = pd.Series(numeric) | ||
# GH 24971, GH#23305 | ||
ser = pd.Series(categorical) | ||
result = ser.replace({"A": 1, "B": 2}) | ||
expected = pd.Series(numeric).astype("category") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. umm why is this a resulting categorical? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. bc that's what someone implemented for the Categorical.replace behavior. im on the fence about it, but for now i think we need to be consistent with it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm ok let's open an issue about this, i dont' think we should raise (or coerce to object) rather than return a new categorical (but maybe others disagree) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you clarify? based on your earlier comment i expected the opposite opinion, so i think the "dont" may be an unintentional double-negative. |
||
if 2 not in expected.cat.categories: | ||
# i.e. categories should be [1, 2] even if there are no "B"s present | ||
# GH#44940 | ||
expected = expected.cat.add_categories(2) | ||
tm.assert_series_equal(expected, result) | ||
|
||
def test_replace_categorical_single(self): | ||
|
@@ -514,3 +517,90 @@ def test_pandas_replace_na(self): | |
result = ser.replace(regex_mapping, regex=True) | ||
exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") | ||
tm.assert_series_equal(result, exp) | ||
|
||
@pytest.mark.parametrize( | ||
"dtype, input_data, to_replace, expected_data", | ||
[ | ||
("bool", [True, False], {True: False}, [False, False]), | ||
("int64", [1, 2], {1: 10, 2: 20}, [10, 20]), | ||
("Int64", [1, 2], {1: 10, 2: 20}, [10, 20]), | ||
("float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), | ||
("Float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), | ||
("string", ["one", "two"], {"one": "1", "two": "2"}, ["1", "2"]), | ||
( | ||
pd.IntervalDtype("int64"), | ||
IntervalArray([pd.Interval(1, 2), pd.Interval(2, 3)]), | ||
{pd.Interval(1, 2): pd.Interval(10, 20)}, | ||
IntervalArray([pd.Interval(10, 20), pd.Interval(2, 3)]), | ||
), | ||
( | ||
pd.IntervalDtype("float64"), | ||
IntervalArray([pd.Interval(1.0, 2.7), pd.Interval(2.8, 3.1)]), | ||
{pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)}, | ||
IntervalArray([pd.Interval(10.6, 20.8), pd.Interval(2.8, 3.1)]), | ||
), | ||
( | ||
pd.PeriodDtype("M"), | ||
[pd.Period("2020-05", freq="M")], | ||
{pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M")}, | ||
[pd.Period("2020-06", freq="M")], | ||
), | ||
], | ||
) | ||
def test_replace_dtype(self, dtype, input_data, to_replace, expected_data): | ||
# GH#33484 | ||
ser = pd.Series(input_data, dtype=dtype) | ||
result = ser.replace(to_replace) | ||
expected = pd.Series(expected_data, dtype=dtype) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_replace_string_dtype(self): | ||
# GH#40732, GH#44940 | ||
ser = pd.Series(["one", "two", np.nan], dtype="string") | ||
res = ser.replace({"one": "1", "two": "2"}) | ||
expected = pd.Series(["1", "2", np.nan], dtype="string") | ||
tm.assert_series_equal(res, expected) | ||
|
||
# GH#31644 | ||
ser2 = pd.Series(["A", np.nan], dtype="string") | ||
res2 = ser2.replace("A", "B") | ||
expected2 = pd.Series(["B", np.nan], dtype="string") | ||
tm.assert_series_equal(res2, expected2) | ||
|
||
ser3 = pd.Series(["A", "B"], dtype="string") | ||
res3 = ser3.replace("A", pd.NA) | ||
expected3 = pd.Series([pd.NA, "B"], dtype="string") | ||
tm.assert_series_equal(res3, expected3) | ||
|
||
def test_replace_string_dtype_list_to_replace(self): | ||
# GH#41215, GH#44940 | ||
ser = pd.Series(["abc", "def"], dtype="string") | ||
res = ser.replace(["abc", "any other string"], "xyz") | ||
expected = pd.Series(["xyz", "def"], dtype="string") | ||
tm.assert_series_equal(res, expected) | ||
|
||
def test_replace_string_dtype_regex(self): | ||
# GH#31644 | ||
ser = pd.Series(["A", "B"], dtype="string") | ||
res = ser.replace(r".", "C", regex=True) | ||
expected = pd.Series(["C", "C"], dtype="string") | ||
tm.assert_series_equal(res, expected) | ||
|
||
def test_replace_nullable_numeric(self): | ||
# GH#40732, GH#44940 | ||
|
||
floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) | ||
assert floats.replace({1.0: 9}).dtype == floats.dtype | ||
assert floats.replace(1.0, 9).dtype == floats.dtype | ||
assert floats.replace({1.0: 9.0}).dtype == floats.dtype | ||
assert floats.replace(1.0, 9.0).dtype == floats.dtype | ||
|
||
res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) | ||
assert res.dtype == floats.dtype | ||
|
||
ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) | ||
assert ints.replace({1: 9}).dtype == ints.dtype | ||
assert ints.replace(1, 9).dtype == ints.dtype | ||
assert ints.replace({1: 9.0}).dtype == ints.dtype | ||
assert ints.replace(1, 9.0).dtype == ints.dtype | ||
# FIXME: ints.replace({1: 9.5}) raises bc of incorrect _can_hold_element |
Uh oh!
There was an error while loading. Please reload this page.