From a560efa67c2700e8d32da31fc6100495393b80f2 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 14 Jul 2022 18:33:51 -0400 Subject: [PATCH 1/2] BUG: DataFrame.corrwith and DataFrameGroupBy.cummin/cummax with numeric_only=True --- pandas/core/frame.py | 13 +++-- pandas/core/groupby/groupby.py | 12 +++- pandas/tests/groupby/test_function.py | 82 ++++++++++++++++++++++++++- 3 files changed, 99 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ead4ea744c647..0da21145a1a7a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10550,7 +10550,8 @@ def corrwith( else: return this.apply(lambda x: other.corr(x, method=method), axis=axis) - other = other._get_numeric_data() + if numeric_only_bool: + other = other._get_numeric_data() left, right = this.align(other, join="inner", copy=False) if axis == 1: @@ -10563,11 +10564,15 @@ def corrwith( right = right + left * 0 # demeaned data - ldem = left - left.mean() - rdem = right - right.mean() + ldem = left - left.mean(numeric_only=numeric_only_bool) + rdem = right - right.mean(numeric_only=numeric_only_bool) num = (ldem * rdem).sum() - dom = (left.count() - 1) * left.std() * right.std() + dom = ( + (left.count() - 1) + * left.std(numeric_only=numeric_only_bool) + * right.std(numeric_only=numeric_only_bool) + ) correl = num / dom diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 89e47af4cb614..09545aa5c3184 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3630,7 +3630,11 @@ def cummin(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT: skipna = kwargs.get("skipna", True) if axis != 0: f = lambda x: np.minimum.accumulate(x, axis) - return self._python_apply_general(f, self._selected_obj, is_transform=True) + numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) + obj = self._selected_obj + if numeric_only_bool: + obj = obj._get_numeric_data() + return self._python_apply_general(f, obj, is_transform=True) return self._cython_transform( "cummin", numeric_only=numeric_only, skipna=skipna @@ -3650,7 +3654,11 @@ def cummax(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT: skipna = kwargs.get("skipna", True) if axis != 0: f = lambda x: np.maximum.accumulate(x, axis) - return self._python_apply_general(f, self._selected_obj, is_transform=True) + numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) + obj = self._selected_obj + if numeric_only_bool: + obj = obj._get_numeric_data() + return self._python_apply_general(f, obj, is_transform=True) return self._cython_transform( "cummax", numeric_only=numeric_only, skipna=skipna diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 7d6c5310942e2..ac25972da33f1 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -555,6 +555,83 @@ def test_idxmin_idxmax_axis1(): gb2.idxmax(axis=1) +@pytest.mark.parametrize("numeric_only", [True, False, None]) +def test_axis1_numeric_only(groupby_func, numeric_only): + if groupby_func in ("idxmax", "idxmin"): + pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") + if groupby_func in ("mad", "tshift"): + pytest.skip("mad and tshift are deprecated") + if groupby_func in ("corrwith", "skew"): + msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1" + pytest.skip(msg) + + df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) + df["E"] = "x" + groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] + gb = df.groupby(groups) + method = getattr(gb, groupby_func) + args = (0,) if groupby_func == "fillna" else () + kwargs = {"axis": 1} + if numeric_only is not None: + # when numeric_only is None we don't pass any argument + kwargs["numeric_only"] = numeric_only + + # Functions without numeric_only and axis args + no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift") + # Functions with axis args + has_axis = ( + "cumprod", + "cumsum", + "diff", + "pct_change", + "rank", + "shift", + "cummax", + "cummin", + "idxmin", + "idxmax", + "fillna", + ) + if numeric_only is not None and groupby_func in no_args: + try: + method(*args, **kwargs) + assert False, f"axis=1 succeeds for {groupby_func}" + except TypeError as err: + assert "got an unexpected keyword argument 'numeric_only'" in str(err) + elif groupby_func not in has_axis: + try: + method(*args, **kwargs) + assert False, f"axis=1 succeeds for {groupby_func}" + except TypeError as err: + assert "got an unexpected keyword argument 'axis'" in str(err) + # fillna and shift are successful even on object dtypes + elif (numeric_only is None or not numeric_only) and groupby_func not in ( + "fillna", + "shift", + ): + msgs = ( + # cummax, cummin, rank + "not supported between instances of", + # cumprod + "can't multiply sequence by non-int of type 'float'", + # cumsum, diff, pct_change + "unsupported operand type", + ) + with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"): + method(*args, **kwargs) + else: + result = method(*args, **kwargs) + + df_expected = df.drop(columns="E").T if numeric_only else df.T + expected = getattr(df_expected, groupby_func)(*args).T + if groupby_func == "shift" and not numeric_only: + # shift with axis=1 leaves the leftmost column as numeric + # but transposing for expected gives us object dtype + expected = expected.astype(float) + + tm.assert_equal(result, expected) + + def test_groupby_cumprod(): # GH 4095 df = DataFrame({"key": ["b"] * 10, "value": 2}) @@ -1321,7 +1398,7 @@ def test_deprecate_numeric_only( assert "b" not in result.columns elif ( # kernels that work on any dtype and have numeric_only arg - kernel in ("first", "last", "corrwith") + kernel in ("first", "last") or ( # kernels that work on any dtype and don't have numeric_only arg kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") @@ -1339,7 +1416,8 @@ def test_deprecate_numeric_only( "(not allowed for this dtype" "|must be a string or a number" "|cannot be performed against 'object' dtypes" - "|must be a string or a real number)" + "|must be a string or a real number" + "|unsupported operand type)" ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) From 4d702863be2f03cb0a906d11fa7a86144cb023a9 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 15 Jul 2022 16:43:56 -0400 Subject: [PATCH 2/2] test improvements --- pandas/tests/groupby/test_function.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index ac25972da33f1..9c622e0bfb69e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -556,14 +556,14 @@ def test_idxmin_idxmax_axis1(): @pytest.mark.parametrize("numeric_only", [True, False, None]) -def test_axis1_numeric_only(groupby_func, numeric_only): +def test_axis1_numeric_only(request, groupby_func, numeric_only): if groupby_func in ("idxmax", "idxmin"): pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") if groupby_func in ("mad", "tshift"): pytest.skip("mad and tshift are deprecated") if groupby_func in ("corrwith", "skew"): msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1" - pytest.skip(msg) + request.node.add_marker(pytest.mark.xfail(reason=msg)) df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) df["E"] = "x" @@ -593,17 +593,15 @@ def test_axis1_numeric_only(groupby_func, numeric_only): "fillna", ) if numeric_only is not None and groupby_func in no_args: - try: + msg = "got an unexpected keyword argument 'numeric_only'" + with pytest.raises(TypeError, match=msg): method(*args, **kwargs) - assert False, f"axis=1 succeeds for {groupby_func}" - except TypeError as err: - assert "got an unexpected keyword argument 'numeric_only'" in str(err) elif groupby_func not in has_axis: - try: - method(*args, **kwargs) - assert False, f"axis=1 succeeds for {groupby_func}" - except TypeError as err: - assert "got an unexpected keyword argument 'axis'" in str(err) + msg = "got an unexpected keyword argument 'axis'" + warn = FutureWarning if groupby_func == "skew" and not numeric_only else None + with tm.assert_produces_warning(warn, match="Dropping of nuisance columns"): + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) # fillna and shift are successful even on object dtypes elif (numeric_only is None or not numeric_only) and groupby_func not in ( "fillna",