Skip to content

DEPR: DataFrameGroupBy.apply operating on the group keys #52477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 12, 2023
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ Other API changes

Deprecations
~~~~~~~~~~~~
- Deprecated :meth:`.DataFrameGroupBy.apply` operating on the grouping column(s) (:issue:`7155`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A small explanation about what to do to avoid the warning?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks - done.

- Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`)
- Deprecated :meth:`DataFrame._data` and :meth:`Series._data`, use public APIs instead (:issue:`33333`)
- Deprecated :meth:`.Groupby.all` and :meth:`.GroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`)
Expand Down
17 changes: 17 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1487,6 +1487,23 @@ def f(g):
with option_context("mode.chained_assignment", None):
try:
result = self._python_apply_general(f, self._selected_obj)
if (
not isinstance(self.obj, Series)
and self._selection is None
and self._selected_obj.shape != self._obj_with_exclusions.shape
):
msg = (
f"{type(self).__name__}.apply operated on the grouping "
f"columns. This behavior is deprecated, and in a future "
f"version of pandas the grouping columns will be excluded "
f"from the operation. Subset the data to exclude the "
f"groupings and silence this warning."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Subset the data to exclude the groupings and silence this warning." -> "Subset the data to silence this warning."?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the benefit of removing the phrase "to exclude the groupings"?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My idea was that by keeping the grouping columns in the subsetting, the users are guaranteed to get the same rseult as before, but without the warning:

>>> df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1,  2, 3, 4]})
>>> g = df.groupby('A')
>>> g.apply(lambda x: x.sum())
   A  B
A
1  2  3
2  4  7
>>> g[df.columns].apply(lambda x: x.sum())
   A  B
A
1  2  3
2  4  7

Or they may actually not want to include, then remove it from the subsetting,

>>> g[df.columns.drop("A")].apply(lambda x: x.sum())
   B
A
1  3
2  7

the idea is just that they may/may not want to remove the groupings in the subset and "exclude the groupings" may not be what they want in all cases.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see - this makes sense, but I find it confusing to call this "subsetting". I'll see what I can do for the warning message here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated. What do you think?

)
warnings.warn(
message=msg,
category=FutureWarning,
stacklevel=find_stack_level(),
)
except TypeError:
# gh-20949
# try again, with .apply acting as a filtering
Expand Down
40 changes: 36 additions & 4 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@
Substitution,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._exceptions import (
find_stack_level,
rewrite_warning,
)

from pandas.core.dtypes.generic import (
ABCDataFrame,
Expand Down Expand Up @@ -420,6 +423,14 @@ def _groupby_and_aggregate(self, how, *args, **kwargs):
obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys
)

target_message = "DataFrameGroupBy.apply operated on the grouping columns"
new_message = (
"DataFrame.resample operated on the grouping columns. "
"This behavior is deprecated, and in a future version of "
"pandas the grouping columns will be excluded from the operation. "
"Subset the data to exclude the groupings and silence this warning."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Subset the data to exclude the groupings and silence this warning." -> Subset the data to silence this warning."

)

try:
if callable(how):
# TODO: test_resample_apply_with_additional_args fails if we go
Expand All @@ -436,7 +447,12 @@ def _groupby_and_aggregate(self, how, *args, **kwargs):
# a DataFrame column, but aggregate_item_by_item operates column-wise
# on Series, raising AttributeError or KeyError
# (depending on whether the column lookup uses getattr/__getitem__)
result = grouped.apply(how, *args, **kwargs)
with rewrite_warning(
target_message=target_message,
target_category=FutureWarning,
new_message=new_message,
):
result = grouped.apply(how, *args, **kwargs)

except ValueError as err:
if "Must produce aggregated value" in str(err):
Expand All @@ -448,7 +464,12 @@ def _groupby_and_aggregate(self, how, *args, **kwargs):

# we have a non-reducing function
# try to evaluate
result = grouped.apply(how, *args, **kwargs)
with rewrite_warning(
target_message=target_message,
target_category=FutureWarning,
new_message=new_message,
):
result = grouped.apply(how, *args, **kwargs)

return self._wrap_result(result)

Expand Down Expand Up @@ -1344,7 +1365,18 @@ def func(x):

return x.apply(f, *args, **kwargs)

result = self._groupby.apply(func)
msg = (
"DataFrameGroupBy.resample operated on the grouping columns. "
"This behavior is deprecated, and in a future version of "
"pandas the grouping columns will be excluded from the operation. "
"Subset the data to exclude the groupings and silence this warning."
)
with rewrite_warning(
target_message="DataFrameGroupBy.apply operated on the grouping columns",
target_category=FutureWarning,
new_message=msg,
):
result = self._groupby.apply(func)
return self._wrap_result(result)

_upsample = _apply
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
Appender,
Substitution,
)
from pandas.util._exceptions import rewrite_warning

from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -457,7 +458,19 @@ def _all_key():
return (margins_name,) + ("",) * (len(cols) - 1)

if len(rows) > 0:
margin = data[rows].groupby(rows, observed=observed).apply(aggfunc)
target_message = "DataFrameGroupBy.apply operated on the grouping columns"
new_message = (
"DataFrame.pivot_table operated on the grouping columns. "
"This behavior is deprecated, and in a future version of "
"pandas the grouping columns will be excluded from the operation. "
"Can the user do something here?"
)
with rewrite_warning(
target_message=target_message,
target_category=FutureWarning,
new_message=new_message,
):
margin = data[rows].groupby(rows, observed=observed).apply(aggfunc)
all_key = _all_key()
table[all_key] = margin
result = table
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,13 @@ def test_groupby_extension_transform(self, data_for_grouping):

def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
df.groupby("B", group_keys=False).apply(groupby_apply_op)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby("B", group_keys=False).apply(groupby_apply_op)
df.groupby("B", group_keys=False).A.apply(groupby_apply_op)
df.groupby("A", group_keys=False).apply(groupby_apply_op)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby("A", group_keys=False).apply(groupby_apply_op)
df.groupby("A", group_keys=False).B.apply(groupby_apply_op)

def test_groupby_apply_identity(self, data_for_grouping):
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,9 +298,13 @@ def test_groupby_extension_transform(self, data_for_grouping):

def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
df.groupby("B", group_keys=False).apply(groupby_apply_op)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby("B", group_keys=False).apply(groupby_apply_op)
df.groupby("B", group_keys=False).A.apply(groupby_apply_op)
df.groupby("A", group_keys=False).apply(groupby_apply_op)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby("A", group_keys=False).apply(groupby_apply_op)
df.groupby("A", group_keys=False).B.apply(groupby_apply_op)

def test_groupby_apply_identity(self, data_for_grouping):
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1577,7 +1577,9 @@ def test_unstack_bug(self):
}
)

result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)

unstacked = result.unstack()
restacked = unstacked.stack()
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,13 +496,17 @@ def test_agg_timezone_round_trip():
assert ts == grouped.first()["B"].iloc[0]

# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]

ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]

# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]


def test_sum_uint64_overflow():
Expand Down
Loading