diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 7dc0db35bf8fe..99426c55da29b 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -6,6 +6,8 @@ import collections from typing import List +from pandas._typing import final + from pandas.core.dtypes.common import is_list_like, is_scalar from pandas.core.base import PandasObject @@ -16,6 +18,7 @@ class ShallowMixin(PandasObject): _attributes: List[str] = [] + @final def _shallow_copy(self, obj, **kwargs): """ return a new object with the replacement attributes @@ -35,6 +38,7 @@ class GotItemMixin(PandasObject): _attributes: List[str] + @final def _gotitem(self, key, ndim, subset=None): """ Sub-classes to define. Return a sliced object. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8f1b99e929f59..23f0e178130be 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -11,6 +11,7 @@ class providing the base-class of operations. import datetime from functools import partial, wraps import inspect +from textwrap import dedent import types from typing import ( Callable, @@ -44,6 +45,7 @@ class providing the base-class of operations. IndexLabel, Label, Scalar, + final, ) from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -447,6 +449,7 @@ class providing the base-class of operations. """ +@final class GroupByPlot(PandasObject): """ Class implementing the .plot attribute for groupby objects. @@ -570,9 +573,11 @@ def __init__( self.grouper = grouper self.exclusions = exclusions or set() + @final def __len__(self) -> int: return len(self.groups) + @final def __repr__(self) -> str: # TODO: Better repr for GroupBy object return object.__repr__(self) @@ -584,6 +589,7 @@ def _assure_grouper(self) -> None: """ pass + @final @property def groups(self) -> Dict[Hashable, np.ndarray]: """ @@ -592,11 +598,13 @@ def groups(self) -> Dict[Hashable, np.ndarray]: self._assure_grouper() return self.grouper.groups + @final @property def ngroups(self) -> int: self._assure_grouper() return self.grouper.ngroups + @final @property def indices(self): """ @@ -605,6 +613,7 @@ def indices(self): self._assure_grouper() return self.grouper.indices + @final def _get_indices(self, names): """ Safe get multiple indices, translate keys for @@ -655,12 +664,14 @@ def get_converter(s): return [self.indices.get(name, []) for name in names] + @final def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ return self._get_indices([name])[0] + @final @cache_readonly def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy @@ -672,6 +683,7 @@ def _selected_obj(self): else: return self.obj[self._selection] + @final def _reset_group_selection(self) -> None: """ Clear group based selection. @@ -684,6 +696,7 @@ def _reset_group_selection(self) -> None: self._group_selection = None self._reset_cache("_selected_obj") + @final def _set_group_selection(self) -> None: """ Create group based selection. @@ -709,6 +722,7 @@ def _set_group_selection(self) -> None: self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") + @final def _set_result_index_ordered( self, result: "OutputFrameOrSeries" ) -> "OutputFrameOrSeries": @@ -725,6 +739,7 @@ def _set_result_index_ordered( result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result + @final def _dir_additions(self) -> Set[str]: return self.obj._dir_additions() | self._apply_allowlist @@ -740,23 +755,25 @@ def __getattr__(self, attr: str): @Substitution( klass="GroupBy", - examples="""\ ->>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) ->>> df - A B -0 a 1 -1 b 2 -2 a 3 -3 b 4 - -To get the difference between each groups maximum and minimum value in one -pass, you can do - ->>> df.groupby('A').pipe(lambda x: x.max() - x.min()) - B -A -a 2 -b 2""", + examples=dedent( + """\ + >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) + >>> df + A B + 0 a 1 + 1 b 2 + 2 a 3 + 3 b 4 + + To get the difference between each groups maximum and minimum value in one + pass, you can do + + >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) + B + A + a 2 + b 2""" + ), ) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): @@ -764,6 +781,7 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) + @final def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist @@ -801,6 +819,7 @@ def curried(x): wrapper.__name__ = name return wrapper + @final def get_group(self, name, obj=None): """ Construct DataFrame from group with provided name. @@ -887,6 +906,7 @@ def f(g): return result + @final def _python_apply_general( self, f: F, data: FrameOrSeriesUnion ) -> FrameOrSeriesUnion: @@ -917,6 +937,7 @@ def _iterate_slices(self) -> Iterable[Series]: def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) + @final def _cumcount_array(self, ascending: bool = True): """ Parameters @@ -949,6 +970,7 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) + @final def _cython_transform( self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs ): @@ -986,6 +1008,7 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) + @final def _agg_general( self, numeric_only: bool = True, @@ -1059,6 +1082,7 @@ def _cython_agg_general( return self._wrap_aggregated_output(output, index=self.grouper.result_index) + @final def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): """ Perform groupby transform routine with the numba engine. @@ -1093,6 +1117,7 @@ def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) # evaluated the data sorted by group return result.take(np.argsort(sorted_index), axis=0) + @final def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): """ Perform groupby aggregation routine with the numba engine. @@ -1129,6 +1154,7 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) index = Index(group_keys, name=self.grouper.names[0]) return result, index + @final def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) @@ -1171,6 +1197,7 @@ def _python_agg_general(self, func, *args, **kwargs): return self._wrap_aggregated_output(output, index=self.grouper.result_index) + @final def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -1232,6 +1259,7 @@ def reset_identity(values): return result + @final def _apply_filter(self, indices, dropna): if len(indices) == 0: indices = np.array([], dtype="int64") @@ -1321,6 +1349,7 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): more """ + @final @property def _obj_1d_constructor(self) -> Type["Series"]: # GH28330 preserve subclassed Series/DataFrames @@ -1329,6 +1358,7 @@ def _obj_1d_constructor(self) -> Type["Series"]: assert isinstance(self.obj, Series) return self.obj._constructor + @final def _bool_agg(self, val_test, skipna): """ Shared func to call any / all Cython GroupBy implementations. @@ -1358,6 +1388,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: skipna=skipna, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def any(self, skipna: bool = True): @@ -1377,6 +1408,7 @@ def any(self, skipna: bool = True): """ return self._bool_agg("any", skipna) + @final @Substitution(name="groupby") @Appender(_common_see_also) def all(self, skipna: bool = True): @@ -1410,6 +1442,7 @@ def count(self): # defined here for API doc raise NotImplementedError + @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def mean(self, numeric_only: bool = True): @@ -1466,6 +1499,7 @@ def mean(self, numeric_only: bool = True): numeric_only=numeric_only, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def median(self, numeric_only=True): @@ -1491,6 +1525,7 @@ def median(self, numeric_only=True): numeric_only=numeric_only, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def std(self, ddof: int = 1): @@ -1520,6 +1555,7 @@ def std(self, ddof: int = 1): ddof=ddof, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def var(self, ddof: int = 1): @@ -1547,6 +1583,7 @@ def var(self, ddof: int = 1): with group_selection_context(self): return self._python_agg_general(func) + @final @Substitution(name="groupby") @Appender(_common_see_also) def sem(self, ddof: int = 1): @@ -1577,6 +1614,7 @@ def sem(self, ddof: int = 1): ) return result + @final @Substitution(name="groupby") @Appender(_common_see_also) def size(self) -> FrameOrSeriesUnion: @@ -1602,6 +1640,7 @@ def size(self) -> FrameOrSeriesUnion: return self._reindex_output(result, fill_value=0) + @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): @@ -1618,24 +1657,28 @@ def sum(self, numeric_only: bool = True, min_count: int = 0): return self._reindex_output(result, fill_value=0) + @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) def prod(self, numeric_only: bool = True, min_count: int = 0): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) + @final @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) def min(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) + @final @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) def max(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) + @final @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) def first(self, numeric_only: bool = False, min_count: int = -1): def first_compat(obj: FrameOrSeries, axis: int = 0): @@ -1660,6 +1703,7 @@ def first(x: Series): npfunc=first_compat, ) + @final @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) def last(self, numeric_only: bool = False, min_count: int = -1): def last_compat(obj: FrameOrSeries, axis: int = 0): @@ -1684,6 +1728,7 @@ def last(x: Series): npfunc=last_compat, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def ohlc(self) -> DataFrame: @@ -1699,6 +1744,7 @@ def ohlc(self) -> DataFrame: """ return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc")) + @final @doc(DataFrame.describe) def describe(self, **kwargs): with group_selection_context(self): @@ -1707,6 +1753,7 @@ def describe(self, **kwargs): return result.T return result.unstack() + @final def resample(self, rule, *args, **kwargs): """ Provide resampling when using a TimeGrouper. @@ -1808,6 +1855,7 @@ def resample(self, rule, *args, **kwargs): return get_resampler_for_grouping(self, rule, *args, **kwargs) + @final @Substitution(name="groupby") @Appender(_common_see_also) def rolling(self, *args, **kwargs): @@ -1818,6 +1866,7 @@ def rolling(self, *args, **kwargs): return RollingGroupby(self, *args, **kwargs) + @final @Substitution(name="groupby") @Appender(_common_see_also) def expanding(self, *args, **kwargs): @@ -1829,6 +1878,7 @@ def expanding(self, *args, **kwargs): return ExpandingGroupby(self, *args, **kwargs) + @final @Substitution(name="groupby") @Appender(_common_see_also) def ewm(self, *args, **kwargs): @@ -1839,6 +1889,7 @@ def ewm(self, *args, **kwargs): return ExponentialMovingWindowGroupby(self, *args, **kwargs) + @final def _fill(self, direction, limit=None): """ Shared function for `pad` and `backfill` to call Cython method. @@ -1877,6 +1928,7 @@ def _fill(self, direction, limit=None): dropna=self.dropna, ) + @final @Substitution(name="groupby") def pad(self, limit=None): """ @@ -1903,6 +1955,7 @@ def pad(self, limit=None): ffill = pad + @final @Substitution(name="groupby") def backfill(self, limit=None): """ @@ -1929,6 +1982,7 @@ def backfill(self, limit=None): bfill = backfill + @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame: @@ -2102,6 +2156,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra return result + @final def quantile(self, q=0.5, interpolation: str = "linear"): """ Return group values at the given quantile, a la numpy.percentile. @@ -2230,6 +2285,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: ) return result.take(indices, axis=self.axis) + @final @Substitution(name="groupby") def ngroup(self, ascending: bool = True): """ @@ -2297,6 +2353,7 @@ def ngroup(self, ascending: bool = True): result = self.ngroups - 1 - result return result + @final @Substitution(name="groupby") def cumcount(self, ascending: bool = True): """ @@ -2356,6 +2413,7 @@ def cumcount(self, ascending: bool = True): cumcounts = self._cumcount_array(ascending=ascending) return self._obj_1d_constructor(cumcounts, index) + @final @Substitution(name="groupby") @Appender(_common_see_also) def rank( @@ -2405,6 +2463,7 @@ def rank( axis=axis, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def cumprod(self, axis=0, *args, **kwargs): @@ -2421,6 +2480,7 @@ def cumprod(self, axis=0, *args, **kwargs): return self._cython_transform("cumprod", **kwargs) + @final @Substitution(name="groupby") @Appender(_common_see_also) def cumsum(self, axis=0, *args, **kwargs): @@ -2437,6 +2497,7 @@ def cumsum(self, axis=0, *args, **kwargs): return self._cython_transform("cumsum", **kwargs) + @final @Substitution(name="groupby") @Appender(_common_see_also) def cummin(self, axis=0, **kwargs): @@ -2452,6 +2513,7 @@ def cummin(self, axis=0, **kwargs): return self._cython_transform("cummin", numeric_only=False) + @final @Substitution(name="groupby") @Appender(_common_see_also) def cummax(self, axis=0, **kwargs): @@ -2467,6 +2529,7 @@ def cummax(self, axis=0, **kwargs): return self._cython_transform("cummax", numeric_only=False) + @final def _get_cythonized_result( self, how: str, @@ -2625,6 +2688,7 @@ def _get_cythonized_result( else: return self._wrap_transformed_output(output) + @final @Substitution(name="groupby") def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ @@ -2668,6 +2732,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): periods=periods, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0): @@ -2697,6 +2762,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis) return (filled / shifted) - 1 + @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def head(self, n=5): @@ -2734,6 +2800,7 @@ def head(self, n=5): else: return self._selected_obj.iloc[:, mask] + @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def tail(self, n=5): @@ -2771,6 +2838,7 @@ def tail(self, n=5): else: return self._selected_obj.iloc[:, mask] + @final def _reindex_output( self, output: OutputFrameOrSeries, fill_value: Scalar = np.NaN ) -> OutputFrameOrSeries: @@ -2857,6 +2925,7 @@ def _reindex_output( return output.reset_index(drop=True) + @final def sample( self, n: Optional[int] = None, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8b79ea2242258..d814a7cee436e 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._typing import FrameOrSeries, Label +from pandas._typing import FrameOrSeries, Label, final from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly @@ -289,6 +289,7 @@ def __init__( self._grouper = None self.dropna = dropna + @final @property def ax(self): return self.grouper @@ -320,6 +321,7 @@ def _get_grouper(self, obj, validate: bool = True): ) return self.binner, self.grouper, self.obj + @final def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper @@ -379,12 +381,14 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): self.grouper = ax return self.grouper + @final @property def groups(self): # pandas\core\groupby\grouper.py:382: error: Item "None" of # "Optional[Any]" has no attribute "groups" [union-attr] return self.grouper.groups # type: ignore[union-attr] + @final def __repr__(self) -> str: attrs_list = ( f"{attr_name}={repr(getattr(self, attr_name))}" @@ -396,6 +400,7 @@ def __repr__(self) -> str: return f"{cls_name}({attrs})" +@final class Grouping: """ Holds the grouping information for a single key diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d98c55755042e..c60a59916affc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -24,7 +24,7 @@ from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction -from pandas._typing import ArrayLike, F, FrameOrSeries, Label, Shape +from pandas._typing import ArrayLike, F, FrameOrSeries, Label, Shape, final from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -146,6 +146,7 @@ def get_iterator( for key, (i, group) in zip(keys, splitter): yield key, group.__finalize__(data, method="groupby") + @final def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": """ Returns @@ -166,6 +167,7 @@ def _get_grouper(self): """ return self.groupings[0].grouper + @final def _get_group_keys(self): if len(self.groupings) == 1: return self.levels[0] @@ -175,6 +177,7 @@ def _get_group_keys(self): # provide "flattened" iterator for multi-group setting return get_flattened_list(comp_ids, ngroups, self.levels, self.codes) + @final def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) @@ -256,6 +259,7 @@ def levels(self) -> List[Index]: def names(self) -> List[Label]: return [ping.name for ping in self.groupings] + @final def size(self) -> Series: """ Compute group sizes. @@ -278,6 +282,7 @@ def groups(self) -> Dict[Hashable, np.ndarray]: to_groupby = Index(to_groupby) return self.axis.groupby(to_groupby) + @final @cache_readonly def is_monotonic(self) -> bool: # return if my group orderings are monotonic @@ -291,6 +296,7 @@ def group_info(self): comp_ids = ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups + @final @cache_readonly def codes_info(self) -> np.ndarray: # return the codes of items in original grouped axis @@ -300,6 +306,7 @@ def codes_info(self) -> np.ndarray: codes = codes[sorter] return codes + @final def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: all_codes = self.codes if len(all_codes) > 1: @@ -309,6 +316,7 @@ def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index)) + @final @cache_readonly def ngroups(self) -> int: return len(self.result_index) @@ -330,6 +338,7 @@ def result_index(self) -> Index: levels=levels, codes=codes, verify_integrity=False, names=self.names ) + @final def get_group_levels(self) -> List[Index]: if not self.compressed and len(self.groupings) == 1: return [self.groupings[0].result_index] @@ -370,6 +379,7 @@ def get_group_levels(self) -> List[Index]: _cython_arity = {"ohlc": 4} # OHLC + @final def _is_builtin_func(self, arg): """ if we define a builtin function for this argument, return it, @@ -377,6 +387,7 @@ def _is_builtin_func(self, arg): """ return SelectionMixin._builtin_table.get(arg, arg) + @final def _get_cython_function( self, kind: str, how: str, values: np.ndarray, is_numeric: bool ): @@ -413,6 +424,7 @@ def _get_cython_function( return func + @final def _get_cython_func_and_vals( self, kind: str, how: str, values: np.ndarray, is_numeric: bool ): @@ -447,6 +459,7 @@ def _get_cython_func_and_vals( raise return func, values + @final def _disallow_invalid_ops(self, values: ArrayLike, how: str): """ Check if we can do this operation with our cython functions. @@ -476,6 +489,7 @@ def _disallow_invalid_ops(self, values: ArrayLike, how: str): f"timedelta64 type does not support {how} operations" ) + @final def _ea_wrap_cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: @@ -512,6 +526,7 @@ def _ea_wrap_cython_operation( raise NotImplementedError(values.dtype) + @final def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> np.ndarray: @@ -625,6 +640,7 @@ def _cython_operation( return result + @final def _aggregate( self, result, counts, values, comp_ids, agg_func, min_count: int = -1 ): @@ -636,6 +652,7 @@ def _aggregate( return result + @final def _transform( self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs ): @@ -674,6 +691,7 @@ def agg_series(self, obj: Series, func: F): raise return self._aggregate_series_pure_python(obj, func) + @final def _aggregate_series_fast(self, obj: Series, func: F): # At this point we have already checked that # - obj.index is not a MultiIndex @@ -693,6 +711,7 @@ def _aggregate_series_fast(self, obj: Series, func: F): result, counts = grouper.get_result() return result, counts + @final def _aggregate_series_pure_python(self, obj: Series, func: F): group_index, _, ngroups = self.group_info