Skip to content

REF: remove _get_grouper, make Grouper.__init__ less stateful #51155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 35 additions & 33 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,6 @@ class Grouping:

_codes: npt.NDArray[np.signedinteger] | None = None
_group_index: Index | None = None
_passed_categorical: bool
_all_grouper: Categorical | None
_orig_cats: Index | None
_index: Index
Expand All @@ -460,7 +459,7 @@ def __init__(
) -> None:
self.level = level
self._orig_grouper = grouper
self.grouping_vector = _convert_grouper(index, grouper)
grouping_vector = _convert_grouper(index, grouper)
self._all_grouper = None
self._orig_cats = None
self._index = index
Expand All @@ -471,8 +470,6 @@ def __init__(
self._dropna = dropna
self._uniques = uniques

self._passed_categorical = False

# we have a single grouper which may be a myriad of things,
# some of which are dependent on the passing in level

Expand All @@ -486,78 +483,83 @@ def __init__(
else:
index_level = index

if self.grouping_vector is None:
self.grouping_vector = index_level
if grouping_vector is None:
grouping_vector = index_level
else:
mapper = self.grouping_vector
self.grouping_vector = index_level.map(mapper)
mapper = grouping_vector
grouping_vector = index_level.map(mapper)

# a passed Grouper like, directly get the grouper in the same way
# as single grouper groupby, use the group_info to get codes
elif isinstance(self.grouping_vector, Grouper):
elif isinstance(grouping_vector, Grouper):
# get the new grouper; we already have disambiguated
# what key/level refer to exactly, don't need to
# check again as we have by this point converted these
# to an actual value (rather than a pd.Grouper)
assert self.obj is not None # for mypy
newgrouper, newobj = self.grouping_vector._get_grouper(
self.obj, validate=False
)
newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False)
self.obj = newobj

ng = newgrouper._get_grouper()
if isinstance(newgrouper, ops.BinGrouper):
# in this case we have `ng is newgrouper`
self.grouping_vector = ng
# TODO: can we unwrap this and get a tighter typing
# for self.grouping_vector?
grouping_vector = newgrouper
else:
# ops.BaseGrouper
# TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1.
# If that were to occur, would we be throwing out information?
# error: Cannot determine type of "grouping_vector" [has-type]
ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type]
# use Index instead of ndarray so we can recover the name
self.grouping_vector = Index(ng, name=newgrouper.result_index.name)
grouping_vector = Index(ng, name=newgrouper.result_index.name)

elif not isinstance(
self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
):
# no level passed
if getattr(self.grouping_vector, "ndim", 1) != 1:
t = self.name or str(type(self.grouping_vector))
if getattr(grouping_vector, "ndim", 1) != 1:
t = str(type(grouping_vector))
raise ValueError(f"Grouper for '{t}' not 1-dimensional")

self.grouping_vector = index.map(self.grouping_vector)
grouping_vector = index.map(grouping_vector)

if not (
hasattr(self.grouping_vector, "__len__")
and len(self.grouping_vector) == len(index)
hasattr(grouping_vector, "__len__")
and len(grouping_vector) == len(index)
):
grper = pprint_thing(self.grouping_vector)
grper = pprint_thing(grouping_vector)
errmsg = (
"Grouper result violates len(labels) == "
f"len(data)\nresult: {grper}"
)
self.grouping_vector = None # Try for sanity
raise AssertionError(errmsg)

if isinstance(self.grouping_vector, np.ndarray):
if self.grouping_vector.dtype.kind in ["m", "M"]:
if isinstance(grouping_vector, np.ndarray):
if grouping_vector.dtype.kind in ["m", "M"]:
# if we have a date/time-like grouper, make sure that we have
# Timestamps like
# TODO 2022-10-08 we only have one test that gets here and
# values are already in nanoseconds in that case.
self.grouping_vector = Series(self.grouping_vector).to_numpy()
elif is_categorical_dtype(self.grouping_vector):
grouping_vector = Series(grouping_vector).to_numpy()
elif is_categorical_dtype(grouping_vector):
# a passed Categorical
self._passed_categorical = True

self._orig_cats = self.grouping_vector.categories
self.grouping_vector, self._all_grouper = recode_for_groupby(
self.grouping_vector, sort, observed
self._orig_cats = grouping_vector.categories
grouping_vector, self._all_grouper = recode_for_groupby(
grouping_vector, sort, observed
)

self.grouping_vector = grouping_vector

def __repr__(self) -> str:
return f"Grouping({self.name})"

def __iter__(self) -> Iterator:
return iter(self.indices)

@cache_readonly
def _passed_categorical(self) -> bool:
return is_categorical_dtype(self.grouping_vector)

@cache_readonly
def name(self) -> Hashable:
ilevel = self._ilevel
Expand Down
18 changes: 0 additions & 18 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,15 +745,6 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter:
ids, _, ngroups = self.group_info
return get_splitter(data, ids, ngroups, axis=axis)

def _get_grouper(self):
"""
We are a grouper as part of another's groupings.

We have a specific method of grouping, so cannot
convert to a Index for our grouper.
"""
return self.groupings[0].grouping_vector

@final
@cache_readonly
def group_keys_seq(self):
Expand Down Expand Up @@ -1112,15 +1103,6 @@ def nkeys(self) -> int:
# still matches len(self.groupings), but we can hard-code
return 1

def _get_grouper(self):
"""
We are a grouper as part of another's groupings.

We have a specific method of grouping, so cannot
convert to a Index for our grouper.
"""
return self

def get_iterator(self, data: NDFrame, axis: AxisInt = 0):
"""
Groupby iterator
Expand Down