Skip to content

FIX-#3498 Fix groupby to use kwargs for getting the groups #3523

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,9 @@ def _compute_index_grouped(self, numerical=False):
--------
pandas.core.groupby.GroupBy.groups
"""
# We end up using pure pandas to compute group indices, so raising a warning
ErrorMessage.default_to_pandas("Group indices computation")

# Splitting level-by and column-by since we serialize them in a different ways
by = None
level = []
Expand All @@ -866,6 +869,8 @@ def _compute_index_grouped(self, numerical=False):
by = self._by

is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)
# `dropna` param is the only one that matters for the group indices result
dropna = self._kwargs.get("dropna", True)

if hasattr(self._by, "columns") and is_multi_by:
by = list(self._by.columns)
Expand All @@ -875,7 +880,6 @@ def _compute_index_grouped(self, numerical=False):
# end up using pandas implementation. Add the warning so the user is
# aware.
ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
ErrorMessage.default_to_pandas("Groupby with multiple columns")
if isinstance(by, list) and all(
is_label(self._df, o, self._axis) for o in by
):
Expand All @@ -886,7 +890,7 @@ def _compute_index_grouped(self, numerical=False):
by = try_cast_to_pandas(by, squeeze=True)
pandas_df = self._df._to_pandas()
by = wrap_into_list(by, level)
groupby_obj = pandas_df.groupby(by=by)
groupby_obj = pandas_df.groupby(by=by, dropna=dropna)
return groupby_obj.indices if numerical else groupby_obj.groups
else:
if isinstance(self._by, type(self._query_compiler)):
Expand All @@ -908,7 +912,13 @@ def _compute_index_grouped(self, numerical=False):
# Since we want positional indices of the groups, we want to group
# on a `RangeIndex`, not on the actual index labels
axis_labels = pandas.RangeIndex(len(axis_labels))
return axis_labels.groupby(by)
# `pandas.Index.groupby` doesn't take any parameters except `by`.
# Have to convert an Index to a Series to be able to process `dropna=False`:
if dropna:
return axis_labels.groupby(by)
else:
groupby_obj = axis_labels.to_series().groupby(by, dropna=dropna)
return groupby_obj.indices if numerical else groupby_obj.groups

def _wrap_aggregation(
self, qc_method, default_func, drop=True, numeric_only=True, **kwargs
Expand Down
71 changes: 70 additions & 1 deletion modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
generate_multiindex,
test_groupby_data,
dict_equals,
value_equals,
)
from modin.config import NPartitions

Expand All @@ -45,7 +46,7 @@ def modin_groupby_equals_pandas(modin_groupby, pandas_groupby):
)

for g1, g2 in itertools.zip_longest(modin_groupby, pandas_groupby):
assert g1[0] == g2[0]
value_equals(g1[0], g2[0])
Comment on lines -48 to +49
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed so nan values be compared correctly

df_equals(g1[1], g2[1])


Expand Down Expand Up @@ -1350,6 +1351,74 @@ def test_groupby_multiindex(groupby_kwargs):
df_equals(md_grp.first(), pd_grp.first())


@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize(
"groupby_kwargs",
[
pytest.param({"level": 1, "axis": 1}, id="level_idx_axis=1"),
pytest.param({"level": 1}, id="level_idx"),
pytest.param({"level": [1, "four"]}, id="level_idx+name"),
pytest.param({"by": "four"}, id="level_name"),
pytest.param({"by": ["one", "two"]}, id="level_name_multi_by"),
pytest.param(
{"by": ["item0", "one", "two"]},
id="col_name+level_name",
),
pytest.param({"by": ["item0"]}, id="col_name"),
pytest.param(
{"by": ["item0", "item1"]},
id="col_name_multi_by",
),
],
)
def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna):
modin_df = pd.DataFrame(test_data["float_nan_data"])
pandas_df = pandas.DataFrame(test_data["float_nan_data"])

new_index = pandas.Index([f"item{i}" for i in range(len(pandas_df))])
new_columns = pandas.MultiIndex.from_tuples(
[(i // 4, i // 2, i) for i in range(len(modin_df.columns))],
names=["four", "two", "one"],
)
modin_df.columns = new_columns
modin_df.index = new_index
pandas_df.columns = new_columns
pandas_df.index = new_index

if groupby_kwargs.get("axis", 0) == 0:
modin_df = modin_df.T
pandas_df = pandas_df.T

md_grp, pd_grp = modin_df.groupby(
**groupby_kwargs, dropna=dropna
), pandas_df.groupby(**groupby_kwargs, dropna=dropna)
modin_groupby_equals_pandas(md_grp, pd_grp)

by_kwarg = groupby_kwargs.get("by", [])
# Disabled because of broken `dropna=False` for MapReduce implemented aggs:
# https://github.com/modin-project/modin/issues/3817
if not (
not dropna
and len(by_kwarg) > 1
and any(col in modin_df.columns for col in by_kwarg)
):
df_equals(md_grp.sum(), pd_grp.sum())
df_equals(md_grp.size(), pd_grp.size())
# Grouping on level works incorrect in case of aggregation:
# https://github.com/modin-project/modin/issues/2912
# "BaseOnPython" tests are disabled because of the bug:
# https://github.com/modin-project/modin/issues/3827
if get_current_execution() != "BaseOnPython" and any(
col in modin_df.columns for col in by_kwarg
):
df_equals(md_grp.quantile(), pd_grp.quantile())
# Default-to-pandas tests are disabled for multi-column 'by' because of the bug:
# https://github.com/modin-project/modin/issues/3827
if not (not dropna and len(by_kwarg) > 1):
df_equals(md_grp.first(), pd_grp.first())
df_equals(md_grp._default_to_pandas(lambda df: df.sum()), pd_grp.sum())


@pytest.mark.parametrize("groupby_axis", [0, 1])
@pytest.mark.parametrize("shift_axis", [0, 1])
def test_shift_freq(groupby_axis, shift_axis):
Expand Down
16 changes: 10 additions & 6 deletions modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1353,12 +1353,16 @@ def _make_default_file(filename=None, nrows=NROWS, ncols=2, force=True, **kwargs
return _make_default_file, filenames


def value_equals(obj1, obj2):
"""Check wherher two scalar or list-like values are equal and raise an ``AssertionError`` if they aren't."""
if is_list_like(obj1):
np.testing.assert_array_equal(obj1, obj2)
else:
assert (obj1 == obj2) or (np.isnan(obj1) and np.isnan(obj2))


def dict_equals(dict1, dict2):
"""Check whether two dictionaries are equal and raise an ``AssertionError`` if they aren't."""
for key1, key2 in itertools.zip_longest(sorted(dict1), sorted(dict2)):
assert (key1 == key2) or (np.isnan(key1) and np.isnan(key2))
value1, value2 = dict1[key1], dict2[key2]
if is_list_like(value1):
np.testing.assert_array_equal(value1, value2)
else:
assert value1 == value2
value_equals(key1, key2)
value_equals(dict1[key1], dict2[key2])