-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Cythonized GroupBy mad #20024
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cythonized GroupBy mad #20024
Changes from 6 commits
31f1799
962f324
0c10369
192253f
9d7f0ac
57152e6
f1a3860
5307ac3
5cab1eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
from pandas import (date_range, bdate_range, Timestamp, | ||
Index, MultiIndex, DataFrame, Series, | ||
concat, Panel, DatetimeIndex, read_csv) | ||
from pandas.core.base import DataError | ||
from pandas.core.dtypes.missing import isna | ||
from pandas.errors import UnsupportedFunctionCall, PerformanceWarning | ||
from pandas.util.testing import (assert_frame_equal, assert_index_equal, | ||
|
@@ -1300,17 +1301,6 @@ def test_non_cython_api(self): | |
g = df.groupby('A') | ||
gni = df.groupby('A', as_index=False) | ||
|
||
# mad | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test was failing with this change because it allowed for There's also some code that tests |
||
expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) | ||
expected.index.name = 'A' | ||
result = g.mad() | ||
assert_frame_equal(result, expected) | ||
|
||
expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], | ||
index=[0, 1]) | ||
result = gni.mad() | ||
assert_frame_equal(result, expected) | ||
|
||
# describe | ||
expected_index = pd.Index([1, 3], name='A') | ||
expected_col = pd.MultiIndex(levels=[['B'], | ||
|
@@ -2141,6 +2131,52 @@ def test_groupby_bool_aggs(self, agg_func, skipna, vals): | |
result = getattr(df.groupby('key'), agg_func)(skipna=skipna) | ||
assert_frame_equal(result, exp_df) | ||
|
||
@pytest.mark.parametrize("klass", [Series, DataFrame]) | ||
@pytest.mark.parametrize("test_mi", [True, False]) | ||
@pytest.mark.parametrize("dtype", ['int', 'float']) | ||
def test_groupby_mad(self, klass, test_mi, dtype): | ||
vals = np.array(range(10)).astype(dtype) | ||
df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) | ||
|
||
idx = pd.Index(['a', 'b'], name='key') | ||
exp = klass([1.2, 1.2], index=idx) | ||
grping = ['key'] | ||
|
||
if test_mi: | ||
df = df.append(df) # Double the size of the frame | ||
df['newcol'] = ['foo'] * 10 + ['bar'] * 10 | ||
grping.append('newcol') | ||
|
||
mi = pd.MultiIndex.from_product((exp.index.values, | ||
['bar', 'foo']), | ||
names=['key', 'newcol']) | ||
exp = exp.append(exp) | ||
exp.index = mi | ||
|
||
if klass is Series: | ||
exp.name = 'val' | ||
result = df.groupby(grping)['val'].mad() | ||
tm.assert_series_equal(result, exp) | ||
else: | ||
exp = exp.rename(columns={0: 'val'}) | ||
result = df.groupby(grping).mad() | ||
tm.assert_frame_equal(result, exp) | ||
|
||
@pytest.mark.parametrize("vals", [ | ||
['foo'] * 10, [True] * 10]) | ||
def test_groupby_mad_raises(self, vals): | ||
df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': vals}) | ||
|
||
with tm.assert_raises_regex(DataError, | ||
"No numeric types to aggregate"): | ||
df.groupby('key').mad() | ||
|
||
def test_groupby_mad_skipna(self): | ||
df = DataFrame({'key': ['a'] * 5 + ['b'] * 5, 'val': range(10)}) | ||
with tm.assert_raises_regex( | ||
NotImplementedError, "'skipna=False' not yet implemented"): | ||
df.groupby('key').mad(skipna=False) | ||
|
||
def test_dont_clobber_name_column(self): | ||
df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], | ||
'name': ['foo', 'bar', 'baz'] * 2}) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,7 @@ | |
|
||
AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', | ||
'mad', 'std', 'var', 'sem'] | ||
AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad'] | ||
AGG_FUNCTIONS_WITH_SKIPNA = ['skew'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Speaking to why I did this - since As always glad to open an issue for that if you agree on approach |
||
|
||
df_whitelist = frozenset([ | ||
'last', | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As touched on in the comments I ideally would not want this
try...except
and think instead that themean
application should be throwing the error. Whilemean
does raise forobject
types, something likepd.Series([True, False, True]).mean()
is entirely valid and therefore this ends up throwing a TypeError in subsequent operationI think
mean
should raise on boolean data and have that pass through. Will open separate issue if you agree