diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e763700d08cf4..20f17a7f42472 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -52,7 +52,6 @@ _apply_whitelist = frozenset(['last', 'first', 'mean', 'sum', 'min', 'max', - 'head', 'tail', 'cumsum', 'cumprod', 'cummin', 'cummax', 'resample', 'describe', @@ -482,13 +481,19 @@ def picker(arr): return np.nan return self.agg(picker) - def cumcount(self): - """Number each item in each group from 0 to the length of that group. + def cumcount(self, **kwargs): + """ + Number each item in each group from 0 to the length of that group - 1. Essentially this is equivalent to >>> self.apply(lambda x: Series(np.arange(len(x)), x.index)) + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + Example ------- @@ -510,14 +515,111 @@ def cumcount(self): 4 1 5 3 dtype: int64 + >>> df.groupby('A').cumcount(ascending=False) + 0 3 + 1 2 + 2 1 + 3 1 + 4 0 + 5 0 + dtype: int64 """ + ascending = kwargs.pop('ascending', True) + index = self.obj.index - cumcounts = np.zeros(len(index), dtype='int64') - for v in self.indices.values(): - cumcounts[v] = np.arange(len(v), dtype='int64') + rng = np.arange(self.grouper._max_groupsize, dtype='int64') + cumcounts = self._cumcount_array(rng, ascending=ascending) return Series(cumcounts, index) + def head(self, n=5): + """ + Returns first n rows of each group. + + Essentially equivalent to ``.apply(lambda x: x.head(n))`` + + Example + ------- + + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], + columns=['A', 'B']) + >>> df.groupby('A', as_index=False).head(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(1) + A B + A + 1 0 1 2 + 5 2 5 6 + + """ + rng = np.arange(self.grouper._max_groupsize, dtype='int64') + in_head = self._cumcount_array(rng) < n + head = self.obj[in_head] + if self.as_index: + head.index = self._index_with_as_index(in_head) + return head + + def tail(self, n=5): + """ + Returns last n rows of each group + + Essentially equivalent to ``.apply(lambda x: x.tail(n))`` + + Example + ------- + + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], + columns=['A', 'B']) + >>> df.groupby('A', as_index=False).tail(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(1) + A B + A + 1 0 1 2 + 5 2 5 6 + + """ + rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') + in_tail = self._cumcount_array(rng, ascending=False) > -n + tail = self.obj[in_tail] + if self.as_index: + tail.index = self._index_with_as_index(in_tail) + return tail + + def _cumcount_array(self, arr, **kwargs): + ascending = kwargs.pop('ascending', True) + + len_index = len(self.obj.index) + cumcounts = np.zeros(len_index, dtype='int64') + if ascending: + for v in self.indices.values(): + cumcounts[v] = arr[:len(v)] + else: + for v in self.indices.values(): + cumcounts[v] = arr[len(v)-1::-1] + return cumcounts + + def _index_with_as_index(self, b): + """ + Take boolean mask of index to be returned from apply, if as_index=True + + """ + # TODO perf, it feels like this should already be somewhere... + from itertools import chain + original = self.obj.index + gp = self.grouper + levels = chain((gp.levels[i][gp.labels[i][b]] + for i in range(len(gp.groupings))), + (original.get_level_values(i)[b] + for i in range(original.nlevels))) + new = MultiIndex.from_arrays(list(levels)) + new.names = gp.names + original.names + return new + def _try_cast(self, result, obj): """ try to cast the result to our obj original type, @@ -758,14 +860,28 @@ def names(self): def size(self): """ Compute group sizes + """ # TODO: better impl labels, _, ngroups = self.group_info - bin_counts = Series(labels).value_counts() + bin_counts = algos.value_counts(labels, sort=False) bin_counts = bin_counts.reindex(np.arange(ngroups)) bin_counts.index = self.result_index return bin_counts + @cache_readonly + def _max_groupsize(self): + ''' + Compute size of largest group + + ''' + # For many items in each group this is much faster than + # self.size().max(), in worst case marginally slower + if self.indices: + return max(len(v) for v in self.indices.values()) + else: + return 0 + @cache_readonly def groups(self): if len(self.groupings) == 1: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 9df5541615cee..9c636168114c7 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1203,24 +1203,64 @@ def test_groupby_as_index_apply(self): g_not_as = df.groupby('user_id', as_index=False) res_as = g_as.head(2).index - exp_as = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)]) + exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)]) assert_index_equal(res_as, exp_as) res_not_as = g_not_as.head(2).index - exp_not_as = Index([0, 2, 1, 4]) + exp_not_as = Index([0, 1, 2, 4]) assert_index_equal(res_not_as, exp_not_as) - res_as = g_as.apply(lambda x: x.head(2)).index - assert_index_equal(res_not_as, exp_not_as) + res_as_apply = g_as.apply(lambda x: x.head(2)).index + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index - res_not_as = g_not_as.apply(lambda x: x.head(2)).index - assert_index_equal(res_not_as, exp_not_as) + # apply doesn't maintain the original ordering + exp_not_as_apply = Index([0, 2, 1, 4]) + exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)]) + + assert_index_equal(res_as_apply, exp_as_apply) + assert_index_equal(res_not_as_apply, exp_not_as_apply) ind = Index(list('abcde')) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) res = df.groupby(0, as_index=False).apply(lambda x: x).index assert_index_equal(res, ind) + def test_groupby_head_tail(self): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g_as = df.groupby('A', as_index=True) + g_not_as = df.groupby('A', as_index=False) + + # as_index= False, much easier + assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) + assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) + + empty_not_as = DataFrame(columns=df.columns) + assert_frame_equal(empty_not_as, g_not_as.head(0)) + assert_frame_equal(empty_not_as, g_not_as.tail(0)) + assert_frame_equal(empty_not_as, g_not_as.head(-1)) + assert_frame_equal(empty_not_as, g_not_as.tail(-1)) + + assert_frame_equal(df, g_not_as.head(7)) # contains all + assert_frame_equal(df, g_not_as.tail(7)) + + # as_index=True, yuck + # prepend the A column as an index, in a roundabout way + df_as = df.copy() + df_as.index = df.set_index('A', append=True, + drop=False).index.swaplevel(0, 1) + + assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) + assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) + + empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) + assert_frame_equal(empty_as, g_as.head(0)) + assert_frame_equal(empty_as, g_as.tail(0)) + assert_frame_equal(empty_as, g_as.head(-1)) + assert_frame_equal(empty_as, g_as.tail(-1)) + + assert_frame_equal(df_as, g_as.head(7)) # contains all + assert_frame_equal(df_as, g_as.tail(7)) + def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year,