From 5fa6a39fab9498015f767b592aee565a18d29b98 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 20 Apr 2014 00:57:10 +0900 Subject: [PATCH] ENH: pivot_table can now accept Grouper --- doc/source/release.rst | 1 + doc/source/reshaping.rst | 17 +++- doc/source/v0.14.0.txt | 20 +++++ pandas/tools/pivot.py | 15 ++-- pandas/tools/tests/test_pivot.py | 129 +++++++++++++++++++++++++++++-- 5 files changed, 169 insertions(+), 13 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index a888f03b9d8e7..c975143b0ef67 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -286,6 +286,7 @@ Improvements to existing features :func:`read_csv`/:func:`read_table` if no other C-unsupported options specified (:issue:`6607`) - ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`) +- ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index b872c8a60e34e..436055ffe37d1 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -264,19 +264,24 @@ It takes a number of arguments - ``data``: A DataFrame object - ``values``: a column or a list of columns to aggregate -- ``rows``: list of columns to group by on the table rows -- ``cols``: list of columns to group by on the table columns +- ``index``: a column, Grouper, array which has the same length as data, or list of them. + Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. +- ``columns``: a column, Grouper, array which has the same length as data, or list of them. + Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. - ``aggfunc``: function to use for aggregation, defaulting to ``numpy.mean`` Consider a data set like this: .. ipython:: python + import datetime df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 6, 'B' : ['A', 'B', 'C'] * 8, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, 'D' : np.random.randn(24), - 'E' : np.random.randn(24)}) + 'E' : np.random.randn(24), + 'F' : [datetime.datetime(2013, i, 1) for i in range(1, 13)] + + [datetime.datetime(2013, i, 15) for i in range(1, 13)]}) df We can produce pivot tables from this data very easily: @@ -296,6 +301,12 @@ hierarchy in the columns: pivot_table(df, index=['A', 'B'], columns=['C']) +Also, you can use ``Grouper`` for ``index`` and ``columns`` keywords. For detail of ``Grouper``, see :ref:`Grouping with a Grouper specification `. + +.. ipython:: python + + pivot_table(df, values='D', index=Grouper(freq='M', key='F'), columns='C') + You can render a nice output of the table omitting the missing values by calling ``to_string`` if you wish: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index e63728e22d23a..34480668df8c9 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -484,6 +484,26 @@ Enhancements - ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`) - :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of quantiles. +- ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`) + + .. ipython:: python + + import datetime + df = DataFrame({ + 'Branch' : 'A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe'.split(), + 'Quantity': [1, 3, 5, 1, 8, 1], + 'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5), + datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0), + datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0)], + 'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5), + datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0), + datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0)]}) + df + + pivot_table(df, index=Grouper(freq='M', key='Date'), + columns=Grouper(freq='M', key='PayDay'), + values='Quantity', aggfunc=np.sum) Performance ~~~~~~~~~~~ diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 6c4f55ae8a3b5..9132fea089fe7 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -4,6 +4,7 @@ from pandas import Series, DataFrame from pandas.core.index import MultiIndex +from pandas.core.groupby import Grouper from pandas.tools.merge import concat from pandas.tools.util import cartesian_product from pandas.compat import range, lrange, zip @@ -25,10 +26,12 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', ---------- data : DataFrame values : column to aggregate, optional - index : list of column names or arrays to group on - Keys to group on the x-axis of the pivot table - columns : list of column names or arrays to group on - Keys to group on the y-axis of the pivot table + index : a column, Grouper, array which has the same length as data, or list of them. + Keys to group by on the pivot table index. + If an array is passed, it is being used as the same manner as column values. + columns : a column, Grouper, array which has the same length as data, or list of them. + Keys to group by on the pivot table column. + If an array is passed, it is being used as the same manner as column values. aggfunc : function, default numpy.mean, or list of functions If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred @@ -98,6 +101,8 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if values_passed: to_filter = [] for x in keys + values: + if isinstance(x, Grouper): + x = x.key try: if x in data: to_filter.append(x) @@ -297,7 +302,7 @@ def _all_key(): def _convert_by(by): if by is None: by = [] - elif (np.isscalar(by) or isinstance(by, (np.ndarray, Series)) + elif (np.isscalar(by) or isinstance(by, (np.ndarray, Series, Grouper)) or hasattr(by, '__call__')): by = [by] else: diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 2255fdebc9fe3..6fe32b5b85080 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1,12 +1,10 @@ import datetime -import unittest -import warnings import numpy as np from numpy.testing import assert_equal import pandas -from pandas import DataFrame, Series, Index, MultiIndex +from pandas import DataFrame, Series, Index, MultiIndex, Grouper from pandas.tools.merge import concat from pandas.tools.pivot import pivot_table, crosstab from pandas.compat import range, u, product @@ -288,8 +286,7 @@ def test_pivot_columns_lexsorted(self): iproduct = np.random.randint(0, len(products), n) items['Index'] = products['Index'][iproduct] items['Symbol'] = products['Symbol'][iproduct] - dr = pandas.date_range(datetime.date(2000, 1, 1), - datetime.date(2010, 12, 31)) + dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items['Year'] = dates.year items['Month'] = dates.month @@ -333,6 +330,128 @@ def test_margins_no_values_two_row_two_cols(self): result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) + def test_pivot_timegrouper(self): + df = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], + 'Date' : [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1), + datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), + datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), + datetime.datetime(2013, 12, 2), datetime.datetime(2013, 12, 2),]}).set_index('Date') + + expected = DataFrame(np.array([10, 18, 3]).reshape(1, 3), + index=[datetime.datetime(2013, 12, 31)], + columns='Carl Joe Mark'.split()) + expected.index.name = 'Date' + expected.columns.name = 'Buyer' + + result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result,expected) + + result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result,expected.T) + + expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), + index=[datetime.datetime(2013, 1, 1), datetime.datetime(2013, 7, 1)], + columns='Carl Joe Mark'.split()) + expected.index.name = 'Date' + expected.columns.name = 'Buyer' + + result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) + + # passing the name + df = df.reset_index() + result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer', + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) + + self.assertRaises(KeyError, lambda : pivot_table(df, index=Grouper(freq='6MS', key='foo'), + columns='Buyer', values='Quantity', aggfunc=np.sum)) + self.assertRaises(KeyError, lambda : pivot_table(df, index='Buyer', + columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum)) + + # passing the level + df = df.set_index('Date') + result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer', + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) + + self.assertRaises(ValueError, lambda : pivot_table(df, index=Grouper(freq='6MS', level='foo'), + columns='Buyer', values='Quantity', aggfunc=np.sum)) + self.assertRaises(ValueError, lambda : pivot_table(df, index='Buyer', + columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum)) + + # double grouper + df = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1,3,5,1,8,1,9,3], + 'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5), + datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0), + datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0), + datetime.datetime(2013,10,2,12,0), datetime.datetime(2013,12,5,14,0)], + 'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5), + datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0), + datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0), + datetime.datetime(2013,12,30,12,0), datetime.datetime(2013,11,20,14,0),]}) + + result = pivot_table(df, index=Grouper(freq='M', key='Date'), + columns=Grouper(freq='M', key='PayDay'), + values='Quantity', aggfunc=np.sum) + expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, + np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan]).reshape(4, 4), + index=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), + datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)], + columns=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), + datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)]) + expected.index.name = 'Date' + expected.columns.name = 'PayDay' + + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index=Grouper(freq='M', key='PayDay'), + columns=Grouper(freq='M', key='Date'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) + + tuples = [(datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)), + (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)), + (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)), + (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)), + (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)), + (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)),] + idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) + expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan, + 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2), + index=idx, columns=['A', 'B']) + expected.columns.name = 'Branch' + + result = pivot_table(df, index=[Grouper(freq='M', key='Date'), + Grouper(freq='M', key='PayDay')], columns=['Branch'], + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index=['Branch'], columns=[Grouper(freq='M', key='Date'), + Grouper(freq='M', key='PayDay')], + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) class TestCrosstab(tm.TestCase):