Merge pull request #6913 from sinhrks/pivotg

jreback · jreback · commit 89502fc163ac · 2014-04-27T09:28:37.000-04:00
ENH: pivot_table can now accept Grouper
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -286,6 +286,7 @@ Improvements to existing features
   :func:`read_csv`/:func:`read_table` if no other C-unsupported options
   specified (:issue:`6607`)
 - ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`)
+- ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`)
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst
@@ -264,19 +264,24 @@ It takes a number of arguments
 
 - ``data``: A DataFrame object
 - ``values``: a column or a list of columns to aggregate
-- ``rows``: list of columns to group by on the table rows
-- ``cols``: list of columns to group by on the table columns
+- ``index``: a column, Grouper, array which has the same length as data, or list of them.
+  Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values.
+- ``columns``: a column, Grouper, array which has the same length as data, or list of them. 
+  Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values.
 - ``aggfunc``: function to use for aggregation, defaulting to ``numpy.mean``
 
 Consider a data set like this:
 
 .. ipython:: python
 
+   import datetime
    df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 6,
                    'B' : ['A', 'B', 'C'] * 8,
                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4,
                    'D' : np.random.randn(24),
-                   'E' : np.random.randn(24)})
+                   'E' : np.random.randn(24),
+                   'F' : [datetime.datetime(2013, i, 1) for i in range(1, 13)] +
+                         [datetime.datetime(2013, i, 15) for i in range(1, 13)]})
    df
 
 We can produce pivot tables from this data very easily:
@@ -296,6 +301,12 @@ hierarchy in the columns:
 
    pivot_table(df, index=['A', 'B'], columns=['C'])
 
+Also, you can use ``Grouper`` for ``index`` and ``columns`` keywords. For detail of ``Grouper``, see :ref:`Grouping with a Grouper specification <groupby.specify>`.
+
+.. ipython:: python
+
+   pivot_table(df, values='D', index=Grouper(freq='M', key='F'), columns='C')
+
 You can render a nice output of the table omitting the missing values by
 calling ``to_string`` if you wish:
 
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -484,6 +484,26 @@ Enhancements
 - ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`)
 - :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of 
   quantiles.
+- ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`)
+
+  .. ipython:: python
+
+    import datetime 
+    df = DataFrame({
+      'Branch' : 'A A A A A B'.split(),
+      'Buyer': 'Carl Mark Carl Carl Joe Joe'.split(),
+      'Quantity': [1, 3, 5, 1, 8, 1],
+      'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5),
+                datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0),
+                datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0)],
+      'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5),
+                  datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0),
+                  datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0)]})
+    df
+
+    pivot_table(df, index=Grouper(freq='M', key='Date'),
+                columns=Grouper(freq='M', key='PayDay'),
+                values='Quantity', aggfunc=np.sum)
 
 Performance
 ~~~~~~~~~~~
diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py
@@ -4,6 +4,7 @@
 
 from pandas import Series, DataFrame
 from pandas.core.index import MultiIndex
+from pandas.core.groupby import Grouper
 from pandas.tools.merge import concat
 from pandas.tools.util import cartesian_product
 from pandas.compat import range, lrange, zip
@@ -25,10 +26,12 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
     ----------
     data : DataFrame
     values : column to aggregate, optional
-    index : list of column names or arrays to group on
-        Keys to group on the x-axis of the pivot table
-    columns : list of column names or arrays to group on
-        Keys to group on the y-axis of the pivot table
+    index : a column, Grouper, array which has the same length as data, or list of them.
+        Keys to group by on the pivot table index.
+        If an array is passed, it is being used as the same manner as column values.
+    columns : a column, Grouper, array which has the same length as data, or list of them.
+        Keys to group by on the pivot table column.
+        If an array is passed, it is being used as the same manner as column values.
     aggfunc : function, default numpy.mean, or list of functions
         If list of functions passed, the resulting pivot table will have
         hierarchical columns whose top level are the function names (inferred
@@ -98,6 +101,8 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
     if values_passed:
         to_filter = []
         for x in keys + values:
+            if isinstance(x, Grouper):
+                x = x.key
             try:
                 if x in data:
                     to_filter.append(x)
@@ -297,7 +302,7 @@ def _all_key():
 def _convert_by(by):
     if by is None:
         by = []
-    elif (np.isscalar(by) or isinstance(by, (np.ndarray, Series))
+    elif (np.isscalar(by) or isinstance(by, (np.ndarray, Series, Grouper))
           or hasattr(by, '__call__')):
         by = [by]
     else:
diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py
@@ -1,12 +1,10 @@
 import datetime
-import unittest
-import warnings
 
 import numpy as np
 from numpy.testing import assert_equal
 
 import pandas
-from pandas import DataFrame, Series, Index, MultiIndex
+from pandas import DataFrame, Series, Index, MultiIndex, Grouper
 from pandas.tools.merge import concat
 from pandas.tools.pivot import pivot_table, crosstab
 from pandas.compat import range, u, product
@@ -288,8 +286,7 @@ def test_pivot_columns_lexsorted(self):
         iproduct = np.random.randint(0, len(products), n)
         items['Index'] = products['Index'][iproduct]
         items['Symbol'] = products['Symbol'][iproduct]
-        dr = pandas.date_range(datetime.date(2000, 1, 1),
-                               datetime.date(2010, 12, 31))
+        dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31))
         dates = dr[np.random.randint(0, len(dr), n)]
         items['Year'] = dates.year
         items['Month'] = dates.month
@@ -333,6 +330,128 @@ def test_margins_no_values_two_row_two_cols(self):
         result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True)
         self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0])
 
+    def test_pivot_timegrouper(self):
+        df = DataFrame({
+            'Branch' : 'A A A A A A A B'.split(),
+            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
+            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
+            'Date' : [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1),
+                      datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2),
+                      datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2),
+                      datetime.datetime(2013, 12, 2), datetime.datetime(2013, 12, 2),]}).set_index('Date')
+
+        expected = DataFrame(np.array([10, 18, 3]).reshape(1, 3),
+                             index=[datetime.datetime(2013, 12, 31)], 
+                             columns='Carl Joe Mark'.split())
+        expected.index.name = 'Date'
+        expected.columns.name = 'Buyer'
+
+        result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer',
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result,expected)
+
+        result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), 
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result,expected.T)
+
+        expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3),
+                             index=[datetime.datetime(2013, 1, 1), datetime.datetime(2013, 7, 1)], 
+                             columns='Carl Joe Mark'.split())
+        expected.index.name = 'Date'
+        expected.columns.name = 'Buyer'
+
+        result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer',
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'),
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result, expected.T)
+ 
+        # passing the name
+        df = df.reset_index()
+        result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer',
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'),
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result, expected.T)
+
+        self.assertRaises(KeyError, lambda : pivot_table(df, index=Grouper(freq='6MS', key='foo'),
+                          columns='Buyer', values='Quantity', aggfunc=np.sum))
+        self.assertRaises(KeyError, lambda : pivot_table(df, index='Buyer',
+                          columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum))
+
+        # passing the level
+        df = df.set_index('Date')
+        result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer',
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'),
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result, expected.T)
+
+        self.assertRaises(ValueError, lambda : pivot_table(df, index=Grouper(freq='6MS', level='foo'),
+                          columns='Buyer', values='Quantity', aggfunc=np.sum))
+        self.assertRaises(ValueError, lambda : pivot_table(df, index='Buyer',
+                          columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum))
+
+        # double grouper
+        df = DataFrame({
+            'Branch' : 'A A A A A A A B'.split(),
+            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
+            'Quantity': [1,3,5,1,8,1,9,3],
+            'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5),
+                      datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0),
+                      datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0),
+                      datetime.datetime(2013,10,2,12,0), datetime.datetime(2013,12,5,14,0)],
+            'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5),
+                        datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0),
+                        datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0),
+                        datetime.datetime(2013,12,30,12,0), datetime.datetime(2013,11,20,14,0),]})
+
+        result = pivot_table(df, index=Grouper(freq='M', key='Date'),
+                             columns=Grouper(freq='M', key='PayDay'),
+                             values='Quantity', aggfunc=np.sum)
+        expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9,
+                                       np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan]).reshape(4, 4),
+                             index=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31),
+                                    datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)], 
+                             columns=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31),
+                                    datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)])
+        expected.index.name = 'Date'
+        expected.columns.name = 'PayDay'        
+
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(df, index=Grouper(freq='M', key='PayDay'),
+                             columns=Grouper(freq='M', key='Date'),
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result, expected.T)
+
+        tuples = [(datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)),
+                  (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)),
+                  (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)),
+                  (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)),
+                  (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)),
+                  (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)),]
+        idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay'])
+        expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan,
+                                       9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2),
+                             index=idx, columns=['A', 'B'])
+        expected.columns.name = 'Branch'   
+
+        result = pivot_table(df, index=[Grouper(freq='M', key='Date'),
+                             Grouper(freq='M', key='PayDay')], columns=['Branch'],
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(df, index=['Branch'], columns=[Grouper(freq='M', key='Date'),
+                             Grouper(freq='M', key='PayDay')], 
+                             values='Quantity', aggfunc=np.sum)
+        tm.assert_frame_equal(result, expected.T)
 
 class TestCrosstab(tm.TestCase):