From 78a3edbf4a4b7f0d500dd7f848ac621f0a37ce4e Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sun, 17 Mar 2013 21:10:44 -0400
Subject: [PATCH] ENH: New keywords ``iterator=boolean``, and
 ``chunksize=number_in_a_chunk`` are      provided to support iteration on
 ``select`` and ``select_as_multiple`` (GH3076_)

---
 RELEASE.rst                      |  11 ++-
 doc/source/io.rst                |  17 +++++
 doc/source/v0.11.0.txt           |   8 ++-
 pandas/io/pytables.py            | 112 ++++++++++++++++++++++++++-----
 pandas/io/tests/test_pytables.py |  65 ++++++++++++++++++
 5 files changed, 192 insertions(+), 21 deletions(-)

diff --git a/RELEASE.rst b/RELEASE.rst
index 7e0187df9f61d..45e82d4ef83ce 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -63,6 +63,12 @@ pandas 0.11.0
   - Add ``axes`` property to ``Series`` for compatibility
   - Add ``xs`` function to ``Series`` for compatibility
   - Allow setitem in a frame where only mixed numerics are present (e.g. int and float), (GH3037_)
+  - ``HDFStore``
+
+    - Provide dotted attribute access to ``get`` from stores
+      (e.g. store.df == store['df'])
+    - New keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are
+      provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_)
 
   - In ``HDFStore``, provide dotted attribute access to ``get`` from stores
     (e.g. ``store.df == store['df']``)
@@ -140,8 +146,6 @@ pandas 0.11.0
     - Fix weird PyTables error when using too many selectors in a where
       also correctly filter on any number of values in a Term expression
       (so not using numexpr filtering, but isin filtering)
-    - Provide dotted attribute access to ``get`` from stores
-      (e.g. store.df == store['df'])
     - Internally, change all variables to be private-like (now have leading
       underscore)
     - fixes for query parsing to correctly interpret boolean and != (GH2849_, GH2973_)
@@ -218,6 +222,7 @@ pandas 0.11.0
 .. _GH2819: https://github.com/pydata/pandas/issues/2819
 .. _GH2845: https://github.com/pydata/pandas/issues/2845
 .. _GH2867: https://github.com/pydata/pandas/issues/2867
+.. _GH2803: https://github.com/pydata/pandas/issues/2803
 .. _GH2807: https://github.com/pydata/pandas/issues/2807
 .. _GH2849: https://github.com/pydata/pandas/issues/2849
 .. _GH2850: https://github.com/pydata/pandas/issues/2850
@@ -238,7 +243,7 @@ pandas 0.11.0
 .. _GH3037: https://github.com/pydata/pandas/issues/3037
 .. _GH3041: https://github.com/pydata/pandas/issues/3041
 .. _GH3053: https://github.com/pydata/pandas/issues/3053
-.. _GH2803: https://github.com/pydata/pandas/issues/2803
+.. _GH3076: https://github.com/pydata/pandas/issues/3076
 
 
 pandas 0.10.1
diff --git a/doc/source/io.rst b/doc/source/io.rst
index c1b40c92529f4..c30b64d9ae07a 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -1307,6 +1307,23 @@ you cannot change data columns (nor indexables) after the first
 append/put operation (Of course you can simply read in the data and
 create a new table!)
 
+Iterator
+~~~~~~~~
+
+Starting in 0.11, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk``
+to ``select`` and ``select_as_multiple`` to return an iterator on the results.
+The default is 50,000 rows returned in a chunk.
+
+.. ipython:: python
+
+   for df in store.select('df', chunksize=3):
+      print df
+
+Note, that the chunksize keyword applies to the **returned** rows. So if you
+are doing a query, then that set will be subdivided and returned in the
+iterator. Keep in mind that if you do not pass a ``where`` selection criteria
+then the ``nrows`` of the table are considered.
+
 Advanced Queries
 ~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt
index 487321b35ef99..328e14432e333 100644
--- a/doc/source/v0.11.0.txt
+++ b/doc/source/v0.11.0.txt
@@ -238,6 +238,9 @@ Enhancements
   - In ``HDFStore``, provide dotted attribute access to ``get`` from stores
     (e.g. ``store.df == store['df']``)
 
+  - In ``HDFStore``, new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are
+    provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_)
+
   - ``Squeeze`` to possibly remove length 1 dimensions from an object.
 
     .. ipython:: python
@@ -300,6 +303,7 @@ on GitHub for a complete list.
 .. _GH2806: https://github.com/pydata/pandas/issues/2806
 .. _GH2807: https://github.com/pydata/pandas/issues/2807
 .. _GH2918: https://github.com/pydata/pandas/issues/2918
-.. _GH3011: https://github.com/pydata/pandas/issues/3011
-.. _GH2979: https://github.com/pydata/pandas/issues/2979
 .. _GH2758: https://github.com/pydata/pandas/issues/2758
+.. _GH2979: https://github.com/pydata/pandas/issues/2979
+.. _GH3011: https://github.com/pydata/pandas/issues/3011
+.. _GH3076: https://github.com/pydata/pandas/issues/3076
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 6b3b36f231c1a..ca2e3b6e04f19 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -347,7 +347,7 @@ def get(self, key):
             raise KeyError('No object named %s in the file' % key)
         return self._read_group(group)
 
-    def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs):
+    def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, **kwargs):
         """
         Retrieve pandas object stored in file, optionally based on where
         criteria
@@ -362,16 +362,30 @@ def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs)
         start : integer (defaults to None), row number to start selection
         stop  : integer (defaults to None), row number to stop selection
         columns : a list of columns that if not None, will limit the return columns
+        iterator : boolean, return an iterator, default False
+        chunksize : nrows to include in iteration, return an iterator
 
         """
         group = self.get_node(key)
         if group is None:
             raise KeyError('No object named %s in the file' % key)
-        return self._read_group(group, where=where, start=start, stop=stop, columns=columns, **kwargs)
 
-    def select_as_coordinates(self, key, where=None, **kwargs):
+        # create the storer and axes
+        s = self._create_storer(group)
+        s.infer_axes()
+
+        # what we are actually going to do for a chunk
+        def func(_start, _stop):
+            return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs)
+
+        if iterator or chunksize is not None:
+            return TableIterator(func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize)
+
+        return TableIterator(func, nrows=s.nrows, start=start, stop=stop).get_values()
+
+    def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs):
         """
-        return the selection as a Coordinates. Note that start/stop/columns parematers are inapplicable here.
+        return the selection as a Coordinates.
 
         Parameters
         ----------
@@ -380,8 +394,10 @@ def select_as_coordinates(self, key, where=None, **kwargs):
         Optional Parameters
         -------------------
         where : list of Term (or convertable) objects, optional
+        start : integer (defaults to None), row number to start selection
+        stop  : integer (defaults to None), row number to stop selection
         """
-        return self.get_storer(key).read_coordinates(where = where, **kwargs)
+        return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs)
 
     def unique(self, key, column, **kwargs):
         """
@@ -400,7 +416,7 @@ def unique(self, key, column, **kwargs):
         """
         return self.get_storer(key).read_column(column = column, **kwargs)
 
-    def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kwargs):
+    def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, **kwargs):
         """ Retrieve pandas objects from multiple tables
 
         Parameters
@@ -408,6 +424,10 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw
         keys : a list of the tables
         selector : the table to apply the where criteria (defaults to keys[0] if not supplied)
         columns : the columns I want back
+        start : integer (defaults to None), row number to start selection
+        stop  : integer (defaults to None), row number to stop selection
+        iterator : boolean, return an iterator, default False
+        chunksize : nrows to include in iteration, return an iterator
 
         Exceptions
         ----------
@@ -418,7 +438,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw
         if isinstance(keys, (list, tuple)) and len(keys) == 1:
             keys = keys[0]
         if isinstance(keys, basestring):
-            return self.select(key=keys, where=where, columns=columns, **kwargs)
+            return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs)
 
         if not isinstance(keys, (list, tuple)):
             raise Exception("keys must be a list/tuple")
@@ -433,6 +453,8 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw
         tbls = [ self.get_storer(k) for k in keys ]
 
         # validate rows
+        if tbls[0] is None:
+            raise Exception("no valid tables to select as multiple")
         nrows = tbls[0].nrows
         for t in tbls:
             if t.nrows != nrows:
@@ -441,16 +463,25 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw
                 raise Exception("object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname)
 
         # select coordinates from the selector table
-        c = self.select_as_coordinates(selector, where)
+        c = self.select_as_coordinates(selector, where, start=start, stop=stop)
+        nrows = len(c)
+
+        def func(_start, _stop):
+
+            # collect the returns objs
+            objs = [t.read(where=c[_start:_stop], columns=columns) for t in tbls]
+
+            # axis is the concentation axes
+            axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
 
-        # collect the returns objs
-        objs = [t.read(where=c, columns=columns) for t in tbls]
+            # concat and return
+            return concat(objs, axis=axis, verify_integrity=True)
 
-        # axis is the concentation axes
-        axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
+        if iterator or chunksize is not None:
+            return TableIterator(func, nrows=nrows, start=start, stop=stop, chunksize=chunksize)
+
+        return TableIterator(func, nrows=nrows, start=start, stop=stop).get_values()
 
-        # concat and return
-        return concat(objs, axis=axis, verify_integrity=True)
 
     def put(self, key, value, table=None, append=False, **kwargs):
         """
@@ -807,6 +838,49 @@ def _read_group(self, group, **kwargs):
         s.infer_axes()
         return s.read(**kwargs)
 
+class TableIterator(object):
+    """ define the iteration interface on a table
+        
+        Parameters
+        ----------
+
+        func   : the function to get results
+        nrows : the rows to iterate on
+        start : the passed start value (default is None)
+        stop : the passed stop value (default is None)
+        chunksize : the passed chunking valeu (default is 50000)
+        kwargs : the passed kwargs
+        """
+
+    def __init__(self, func, nrows, start=None, stop=None, chunksize=None):
+        self.func   = func
+        self.nrows = nrows
+        self.start = start or 0
+
+        if stop is None:
+            stop = self.nrows
+        self.stop  = min(self.nrows,stop)
+
+        if chunksize is None:
+            chunksize = 50000
+
+        self.chunksize = chunksize
+
+    def __iter__(self):
+        current = self.start
+        while current < self.stop:
+            stop = current + self.chunksize
+            v = self.func(current, stop)
+            current = stop
+
+            if v is None:
+                continue
+
+            yield v
+
+    def get_values(self):
+        return self.func(self.start, self.stop)
+        
 
 class IndexCol(object):
     """ an index column description class
@@ -2351,7 +2425,7 @@ def create_description(self, complib=None, complevel=None, fletcher32=False, exp
 
         return d
 
-    def read_coordinates(self, where=None, **kwargs):
+    def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
         """ select coordinates (row numbers) from a table; return the coordinates object """
 
         # validate the version
@@ -2362,7 +2436,7 @@ def read_coordinates(self, where=None, **kwargs):
             return False
 
         # create the selection
-        self.selection = Selection(self, where=where, **kwargs)
+        self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs)
         return Coordinates(self.selection.select_coords(), group=self.group, where=where)
 
     def read_column(self, column, **kwargs):
@@ -3132,6 +3206,12 @@ def __init__(self, values, group, where, **kwargs):
         self.group = group
         self.where = where
 
+    def __len__(self):
+        return len(self.values)
+
+    def __getitem__(self, key):
+        """ return a new coordinates object, sliced by the key """
+        return Coordinates(self.values[key], self.group, self.where)
 
 class Selection(object):
     """
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
index c3a8990962ca1..8cf40a77d639f 100644
--- a/pandas/io/tests/test_pytables.py
+++ b/pandas/io/tests/test_pytables.py
@@ -1829,6 +1829,66 @@ def test_select_with_many_inputs(self):
             tm.assert_frame_equal(expected, result)
             self.assert_(len(result) == 100)
 
+    def test_select_iterator(self):
+ 
+        # single table
+        with ensure_clean(self.path) as store:
+
+            df = tm.makeTimeDataFrame(500)
+            store.remove('df')
+            store.append('df', df)
+
+            expected = store.select('df')
+
+            results = []
+            for s in store.select('df',iterator=True):
+                results.append(s)
+            result = concat(results)
+            tm.assert_frame_equal(expected, result)
+            results = []
+            for s in store.select('df',chunksize=100):
+                results.append(s)
+            result = concat(results)
+            tm.assert_frame_equal(expected, result)
+
+            results = []
+            for s in store.select('df',chunksize=150):
+                results.append(s)
+            result = concat(results)
+            tm.assert_frame_equal(expected, result)
+
+        # multiple
+
+        with ensure_clean(self.path) as store:
+
+            df1 = tm.makeTimeDataFrame(500)
+            store.append('df1',df1,data_columns=True)
+            df2 = tm.makeTimeDataFrame(500).rename(columns=lambda x: "%s_2" % x)
+            df2['foo'] = 'bar'
+            store.append('df2',df2)
+
+            df = concat([df1, df2], axis=1)
+
+            # full selection
+            expected = store.select_as_multiple(
+                ['df1', 'df2'], selector='df1')
+            results = []
+            for s in store.select_as_multiple(
+                ['df1', 'df2'], selector='df1', chunksize=150):
+                results.append(s)
+            result = concat(results)
+            tm.assert_frame_equal(expected, result)
+            
+            # where selection
+            expected = store.select_as_multiple(
+                ['df1', 'df2'], where= Term('A>0'), selector='df1')
+            results = []
+            for s in store.select_as_multiple(
+                ['df1', 'df2'], where= Term('A>0'), selector='df1', chunksize=25):
+                results.append(s)
+            result = concat(results)
+            tm.assert_frame_equal(expected, result)
+
     def test_panel_select(self):
 
         wp = tm.makePanel()
@@ -2042,6 +2102,11 @@ def test_select_as_multiple(self):
         df2['foo'] = 'bar'
 
         with ensure_clean(self.path) as store:
+
+            # no tables stored
+            self.assertRaises(Exception, store.select_as_multiple,
+                              None, where=['A>0', 'B>0'], selector='df1')
+
             store.append('df1', df1, data_columns=['A', 'B'])
             store.append('df2', df2)