From 78a3edbf4a4b7f0d500dd7f848ac621f0a37ce4e Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 17 Mar 2013 21:10:44 -0400 Subject: [PATCH] ENH: New keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_) --- RELEASE.rst | 11 ++- doc/source/io.rst | 17 +++++ doc/source/v0.11.0.txt | 8 ++- pandas/io/pytables.py | 112 ++++++++++++++++++++++++++----- pandas/io/tests/test_pytables.py | 65 ++++++++++++++++++ 5 files changed, 192 insertions(+), 21 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 7e0187df9f61d..45e82d4ef83ce 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -63,6 +63,12 @@ pandas 0.11.0 - Add ``axes`` property to ``Series`` for compatibility - Add ``xs`` function to ``Series`` for compatibility - Allow setitem in a frame where only mixed numerics are present (e.g. int and float), (GH3037_) + - ``HDFStore`` + + - Provide dotted attribute access to ``get`` from stores + (e.g. store.df == store['df']) + - New keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are + provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_) - In ``HDFStore``, provide dotted attribute access to ``get`` from stores (e.g. ``store.df == store['df']``) @@ -140,8 +146,6 @@ pandas 0.11.0 - Fix weird PyTables error when using too many selectors in a where also correctly filter on any number of values in a Term expression (so not using numexpr filtering, but isin filtering) - - Provide dotted attribute access to ``get`` from stores - (e.g. store.df == store['df']) - Internally, change all variables to be private-like (now have leading underscore) - fixes for query parsing to correctly interpret boolean and != (GH2849_, GH2973_) @@ -218,6 +222,7 @@ pandas 0.11.0 .. _GH2819: https://github.com/pydata/pandas/issues/2819 .. _GH2845: https://github.com/pydata/pandas/issues/2845 .. _GH2867: https://github.com/pydata/pandas/issues/2867 +.. _GH2803: https://github.com/pydata/pandas/issues/2803 .. _GH2807: https://github.com/pydata/pandas/issues/2807 .. _GH2849: https://github.com/pydata/pandas/issues/2849 .. _GH2850: https://github.com/pydata/pandas/issues/2850 @@ -238,7 +243,7 @@ pandas 0.11.0 .. _GH3037: https://github.com/pydata/pandas/issues/3037 .. _GH3041: https://github.com/pydata/pandas/issues/3041 .. _GH3053: https://github.com/pydata/pandas/issues/3053 -.. _GH2803: https://github.com/pydata/pandas/issues/2803 +.. _GH3076: https://github.com/pydata/pandas/issues/3076 pandas 0.10.1 diff --git a/doc/source/io.rst b/doc/source/io.rst index c1b40c92529f4..c30b64d9ae07a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1307,6 +1307,23 @@ you cannot change data columns (nor indexables) after the first append/put operation (Of course you can simply read in the data and create a new table!) +Iterator +~~~~~~~~ + +Starting in 0.11, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk`` +to ``select`` and ``select_as_multiple`` to return an iterator on the results. +The default is 50,000 rows returned in a chunk. + +.. ipython:: python + + for df in store.select('df', chunksize=3): + print df + +Note, that the chunksize keyword applies to the **returned** rows. So if you +are doing a query, then that set will be subdivided and returned in the +iterator. Keep in mind that if you do not pass a ``where`` selection criteria +then the ``nrows`` of the table are considered. + Advanced Queries ~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 487321b35ef99..328e14432e333 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -238,6 +238,9 @@ Enhancements - In ``HDFStore``, provide dotted attribute access to ``get`` from stores (e.g. ``store.df == store['df']``) + - In ``HDFStore``, new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are + provided to support iteration on ``select`` and ``select_as_multiple`` (GH3076_) + - ``Squeeze`` to possibly remove length 1 dimensions from an object. .. ipython:: python @@ -300,6 +303,7 @@ on GitHub for a complete list. .. _GH2806: https://github.com/pydata/pandas/issues/2806 .. _GH2807: https://github.com/pydata/pandas/issues/2807 .. _GH2918: https://github.com/pydata/pandas/issues/2918 -.. _GH3011: https://github.com/pydata/pandas/issues/3011 -.. _GH2979: https://github.com/pydata/pandas/issues/2979 .. _GH2758: https://github.com/pydata/pandas/issues/2758 +.. _GH2979: https://github.com/pydata/pandas/issues/2979 +.. _GH3011: https://github.com/pydata/pandas/issues/3011 +.. _GH3076: https://github.com/pydata/pandas/issues/3076 diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6b3b36f231c1a..ca2e3b6e04f19 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -347,7 +347,7 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs): + def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -362,16 +362,30 @@ def select(self, key, where=None, start=None, stop=None, columns=None, **kwargs) start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection columns : a list of columns that if not None, will limit the return columns + iterator : boolean, return an iterator, default False + chunksize : nrows to include in iteration, return an iterator """ group = self.get_node(key) if group is None: raise KeyError('No object named %s in the file' % key) - return self._read_group(group, where=where, start=start, stop=stop, columns=columns, **kwargs) - def select_as_coordinates(self, key, where=None, **kwargs): + # create the storer and axes + s = self._create_storer(group) + s.infer_axes() + + # what we are actually going to do for a chunk + def func(_start, _stop): + return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs) + + if iterator or chunksize is not None: + return TableIterator(func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize) + + return TableIterator(func, nrows=s.nrows, start=start, stop=stop).get_values() + + def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): """ - return the selection as a Coordinates. Note that start/stop/columns parematers are inapplicable here. + return the selection as a Coordinates. Parameters ---------- @@ -380,8 +394,10 @@ def select_as_coordinates(self, key, where=None, **kwargs): Optional Parameters ------------------- where : list of Term (or convertable) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection """ - return self.get_storer(key).read_coordinates(where = where, **kwargs) + return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs) def unique(self, key, column, **kwargs): """ @@ -400,7 +416,7 @@ def unique(self, key, column, **kwargs): """ return self.get_storer(key).read_column(column = column, **kwargs) - def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kwargs): + def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -408,6 +424,10 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw keys : a list of the tables selector : the table to apply the where criteria (defaults to keys[0] if not supplied) columns : the columns I want back + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection + iterator : boolean, return an iterator, default False + chunksize : nrows to include in iteration, return an iterator Exceptions ---------- @@ -418,7 +438,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, basestring): - return self.select(key=keys, where=where, columns=columns, **kwargs) + return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs) if not isinstance(keys, (list, tuple)): raise Exception("keys must be a list/tuple") @@ -433,6 +453,8 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw tbls = [ self.get_storer(k) for k in keys ] # validate rows + if tbls[0] is None: + raise Exception("no valid tables to select as multiple") nrows = tbls[0].nrows for t in tbls: if t.nrows != nrows: @@ -441,16 +463,25 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, **kw raise Exception("object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname) # select coordinates from the selector table - c = self.select_as_coordinates(selector, where) + c = self.select_as_coordinates(selector, where, start=start, stop=stop) + nrows = len(c) + + def func(_start, _stop): + + # collect the returns objs + objs = [t.read(where=c[_start:_stop], columns=columns) for t in tbls] + + # axis is the concentation axes + axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] - # collect the returns objs - objs = [t.read(where=c, columns=columns) for t in tbls] + # concat and return + return concat(objs, axis=axis, verify_integrity=True) - # axis is the concentation axes - axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] + if iterator or chunksize is not None: + return TableIterator(func, nrows=nrows, start=start, stop=stop, chunksize=chunksize) + + return TableIterator(func, nrows=nrows, start=start, stop=stop).get_values() - # concat and return - return concat(objs, axis=axis, verify_integrity=True) def put(self, key, value, table=None, append=False, **kwargs): """ @@ -807,6 +838,49 @@ def _read_group(self, group, **kwargs): s.infer_axes() return s.read(**kwargs) +class TableIterator(object): + """ define the iteration interface on a table + + Parameters + ---------- + + func : the function to get results + nrows : the rows to iterate on + start : the passed start value (default is None) + stop : the passed stop value (default is None) + chunksize : the passed chunking valeu (default is 50000) + kwargs : the passed kwargs + """ + + def __init__(self, func, nrows, start=None, stop=None, chunksize=None): + self.func = func + self.nrows = nrows + self.start = start or 0 + + if stop is None: + stop = self.nrows + self.stop = min(self.nrows,stop) + + if chunksize is None: + chunksize = 50000 + + self.chunksize = chunksize + + def __iter__(self): + current = self.start + while current < self.stop: + stop = current + self.chunksize + v = self.func(current, stop) + current = stop + + if v is None: + continue + + yield v + + def get_values(self): + return self.func(self.start, self.stop) + class IndexCol(object): """ an index column description class @@ -2351,7 +2425,7 @@ def create_description(self, complib=None, complevel=None, fletcher32=False, exp return d - def read_coordinates(self, where=None, **kwargs): + def read_coordinates(self, where=None, start=None, stop=None, **kwargs): """ select coordinates (row numbers) from a table; return the coordinates object """ # validate the version @@ -2362,7 +2436,7 @@ def read_coordinates(self, where=None, **kwargs): return False # create the selection - self.selection = Selection(self, where=where, **kwargs) + self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) return Coordinates(self.selection.select_coords(), group=self.group, where=where) def read_column(self, column, **kwargs): @@ -3132,6 +3206,12 @@ def __init__(self, values, group, where, **kwargs): self.group = group self.where = where + def __len__(self): + return len(self.values) + + def __getitem__(self, key): + """ return a new coordinates object, sliced by the key """ + return Coordinates(self.values[key], self.group, self.where) class Selection(object): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index c3a8990962ca1..8cf40a77d639f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1829,6 +1829,66 @@ def test_select_with_many_inputs(self): tm.assert_frame_equal(expected, result) self.assert_(len(result) == 100) + def test_select_iterator(self): + + # single table + with ensure_clean(self.path) as store: + + df = tm.makeTimeDataFrame(500) + store.remove('df') + store.append('df', df) + + expected = store.select('df') + + results = [] + for s in store.select('df',iterator=True): + results.append(s) + result = concat(results) + tm.assert_frame_equal(expected, result) + results = [] + for s in store.select('df',chunksize=100): + results.append(s) + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = [] + for s in store.select('df',chunksize=150): + results.append(s) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # multiple + + with ensure_clean(self.path) as store: + + df1 = tm.makeTimeDataFrame(500) + store.append('df1',df1,data_columns=True) + df2 = tm.makeTimeDataFrame(500).rename(columns=lambda x: "%s_2" % x) + df2['foo'] = 'bar' + store.append('df2',df2) + + df = concat([df1, df2], axis=1) + + # full selection + expected = store.select_as_multiple( + ['df1', 'df2'], selector='df1') + results = [] + for s in store.select_as_multiple( + ['df1', 'df2'], selector='df1', chunksize=150): + results.append(s) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # where selection + expected = store.select_as_multiple( + ['df1', 'df2'], where= Term('A>0'), selector='df1') + results = [] + for s in store.select_as_multiple( + ['df1', 'df2'], where= Term('A>0'), selector='df1', chunksize=25): + results.append(s) + result = concat(results) + tm.assert_frame_equal(expected, result) + def test_panel_select(self): wp = tm.makePanel() @@ -2042,6 +2102,11 @@ def test_select_as_multiple(self): df2['foo'] = 'bar' with ensure_clean(self.path) as store: + + # no tables stored + self.assertRaises(Exception, store.select_as_multiple, + None, where=['A>0', 'B>0'], selector='df1') + store.append('df1', df1, data_columns=['A', 'B']) store.append('df2', df2)