Merge pull request #3537 from jreback/hdf_optimize

jreback · jreback · commit dc847425736d · 2013-05-08T14:44:20.000-07:00
PERF: HDFStore table writing performance improvements
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -44,6 +44,7 @@ pandas 0.11.1
     - will warn with a FrequencyWarning if you are attempting to append
       an index with a different frequency than the existing
     - support datelike columns with a timezone as data_columns (GH2852_)
+    - table writing performance improvements.
 
 **API Changes**
 
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -913,7 +913,7 @@ def __init__(self, func, nrows, start=None, stop=None, chunksize=None):
         self.stop  = min(self.nrows,stop)
 
         if chunksize is None:
-            chunksize = 50000
+            chunksize = 100000
 
         self.chunksize = chunksize
 
@@ -2232,6 +2232,10 @@ def table(self):
         """ return the table group (this is my storable) """
         return self.storable
 
+    @property
+    def dtype(self):
+        return self.table.dtype
+
     @property
     def description(self):
         return self.table.description
@@ -2848,7 +2852,7 @@ class AppendableTable(LegacyTable):
     table_type = 'appendable'
 
     def write(self, obj, axes=None, append=False, complib=None,
-              complevel=None, fletcher32=None, min_itemsize=None, chunksize=50000,
+              complevel=None, fletcher32=None, min_itemsize=None, chunksize=None,
               expectedrows=None, **kwargs):
 
         if not append and self.is_exists:
@@ -2905,18 +2909,26 @@ def write_data(self, chunksize):
             [a.is_searchable for a in self.values_axes]).astype('u1')
         values = [a.take_data() for a in self.values_axes]
 
+        # transpose the values so first dimension is last
+        values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ]
+
         # write the chunks
+        if chunksize is None:
+            chunksize = 100000
+
         rows = self.nrows_expected
         chunks = int(rows / chunksize) + 1
         for i in xrange(chunks):
             start_i = i * chunksize
             end_i = min((i + 1) * chunksize, rows)
+            if start_i >= end_i:
+                break
 
             self.write_data_chunk(
                 indexes=[a[start_i:end_i] for a in indexes],
                 mask=mask[start_i:end_i],
                 search=search,
-                values=[v[:, start_i:end_i] for v in values])
+                values=[v[start_i:end_i] for v in values])
 
     def write_data_chunk(self, indexes, mask, search, values):
 
@@ -2929,7 +2941,7 @@ def write_data_chunk(self, indexes, mask, search, values):
         try:
             func = getattr(lib, "create_hdf_rows_%sd" % self.ndim)
             args = list(indexes)
-            args.extend([mask, search, values])
+            args.extend([self.dtype, mask, search, values])
             rows = func(*args)
         except (Exception), detail:
             raise Exception("cannot create row-data -> %s" % str(detail))
@@ -2939,9 +2951,8 @@ def write_data_chunk(self, indexes, mask, search, values):
                 self.table.append(rows)
                 self.table.flush()
         except (Exception), detail:
-            raise Exception(
-                "tables cannot write this data -> %s" % str(detail))
-
+            raise Exception("tables cannot write this data -> %s" % str(detail))
+ 
     def delete(self, where=None, **kwargs):
 
         # delete all rows (and return the nrows)
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -837,61 +837,70 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def create_hdf_rows_2d(ndarray indexer0, 
+def create_hdf_rows_2d(ndarray indexer0,
+                       object dtype,
                        ndarray[np.uint8_t, ndim=1] mask,
                        ndarray[np.uint8_t, ndim=1] searchable,	 
-                       list values):	 
+                       list values):
     """ return a list of objects ready to be converted to rec-array format """
 
     cdef:
-        int i, b, n_indexer0, n_blocks, tup_size
-        list l
-        object tup, val, v
+        int i, l, b, n_indexer0, n_blocks, tup_size
+        ndarray result
+        tuple tup
+        object v
 
     n_indexer0 = indexer0.shape[0]
     n_blocks   = len(values)
     tup_size   = n_blocks+1
-    l = []
 
-    for i from 0 <= i < n_indexer0:
+    result = np.empty(n_indexer0,dtype=dtype)
+    l = 0
+    for i in range(n_indexer0):
 
         if not mask[i]:
-                
+         
             tup = PyTuple_New(tup_size)
-            val  = indexer0[i]
-            PyTuple_SET_ITEM(tup, 0, val)
-            Py_INCREF(val)
 
-            for b from 0 <= b < n_blocks:
+            v  = indexer0[i]
+            PyTuple_SET_ITEM(tup, 0, v)
+            Py_INCREF(v)
+
+            for b in range(n_blocks):
 
-                v = values[b][:, i]
+                v = values[b][i]
                 if searchable[b]:
                     v = v[0]
+        
                 PyTuple_SET_ITEM(tup, b+1, v)
                 Py_INCREF(v)
 
-            l.append(tup)
+            result[l] = tup
+            l += 1
 
-    return l
+    return result[0:l]
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
+                       object dtype,
                        ndarray[np.uint8_t, ndim=2] mask, 
                        ndarray[np.uint8_t, ndim=1] searchable,	 
                        list values):
     """ return a list of objects ready to be converted to rec-array format """
 
     cdef:
-        int i, j, b, n_indexer0, n_indexer1, n_blocks, tup_size
-        list l
-        object tup, val, v
+        int i, j, l, b, n_indexer0, n_indexer1, n_blocks, tup_size
+        tuple tup
+        object v
+        ndarray result
 
     n_indexer0 = indexer0.shape[0]
     n_indexer1 = indexer1.shape[0]
     n_blocks   = len(values)
     tup_size   = n_blocks+2
-    l = []
+    result = np.empty(n_indexer0*n_indexer1,dtype=dtype)
+    l = 0
     for i from 0 <= i < n_indexer0:
 
         for j from 0 <= j < n_indexer1:
@@ -900,45 +909,49 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
 
                 tup = PyTuple_New(tup_size)
 
-                val  = indexer0[i]
-                PyTuple_SET_ITEM(tup, 0, val)
-                Py_INCREF(val)
-
-                val  = indexer1[j]
-                PyTuple_SET_ITEM(tup, 1, val)
-                Py_INCREF(val)
+                v = indexer0[i]
+                PyTuple_SET_ITEM(tup, 0, v)
+                Py_INCREF(v)
+                v = indexer1[j]
+                PyTuple_SET_ITEM(tup, 1, v)
+                Py_INCREF(v)
 
                 for b from 0 <= b < n_blocks:
 
-                    v   = values[b][:, i, j]
+                    v   = values[b][i, j]
                     if searchable[b]:
                         v = v[0]
+
                     PyTuple_SET_ITEM(tup, b+2, v)
                     Py_INCREF(v)
 
-                l.append(tup)
+                result[l] = tup
+                l += 1
 
-    return l
+    return result[0:l]
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,
+                       object dtype,
                        ndarray[np.uint8_t, ndim=3] mask, 
                        ndarray[np.uint8_t, ndim=1] searchable,	 
                        list values):
     """ return a list of objects ready to be converted to rec-array format """
 
     cdef:
-        int i, j, k, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size
-        list l
-        object tup, val, v
+        int i, j, k, l, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size
+        tuple tup
+        object v
+        ndarray result
 
     n_indexer0 = indexer0.shape[0]
     n_indexer1 = indexer1.shape[0]
     n_indexer2 = indexer2.shape[0]
     n_blocks   = len(values)
     tup_size   = n_blocks+3
-    l = []
+    result = np.empty(n_indexer0*n_indexer1*n_indexer2,dtype=dtype)
+    l = 0
     for i from 0 <= i < n_indexer0:
 
         for j from 0 <= j < n_indexer1:
@@ -949,29 +962,28 @@ def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,
 
                     tup = PyTuple_New(tup_size)
 
-                    val  = indexer0[i]
-                    PyTuple_SET_ITEM(tup, 0, val)
-                    Py_INCREF(val)
-
-                    val  = indexer1[j]
-                    PyTuple_SET_ITEM(tup, 1, val)
-                    Py_INCREF(val)
-
-                    val  = indexer2[k]
-                    PyTuple_SET_ITEM(tup, 2, val)
-                    Py_INCREF(val)
+                    v = indexer0[i]
+                    PyTuple_SET_ITEM(tup, 0, v)
+                    Py_INCREF(v)
+                    v = indexer1[j]
+                    PyTuple_SET_ITEM(tup, 1, v)
+                    Py_INCREF(v)
+                    v = indexer2[k]
+                    PyTuple_SET_ITEM(tup, 2, v)
+                    Py_INCREF(v)
 
                     for b from 0 <= b < n_blocks:
 
-                        v   = values[b][:, i, j, k]
+                        v   = values[b][i, j, k]
                         if searchable[b]:
                             v = v[0]
                         PyTuple_SET_ITEM(tup, b+3, v)
                         Py_INCREF(v)
 
-                    l.append(tup)
+                    result[l] = tup
+                    l += 1
 
-    return l
+    return result[0:l]
 
 #-------------------------------------------------------------------------------
 # Groupby-related functions