Skip to content

Commit dc84742

Browse files
committed
Merge pull request #3537 from jreback/hdf_optimize
PERF: HDFStore table writing performance improvements
2 parents 8b8b714 + 21bce6c commit dc84742

File tree

3 files changed

+78
-54
lines changed

3 files changed

+78
-54
lines changed

RELEASE.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ pandas 0.11.1
4444
- will warn with a FrequencyWarning if you are attempting to append
4545
an index with a different frequency than the existing
4646
- support datelike columns with a timezone as data_columns (GH2852_)
47+
- table writing performance improvements.
4748

4849
**API Changes**
4950

pandas/io/pytables.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,7 @@ def __init__(self, func, nrows, start=None, stop=None, chunksize=None):
913913
self.stop = min(self.nrows,stop)
914914

915915
if chunksize is None:
916-
chunksize = 50000
916+
chunksize = 100000
917917

918918
self.chunksize = chunksize
919919

@@ -2232,6 +2232,10 @@ def table(self):
22322232
""" return the table group (this is my storable) """
22332233
return self.storable
22342234

2235+
@property
2236+
def dtype(self):
2237+
return self.table.dtype
2238+
22352239
@property
22362240
def description(self):
22372241
return self.table.description
@@ -2848,7 +2852,7 @@ class AppendableTable(LegacyTable):
28482852
table_type = 'appendable'
28492853

28502854
def write(self, obj, axes=None, append=False, complib=None,
2851-
complevel=None, fletcher32=None, min_itemsize=None, chunksize=50000,
2855+
complevel=None, fletcher32=None, min_itemsize=None, chunksize=None,
28522856
expectedrows=None, **kwargs):
28532857

28542858
if not append and self.is_exists:
@@ -2905,18 +2909,26 @@ def write_data(self, chunksize):
29052909
[a.is_searchable for a in self.values_axes]).astype('u1')
29062910
values = [a.take_data() for a in self.values_axes]
29072911

2912+
# transpose the values so first dimension is last
2913+
values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ]
2914+
29082915
# write the chunks
2916+
if chunksize is None:
2917+
chunksize = 100000
2918+
29092919
rows = self.nrows_expected
29102920
chunks = int(rows / chunksize) + 1
29112921
for i in xrange(chunks):
29122922
start_i = i * chunksize
29132923
end_i = min((i + 1) * chunksize, rows)
2924+
if start_i >= end_i:
2925+
break
29142926

29152927
self.write_data_chunk(
29162928
indexes=[a[start_i:end_i] for a in indexes],
29172929
mask=mask[start_i:end_i],
29182930
search=search,
2919-
values=[v[:, start_i:end_i] for v in values])
2931+
values=[v[start_i:end_i] for v in values])
29202932

29212933
def write_data_chunk(self, indexes, mask, search, values):
29222934

@@ -2929,7 +2941,7 @@ def write_data_chunk(self, indexes, mask, search, values):
29292941
try:
29302942
func = getattr(lib, "create_hdf_rows_%sd" % self.ndim)
29312943
args = list(indexes)
2932-
args.extend([mask, search, values])
2944+
args.extend([self.dtype, mask, search, values])
29332945
rows = func(*args)
29342946
except (Exception), detail:
29352947
raise Exception("cannot create row-data -> %s" % str(detail))
@@ -2939,9 +2951,8 @@ def write_data_chunk(self, indexes, mask, search, values):
29392951
self.table.append(rows)
29402952
self.table.flush()
29412953
except (Exception), detail:
2942-
raise Exception(
2943-
"tables cannot write this data -> %s" % str(detail))
2944-
2954+
raise Exception("tables cannot write this data -> %s" % str(detail))
2955+
29452956
def delete(self, where=None, **kwargs):
29462957

29472958
# delete all rows (and return the nrows)

pandas/lib.pyx

Lines changed: 59 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -837,61 +837,70 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr
837837

838838
@cython.boundscheck(False)
839839
@cython.wraparound(False)
840-
def create_hdf_rows_2d(ndarray indexer0,
840+
def create_hdf_rows_2d(ndarray indexer0,
841+
object dtype,
841842
ndarray[np.uint8_t, ndim=1] mask,
842843
ndarray[np.uint8_t, ndim=1] searchable,
843-
list values):
844+
list values):
844845
""" return a list of objects ready to be converted to rec-array format """
845846

846847
cdef:
847-
int i, b, n_indexer0, n_blocks, tup_size
848-
list l
849-
object tup, val, v
848+
int i, l, b, n_indexer0, n_blocks, tup_size
849+
ndarray result
850+
tuple tup
851+
object v
850852

851853
n_indexer0 = indexer0.shape[0]
852854
n_blocks = len(values)
853855
tup_size = n_blocks+1
854-
l = []
855856

856-
for i from 0 <= i < n_indexer0:
857+
result = np.empty(n_indexer0,dtype=dtype)
858+
l = 0
859+
for i in range(n_indexer0):
857860

858861
if not mask[i]:
859-
862+
860863
tup = PyTuple_New(tup_size)
861-
val = indexer0[i]
862-
PyTuple_SET_ITEM(tup, 0, val)
863-
Py_INCREF(val)
864864

865-
for b from 0 <= b < n_blocks:
865+
v = indexer0[i]
866+
PyTuple_SET_ITEM(tup, 0, v)
867+
Py_INCREF(v)
868+
869+
for b in range(n_blocks):
866870

867-
v = values[b][:, i]
871+
v = values[b][i]
868872
if searchable[b]:
869873
v = v[0]
874+
870875
PyTuple_SET_ITEM(tup, b+1, v)
871876
Py_INCREF(v)
872877

873-
l.append(tup)
878+
result[l] = tup
879+
l += 1
874880

875-
return l
881+
return result[0:l]
876882

877883
@cython.boundscheck(False)
878884
@cython.wraparound(False)
879885
def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
886+
object dtype,
880887
ndarray[np.uint8_t, ndim=2] mask,
881888
ndarray[np.uint8_t, ndim=1] searchable,
882889
list values):
883890
""" return a list of objects ready to be converted to rec-array format """
884891

885892
cdef:
886-
int i, j, b, n_indexer0, n_indexer1, n_blocks, tup_size
887-
list l
888-
object tup, val, v
893+
int i, j, l, b, n_indexer0, n_indexer1, n_blocks, tup_size
894+
tuple tup
895+
object v
896+
ndarray result
889897

890898
n_indexer0 = indexer0.shape[0]
891899
n_indexer1 = indexer1.shape[0]
892900
n_blocks = len(values)
893901
tup_size = n_blocks+2
894-
l = []
902+
result = np.empty(n_indexer0*n_indexer1,dtype=dtype)
903+
l = 0
895904
for i from 0 <= i < n_indexer0:
896905

897906
for j from 0 <= j < n_indexer1:
@@ -900,45 +909,49 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
900909

901910
tup = PyTuple_New(tup_size)
902911

903-
val = indexer0[i]
904-
PyTuple_SET_ITEM(tup, 0, val)
905-
Py_INCREF(val)
906-
907-
val = indexer1[j]
908-
PyTuple_SET_ITEM(tup, 1, val)
909-
Py_INCREF(val)
912+
v = indexer0[i]
913+
PyTuple_SET_ITEM(tup, 0, v)
914+
Py_INCREF(v)
915+
v = indexer1[j]
916+
PyTuple_SET_ITEM(tup, 1, v)
917+
Py_INCREF(v)
910918

911919
for b from 0 <= b < n_blocks:
912920

913-
v = values[b][:, i, j]
921+
v = values[b][i, j]
914922
if searchable[b]:
915923
v = v[0]
924+
916925
PyTuple_SET_ITEM(tup, b+2, v)
917926
Py_INCREF(v)
918927

919-
l.append(tup)
928+
result[l] = tup
929+
l += 1
920930

921-
return l
931+
return result[0:l]
922932

923933
@cython.boundscheck(False)
924934
@cython.wraparound(False)
925935
def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,
936+
object dtype,
926937
ndarray[np.uint8_t, ndim=3] mask,
927938
ndarray[np.uint8_t, ndim=1] searchable,
928939
list values):
929940
""" return a list of objects ready to be converted to rec-array format """
930941

931942
cdef:
932-
int i, j, k, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size
933-
list l
934-
object tup, val, v
943+
int i, j, k, l, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size
944+
tuple tup
945+
object v
946+
ndarray result
935947

936948
n_indexer0 = indexer0.shape[0]
937949
n_indexer1 = indexer1.shape[0]
938950
n_indexer2 = indexer2.shape[0]
939951
n_blocks = len(values)
940952
tup_size = n_blocks+3
941-
l = []
953+
result = np.empty(n_indexer0*n_indexer1*n_indexer2,dtype=dtype)
954+
l = 0
942955
for i from 0 <= i < n_indexer0:
943956

944957
for j from 0 <= j < n_indexer1:
@@ -949,29 +962,28 @@ def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,
949962

950963
tup = PyTuple_New(tup_size)
951964

952-
val = indexer0[i]
953-
PyTuple_SET_ITEM(tup, 0, val)
954-
Py_INCREF(val)
955-
956-
val = indexer1[j]
957-
PyTuple_SET_ITEM(tup, 1, val)
958-
Py_INCREF(val)
959-
960-
val = indexer2[k]
961-
PyTuple_SET_ITEM(tup, 2, val)
962-
Py_INCREF(val)
965+
v = indexer0[i]
966+
PyTuple_SET_ITEM(tup, 0, v)
967+
Py_INCREF(v)
968+
v = indexer1[j]
969+
PyTuple_SET_ITEM(tup, 1, v)
970+
Py_INCREF(v)
971+
v = indexer2[k]
972+
PyTuple_SET_ITEM(tup, 2, v)
973+
Py_INCREF(v)
963974

964975
for b from 0 <= b < n_blocks:
965976

966-
v = values[b][:, i, j, k]
977+
v = values[b][i, j, k]
967978
if searchable[b]:
968979
v = v[0]
969980
PyTuple_SET_ITEM(tup, b+3, v)
970981
Py_INCREF(v)
971982

972-
l.append(tup)
983+
result[l] = tup
984+
l += 1
973985

974-
return l
986+
return result[0:l]
975987

976988
#-------------------------------------------------------------------------------
977989
# Groupby-related functions

0 commit comments

Comments
 (0)