diff --git a/RELEASE.rst b/RELEASE.rst index 2eb7980458f8e..51fdd527afdfa 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -47,6 +47,7 @@ pandas 0.11.0 **Improvements to existing features** + - Improved performance of dv.to_csv() by up to 10x in some cases. (GH3059_) - added ``blocks`` attribute to DataFrames, to return a dict of dtypes to homogeneously dtyped DataFrames - added keyword ``convert_numeric`` to ``convert_objects()`` to try to @@ -62,6 +63,8 @@ pandas 0.11.0 strings that can be parsed with datetime.strptime - Add ``axes`` property to ``Series`` for compatibility - Add ``xs`` function to ``Series`` for compatibility + - Add ``chunksize`` parameter to ``to_csv`` to allow writing in chunks + to enable constant memory usage **API Changes** @@ -183,6 +186,7 @@ pandas 0.11.0 .. _GH3012: https://github.com/pydata/pandas/issues/3012 .. _GH3029: https://github.com/pydata/pandas/issues/3029 .. _GH3041: https://github.com/pydata/pandas/issues/3041 +.. _GH3059: https://github.com/pydata/pandas/issues/3039 pandas 0.10.1 diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 60ec7de5c4d8e..09289bab5a0f4 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -229,6 +229,8 @@ API changes Enhancements ~~~~~~~~~~~~ + - Improved performance of dv.to_csv() by up to 10x in some cases. (GH3059_) + - Numexpr is now a :ref:`Recommended Dependencies `, to accelerate certain types of numerical and boolean operations @@ -331,3 +333,4 @@ on GitHub for a complete list. .. _GH2806: https://github.com/pydata/pandas/issues/2806 .. _GH2807: https://github.com/pydata/pandas/issues/2807 .. _GH2918: https://github.com/pydata/pandas/issues/2918 +.. _GH3059: https://github.com/pydata/pandas/issues/3059 diff --git a/pandas/core/common.py b/pandas/core/common.py index a3e8c09839891..207ed2edac4bc 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -101,7 +101,6 @@ def _isnull_old(obj): _isnull = _isnull_new - def _use_inf_as_null(key): '''Option change callback for null/inf behaviour Choose which replacement for numpy.isnan / -numpy.isfinite is used. @@ -1594,6 +1593,26 @@ def _check_as_is(x): # empty queue self.queue.truncate(0) + def writerows(self, rows): + def _check_as_is(x): + return (self.quoting == csv.QUOTE_NONNUMERIC and + is_number(x)) or isinstance(x, str) + + for i, row in enumerate(rows): + rows[i] = [x if _check_as_is(x) + else pprint_thing(x).encode('utf-8') for x in row] + + self.writer.writerows([[s for s in row] for row in rows]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + _NS_DTYPE = np.dtype('M8[ns]') diff --git a/pandas/core/format.py b/pandas/core/format.py index 003b1fefd01f7..ef14c830e1c37 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -9,7 +9,7 @@ from io import StringIO from pandas.core.common import adjoin, isnull, notnull -from pandas.core.index import MultiIndex, _ensure_index +from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.util import py3compat from pandas.core.config import get_option, set_option, reset_option import pandas.core.common as com @@ -18,6 +18,7 @@ import numpy as np import itertools +import csv from pandas.tseries.period import PeriodIndex @@ -763,6 +764,260 @@ def grouper(x): return result +class CSVFormatter(object): + + def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, + cols=None, header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None, quoting=None, + line_terminator='\n', chunksize=None,legacy=False): + self.legacy=legacy # remove for 0.12 + self.obj = obj + self.path_or_buf = path_or_buf + self.sep = sep + self.na_rep = na_rep + self.float_format = float_format + + self.header = header + self.index = index + self.index_label = index_label + self.mode = mode + self.encoding = encoding + + if quoting is None: + quoting = csv.QUOTE_MINIMAL + self.quoting = quoting + + self.line_terminator = line_terminator + + if cols is None: + cols = obj.columns + + if isinstance(cols,Index): + cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) + else: + cols=list(cols) + self.cols = cols + + # preallocate data 2d list + self.blocks = self.obj._data.blocks + ncols = sum(len(b.items) for b in self.blocks) + self.data =[None] * ncols + + # fail early if we have duplicate columns + if len(set(self.cols)) != len(self.cols): + raise Exception("duplicate columns are not permitted in to_csv") + + self.colname_map = dict((k,i) for i,k in enumerate(obj.columns)) + + if chunksize is None: + chunksize = (100000/ (len(self.cols) or 1)) or 1 + self.chunksize = chunksize + + self.data_index = obj.index + if isinstance(obj.index, PeriodIndex): + self.data_index = obj.index.to_timestamp() + + self.nlevels = getattr(self.data_index, 'nlevels', 1) + if not index: + self.nlevels = 0 + + # legacy to be removed in 0.12 + def _helper_csv(self, writer, na_rep=None, cols=None, + header=True, index=True, + index_label=None, float_format=None): + if cols is None: + cols = self.columns + + series = {} + for k, v in self.obj._series.iteritems(): + series[k] = v.values + + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or header: + if index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(self.obj.index, MultiIndex): + index_label = [] + for i, name in enumerate(self.obj.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = self.obj.index.name + if index_label is None: + index_label = [''] + else: + index_label = [index_label] + elif not isinstance(index_label, (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] + + encoded_labels = list(index_label) + else: + encoded_labels = [] + + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) + else: + write_cols = header + else: + write_cols = cols + encoded_cols = list(write_cols) + + writer.writerow(encoded_labels + encoded_cols) + else: + encoded_cols = list(cols) + writer.writerow(encoded_cols) + + data_index = self.obj.index + if isinstance(self.obj.index, PeriodIndex): + data_index = self.obj.index.to_timestamp() + + nlevels = getattr(data_index, 'nlevels', 1) + for j, idx in enumerate(data_index): + row_fields = [] + if index: + if nlevels == 1: + row_fields = [idx] + else: # handle MultiIndex + row_fields = list(idx) + for i, col in enumerate(cols): + val = series[col][j] + if lib.checknull(val): + val = na_rep + + if float_format is not None and com.is_float(val): + val = float_format % val + elif isinstance(val, np.datetime64): + val = lib.Timestamp(val)._repr_base + + row_fields.append(val) + + writer.writerow(row_fields) + + def save(self): + # create the writer & save + if hasattr(self.path_or_buf, 'read'): + f = self.path_or_buf + close = False + else: + f = com._get_handle(self.path_or_buf, self.mode, encoding=self.encoding) + close = True + + try: + if self.encoding is not None: + self.writer = com.UnicodeWriter(f, lineterminator=self.line_terminator, + delimiter=self.sep, encoding=self.encoding, + quoting=self.quoting) + else: + self.writer = csv.writer(f, lineterminator=self.line_terminator, + delimiter=self.sep, quoting=self.quoting) + + if self.legacy: + # to be removed in 0.12 + self._helper_csv(self.writer, na_rep=self.na_rep, + float_format=self.float_format, cols=self.cols, + header=self.header, index=self.index, + index_label=self.index_label) + + else: + self._save() + + + finally: + if close: + f.close() + + def _save_header(self): + + writer = self.writer + obj = self.obj + index_label = self.index_label + cols = self.cols + header = self.header + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or self.header: + if self.index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(obj.index, MultiIndex): + index_label = [] + for i, name in enumerate(obj.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = obj.index.name + if index_label is None: + index_label = [''] + else: + index_label = [index_label] + elif not isinstance(index_label, (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] + + encoded_labels = list(index_label) + else: + encoded_labels = [] + + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) + else: + write_cols = header + else: + write_cols = cols + encoded_cols = list(write_cols) + + writer.writerow(encoded_labels + encoded_cols) + else: + encoded_cols = list(cols) + writer.writerow(encoded_cols) + + def _save(self): + + self._save_header() + + nrows = len(self.data_index) + + # write in chunksize bites + chunksize = self.chunksize + chunks = int(nrows / chunksize)+1 + + for i in xrange(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break + + self._save_chunk(start_i, end_i) + + def _save_chunk(self, start_i, end_i): + + colname_map = self.colname_map + data_index = self.data_index + + # create the data for a chunk + slicer = slice(start_i,end_i) + for i in range(len(self.blocks)): + b = self.blocks[i] + d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) + for j, k in enumerate(b.items): + # self.data is a preallocated list + self.data[colname_map[k]] = d[j] + + ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) + + lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) + # from collections import namedtuple # ExcelCell = namedtuple("ExcelCell", # 'row, col, val, style, mergestart, mergeend') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ee586a2101f62..7cfb9ec03ba83 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,7 +14,6 @@ from itertools import izip from StringIO import StringIO -import csv import operator import sys @@ -1289,87 +1288,10 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) - def _helper_csv(self, writer, na_rep=None, cols=None, - header=True, index=True, - index_label=None, float_format=None): - if cols is None: - cols = self.columns - - series = {} - for k, v in self._series.iteritems(): - series[k] = v.values - - has_aliases = isinstance(header, (tuple, list, np.ndarray)) - if has_aliases or header: - if index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(self.index, MultiIndex): - index_label = [] - for i, name in enumerate(self.index.names): - if name is None: - name = '' - index_label.append(name) - else: - index_label = self.index.name - if index_label is None: - index_label = [''] - else: - index_label = [index_label] - elif not isinstance(index_label, (list, tuple, np.ndarray)): - # given a string for a DF with Index - index_label = [index_label] - - encoded_labels = list(index_label) - else: - encoded_labels = [] - - if has_aliases: - if len(header) != len(cols): - raise ValueError(('Writing %d cols but got %d aliases' - % (len(cols), len(header)))) - else: - write_cols = header - else: - write_cols = cols - encoded_cols = list(write_cols) - - writer.writerow(encoded_labels + encoded_cols) - else: - encoded_cols = list(cols) - writer.writerow(encoded_cols) - - data_index = self.index - if isinstance(self.index, PeriodIndex): - data_index = self.index.to_timestamp() - - nlevels = getattr(data_index, 'nlevels', 1) - for j, idx in enumerate(data_index): - row_fields = [] - if index: - if nlevels == 1: - row_fields = [idx] - else: # handle MultiIndex - row_fields = list(idx) - for i, col in enumerate(cols): - val = series[col][j] - if lib.checknull(val): - val = na_rep - - if float_format is not None and com.is_float(val): - val = float_format % val - elif isinstance(val, np.datetime64): - val = lib.Timestamp(val)._repr_base - - row_fields.append(val) - - writer.writerow(row_fields) - def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, - line_terminator='\n'): + line_terminator='\n', chunksize=None,**kwds): """ Write DataFrame to a comma-separated values (csv) file @@ -1406,6 +1328,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, file quoting : optional constant from csv module defaults to csv.QUOTE_MINIMAL + chunksize : rows to write at a time """ if nanRep is not None: # pragma: no cover import warnings @@ -1413,32 +1336,17 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, FutureWarning) na_rep = nanRep - if hasattr(path_or_buf, 'read'): - f = path_or_buf - close = False - else: - f = com._get_handle(path_or_buf, mode, encoding=encoding) - close = True - - if quoting is None: - quoting = csv.QUOTE_MINIMAL - try: - if encoding is not None: - csvout = com.UnicodeWriter(f, lineterminator=line_terminator, - delimiter=sep, encoding=encoding, - quoting=quoting) - else: - csvout = csv.writer(f, lineterminator=line_terminator, - delimiter=sep, quoting=quoting) - self._helper_csv(csvout, na_rep=na_rep, - float_format=float_format, cols=cols, - header=header, index=index, - index_label=index_label) - - finally: - if close: - f.close() + else: + formatter = fmt.CSVFormatter(self, path_or_buf, + line_terminator=line_terminator, + sep=sep, encoding=encoding, + quoting=quoting,na_rep=na_rep, + float_format=float_format, cols=cols, + header=header, index=index, + index_label=index_label, + chunksize=chunksize,legacy=kwds.get("legacy",False) ) + formatter.save() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', float_format=None, cols=None, header=True, index=True, diff --git a/pandas/core/index.py b/pandas/core/index.py index 0f9776e202c00..8b42f2146a7cf 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -441,16 +441,7 @@ def format(self, name=False, formatter=None, na_rep='NaN'): return header + list(self.map(formatter)) if self.is_all_dates: - zero_time = time(0, 0) - result = [] - for dt in self: - if isnull(dt): - result.append(u'NaT') - else: - if dt.time() != zero_time or dt.tzinfo is not None: - return header + [u'%s' % x for x in self] - result.append(u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day)) - return header + result + return header + _date_formatter(self) values = self.values @@ -472,6 +463,20 @@ def format(self, name=False, formatter=None, na_rep='NaN'): result = _trim_front(format_array(values, None, justify='left')) return header + result + def to_native_types(self, slicer=None, na_rep='', float_format=None): + values = self + if slicer is not None: + values = values[slicer] + mask = isnull(values) + values = np.array(values,dtype=object) + + if self.is_all_dates: + return _date_formatter(self) + else: + values[mask] = na_rep + + return values.tolist() + def equals(self, other): """ Determines if two Index objects contain the same elements. @@ -1481,6 +1486,12 @@ def __repr__(self): def __len__(self): return len(self.labels[0]) + def to_native_types(self, slicer=None, na_rep='', float_format=None): + ix = self + if slicer: + ix = self[slicer] + return ix.tolist() + @property def _constructor(self): return MultiIndex.from_tuples @@ -2578,6 +2589,22 @@ def _wrap_joined_index(self, joined, other): # For utility purposes +def _date_formatter(obj, na_rep=u'NaT'): + data = list(obj) + + # tz formatter or time formatter + zero_time = time(0, 0) + for d in data: + if d.time() != zero_time or d.tzinfo is not None: + return [u'%s' % x for x in data ] + + values = np.array(data,dtype=object) + mask = isnull(obj.values) + values[mask] = na_rep + + imask = -mask + values[imask] = np.array([ u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day) for dt in values[imask] ]) + return values.tolist() def _sparsify(label_list, start=0): pivoted = zip(*label_list) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 2a41bbffa3b83..3467b72541481 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4,13 +4,14 @@ from numpy import nan import numpy as np -from pandas.core.common import _possibly_downcast_to_dtype +from pandas.core.common import isnull, _possibly_downcast_to_dtype from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib +from pandas.tslib import Timestamp from pandas.util import py3compat @@ -259,6 +260,17 @@ def _try_cast_result(self, result): we may have roundtripped thru object in the mean-time """ return result + def to_native_types(self, slicer=None, na_rep='', **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:,slicer] + values = np.array(values,dtype=object) + mask = isnull(values) + values[mask] = na_rep + return values.tolist() + def replace(self, to_replace, value, inplace=False): new_values = self.values if inplace else self.values.copy() if self._can_hold_element(value): @@ -577,6 +589,20 @@ def _try_cast(self, element): except: # pragma: no cover return element + def to_native_types(self, slicer=None, na_rep='', float_format=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:,slicer] + values = np.array(values,dtype=object) + mask = isnull(values) + values[mask] = na_rep + if float_format: + imask = (-mask).ravel() + values.flat[imask] = np.array([ float_format % val for val in values.ravel()[imask] ]) + return values.tolist() + def should_store(self, value): # when inserting a column should not coerce integers to floats # unnecessarily @@ -701,6 +727,25 @@ def _try_cast(self, element): except: return element + def to_native_types(self, slicer=None, na_rep=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:,slicer] + mask = isnull(values) + + rvalues = np.empty(self.shape,dtype=object) + if na_rep is None: + na_rep = 'NaT' + rvalues[mask] = na_rep + imask = (-mask).ravel() + if self.dtype == 'datetime64[ns]': + rvalues.flat[imask] = np.array([ Timestamp(val)._repr_base for val in values.ravel()[imask] ],dtype=object) + elif self.dtype == 'timedelta64[ns]': + rvalues.flat[imask] = np.array([ lib.repr_timedelta64(val) for val in values.ravel()[imask] ],dtype=object) + return rvalues.tolist() + def should_store(self, value): return issubclass(value.dtype.type, np.datetime64) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 1fd579553f094..e12b524dda736 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -784,6 +784,54 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje return arr +@cython.boundscheck(False) +@cython.wraparound(False) + +def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer): + + cdef int N, j, i, ncols + cdef list rows + cdef object val + + # In crude testing, N>100 yields little marginal improvement + N=100 + + # pre-allocate rows + ncols = len(cols) + rows = [[None]*(nlevels+ncols) for x in range(N)] + + j = -1 + if nlevels == 1: + for j in range(len(data_index)): + row = rows[j % N] + row[0] = data_index[j] + for i in range(ncols): + row[1+i] = data[i][j] + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + elif nlevels > 1: + for j in range(len(data_index)): + row = rows[j % N] + row[:nlevels] = list(data_index[j]) + for i in range(ncols): + row[nlevels+i] = data[i][j] + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + else: + for j in range(len(data_index)): + row = rows[j % N] + for i in range(ncols): + row[i] = data[i][j] + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + + if j >= 0 and (j < N-1 or (j % N) != N-1 ): + writer.writerows(rows[:((j+1) % N)]) + + @cython.boundscheck(False) @cython.wraparound(False) def create_hdf_rows_2d(ndarray indexer0, diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1c30dfd1abced..7051c193dffd4 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4450,6 +4450,115 @@ def test_to_csv_from_csv(self): os.remove(path) + def test_to_csv_moar(self): + from pandas.util.testing import makeCustomDataframe as mkdf + path = '__tmp_to_csv_dupe_cols__' + def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None): + try: + df.to_csv(path,encoding='utf8') + recons = DataFrame.from_csv(path) + except: + os.remove(path) + raise + else: + def _to_uni(x): + if not isinstance(x,unicode): + return x.decode('utf8') + return x + if rnlvl: + delta_lvl = [recons.icol(i).values for i in range(rnlvl-1)] + ix=MultiIndex.from_arrays([list(recons.index)]+delta_lvl) + recons.index = ix + recons = recons.iloc[:,rnlvl-1:] + + if cnlvl: + def stuple_to_tuple(x): + import re + x = x.split(",") + x = map(lambda x: re.sub("[\'\"\s\(\)]","",x),x) + return x + + cols=MultiIndex.from_tuples(map(stuple_to_tuple,recons.columns)) + recons.columns = cols + + type_map = dict(i='i',f='f',s='O',u='O',dt='O') + if r_dtype: + if r_dtype == 'u': # unicode + r_dtype='O' + recons.index = np.array(map(_to_uni,recons.index), + dtype=r_dtype ) + df.index = np.array(map(_to_uni,df.index),dtype=r_dtype ) + if r_dtype == 'dt': # unicode + r_dtype='O' + recons.index = np.array(map(Timestamp,recons.index), + dtype=r_dtype ) + df.index = np.array(map(Timestamp,df.index),dtype=r_dtype ) + else: + r_dtype= type_map.get(r_dtype) + recons.index = np.array(recons.index,dtype=r_dtype ) + df.index = np.array(df.index,dtype=r_dtype ) + if c_dtype: + if c_dtype == 'u': + c_dtype='O' + recons.columns = np.array(map(_to_uni,recons.columns), + dtype=c_dtype ) + df.Columns = np.array(map(_to_uni,df.columns),dtype=c_dtype ) + elif c_dtype == 'dt': + c_dtype='O' + recons.columns = np.array(map(Timestamp,recons.columns), + dtype=c_dtype ) + df.Columns = np.array(map(Timestamp,df.columns),dtype=c_dtype ) + else: + c_dtype= type_map.get(c_dtype) + recons.columns = np.array(recons.columns,dtype=c_dtype ) + df.columns = np.array(df.columns,dtype=c_dtype ) + + assert_frame_equal(df, recons,check_names=False) + + N = 100 + + for ncols in [1,10,30]: + base = int((100000/ ncols or 1) or 1) + for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, + base-1,base,base+1]: + print( nrows,ncols) + _do_test(mkdf(nrows, ncols),path) + + for nrows in [10,N-2,N-1,N,N+1,N+2]: + df = mkdf(nrows, 10) + cols = list(df.columns) + cols[:1] = ["dupe","dupe"] + cols[-1:] = ["dupe","dupe"] + ix = list(df.index) + ix[:2] = ["rdupe","rdupe"] + ix[-2:] = ["rdupe","rdupe"] + print( nrows) + + df.index=ix + _do_test(df,path) + + for r_idx_type in ['i', 'f','s','u','dt']: + for c_idx_type in ['i', 'f','s','u','dt']: + print(r_idx_type,c_idx_type) + _do_test(mkdf(100, 1,r_idx_type=r_idx_type, + c_idx_type=c_idx_type),path,r_idx_type,c_idx_type) + _do_test(mkdf(100, 2,r_idx_type=r_idx_type, + c_idx_type=c_idx_type),path,r_idx_type,c_idx_type) + + _do_test(DataFrame(index=range(10)),path) + _do_test(mkdf(50001, 2,r_idx_nlevels=2),path,rnlvl=2) + for ncols in [2,10,30]: + base = int(100000/ncols) + for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, + base-1,base,base+1]: + print(nrows, ncols) + _do_test(mkdf(nrows, ncols,r_idx_nlevels=2),path,rnlvl=2) + _do_test(mkdf(nrows, ncols,c_idx_nlevels=2),path,cnlvl=2) + _do_test(mkdf(nrows, ncols,r_idx_nlevels=2,c_idx_nlevels=2), + path,rnlvl=2,cnlvl=2) + + + def test_to_csv_from_csv_w_some_infs(self): path = '__%s__' % tm.rands(10) @@ -4562,6 +4671,69 @@ def test_to_csv_withcommas(self): os.remove(path) + def test_to_csv_mixed(self): + filename = '__tmp_to_csv_mixed__.csv' + def create_cols(name): + return [ "%s%03d" % (name,i) for i in xrange(5) ] + + df_float = DataFrame(np.random.randn(100, 5),dtype='float64',columns=create_cols('float')) + df_int = DataFrame(np.random.randn(100, 5),dtype='int64',columns=create_cols('int')) + df_bool = DataFrame(True,index=df_float.index,columns=create_cols('bool')) + df_object = DataFrame('foo',index=df_float.index,columns=create_cols('object')) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=create_cols('date')) + + # add in some nans + df_float.ix[30:50,1:3] = np.nan + + #### this is a bug in read_csv right now #### + #df_dt.ix[30:50,1:3] = np.nan + + df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + + # dtype + dtypes = dict() + for n,dtype in [('float',np.float64),('int',np.int64),('bool',np.bool),('object',np.object)]: + for c in create_cols(n): + dtypes[c] = dtype + + df.to_csv(filename) + + rs = pan.read_csv(filename, index_col=0, dtype=dtypes, parse_dates=create_cols('date')) + assert_frame_equal(rs, df) + os.remove(filename) + + def test_to_csv_dups_cols(self): + filename = '__tmp_to_csv_dup_cols__.csv' + + df = DataFrame(np.random.randn(1000, 30),columns=range(15)+range(15),dtype='float64') + self.assertRaises(Exception, df.to_csv, filename) + + df_float = DataFrame(np.random.randn(1000, 30),dtype='float64') + df_int = DataFrame(np.random.randn(1000, 30),dtype='int64') + df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) + df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) + df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + + #### this raises because we have duplicate column names across dtypes #### + self.assertRaises(Exception, df.to_csv, filename) + + def test_to_csv_chunking(self): + filename = '__tmp_to_csv_chunking__.csv' + + aa=DataFrame({'A':range(100000)}) + + aa['B'] = aa.A + 1.0 + aa['C'] = aa.A + 2.0 + aa['D'] = aa.A + 3.0 + + for chunksize in [10000,50000,100000]: + aa.to_csv(filename,chunksize=chunksize) + rs = pan.read_csv(filename,index_col=0) + assert_frame_equal(rs, aa) + + os.remove(filename) + def test_to_csv_bug(self): path = '__tmp_to_csv_bug__.csv' f1 = StringIO('a,1.0\nb,2.0') diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py index ba386bd0e9649..dc335a4f994d5 100644 --- a/vb_suite/io_bench.py +++ b/vb_suite/io_bench.py @@ -44,17 +44,34 @@ """ frame_to_csv = Benchmark("df.to_csv('__test__.csv')", setup, start_date=datetime(2011, 1, 1)) +#---------------------------------- + +setup = common_setup + """ +df=DataFrame({'A':range(100000)}) +df['B'] = df.A + 1.0 +df['C'] = df.A + 2.0 +df['D'] = df.A + 3.0 +""" +frame_to_csv2 = Benchmark("df.to_csv('__test__.csv')", setup, + start_date=datetime(2011, 1, 1)) #---------------------------------- setup = common_setup + """ from pandas import concat, Timestamp -df_float = DataFrame(np.random.randn(1000, 30),dtype='float64') -df_int = DataFrame(np.random.randn(1000, 30),dtype='int64') -df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) -df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) -df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) +def create_cols(name): + return [ "%s%03d" % (name,i) for i in xrange(5) ] +df_float = DataFrame(np.random.randn(10000, 5),dtype='float64',columns=create_cols('float')) +df_int = DataFrame(np.random.randn(10000, 5),dtype='int64',columns=create_cols('int')) +df_bool = DataFrame(True,index=df_float.index,columns=create_cols('bool')) +df_object = DataFrame('foo',index=df_float.index,columns=create_cols('object')) +df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=create_cols('date')) + +# add in some nans +df_float.ix[30:500,1:3] = np.nan + df = concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + """ frame_to_csv_mixed = Benchmark("df.to_csv('__test__.csv')", setup, start_date=datetime(2012, 6, 1))