From 09f9542cf707bf974da09bec5119114c2ed84e6b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 3 Dec 2017 22:59:09 -0800 Subject: [PATCH 1/3] CLN: ASV hdfstore benchmark --- asv_bench/benchmarks/hdfstore_bench.py | 68 ++++++++++++++------------ 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index 5aa8f76917797..fa4bc8c662478 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -1,31 +1,40 @@ -from .pandas_vb_common import * import os +import numpy as np +from pandas import DataFrame, Panel, date_range, HDFStore +import pandas.util.testing as tm -class HDF5(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000),}, - index=self.index) +from .pandas_vb_common import setup # noqa - self.df_mixed = DataFrame( - {'float1': randn(25000), 'float2': randn(25000), - 'string1': (['foo'] * 25000), - 'bool1': ([True] * 25000), - 'int1': np.random.randint(0, 250000, size=25000),}, - index=self.index) - self.df_wide = DataFrame(np.random.randn(25000, 100)) +class HDF5(object): - self.df2 = DataFrame({'float1': randn(25000), 'float2': randn(25000)}, - index=date_range('1/1/2000', periods=25000)) - self.df_wide2 = DataFrame(np.random.randn(25000, 100), - index=date_range('1/1/2000', periods=25000)) + goal_time = 0.2 - self.df_dc = DataFrame(np.random.randn(10000, 10), - columns=[('C%03d' % i) for i in range(10)]) + def setup(self): + N = 25000 + index = tm.makeStringIndex(N) + self.df = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N)}, + index=index) + self.df_mixed = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N), + 'string1': ['foo'] * N, + 'bool1': [True] * N, + 'int1': np.random.randint(0, N, size=N)}, + index=index) + self.df_wide = DataFrame(np.random.randn(N, 100)) + self.start_wide = self.df_wide.index[10000] + self.stop_wide = self.df_wide.index[15000] + self.df2 = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N)}, + index=date_range('1/1/2000', periods=N)) + self.start = self.df2.index[10000] + self.stop = self.df2.index[15000] + self.df_wide2 = DataFrame(np.random.randn(N, 100), + index=date_range('1/1/2000', periods=N)) + self.df_dc = DataFrame(np.random.randn(N, 10), + columns=['C%03d' % i for i in range(10)]) self.f = '__test__.h5' self.remove(self.f) @@ -82,14 +91,12 @@ def time_write_store_table_dc(self): self.store.append('table_dc_write', self.df_dc, data_columns=True) def time_query_store_table_wide(self): - start = self.df_wide2.index[10000] - stop = self.df_wide2.index[15000] - self.store.select('table_wide', where="index > start and index < stop") + self.store.select('table_wide', where="index > self.start_wide and " + "index < self.stop_wide") def time_query_store_table(self): - start = self.df2.index[10000] - stop = self.df2.index[15000] - self.store.select('table', where="index > start and index < stop") + self.store.select('table', where="index > self.start and " + "index < self.stop") def time_store_repr(self): repr(self.store) @@ -102,14 +109,15 @@ def time_store_info(self): class HDF5Panel(object): + goal_time = 0.2 def setup(self): self.f = '__test__.h5' - self.p = Panel(randn(20, 1000, 25), - items=[('Item%03d' % i) for i in range(20)], + self.p = Panel(np.random.randn(20, 1000, 25), + items=['Item%03d' % i for i in range(20)], major_axis=date_range('1/1/2000', periods=1000), - minor_axis=[('E%03d' % i) for i in range(25)]) + minor_axis=['E%03d' % i for i in range(25)]) self.remove(self.f) self.store = HDFStore(self.f) self.store.append('p1', self.p) From 6f58ad432ee60d14341118abddc8fd78ed36e14b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 4 Dec 2017 21:49:29 -0800 Subject: [PATCH 2/3] small clean --- asv_bench/benchmarks/hdfstore_bench.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index fa4bc8c662478..7f87d3c00eeec 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -37,7 +37,6 @@ def setup(self): columns=['C%03d' % i for i in range(10)]) self.f = '__test__.h5' - self.remove(self.f) self.store = HDFStore(self.f) self.store.put('fixed', self.df) @@ -49,13 +48,7 @@ def setup(self): def teardown(self): self.store.close() - self.remove(self.f) - - def remove(self, f): - try: - os.remove(f) - except: - pass + os.remove(self.f) def time_read_store(self): self.store.get('fixed') @@ -118,19 +111,12 @@ def setup(self): items=['Item%03d' % i for i in range(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=['E%03d' % i for i in range(25)]) - self.remove(self.f) self.store = HDFStore(self.f) self.store.append('p1', self.p) def teardown(self): self.store.close() - self.remove(self.f) - - def remove(self, f): - try: - os.remove(f) - except: - pass + os.remove(self.f) def time_read_store_table_panel(self): self.store.select('p1') From db447949617dee266ac7668c3d78544b875de664 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 5 Dec 2017 21:59:58 -0800 Subject: [PATCH 3/3] Add IO base class --- asv_bench/benchmarks/hdfstore_bench.py | 12 ++++----- asv_bench/benchmarks/io_bench.py | 34 +++++++----------------- asv_bench/benchmarks/pandas_vb_common.py | 20 ++++++++++++++ 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index 7f87d3c00eeec..d7b3be25a18b9 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -1,13 +1,11 @@ -import os - import numpy as np from pandas import DataFrame, Panel, date_range, HDFStore import pandas.util.testing as tm -from .pandas_vb_common import setup # noqa +from .pandas_vb_common import BaseIO, setup # noqa -class HDF5(object): +class HDF5(BaseIO): goal_time = 0.2 @@ -48,7 +46,7 @@ def setup(self): def teardown(self): self.store.close() - os.remove(self.f) + self.remove(self.f) def time_read_store(self): self.store.get('fixed') @@ -101,7 +99,7 @@ def time_store_info(self): self.store.info() -class HDF5Panel(object): +class HDF5Panel(BaseIO): goal_time = 0.2 @@ -116,7 +114,7 @@ def setup(self): def teardown(self): self.store.close() - os.remove(self.f) + self.remove(self.f) def time_read_store_table_panel(self): self.store.select('p1') diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index c718b13912e73..e8112cc41f032 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -8,23 +8,7 @@ import timeit -class _BenchTeardown(object): - """ - base class for teardown method implementation - """ - fname = None - - def remove(self, f): - try: - os.remove(f) - except: - pass - - def teardown(self): - self.remove(self.fname) - - -class frame_to_csv(_BenchTeardown): +class frame_to_csv(BaseIO): goal_time = 0.2 fname = '__test__.csv' @@ -35,7 +19,7 @@ def time_frame_to_csv(self): self.df.to_csv(self.fname) -class frame_to_csv2(_BenchTeardown): +class frame_to_csv2(BaseIO): goal_time = 0.2 fname = '__test__.csv' @@ -49,7 +33,7 @@ def time_frame_to_csv2(self): self.df.to_csv(self.fname) -class frame_to_csv_date_formatting(_BenchTeardown): +class frame_to_csv_date_formatting(BaseIO): goal_time = 0.2 fname = '__test__.csv' @@ -61,7 +45,7 @@ def time_frame_to_csv_date_formatting(self): self.data.to_csv(self.fname, date_format='%Y%m%d') -class frame_to_csv_mixed(_BenchTeardown): +class frame_to_csv_mixed(BaseIO): goal_time = 0.2 fname = '__test__.csv' @@ -114,7 +98,7 @@ def time_read_csv_infer_datetime_format_ymd(self): read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) -class read_csv_skiprows(_BenchTeardown): +class read_csv_skiprows(BaseIO): goal_time = 0.2 fname = '__test__.csv' @@ -127,7 +111,7 @@ def time_read_csv_skiprows(self): read_csv(self.fname, skiprows=10000) -class read_csv_standard(_BenchTeardown): +class read_csv_standard(BaseIO): goal_time = 0.2 fname = '__test__.csv' @@ -174,7 +158,7 @@ def time_read_uint64_na_values(self): read_csv(StringIO(self.data1), header=None, na_values=self.na_values) -class write_csv_standard(_BenchTeardown): +class write_csv_standard(BaseIO): goal_time = 0.2 fname = '__test__.csv' @@ -218,14 +202,14 @@ def time_read_nrows(self, compression, engine): compression=compression, engine=engine) -class read_json_lines(_BenchTeardown): +class read_json_lines(BaseIO): goal_time = 0.2 fname = "__test__.json" def setup(self): self.N = 100000 self.C = 5 - self.df = DataFrame({('float{0}'.format(i), randn(self.N)) for i in range(self.C)}) + self.df = DataFrame({'float{0}'.format(i): randn(self.N) for i in range(self.C)}) self.df.to_json(self.fname,orient="records",lines=True) def time_read_json_lines(self): diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 62eb826418030..74517f184ae6f 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -1,3 +1,4 @@ +import os from pandas import * import pandas as pd from numpy.random import randn @@ -19,6 +20,25 @@ def setup(*args, **kwargs): np.random.seed(1234) + +class BaseIO(object): + """ + Base class for IO benchmarks + """ + fname = None + + def remove(self, f): + """Remove created files""" + try: + os.remove(f) + except: + # On Windows, attempting to remove a file that is in use + # causes an exception to be raised + pass + + def teardown(self): + self.remove(self.fname) + # try em until it works! for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']: try: