From 4ecb51e8e39c257ea87b0ca7b68ed5df368d2f6f Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Sat, 30 Jan 2021 23:10:04 +0100 Subject: [PATCH 1/2] ASV: add benchmarks for concatenating and appending of CategoricalIndex (GH38149) --- asv_bench/benchmarks/categoricals.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index f3b005b704014..fa8cfcd760661 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -299,8 +299,13 @@ class Indexing: def setup(self): N = 10 ** 5 self.index = pd.CategoricalIndex(range(N), range(N)) + self.index_non_overlapping = pd.CategoricalIndex(range(N + 1), range(N + 1)) self.series = pd.Series(range(N), index=self.index).sort_index() self.category = self.index[500] + self.df = pd.DataFrame(range(N), columns=["a"], index=self.index) + self.df_non_overlapping = pd.DataFrame( + range(N + 1), columns=["a"], index=self.index_non_overlapping + ) def time_get_loc(self): self.index.get_loc(self.category) @@ -326,6 +331,18 @@ def time_reindex_missing(self): def time_sort_values(self): self.index.sort_values(ascending=False) + def time_append_index(self): + self.index.append(self.index) + + def time_append_non_overlapping_index(self): + self.index.append(self.index_non_overlapping) + + def time_concat_with_index(self): + pd.concat([self.df, self.df]) + + def time_concat_with_non_overlapping_index(self): + pd.concat([self.df, self.df_non_overlapping]) + class SearchSorted: def setup(self): From 1a2330427beefd19d665bbe661be7434192ea69b Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Fri, 5 Feb 2021 20:16:14 +0100 Subject: [PATCH 2/2] Move tests and rename variables --- asv_bench/benchmarks/categoricals.py | 34 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index fa8cfcd760661..f4a6ed5f26c89 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -118,12 +118,29 @@ def setup(self): self.a = pd.Categorical(list("aabbcd") * N) self.b = pd.Categorical(list("bbcdjk") * N) + self.idx_a = pd.CategoricalIndex(range(N), range(N)) + self.idx_b = pd.CategoricalIndex(range(N + 1), range(N + 1)) + self.df_a = pd.DataFrame(range(N), columns=["a"], index=self.idx_a) + self.df_b = pd.DataFrame(range(N + 1), columns=["a"], index=self.idx_b) + def time_concat(self): pd.concat([self.s, self.s]) def time_union(self): union_categoricals([self.a, self.b]) + def time_append_overlapping_index(self): + self.idx_a.append(self.idx_a) + + def time_append_non_overlapping_index(self): + self.idx_a.append(self.idx_b) + + def time_concat_overlapping_index(self): + pd.concat([self.df_a, self.df_a]) + + def time_concat_non_overlapping_index(self): + pd.concat([self.df_a, self.df_b]) + class ValueCounts: @@ -299,13 +316,8 @@ class Indexing: def setup(self): N = 10 ** 5 self.index = pd.CategoricalIndex(range(N), range(N)) - self.index_non_overlapping = pd.CategoricalIndex(range(N + 1), range(N + 1)) self.series = pd.Series(range(N), index=self.index).sort_index() self.category = self.index[500] - self.df = pd.DataFrame(range(N), columns=["a"], index=self.index) - self.df_non_overlapping = pd.DataFrame( - range(N + 1), columns=["a"], index=self.index_non_overlapping - ) def time_get_loc(self): self.index.get_loc(self.category) @@ -331,18 +343,6 @@ def time_reindex_missing(self): def time_sort_values(self): self.index.sort_values(ascending=False) - def time_append_index(self): - self.index.append(self.index) - - def time_append_non_overlapping_index(self): - self.index.append(self.index_non_overlapping) - - def time_concat_with_index(self): - pd.concat([self.df, self.df]) - - def time_concat_with_non_overlapping_index(self): - pd.concat([self.df, self.df_non_overlapping]) - class SearchSorted: def setup(self):