build: Use Pandas 2.0 forward compatible API (#582)

jjerphan · web-flow · commit 411e97d977f8 · 2023-07-12T13:08:25.000+02:00
diff --git a/python/tests/integration/arcticdb/version_store/test_update_with_date_range.py b/python/tests/integration/arcticdb/version_store/test_update_with_date_range.py
@@ -53,7 +53,14 @@ def __init__(self, wrapped: pd.DataFrame, *, with_timezone_attr: bool, timezone_
 
     def __getitem__(self, item):
         if isinstance(item, slice):
-            open_ended = slice(item.start + timedelta(microseconds=1), item.stop - timedelta(microseconds=1), item.step)
+            # Comparing datetimes with timezone to datetimes without timezone has been deprecated in Pandas 1.2.0
+            # (see https://github.com/pandas-dev/pandas/pull/36148/) and is not support anymore in Pandas 2.0
+            # (see https://github.com/pandas-dev/pandas/pull/49492/).
+            # We explicitly remove the timezone from the start and stop of the slice to be able to use the
+            # index of the wrapped DataFrame.
+            start_wo_tz = item.start.replace(tzinfo=None) + timedelta(microseconds=1)
+            stop_wo_tz = item.stop.replace(tzinfo=None) - timedelta(microseconds=1)
+            open_ended = slice(start_wo_tz, stop_wo_tz, item.step)
             return CustomTimeseries(
                 self.wrapped[open_ended],
                 with_timezone_attr=self.with_timezone_attr,
diff --git a/python/tests/unit/arcticdb/test_column_stats.py b/python/tests/unit/arcticdb/test_column_stats.py
@@ -19,7 +19,11 @@ def generate_symbol(lib, sym):
     lib.write(sym, df0)
     lib.append(sym, df1)
     expected_column_stats = lib.read_index(sym)
-    expected_column_stats.drop(expected_column_stats.columns.difference(["start_index", "end_index"]), 1, inplace=True)
+    expected_column_stats.drop(
+        expected_column_stats.columns.difference(["start_index", "end_index"]),
+        axis=1,
+        inplace=True,
+    )
     expected_column_stats = expected_column_stats.iloc[[0, 1]]
     expected_column_stats["v1.0_MIN(col_1)"] = [df0["col_1"].min(), df1["col_1"].min()]
     expected_column_stats["v1.0_MAX(col_1)"] = [df0["col_1"].max(), df1["col_1"].max()]
@@ -41,7 +45,7 @@ def test_column_stats_basic_flow(lmdb_version_store_tiny_segment):
     expected_column_stats = generate_symbol(lib, sym)
     expected_column_stats.drop(
         expected_column_stats.columns.difference(["start_index", "end_index", "v1.0_MIN(col_1)", "v1.0_MAX(col_1)"]),
-        1,
+        axis=1,
         inplace=True,
     )
 
@@ -74,7 +78,11 @@ def test_column_stats_infinity(lmdb_version_store_tiny_segment):
     lib.append(sym, df1)
     lib.append(sym, df2)
     expected_column_stats = lib.read_index(sym)
-    expected_column_stats.drop(expected_column_stats.columns.difference(["start_index", "end_index"]), 1, inplace=True)
+    expected_column_stats.drop(
+        expected_column_stats.columns.difference(["start_index", "end_index"]),
+        axis=1,
+        inplace=True,
+    )
     expected_column_stats = expected_column_stats.iloc[[0, 1, 2]]
     expected_column_stats["v1.0_MIN(col_1)"] = [df0["col_1"].min(), df1["col_1"].min(), df2["col_1"].min()]
     expected_column_stats["v1.0_MAX(col_1)"] = [df0["col_1"].max(), df1["col_1"].max(), df2["col_1"].max()]
@@ -94,7 +102,7 @@ def test_column_stats_as_of(lmdb_version_store_tiny_segment):
     expected_column_stats = expected_column_stats.iloc[[0]]
     expected_column_stats.drop(
         expected_column_stats.columns.difference(["start_index", "end_index", "v1.0_MIN(col_1)", "v1.0_MAX(col_1)"]),
-        1,
+        axis=1,
         inplace=True,
     )
 
@@ -150,7 +158,7 @@ def test_column_stats_multiple_indexes_different_columns(lmdb_version_store_tiny
 
     expected_column_stats.drop(
         expected_column_stats.columns.difference(["start_index", "end_index", "v1.0_MIN(col_1)", "v1.0_MAX(col_1)"]),
-        1,
+        axis=1,
         inplace=True,
     )
     column_stats = lib.read_column_stats(sym)
@@ -251,7 +259,7 @@ def test_column_stats_multiple_creates(lmdb_version_store_tiny_segment):
     expected_column_stats = base_expected_column_stats.copy()
     expected_column_stats.drop(
         expected_column_stats.columns.difference(["start_index", "end_index", "v1.0_MIN(col_1)", "v1.0_MAX(col_1)"]),
-        1,
+        axis=1,
         inplace=True,
     )
     column_stats = lib.read_column_stats(sym)
@@ -287,10 +295,14 @@ def test_column_stats_duplicated_primary_index(lmdb_version_store_tiny_segment):
     lib = lmdb_version_store_tiny_segment
     sym = "test_column_stats_duplicated_primary_index"
 
-    total_df = df0.append(df1)
+    total_df = pd.concat((df0, df1))
     lib.write(sym, total_df)
     expected_column_stats = lib.read_index(sym)
-    expected_column_stats.drop(expected_column_stats.columns.difference(["start_index", "end_index"]), 1, inplace=True)
+    expected_column_stats.drop(
+        expected_column_stats.columns.difference(["start_index", "end_index"]),
+        axis=1,
+        inplace=True,
+    )
     expected_column_stats = expected_column_stats.iloc[[0, 1]]
     expected_column_stats["v1.0_MIN(col_1)"] = [df0["col_1"].min(), df1["col_1"].min()]
     expected_column_stats["v1.0_MAX(col_1)"] = [df0["col_1"].max(), df1["col_1"].max()]
@@ -324,7 +336,11 @@ def test_column_stats_dynamic_schema_missing_data(lmdb_version_store_tiny_segmen
     df = lib.read(sym).data
 
     expected_column_stats = lib.read_index(sym)
-    expected_column_stats.drop(expected_column_stats.columns.difference(["start_index", "end_index"]), 1, inplace=True)
+    expected_column_stats.drop(
+        expected_column_stats.columns.difference(["start_index", "end_index"]),
+        axis=1,
+        inplace=True,
+    )
     expected_column_stats = expected_column_stats.iloc[[0, 1, 2, 3, 4]]
     expected_column_stats["v1.0_MIN(col_1)"] = [
         df0["col_1"].min(),
@@ -395,7 +411,11 @@ def test_column_stats_dynamic_schema_types_changing(lmdb_version_store_tiny_segm
     lib.append(sym, df1)
 
     expected_column_stats = lib.read_index(sym)
-    expected_column_stats.drop(expected_column_stats.columns.difference(["start_index", "end_index"]), 1, inplace=True)
+    expected_column_stats.drop(
+        expected_column_stats.columns.difference(["start_index", "end_index"]),
+        axis=1,
+        inplace=True,
+    )
     expected_column_stats = expected_column_stats.iloc[[0, 1]]
     expected_column_stats["v1.0_MIN(int_widening)"] = [df0["int_widening"].min(), df1["int_widening"].min()]
     expected_column_stats["v1.0_MAX(int_widening)"] = [df0["int_widening"].max(), df1["int_widening"].max()]
diff --git a/python/tests/unit/arcticdb/version_store/test_aggregation_dynamic.py b/python/tests/unit/arcticdb/version_store/test_aggregation_dynamic.py
@@ -328,7 +328,7 @@ def test_aggregation_grouping_column_missing_from_row_group(lmdb_version_store_d
         {"to_sum": [3, 4]},
         index=np.arange(2, 4),
     )
-    expected = df0.append(df1).groupby("grouping_column").agg({"to_sum": "sum"})
+    expected = pd.concat((df0, df1)).groupby("grouping_column").agg({"to_sum": "sum"})
 
     symbol = "test_aggregation_grouping_column_missing_from_row_group"
     lib.write(symbol, df0)
diff --git a/python/tests/unit/arcticdb/version_store/test_empty_writes.py b/python/tests/unit/arcticdb/version_store/test_empty_writes.py
@@ -23,15 +23,15 @@ def test_write_no_rows(lmdb_version_store, sym):
     assert_frame_equal(lmdb_version_store.read(sym).data, df)
 
     df2 = pd.DataFrame([[1.3, 6, "test"]], columns=column_names, index=[pd.Timestamp(0)])
-    df2 = df.append(df2)
+    df2 = pd.concat((df, df2))
     # coercing not needed
     lmdb_version_store.append(sym, df2, dynamic_strings=True)
     assert_frame_equal(lmdb_version_store.read(sym).data, df2)
 
     df3 = pd.DataFrame(
         [[3.3, 8, None], [2.3, 10, "test2"]], columns=column_names, index=[pd.Timestamp(1), pd.Timestamp(2)]
     )
-    df2 = df2.append(df3)
+    df2 = pd.concat((df2, df3))
     # coercing not needed
     lmdb_version_store.append(sym, df3, dynamic_strings=True)
     assert_frame_equal(lmdb_version_store.read(sym).data, df2)
@@ -100,7 +100,7 @@ def test_write_no_rows_and_columns(lmdb_version_store_dynamic_schema, sym):
         columns=column_names + ["d"],
         index=[pd.Timestamp(3), pd.Timestamp(4)],
     )
-    df5 = df2.append(df4)
+    df5 = pd.concat((df2, df4))
     lmdb_version_store_dynamic_schema.append(sym, df4, dynamic_strings=True)
     assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df5)
 
diff --git a/python/tests/unit/arcticdb/version_store/test_parallel.py b/python/tests/unit/arcticdb/version_store/test_parallel.py
@@ -108,7 +108,7 @@ def test_sort_merge_write(lmdb_version_store):
         new_df = pd.DataFrame(data=vals, index=index)
 
         dataframes.append(new_df)
-        df = df.append(new_df)
+        df = pd.concat((df, new_df))
         dt = dt + datetime.timedelta(days=1)
 
     random.shuffle(dataframes)
@@ -139,7 +139,7 @@ def test_sort_merge_append(lmdb_version_store_dynamic_schema):
         vals = {c: random_floats(num_rows_per_day) for c in cols}
         new_df = pd.DataFrame(data=vals, index=index)
         dataframes.append(new_df)
-        df = df.append(new_df)
+        df = pd.concat((df, new_df))
         dt = dt + datetime.timedelta(days=1)
 
     half_way = len(dataframes) / 2
diff --git a/python/tests/unit/arcticdb/version_store/test_projection_dynamic.py b/python/tests/unit/arcticdb/version_store/test_projection_dynamic.py
@@ -55,18 +55,18 @@ def test_project_column_types_changing_and_missing(lmdb_version_store_dynamic_sc
     # uint8
     df = pd.DataFrame({"col_to_project": np.arange(2, dtype=np.uint8), "data_col": [2, 3]}, index=np.arange(2, 4))
     lib.append(symbol, df)
-    expected = expected.append(df)
+    expected = pd.concat((expected, df))
     # Missing
     df = pd.DataFrame({"data_col": [4, 5]}, index=np.arange(4, 6))
     lib.append(symbol, df)
-    expected = expected.append(df)
+    expected = pd.concat((expected, df))
     # int16
     df = pd.DataFrame(
         {"col_to_project": np.arange(200, 202, dtype=np.int16), "data_col": [6, 7]}, index=np.arange(6, 8)
     )
     lib.append(symbol, df)
 
-    expected = expected.append(df)
+    expected = pd.concat((expected, df))
     expected["projected_col"] = expected["col_to_project"] * 2
     q = QueryBuilder()
     q = q.apply("projected_col", q["col_to_project"] * 2)

Original file line number	Diff line number	Diff line change
`@@ -328,7 +328,7 @@ def test_aggregation_grouping_column_missing_from_row_group(lmdb_version_store_d`
`328`	`328`	`{"to_sum": [3, 4]},`
`329`	`329`	`index=np.arange(2, 4),`
`330`	`330`	`)`
`331`		`- expected = df0.append(df1).groupby("grouping_column").agg({"to_sum": "sum"})`
	`331`	`+ expected = pd.concat((df0, df1)).groupby("grouping_column").agg({"to_sum": "sum"})`
`332`	`332`
`333`	`333`	`symbol = "test_aggregation_grouping_column_missing_from_row_group"`
`334`	`334`	`lib.write(symbol, df0)`