Skip to content

Commit 2d9df04

Browse files
authored
Merge pull request #1497 from cmu-delphi/ndefries/gs-deprecated-pandas-fns
Return google symptoms to using current pandas version, with mitigations
2 parents 58a57df + 1872b81 commit 2d9df04

File tree

5 files changed

+35
-8
lines changed

5 files changed

+35
-8
lines changed

google_symptoms/delphi_google_symptoms/geo.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def generate_transition_matrix(geo_res):
3737
if geo_res == "hrr":
3838
map_df["population"] = map_df["population"] * map_df["weight"]
3939

40-
aggregated_pop = map_df.groupby(geo_res).sum().reset_index()
40+
aggregated_pop = map_df.groupby(geo_res).sum(numeric_only=True).reset_index()
4141
map_df = map_df.merge(
4242
aggregated_pop, on=geo_res, how="inner", suffixes=["_raw", "_groupsum"]
4343
)
@@ -79,8 +79,11 @@ def geo_map(df, geo_res, namescols = None):
7979
return df
8080

8181
map_df = generate_transition_matrix(geo_res)
82-
converted_df = pd.DataFrame(columns = df.columns)
83-
for _date in df["timestamp"].unique():
82+
83+
dates_list = df["timestamp"].unique()
84+
dfs_list = [pd.DataFrame()] * len(dates_list)
85+
86+
for i, _date in enumerate(dates_list):
8487
val_lists = df[df["timestamp"] == _date].merge(
8588
map_df["geo_id"], how="right"
8689
)[namescols].fillna(0)
@@ -92,5 +95,8 @@ def geo_map(df, geo_res, namescols = None):
9295
newdf["geo_id"] = list(map_df.keys())[1:]
9396
mask = (newdf == 0)
9497
newdf[mask] = np.nan
95-
converted_df = converted_df.append(newdf)
96-
return converted_df
98+
dfs_list[i] = newdf
99+
100+
# Reindex to make sure output has same columns as input df. Filled with
101+
# NaN values if column doesn't already exist.
102+
return pd.concat(dfs_list).reindex(df.columns, axis=1)

google_symptoms/delphi_google_symptoms/pull.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ def preprocess(df, level):
8282
index_df = pd.MultiIndex.from_product(
8383
[geo_list, date_list], names=['geo_id', 'date']
8484
)
85+
df.date = pd.to_datetime(df.date)
8586
df = df.set_index(
8687
["geo_id", "date"]
8788
).reindex(
@@ -296,7 +297,7 @@ def pull_gs_data(credentials, export_start_date, export_end_date, num_export_day
296297
df_dc_county = dfs["state"][dfs["state"]["geo_id"] == "dc"].drop(
297298
"geo_id", axis=1)
298299
df_dc_county["geo_id"] = DC_FIPS
299-
dfs["county"] = dfs["county"].append(df_dc_county)
300+
dfs["county"] = pd.concat([dfs["county"], df_dc_county])
300301
except KeyError:
301302
pass
302303

google_symptoms/setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44
required = [
55
"mock",
66
"numpy",
7-
"pandas==1.3.5",
7+
"pandas",
88
"pydocstyle",
99
"pytest",
1010
"pytest-cov",
1111
"pylint==2.8.3",
1212
"delphi-utils",
1313
"freezegun",
14-
"pandas-gbq"
14+
"pandas-gbq",
15+
"db-dtypes"
1516
]
1617

1718
setup(

google_symptoms/tests/test_geo.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def test_hrr(self):
4747
).drop("weight", axis="columns")
4848
hrr_pop = fips2hrr.groupby("hrr"
4949
).sum(
50+
numeric_only=True
5051
).reset_index(
5152
).rename(columns={"population": "hrr_pop"})
5253
df_plus = df.merge(fips2hrr, left_on="geo_id", right_on="fips", how="left"
@@ -59,6 +60,7 @@ def test_hrr(self):
5960
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
6061
).groupby("hrr"
6162
).sum(
63+
numeric_only=True
6264
).drop(
6365
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
6466
axis="columns"
@@ -91,6 +93,7 @@ def test_msa(self):
9193
fips2msa = gmpr.add_population_column(gmpr.get_crosswalk("fips", "msa"), "fips")
9294
msa_pop = fips2msa.groupby("msa"
9395
).sum(
96+
numeric_only=True
9497
).reset_index(
9598
).rename(columns={"population": "msa_pop"})
9699
df_plus = df.merge(fips2msa, left_on="geo_id", right_on="fips", how="left"
@@ -103,6 +106,7 @@ def test_msa(self):
103106
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
104107
).groupby("msa"
105108
).sum(
109+
numeric_only=True
106110
).drop(
107111
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
108112
axis="columns"
@@ -136,6 +140,7 @@ def test_hhs(self):
136140
state2hhs = gmpr.add_geocode(state2hhs, "state_code", "hhs")
137141
hhs_pop = state2hhs.groupby("hhs"
138142
).sum(
143+
numeric_only=True
139144
).reset_index(
140145
).rename(columns={"population": "hhs_pop"})
141146
df_plus = df.merge(state2hhs, left_on="geo_id", right_on="state_id", how="left"
@@ -148,6 +153,7 @@ def test_hhs(self):
148153
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
149154
).groupby("hhs"
150155
).sum(
156+
numeric_only=True
151157
).drop(
152158
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
153159
axis="columns"
@@ -181,6 +187,7 @@ def test_nation(self):
181187
state2nation = gmpr.add_geocode(state2nation, "state_code", "nation")
182188
nation_pop = state2nation.groupby("nation"
183189
).sum(
190+
numeric_only=True
184191
).reset_index(
185192
).rename(columns={"population": "nation_pop"})
186193
df_plus = df.merge(state2nation, left_on="geo_id", right_on="state_id", how="left"
@@ -193,6 +200,7 @@ def test_nation(self):
193200
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
194201
).groupby("nation"
195202
).sum(
203+
numeric_only=True
196204
).drop(
197205
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
198206
axis="columns"

google_symptoms/tests/test_pull.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pytest
22
import mock
3+
import db_dtypes
34
from freezegun import freeze_time
45
from datetime import date, datetime
56
import pandas as pd
@@ -90,6 +91,16 @@ def test_invalid_fips(self):
9091
with pytest.raises(AssertionError):
9192
preprocess(df, "county")
9293

94+
def test_no_rows_nulled(self):
95+
"""
96+
Check that rows are not mysteriously nulled out. See
97+
https://github.com/cmu-delphi/covidcast-indicators/pull/1496 for motivating issue.
98+
"""
99+
# Cast date field to `dbdate` to match dataframe dtypes as provided by the BigQuery fetch.
100+
df = pd.read_csv(good_input["state"]).astype({"date": "dbdate"})
101+
out = preprocess(df, "state")
102+
assert df.shape[0] == out[~out.Cough.isna()].shape[0]
103+
93104

94105
class TestPullHelperFuncs:
95106
@freeze_time("2021-01-05")

0 commit comments

Comments
 (0)