Skip to content

Commit 195e530

Browse files
committed
feat: reading JSON data as the pyarrow JSON type when available
1 parent c3f51a2 commit 195e530

File tree

8 files changed

+69
-32
lines changed

8 files changed

+69
-32
lines changed

bigframes/core/array_value.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ def from_table(
108108
raise ValueError("must set at most one of 'offests', 'primary_key'")
109109
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
110110
msg = bfe.format_message(
111-
"JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
112-
"is a preview feature and subject to change."
111+
"JSON column interpretation as a PyArrow JSON extention type is a preview "
112+
"feature and subject to change."
113113
)
114114
warnings.warn(msg, bfe.PreviewWarning)
115115
# define data source only for needed columns, this makes row-hashing cheaper

bigframes/core/compile/ibis_types.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
dtype as python_type_to_ibis_type,
2525
)
2626
import bigframes_vendored.ibis.expr.types as ibis_types
27-
import db_dtypes # type: ignore
2827
import geopandas as gpd # type: ignore
2928
import google.cloud.bigquery as bigquery
3029
import pandas as pd
@@ -75,7 +74,7 @@
7574
IBIS_GEO_TYPE,
7675
gpd.array.GeometryDtype(),
7776
),
78-
(ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),
77+
(ibis_dtypes.json, bigframes.dtypes.JSON_DTYPE),
7978
)
8079

8180
BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {

bigframes/core/utils.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,15 @@ def timedelta_to_micros(
224224
raise TypeError(f"Unrecognized input type: {type(timedelta)}")
225225

226226

227+
def _is_timedelat64_dtype(dtype: dtypes.Dtype) -> bool:
228+
try:
229+
return pdtypes.is_timedelta64_dtype(dtype)
230+
except NotImplementedError:
231+
# Workaround the known issue in pandas:
232+
# https://github.com/pandas-dev/pandas/issues/60958
233+
return False
234+
235+
227236
def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
228237
"""
229238
Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored.
@@ -234,11 +243,11 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
234243
updated_columns = []
235244

236245
for col in dataframe.columns:
237-
if pdtypes.is_timedelta64_dtype(dataframe[col].dtype):
246+
if _is_timedelat64_dtype(dataframe[col].dtype):
238247
dataframe[col] = dataframe[col].apply(timedelta_to_micros)
239248
updated_columns.append(col)
240249

241-
if pdtypes.is_timedelta64_dtype(dataframe.index.dtype):
250+
if _is_timedelat64_dtype(dataframe.index.dtype):
242251
dataframe.index = dataframe.index.map(timedelta_to_micros)
243252
updated_columns.append(dataframe.index.name)
244253

@@ -249,15 +258,15 @@ def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool:
249258
"""
250259
Searches recursively for JSON array type within a PyArrow DataType.
251260
"""
252-
if arrow_type == dtypes.JSON_ARROW_TYPE:
253-
return True
254261
if pa.types.is_list(arrow_type):
255262
return _search_for_nested_json_type(arrow_type.value_type)
256263
if pa.types.is_struct(arrow_type):
257264
for i in range(arrow_type.num_fields):
258265
if _search_for_nested_json_type(arrow_type.field(i).type):
259266
return True
260267
return False
268+
if dtypes.is_json_arrow_type(arrow_type):
269+
return True
261270
return False
262271

263272

@@ -272,7 +281,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
272281

273282
for col in dataframe.columns:
274283
column_type = dataframe[col].dtype
275-
if column_type == dtypes.JSON_DTYPE:
284+
if dtypes.is_json_type(column_type):
276285
dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE)
277286
updated_columns.append(col)
278287
elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type(
@@ -283,7 +292,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
283292
f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
284293
)
285294

286-
if dataframe.index.dtype == dtypes.JSON_DTYPE:
295+
if dtypes.is_json_type(dataframe.index.dtype):
287296
dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE)
288297
updated_columns.append(dataframe.index.name)
289298
elif isinstance(

bigframes/dtypes.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,9 @@
6262
# No arrow equivalent
6363
GEO_DTYPE = gpd.array.GeometryDtype()
6464
# JSON
65-
# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
66-
JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
65+
JSON_ARROW_TYPE = (
66+
pa.json_(pa.string()) if hasattr(pa, "JsonType") else db_dtypes.JSONArrowType()
67+
)
6768
JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
6869
OBJ_REF_DTYPE = pd.ArrowDtype(
6970
pa.struct(
@@ -169,7 +170,7 @@ class SimpleDtypeInfo:
169170
),
170171
SimpleDtypeInfo(
171172
dtype=JSON_DTYPE,
172-
arrow_dtype=db_dtypes.JSONArrowType(),
173+
arrow_dtype=JSON_ARROW_TYPE,
173174
type_kind=("JSON",),
174175
orderable=False,
175176
clusterable=False,
@@ -330,8 +331,18 @@ def is_struct_like(type_: ExpressionType) -> bool:
330331
)
331332

332333

334+
def is_json_arrow_type(type_: pa.DataType) -> bool:
335+
return (hasattr(pa, "JsonType") and isinstance(type_, pa.JsonType)) or (
336+
not hasattr(pa, "JsonType") and isinstance(type_, db_dtypes.JSONArrowType)
337+
)
338+
339+
340+
def is_json_type(type_: ExpressionType) -> bool:
341+
return isinstance(type_, pd.ArrowDtype) and is_json_arrow_type(type_.pyarrow_dtype)
342+
343+
333344
def is_json_like(type_: ExpressionType) -> bool:
334-
return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string
345+
return is_json_type(type_) or type_ == STRING_DTYPE # Including JSON string
335346

336347

337348
def is_json_encoding_type(type_: ExpressionType) -> bool:

bigframes/session/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -813,7 +813,12 @@ def _read_pandas_inline(
813813
) -> Optional[dataframe.DataFrame]:
814814
import bigframes.dataframe as dataframe
815815

816-
if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES:
816+
try:
817+
if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES:
818+
return None
819+
except NotImplementedError:
820+
# Workaround the known issue in pandas:
821+
# https://github.com/pandas-dev/pandas/issues/60958
817822
return None
818823

819824
try:

tests/system/small/test_dataframe_io.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
from typing import Tuple
1616

17-
import db_dtypes # type:ignore
1817
import google.api_core.exceptions
1918
import pandas as pd
2019
import pandas.testing
@@ -307,10 +306,10 @@ def test_load_json_w_json_string_items(session):
307306
)
308307
),
309308
"""
310-
df = session.read_gbq(sql, index_col="id")
311-
312-
assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())
309+
# TODO(b/401630655): JSON is not compatible with allow_large_results=False
310+
df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True)
313311

312+
assert dtypes.is_json_type(df.dtypes["json_col"])
314313
assert df["json_col"][0] == '{"boolean":true}'
315314
assert df["json_col"][1] == '{"int":100}'
316315
assert df["json_col"][2] == '{"float":0.98}'
@@ -325,17 +324,24 @@ def test_load_json_w_json_string_items(session):
325324

326325
def test_load_json_to_pandas_has_correct_result(session):
327326
df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col")
328-
assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())
329-
result = df.to_pandas()
327+
assert dtypes.is_json_type(df.dtypes["json_col"])
328+
329+
# TODO(b/401630655): JSON is not compatible with allow_large_results=False
330+
result = df.to_pandas(allow_large_results=True)
330331

331332
# These JSON strings are compatible with BigQuery's JSON storage,
332333
pd_df = pd.DataFrame(
333334
{"json_col": ['{"bar":true,"foo":10}']},
334-
dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()),
335+
dtype=dtypes.JSON_DTYPE,
335336
)
336337
pd_df.index = pd_df.index.astype("Int64")
337-
pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes)
338-
pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"])
338+
assert dtypes.is_json_type(pd_df.dtypes["json_col"])
339+
340+
# `check_exact=False` can workaround the known issue in pandas:
341+
# https://github.com/pandas-dev/pandas/issues/60958
342+
pd.testing.assert_series_equal(
343+
result["json_col"], pd_df["json_col"], check_exact=False
344+
)
339345

340346

341347
def test_load_json_in_struct(session):
@@ -363,13 +369,14 @@ def test_load_json_in_struct(session):
363369
)
364370
), 7),
365371
"""
366-
df = session.read_gbq(sql, index_col="id")
372+
# TODO(b/401630655): JSON is not compatible with allow_large_results=False
373+
df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True)
367374

368375
assert isinstance(df.dtypes["struct_col"], pd.ArrowDtype)
369376
assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType)
370377

371378
data = df["struct_col"].struct.field("data")
372-
assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
379+
assert dtypes.is_json_type(data.dtype)
373380

374381
assert data[0] == '{"boolean":true}'
375382
assert data[1] == '{"int":100}'
@@ -400,14 +407,15 @@ def test_load_json_in_array(session):
400407
)
401408
] AS array_col,
402409
"""
403-
df = session.read_gbq(sql, index_col="id")
410+
# TODO(b/401630655): JSON is not compatible with allow_large_results=False
411+
df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True)
404412

405413
assert isinstance(df.dtypes["array_col"], pd.ArrowDtype)
406414
assert isinstance(df.dtypes["array_col"].pyarrow_dtype, pa.ListType)
407415

408416
data = df["array_col"].list
409417
assert data.len()[0] == 7
410-
assert data[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
418+
assert dtypes.is_json_type(data[0].dtype)
411419

412420
assert data[0][0] == '{"boolean":true}'
413421
assert data[1][0] == '{"int":100}'

tests/system/small/test_series.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import re
1818
import tempfile
1919

20-
import db_dtypes # type: ignore
2120
import geopandas as gpd # type: ignore
2221
import numpy
2322
from packaging.version import Version
@@ -384,9 +383,9 @@ def test_get_column(scalars_dfs, col_name, expected_dtype):
384383

385384
def test_get_column_w_json(json_df, json_pandas_df):
386385
series = json_df["json_col"]
386+
assert dtypes.is_json_type(series.dtype)
387387
# Until b/401630655 is resolved, json not compatible with allow_large_results=False
388388
series_pandas = series.to_pandas(allow_large_results=True)
389-
assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
390389
assert series_pandas.shape[0] == json_pandas_df.shape[0]
391390

392391

tests/system/small/test_session.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -784,7 +784,11 @@ def test_read_pandas_json_dataframes(session, write_engine):
784784

785785
if write_engine == "bigquery_streaming":
786786
expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64")
787-
pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False)
787+
# `check_exact=False` can workaround the known issue in pandas:
788+
# https://github.com/pandas-dev/pandas/issues/60958
789+
pd.testing.assert_frame_equal(
790+
actual_result, expected_df, check_index_type=False, check_exact=False
791+
)
788792

789793

790794
@pytest.mark.parametrize(
@@ -804,8 +808,10 @@ def test_read_pandas_json_series(session, write_engine):
804808
actual_result = session.read_pandas(
805809
expected_series, write_engine=write_engine
806810
).to_pandas(allow_large_results=True)
811+
# `check_exact=False` can workaround the known issue in pandas:
812+
# https://github.com/pandas-dev/pandas/issues/60958
807813
pd.testing.assert_series_equal(
808-
actual_result, expected_series, check_index_type=False
814+
actual_result, expected_series, check_index_type=False, check_exact=False
809815
)
810816

811817

0 commit comments

Comments
 (0)