feat: reading JSON data as the pyarrow JSON type when available

chelsea-lin · chelsea-lin · commit 195e53017690 · 2025-03-20T23:39:46.000Z
diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py
@@ -108,8 +108,8 @@ def from_table(
             raise ValueError("must set at most one of 'offests', 'primary_key'")
         if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
             msg = bfe.format_message(
-                "JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
-                "is a preview feature and subject to change."
+                "JSON column interpretation as a PyArrow JSON extention type is a preview "
+                "feature and subject to change."
             )
             warnings.warn(msg, bfe.PreviewWarning)
         # define data source only for needed columns, this makes row-hashing cheaper
diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
@@ -24,7 +24,6 @@
     dtype as python_type_to_ibis_type,
 )
 import bigframes_vendored.ibis.expr.types as ibis_types
-import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import google.cloud.bigquery as bigquery
 import pandas as pd
@@ -75,7 +74,7 @@
         IBIS_GEO_TYPE,
         gpd.array.GeometryDtype(),
     ),
-    (ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),
+    (ibis_dtypes.json, bigframes.dtypes.JSON_DTYPE),
 )
 
 BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {
diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
@@ -224,6 +224,15 @@ def timedelta_to_micros(
     raise TypeError(f"Unrecognized input type: {type(timedelta)}")
 
 
+def _is_timedelat64_dtype(dtype: dtypes.Dtype) -> bool:
+    try:
+        return pdtypes.is_timedelta64_dtype(dtype)
+    except NotImplementedError:
+        # Workaround the known issue in pandas:
+        # https://github.com/pandas-dev/pandas/issues/60958
+        return False
+
+
 def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
     """
     Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored.
@@ -234,11 +243,11 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
     updated_columns = []
 
     for col in dataframe.columns:
-        if pdtypes.is_timedelta64_dtype(dataframe[col].dtype):
+        if _is_timedelat64_dtype(dataframe[col].dtype):
             dataframe[col] = dataframe[col].apply(timedelta_to_micros)
             updated_columns.append(col)
 
-    if pdtypes.is_timedelta64_dtype(dataframe.index.dtype):
+    if _is_timedelat64_dtype(dataframe.index.dtype):
         dataframe.index = dataframe.index.map(timedelta_to_micros)
         updated_columns.append(dataframe.index.name)
 
@@ -249,15 +258,15 @@ def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool:
     """
     Searches recursively for JSON array type within a PyArrow DataType.
     """
-    if arrow_type == dtypes.JSON_ARROW_TYPE:
-        return True
     if pa.types.is_list(arrow_type):
         return _search_for_nested_json_type(arrow_type.value_type)
     if pa.types.is_struct(arrow_type):
         for i in range(arrow_type.num_fields):
             if _search_for_nested_json_type(arrow_type.field(i).type):
                 return True
         return False
+    if dtypes.is_json_arrow_type(arrow_type):
+        return True
     return False
 
 
@@ -272,7 +281,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
 
     for col in dataframe.columns:
         column_type = dataframe[col].dtype
-        if column_type == dtypes.JSON_DTYPE:
+        if dtypes.is_json_type(column_type):
             dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE)
             updated_columns.append(col)
         elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type(
@@ -283,7 +292,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
                 f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
             )
 
-    if dataframe.index.dtype == dtypes.JSON_DTYPE:
+    if dtypes.is_json_type(dataframe.index.dtype):
         dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE)
         updated_columns.append(dataframe.index.name)
     elif isinstance(
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -62,8 +62,9 @@
 # No arrow equivalent
 GEO_DTYPE = gpd.array.GeometryDtype()
 # JSON
-# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
-JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
+JSON_ARROW_TYPE = (
+    pa.json_(pa.string()) if hasattr(pa, "JsonType") else db_dtypes.JSONArrowType()
+)
 JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
 OBJ_REF_DTYPE = pd.ArrowDtype(
     pa.struct(
@@ -169,7 +170,7 @@ class SimpleDtypeInfo:
     ),
     SimpleDtypeInfo(
         dtype=JSON_DTYPE,
-        arrow_dtype=db_dtypes.JSONArrowType(),
+        arrow_dtype=JSON_ARROW_TYPE,
         type_kind=("JSON",),
         orderable=False,
         clusterable=False,
@@ -330,8 +331,18 @@ def is_struct_like(type_: ExpressionType) -> bool:
     )
 
 
+def is_json_arrow_type(type_: pa.DataType) -> bool:
+    return (hasattr(pa, "JsonType") and isinstance(type_, pa.JsonType)) or (
+        not hasattr(pa, "JsonType") and isinstance(type_, db_dtypes.JSONArrowType)
+    )
+
+
+def is_json_type(type_: ExpressionType) -> bool:
+    return isinstance(type_, pd.ArrowDtype) and is_json_arrow_type(type_.pyarrow_dtype)
+
+
 def is_json_like(type_: ExpressionType) -> bool:
-    return type_ == JSON_DTYPE or type_ == STRING_DTYPE  # Including JSON string
+    return is_json_type(type_) or type_ == STRING_DTYPE  # Including JSON string
 
 
 def is_json_encoding_type(type_: ExpressionType) -> bool:
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -813,7 +813,12 @@ def _read_pandas_inline(
     ) -> Optional[dataframe.DataFrame]:
         import bigframes.dataframe as dataframe
 
-        if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES:
+        try:
+            if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES:
+                return None
+        except NotImplementedError:
+            # Workaround the known issue in pandas:
+            # https://github.com/pandas-dev/pandas/issues/60958
             return None
 
         try:
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -14,7 +14,6 @@
 
 from typing import Tuple
 
-import db_dtypes  # type:ignore
 import google.api_core.exceptions
 import pandas as pd
 import pandas.testing
@@ -307,10 +306,10 @@ def test_load_json_w_json_string_items(session):
                 )
             ),
     """
-    df = session.read_gbq(sql, index_col="id")
-
-    assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())
+    # TODO(b/401630655): JSON is not compatible with allow_large_results=False
+    df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True)
 
+    assert dtypes.is_json_type(df.dtypes["json_col"])
     assert df["json_col"][0] == '{"boolean":true}'
     assert df["json_col"][1] == '{"int":100}'
     assert df["json_col"][2] == '{"float":0.98}'
@@ -325,17 +324,24 @@ def test_load_json_w_json_string_items(session):
 
 def test_load_json_to_pandas_has_correct_result(session):
     df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col")
-    assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())
-    result = df.to_pandas()
+    assert dtypes.is_json_type(df.dtypes["json_col"])
+
+    # TODO(b/401630655): JSON is not compatible with allow_large_results=False
+    result = df.to_pandas(allow_large_results=True)
 
     # These JSON strings are compatible with BigQuery's JSON storage,
     pd_df = pd.DataFrame(
         {"json_col": ['{"bar":true,"foo":10}']},
-        dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()),
+        dtype=dtypes.JSON_DTYPE,
     )
     pd_df.index = pd_df.index.astype("Int64")
-    pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes)
-    pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"])
+    assert dtypes.is_json_type(pd_df.dtypes["json_col"])
+
+    # `check_exact=False` can workaround the known issue in pandas:
+    # https://github.com/pandas-dev/pandas/issues/60958
+    pd.testing.assert_series_equal(
+        result["json_col"], pd_df["json_col"], check_exact=False
+    )
 
 
 def test_load_json_in_struct(session):
@@ -363,13 +369,14 @@ def test_load_json_in_struct(session):
                 )
             ), 7),
     """
-    df = session.read_gbq(sql, index_col="id")
+    # TODO(b/401630655): JSON is not compatible with allow_large_results=False
+    df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True)
 
     assert isinstance(df.dtypes["struct_col"], pd.ArrowDtype)
     assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType)
 
     data = df["struct_col"].struct.field("data")
-    assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
+    assert dtypes.is_json_type(data.dtype)
 
     assert data[0] == '{"boolean":true}'
     assert data[1] == '{"int":100}'
@@ -400,14 +407,15 @@ def test_load_json_in_array(session):
                 )
             ] AS array_col,
     """
-    df = session.read_gbq(sql, index_col="id")
+    # TODO(b/401630655): JSON is not compatible with allow_large_results=False
+    df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True)
 
     assert isinstance(df.dtypes["array_col"], pd.ArrowDtype)
     assert isinstance(df.dtypes["array_col"].pyarrow_dtype, pa.ListType)
 
     data = df["array_col"].list
     assert data.len()[0] == 7
-    assert data[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
+    assert dtypes.is_json_type(data[0].dtype)
 
     assert data[0][0] == '{"boolean":true}'
     assert data[1][0] == '{"int":100}'
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -17,7 +17,6 @@
 import re
 import tempfile
 
-import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import numpy
 from packaging.version import Version
@@ -384,9 +383,9 @@ def test_get_column(scalars_dfs, col_name, expected_dtype):
 
 def test_get_column_w_json(json_df, json_pandas_df):
     series = json_df["json_col"]
+    assert dtypes.is_json_type(series.dtype)
     # Until b/401630655 is resolved, json not compatible with allow_large_results=False
     series_pandas = series.to_pandas(allow_large_results=True)
-    assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
     assert series_pandas.shape[0] == json_pandas_df.shape[0]
 
 
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
@@ -784,7 +784,11 @@ def test_read_pandas_json_dataframes(session, write_engine):
 
     if write_engine == "bigquery_streaming":
         expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64")
-    pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False)
+    # `check_exact=False` can workaround the known issue in pandas:
+    # https://github.com/pandas-dev/pandas/issues/60958
+    pd.testing.assert_frame_equal(
+        actual_result, expected_df, check_index_type=False, check_exact=False
+    )
 
 
 @pytest.mark.parametrize(
@@ -804,8 +808,10 @@ def test_read_pandas_json_series(session, write_engine):
     actual_result = session.read_pandas(
         expected_series, write_engine=write_engine
     ).to_pandas(allow_large_results=True)
+    # `check_exact=False` can workaround the known issue in pandas:
+    # https://github.com/pandas-dev/pandas/issues/60958
     pd.testing.assert_series_equal(
-        actual_result, expected_series, check_index_type=False
+        actual_result, expected_series, check_index_type=False, check_exact=False
     )