diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index e38c43e73e..684290bf81 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -18,10 +18,12 @@ from typing import Hashable, Iterable, List import warnings +import bigframes_vendored.constants as constants import bigframes_vendored.pandas.io.common as vendored_pandas_io_common import numpy as np import pandas as pd import pandas.api.types as pdtypes +import pyarrow as pa import typing_extensions import bigframes.dtypes as dtypes @@ -243,6 +245,22 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: return updated_columns +def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool: + """ + Searches recursively for JSON array type within a PyArrow DataType. + """ + if arrow_type == dtypes.JSON_ARROW_TYPE: + return True + if pa.types.is_list(arrow_type): + return _search_for_nested_json_type(arrow_type.value_type) + if pa.types.is_struct(arrow_type): + for i in range(arrow_type.num_fields): + if _search_for_nested_json_type(arrow_type.field(i).type): + return True + return False + return False + + def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]: """ Due to a BigQuery IO limitation with loading JSON from Parquet files (b/374784249), @@ -253,12 +271,27 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]: updated_columns = [] for col in dataframe.columns: - if dataframe[col].dtype == dtypes.JSON_DTYPE: + column_type = dataframe[col].dtype + if column_type == dtypes.JSON_DTYPE: dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE) updated_columns.append(col) + elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type( + column_type.pyarrow_dtype + ): + raise NotImplementedError( + f"Nested JSON types, found in column `{col}`: `{column_type}`', " + f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" + ) if dataframe.index.dtype == dtypes.JSON_DTYPE: dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE) updated_columns.append(dataframe.index.name) + elif isinstance( + dataframe.index.dtype, pd.ArrowDtype + ) and _search_for_nested_json_type(dataframe.index.dtype.pyarrow_dtype): + raise NotImplementedError( + f"Nested JSON types, found in the index: `{dataframe.index.dtype}`', " + f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" + ) return updated_columns diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 323d002df4..663e5e2f10 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -26,6 +26,8 @@ import google.cloud.bigquery as bigquery import numpy as np import pandas as pd +import pandas.arrays as arrays +import pyarrow as pa import pytest import bigframes @@ -829,6 +831,68 @@ def test_read_pandas_json_index(session, write_engine): pd.testing.assert_index_equal(actual_result, expected_index) +@pytest.mark.parametrize( + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_load"), + ], +) +def test_read_pandas_w_nested_json(session, write_engine): + data = [ + [{"json_field": "1"}], + [{"json_field": None}], + [{"json_field": '["1","3","5"]'}], + [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}], + ] + # PyArrow currently lacks support for creating structs or lists containing extension types. + # See issue: https://github.com/apache/arrow/issues/45262 + pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())]))) + pd_s = pd.Series( + arrays.ArrowExtensionArray(pa_array), # type: ignore + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)])) + ), + ) + with pytest.raises(NotImplementedError, match="Nested JSON types, found in column"): + # Until b/401630655 is resolved, json not compatible with allow_large_results=False + session.read_pandas(pd_s, write_engine=write_engine).to_pandas( + allow_large_results=True + ) + + +@pytest.mark.parametrize( + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_load"), + ], +) +def test_read_pandas_w_nested_json_index(session, write_engine): + data = [ + [{"json_field": "1"}], + [{"json_field": None}], + [{"json_field": '["1","3","5"]'}], + [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}], + ] + # PyArrow currently lacks support for creating structs or lists containing extension types. + # See issue: https://github.com/apache/arrow/issues/45262 + pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())]))) + pd_idx: pd.Index = pd.Index( + arrays.ArrowExtensionArray(pa_array), # type: ignore + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)])) + ), + ) + with pytest.raises( + NotImplementedError, match="Nested JSON types, found in the index" + ): + # Until b/401630655 is resolved, json not compatible with allow_large_results=False + session.read_pandas(pd_idx, write_engine=write_engine).to_pandas( + allow_large_results=True + ) + + @utils.skip_legacy_pandas @pytest.mark.parametrize( ("write_engine",),