Closed
Description
Describe the bug, including details regarding any error messages, version, and platform.
Simple repro:
import pyarrow as pa
import pandas as pd
pd.DataFrame({"x":pa.array(pd.Series([[2.2]*5]*10)).to_pandas(types_mapper=pd.ArrowDtype)}).to_parquet("/tmp/list4.pqt")
df2 = pd.read_parquet("/tmp/list4.pqt", dtype_backend="pyarrow")
Fails with
File ~/<redacted>/lib/python3.11/site-packages/pandas/io/parquet.py:667, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
664 use_nullable_dtypes = False
665 check_dtype_backend(dtype_backend)
--> 667 return impl.read(
668 path,
669 columns=columns,
670 filters=filters,
671 storage_options=storage_options,
672 use_nullable_dtypes=use_nullable_dtypes,
673 dtype_backend=dtype_backend,
674 filesystem=filesystem,
675 **kwargs,
676 )
File ~/<redacted>/lib/python3.11/site-packages/pandas/io/parquet.py:281, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
273 try:
274 pa_table = self.api.parquet.read_table(
275 path_or_handle,
276 columns=columns,
(...)
279 **kwargs,
280 )
--> 281 result = pa_table.to_pandas(**to_pandas_kwargs)
283 if manager == "array":
284 result = result._as_manager("array", copy=False)
File ~/<redacted>lib/python3.11/site-packages/pyarrow/array.pxi:884, in pyarrow.lib._PandasConvertible.to_pandas()
File ~/<redacted>/lib/python3.11/site-packages/pyarrow/table.pxi:4251, in pyarrow.lib.Table._to_pandas()
File ~/<redacted>/lib/python3.11/site-packages/pyarrow/pandas_compat.py:769, in table_to_dataframe(options, table, categories, ignore_metadata, types_mapper)
766 table = _add_any_metadata(table, pandas_metadata)
767 table, index = _reconstruct_index(table, index_descriptors,
768 all_columns, types_mapper)
--> 769 ext_columns_dtypes = _get_extension_dtypes(
770 table, all_columns, types_mapper)
771 else:
772 index = _pandas_api.pd.RangeIndex(table.num_rows)
File ~/<redacted>/lib/python3.11/site-packages/pyarrow/pandas_compat.py:828, in _get_extension_dtypes(table, columns_metadata, types_mapper)
823 dtype = col_meta['numpy_type']
825 if dtype not in _pandas_supported_numpy_types:
826 # pandas_dtype is expensive, so avoid doing this for types
827 # that are certainly numpy dtypes
--> 828 pandas_dtype = _pandas_api.pandas_dtype(dtype)
829 if isinstance(pandas_dtype, _pandas_api.extension_dtype):
830 if hasattr(pandas_dtype, "__from_arrow__"):
File ~/<redacted>/lib/python3.11/site-packages/pyarrow/pandas-shim.pxi:141, in pyarrow.lib._PandasAPIShim.pandas_dtype()
File ~/<redacted>/lib/python3.11/site-packages/pyarrow/pandas-shim.pxi:144, in pyarrow.lib._PandasAPIShim.pandas_dtype()
File ~/<redacted>/lib/python3.11/site-packages/pandas/core/dtypes/common.py:1630, in pandas_dtype(dtype)
1625 with warnings.catch_warnings():
1626 # GH#51523 - Series.astype(np.integer) doesn't show
1627 # numpy deprecation warning of np.integer
1628 # Hence enabling DeprecationWarning
1629 warnings.simplefilter("always", DeprecationWarning)
-> 1630 npdtype = np.dtype(dtype)
1631 except SyntaxError as err:
1632 # np.dtype uses `eval` which can raise SyntaxError
1633 raise TypeError(f"data type '{dtype}' not understood") from err
TypeError: data type 'list<item: double>[pyarrow]' not understood
Environment:
OS: MacOS Darwin Kernel Version 22.1.0
Python: 3.11.6
Pandas: 2.2.0
Pyarrow 15.0.0
The same error is raised even if we use pd.read_parquet("/tmp/list4.pqt", dtype_backend="numpy_nullable")
The non-arrow backed column version
import pyarrow as pa
import pandas as pd
pd.DataFrame({"x":pd.Series([[2.2]*5]*10)}).to_parquet("/tmp/list2.pqt")
df2 = pd.read_parquet("/tmp/list2.pqt", dtype_backend="pyarrow")
is read back correctly, and the column is Arrow backed in the new dataframe, so it doesn't survive a further round trip.
I did some further digging at the Parquet Pandas metadata, and found that for the Parquet written from the Arrow-based table, we have
{'index_columns': [{'kind': 'range',
'name': None,
'start': 0,
'stop': 10,
'step': 1}],
'column_indexes': [{'name': None,
'field_name': None,
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': {'encoding': 'UTF-8'}}],
'columns': [{'name': 'x',
'field_name': 'x',
'pandas_type': 'list[float64]',
'numpy_type': 'list<element: double>[pyarrow]',
'metadata': None}],
'creator': {'library': 'pyarrow', 'version': '15.0.0'},
'pandas_version': '2.2.0'}
whereas for the numpy-based dataframe, the output is:
{'index_columns': [{'kind': 'range',
'name': None,
'start': 0,
'stop': 10,
'step': 1}],
'column_indexes': [{'name': None,
'field_name': None,
'pandas_type': 'unicode',
'numpy_type': 'object',
'metadata': {'encoding': 'UTF-8'}}],
'columns': [{'name': 'x',
'field_name': 'x',
'pandas_type': 'list[float64]',
'numpy_type': 'object',
'metadata': None}],
'creator': {'library': 'pyarrow', 'version': '15.0.0'},
'pandas_version': '2.2.0'}
The problem seems to be caused by the numpy_type for the arrow case being set to 'list<element: double>[pyarrow]' rather than object or a numpy array type.
Component(s)
Parquet, Python