diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index d701208792a4c..a468403e9b261 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -40,6 +40,8 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a {'index_columns': [, , ...], 'column_indexes': [, , ..., ], 'columns': [, , ...], + 'attrs': {...}, + 'column_attrs': {: {...}, : {...}, ...}, 'pandas_version': $VERSION, 'creator': { 'library': $LIBRARY, @@ -185,3 +187,49 @@ As an example of fully-formed metadata: 'library': 'pyarrow', 'version': '0.13.0' }} + + +Attribute metadata +~~~~~~~~~~~~~~~~~~ + +.. warning:: This only works with the ``pyarrow`` engine as of ``pandas`` 1.3. + +The attributes of both the ``DataFrame`` and each ``Series`` are written to and read +from using: + +- :attr:`DataFrame.attrs` +- :attr:`Series.attrs` + +Here is an example: + +.. code-block:: python + + df = pd.DataFrame({"a": [1], "b": [1]}) + df.attrs = {"name": "my custom dataset"} + df.a.attrs = { + "long_name": "Description about data", + "nodata": -1, + "units": "metre", + } + df.to_parquet("file.parquet") + + +Here is an example of the metadata: + +.. code-block:: text + + { + ... + 'attrs': {'name': 'my custom dataset'}, + 'column_attrs': { + 'a': { + 'long_name': 'Description about data', + 'nodata': -1, + 'units': 'metre', + }, + }, + 'pandas_version': '1.3.0', + 'creator': { + 'library': 'pyarrow', + 'version': '0.13.0' + }} diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b36499c340fd9..c30f1b8e0cf21 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -234,6 +234,7 @@ Other enhancements - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) - Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) +- Read and write :class:`DataFrame` and :class:`Series` attrs to parquet with pyarrow engine (:issue:`20521`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index bccf3c3f1011b..57fd7c61d6bd7 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -2,6 +2,7 @@ from __future__ import annotations import io +import json import os from typing import ( Any, @@ -142,6 +143,44 @@ def read(self, path, columns=None, **kwargs): raise AbstractMethodError(self) +def _pyarrow_write_attrs(table: Any, df: DataFrame) -> Any: + """ + .. versionadded:: 1.3 + + Copy attts from pandas.DataFrame and pandas.Series to + schema metadata in pyarrow.Table. + """ + schema_metadata = table.schema.metadata or {} + pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}")) + column_attrs = {} + for col in df.columns: + attrs = df[col].attrs + if not attrs or not isinstance(col, str): + continue + column_attrs[col] = attrs + pandas_metadata.update( + attrs=df.attrs, + column_attrs=column_attrs, + ) + schema_metadata[b"pandas"] = json.dumps(pandas_metadata) + return table.replace_schema_metadata(schema_metadata) + + +def _pyarrow_read_attrs(table: Any, df: DataFrame) -> None: + """ + .. versionadded:: 1.3 + + Copy schema metadata from pyarrow.Table + to attrs in pandas.DataFrame and pandas.Series. + """ + schema_metadata = table.schema.metadata or {} + pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}")) + df.attrs = pandas_metadata.get("attrs", {}) + col_attrs = pandas_metadata.get("column_attrs", {}) + for col in df.columns: + df[col].attrs = col_attrs.get(col, {}) + + class PyArrowImpl(BaseImpl): def __init__(self): import_optional_dependency( @@ -171,6 +210,7 @@ def write( from_pandas_kwargs["preserve_index"] = index table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + table = _pyarrow_write_attrs(table, df) path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, @@ -236,9 +276,11 @@ def read( mode="rb", ) try: - result = self.api.parquet.read_table( + table = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs - ).to_pandas(**to_pandas_kwargs) + ) + result = table.to_pandas(**to_pandas_kwargs) + _pyarrow_read_attrs(table, result) if manager == "array": result = result._as_manager("array", copy=False) return result diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ae6425cd93ac5..e58b9c0c6e09d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -907,6 +907,39 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) + @td.skip_if_no("pyarrow") + def test_read_write_attrs(self, pa): + df = pd.DataFrame({"a": [1], "b": [1]}) + df.attrs = {"name": "my custom dataset"} + df.a.attrs = { + "long_name": "Description about data", + "nodata": -1, + "units": "metre", + } + df.b.attrs = {} + with tm.ensure_clean() as path: + df.to_parquet(path) + result = read_parquet(path) + + assert result.attrs == {"name": "my custom dataset"} + assert result.a.attrs == { + "long_name": "Description about data", + "nodata": -1, + "units": "metre", + } + assert result.b.attrs == {} + + @td.skip_if_no("pyarrow") + def test_read_write_attrs__invalid(self, pa): + df = pd.DataFrame({"a": [1], "b": [1]}) + df.attrs = {-1: np.array(1)} + df.a.attrs = {-1: np.array(1)} + df.b.attrs = {} + with tm.ensure_clean() as path, pytest.raises( + TypeError, match="not JSON serializable" + ): + df.to_parquet(path) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full):