-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
REF/ENH: Refactor NDFrame finalization #28334
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
8370e39
d7bb99c
60bc89c
b05782c
53576eb
3009732
d68e5bb
710d73a
ecf3989
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
""" | ||
Metadata propagation through pandas operations. | ||
|
||
This module contains the infrastructure for propagating ``NDFrame._metadata`` | ||
through operations. We perform an operation (say :meth:`pandas.Series.copy`) that | ||
returns an ``NDFrame`` and would like to propagate the metadata (say ``Series.name``) | ||
from ``self`` to the new ``NDFrame``. | ||
|
||
.. note:: | ||
|
||
Currently, pandas doesn't provide a clean, documented API on | ||
|
||
* which methods call finalize | ||
* the types passed to finalize for each method | ||
|
||
This is a known limitation we would like to address in the future. | ||
""" | ||
from collections import defaultdict | ||
from functools import wraps | ||
from typing import TYPE_CHECKING, Any, Callable, Union | ||
|
||
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries | ||
|
||
if TYPE_CHECKING: | ||
from pandas.core.generic import NDFrame | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you want FrameOrSeries from pandas._typing |
||
|
||
dispatch = defaultdict(dict) | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dispatch_method_type = Union[Callable[..., "NDFrame"], str] | ||
|
||
|
||
def key_of(method): | ||
if isinstance(method, str): | ||
# TODO: figure out if this is OK. May be necessary when we have | ||
# things like pd.merge and DataFrame.merge that hit the same finalize. | ||
return method | ||
elif method: | ||
return method.__module__, method.__name__ | ||
|
||
|
||
class PandasMetadata: | ||
""" | ||
Dispatch metadata finalization for pandas metadata. | ||
|
||
Users should instantiate a single `PandasMetadata` instance | ||
for their piece of metadata and register finalizers for various | ||
pandas methods using :meth:`PandsaMetadata.register`. | ||
|
||
Parameters | ||
---------- | ||
name : str | ||
The name of the attribute being finalized. | ||
|
||
Examples | ||
-------- | ||
>>> maxmeta = PandasMetadata("attr") | ||
|
||
Register a finalizer for a given pandas method: | ||
|
||
>>> @maxmeta.register(pd.concat) | ||
... def _(new, concatenator): | ||
... new.attr = max(x.attr_meta for x in concatenator.objs) | ||
|
||
>>> pd.DataFrame._metadata = ['attr'] | ||
>>> x = pd.DataFrame({"x"}); x.attr = 1 | ||
>>> y = pd.DataFrame({"y"}); y.attr = 2 | ||
>>> pd.concat([x, y]).attr | ||
2 | ||
""" | ||
|
||
def __init__(self, name: str): | ||
self.name = name | ||
|
||
def register(self, pandas_method: dispatch_method_type): | ||
""" | ||
A decorator to register a finalizer for a specific pandas method. | ||
|
||
Parameters | ||
---------- | ||
pandas_method : callable or str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this looks like you can register a single finalizer? but we already have internal ones, shouldn't this just append to a list of finalizers? how is the default done if we have 1 or more finalizers? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea was to register one finalizer per pandas method. I Brock's subclassed based approach will make this clearer. Pandas will provide a default implementation, which the subclass can override.
Previously, the Now we iterate over metadata attributes, look up the finalizer for that attribute, and then apply that finalizer. This gives you potentially different finalization behavior for different attributes (which we need for |
||
A pandas method, like :meth:`pandas.concat`, that this finalizer | ||
should be used for. The function being decorated will be called | ||
with the relevant arguments (typically the output and the source NDFrame). | ||
When `NDFrame.__finalize__` is called as a result of `pandas_method`, | ||
the registered finalizer will be called. | ||
""" | ||
|
||
def decorate(func): | ||
# TODO: warn of collisions? | ||
dispatch[key_of(pandas_method)][self.name] = func | ||
|
||
@wraps(func) | ||
def wrapper(*args, **kwargs): | ||
return func(*args, **kwargs) | ||
|
||
return wrapper | ||
|
||
return decorate | ||
|
||
|
||
def default_finalizer(new: "NDFrame", other: Any, *, name: str): | ||
""" | ||
The default finalizer when this method, attribute hasn't been overridden. | ||
|
||
This copies the ``_metadata`` attribute from ``other`` to ``self``, modifying | ||
``self`` inplace. | ||
|
||
Parameters | ||
---------- | ||
new : NDFrame | ||
The newly created NDFrame being finalized. | ||
other : NDFrame | ||
The source NDFrame attributes will be extracted from. | ||
""" | ||
object.__setattr__(new, name, getattr(other, name, None)) | ||
|
||
|
||
# ---------------------------------------------------------------------------- | ||
# Pandas Internals. | ||
|
||
|
||
def ndframe_finalize(new: "NDFrame", other: Any, method: dispatch_method_type): | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Finalize a new NDFrame. | ||
|
||
The finalizer is looked up from finalizers registered with PandasMetadata. | ||
`new` is modified inplace, and nothing is returned. | ||
|
||
Parameters | ||
---------- | ||
new : NDFrame | ||
other : NDFrame | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Or a list of them? TBD | ||
method : callable or str | ||
""" | ||
# To avoid one isinstance per _metadata name, we check up front. | ||
# Most of the time `other` is an ndframe, but in some cases (e.g. concat) | ||
# it's `_Concatenator` object | ||
other_is_ndframe = isinstance(other, (ABCSeries, ABCDataFrame)) | ||
|
||
for name in new._metadata: | ||
finalizer = dispatch.get(key_of(method), {}).get(name) | ||
|
||
if finalizer: | ||
finalizer(new, other) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should not these return new? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a style choice. All these operations are inplace. My hope is that by returning |
||
elif other_is_ndframe: | ||
default_finalizer(new, other, name=name) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import pytest | ||
|
||
import pandas as pd | ||
from pandas.core._meta import PandasMetadata | ||
|
||
mymeta = PandasMetadata("attr") | ||
|
||
|
||
@mymeta.register(pd.core.generic.NDFrame.copy) | ||
def _(new, other): | ||
new.attr = other.attr + 1 | ||
|
||
|
||
@mymeta.register("concat") | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def _(new, other): | ||
assert isinstance(other, pd.core.reshape.concat._Concatenator) | ||
new.attr = sum(x.attr for x in other.objs) | ||
|
||
|
||
@pytest.fixture | ||
def custom_meta(monkeypatch): | ||
original_metadata = [] | ||
|
||
for cls in [pd.Series, pd.DataFrame]: | ||
original_metadata.append(cls._metadata) | ||
custom_metadata = cls._metadata.copy() | ||
custom_metadata.append("attr") | ||
|
||
monkeypatch.setattr(cls, "_metadata", custom_metadata) | ||
|
||
|
||
def test_custom_finalizer(custom_meta): | ||
|
||
df = pd.DataFrame({"A": [1, 2]}) | ||
df.attr = 0 | ||
|
||
result = df.copy() | ||
assert result.attr == 1 | ||
|
||
|
||
def test_concat(custom_meta): | ||
df1 = pd.DataFrame({"A": [1, 2]}) | ||
df1.attr = 2 | ||
|
||
df2 = pd.DataFrame({"A": [1, 2]}) | ||
df2.attr = 3 | ||
|
||
result = pd.concat([df1, df2]) | ||
assert result.attr == 5 |
Uh oh!
There was an error while loading. Please reload this page.