Skip to content

CLN GH23123 Move SparseArray to arrays #23147

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 16, 2018
Merged
2 changes: 1 addition & 1 deletion pandas/api/extensions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
register_index_accessor,
register_series_accessor)
from pandas.core.algorithms import take # noqa
from pandas.core.arrays.base import (ExtensionArray, # noqa
from pandas.core.arrays import (ExtensionArray, # noqa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, should have reverted this too :/ Oh well.

ExtensionScalarOpsMixin)
from pandas.core.dtypes.dtypes import ( # noqa
ExtensionDtype, register_extension_dtype
Expand Down
3 changes: 2 additions & 1 deletion pandas/compat/numpy/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
"""

from numpy import ndarray
from pandas.util._validators import (validate_args, validate_kwargs,
from pandas.util._validators import (validate_args,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason you are changing unrelated things? (even formatting)

validate_kwargs,
validate_args_and_kwargs)
from pandas.errors import UnsupportedFunctionCall
from pandas.core.dtypes.common import is_integer, is_bool
Expand Down
6 changes: 4 additions & 2 deletions pandas/compat/pickle_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def load_reduce(self):

# If classes are moved, provide compat here.
_class_locations_map = {
('pandas.core.sparse.array', 'SparseArray'):
('pandas.core.arrays', 'SparseArray'),

# 15477
('pandas.core.base', 'FrozenNDArray'):
Expand Down Expand Up @@ -88,7 +90,7 @@ def load_reduce(self):

# 15998 top-level dirs moving
('pandas.sparse.array', 'SparseArray'):
('pandas.core.sparse.array', 'SparseArray'),
('pandas.core.arrays.sparse', 'SparseArray'),
('pandas.sparse.series', 'SparseSeries'):
('pandas.core.sparse.series', 'SparseSeries'),
('pandas.sparse.frame', 'SparseDataFrame'):
Expand All @@ -112,7 +114,7 @@ def load_reduce(self):

# 19269, arrays moving
('pandas.core.categorical', 'Categorical'):
('pandas.core.arrays', 'Categorical'),
('pandas.core.arrays.categorical', 'Categorical'),

# 19939, add timedeltaindex, float64index compat from 15998 move
('pandas.tseries.tdi', 'TimedeltaIndex'):
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
from .timedeltas import TimedeltaArrayMixin # noqa
from .integer import ( # noqa
IntegerArray, integer_array)
from .sparse import SparseArray # noqa
4 changes: 2 additions & 2 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ def _add_delta(self, delta):
The result's name is set outside of _add_delta by the calling
method (__add__ or __sub__)
"""
from pandas.core.arrays.timedeltas import TimedeltaArrayMixin
from pandas.core.arrays import TimedeltaArrayMixin
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Were these changes here before mine? I may revert them.


if isinstance(delta, (Tick, timedelta, np.timedelta64)):
new_values = self._add_delta_td(delta)
Expand Down Expand Up @@ -818,7 +818,7 @@ def to_period(self, freq=None):
pandas.PeriodIndex: Immutable ndarray holding ordinal values
pandas.DatetimeIndex.to_pydatetime: Return DatetimeIndex as object
"""
from pandas.core.arrays.period import PeriodArrayMixin
from pandas.core.arrays import PeriodArrayMixin

if self.tz is not None:
warnings.warn("Converting to PeriodArray/Index representation "
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
is_integer_dtype,
is_object_dtype,
is_list_like)
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.arrays import (ExtensionArray,
ExtensionOpsMixin)
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.missing import isna, notna
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from pandas.util._decorators import Appender
from pandas.util._doctools import _WritableDoc

from . import ExtensionArray, Categorical
from pandas.core.arrays import ExtensionArray, Categorical

_VALID_CLOSED = {'left', 'right', 'both', 'neither'}
_interval_shared_docs = {}
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def to_timestamp(self, freq=None, how='start'):
-------
DatetimeArray/Index
"""
from pandas.core.arrays.datetimes import DatetimeArrayMixin
from pandas.core.arrays import DatetimeArrayMixin

how = libperiod._validate_end_alias(how)

Expand Down
252 changes: 250 additions & 2 deletions pandas/core/sparse/array.py → pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import division
# pylint: disable=E1101,E1103,W0231

import re
import operator
import numbers
import numpy as np
Expand All @@ -16,8 +17,10 @@
from pandas.errors import PerformanceWarning
from pandas.compat.numpy import function as nv

from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import (
ABCSparseSeries, ABCSeries, ABCIndexClass
)
Expand Down Expand Up @@ -45,7 +48,252 @@
import pandas.core.algorithms as algos
import pandas.io.formats.printing as printing

from pandas.core.sparse.dtype import SparseDtype

# ----------------------------------------------------------------------------
# Dtype

@register_extension_dtype
class SparseDtype(ExtensionDtype):
"""
Dtype for data stored in :class:`SparseArray`.

This dtype implements the pandas ExtensionDtype interface.

.. versionadded:: 0.24.0

Parameters
----------
dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
The dtype of the underlying array storing the non-fill value values.
fill_value : scalar, optional.
The scalar value not stored in the SparseArray. By default, this
depends on `dtype`.

========== ==========
dtype na_value
========== ==========
float ``np.nan``
int ``0``
bool ``False``
datetime64 ``pd.NaT``
timedelta64 ``pd.NaT``
========== ==========

The default value may be overridden by specifying a `fill_value`.
"""
# We include `_is_na_fill_value` in the metadata to avoid hash collisions
# between SparseDtype(float, 0.0) and SparseDtype(float, nan).
# Without is_na_fill_value in the comparison, those would be equal since
# hash(nan) is (sometimes?) 0.
_metadata = ('_dtype', '_fill_value', '_is_na_fill_value')

def __init__(self, dtype=np.float64, fill_value=None):
# type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
from pandas.core.dtypes.missing import na_value_for_dtype
from pandas.core.dtypes.common import (
pandas_dtype, is_string_dtype, is_scalar
)

if isinstance(dtype, type(self)):
if fill_value is None:
fill_value = dtype.fill_value
dtype = dtype.subtype

dtype = pandas_dtype(dtype)
if is_string_dtype(dtype):
dtype = np.dtype('object')

if fill_value is None:
fill_value = na_value_for_dtype(dtype)

if not is_scalar(fill_value):
raise ValueError("fill_value must be a scalar. Got {} "
"instead".format(fill_value))
self._dtype = dtype
self._fill_value = fill_value

def __hash__(self):
# Python3 doesn't inherit __hash__ when a base class overrides
# __eq__, so we explicitly do it here.
return super(SparseDtype, self).__hash__()

def __eq__(self, other):
# We have to override __eq__ to handle NA values in _metadata.
# The base class does simple == checks, which fail for NA.
if isinstance(other, compat.string_types):
try:
other = self.construct_from_string(other)
except TypeError:
return False

if isinstance(other, type(self)):
subtype = self.subtype == other.subtype
if self._is_na_fill_value:
# this case is complicated by two things:
# SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
# SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
# i.e. we want to treat any floating-point NaN as equal, but
# not a floating-point NaN and a datetime NaT.
fill_value = (
other._is_na_fill_value and
isinstance(self.fill_value, type(other.fill_value)) or
isinstance(other.fill_value, type(self.fill_value))
)
else:
fill_value = self.fill_value == other.fill_value

return subtype and fill_value
return False

@property
def fill_value(self):
"""
The fill value of the array.

Converting the SparseArray to a dense ndarray will fill the
array with this value.

.. warning::

It's possible to end up with a SparseArray that has ``fill_value``
values in ``sp_values``. This can occur, for example, when setting
``SparseArray.fill_value`` directly.
"""
return self._fill_value

@property
def _is_na_fill_value(self):
from pandas.core.dtypes.missing import isna
return isna(self.fill_value)

@property
def _is_numeric(self):
from pandas.core.dtypes.common import is_object_dtype
return not is_object_dtype(self.subtype)

@property
def _is_boolean(self):
from pandas.core.dtypes.common import is_bool_dtype
return is_bool_dtype(self.subtype)

@property
def kind(self):
return self.subtype.kind

@property
def type(self):
return self.subtype.type

@property
def subtype(self):
return self._dtype

@property
def name(self):
return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)

def __repr__(self):
return self.name

@classmethod
def construct_array_type(cls):
return SparseArray

@classmethod
def construct_from_string(cls, string):
"""
Construct a SparseDtype from a string form.

Parameters
----------
string : str
Can take the following forms.

string dtype
================ ============================
'int' SparseDtype[np.int64, 0]
'Sparse' SparseDtype[np.float64, nan]
'Sparse[int]' SparseDtype[np.int64, 0]
'Sparse[int, 0]' SparseDtype[np.int64, 0]
================ ============================

It is not possible to specify non-default fill values
with a string. An argument like ``'Sparse[int, 1]'``
will raise a ``TypeError`` because the default fill value
for integers is 0.

Returns
-------
SparseDtype
"""
msg = "Could not construct SparseDtype from '{}'".format(string)
if string.startswith("Sparse"):
try:
sub_type, has_fill_value = cls._parse_subtype(string)
result = SparseDtype(sub_type)
except Exception:
raise TypeError(msg)
else:
msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
"looks like the fill_value in the string is not "
"the default for the dtype. Non-default fill_values "
"are not supported. Use the 'SparseDtype()' "
"constructor instead.")
if has_fill_value and str(result) != string:
raise TypeError(msg.format(string))
return result
else:
raise TypeError(msg)

@staticmethod
def _parse_subtype(dtype):
"""
Parse a string to get the subtype

Parameters
----------
dtype : str
A string like

* Sparse[subtype]
* Sparse[subtype, fill_value]

Returns
-------
subtype : str

Raises
------
ValueError
When the subtype cannot be extracted.
"""
xpr = re.compile(
r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$"
)
m = xpr.match(dtype)
has_fill_value = False
if m:
subtype = m.groupdict()['subtype']
has_fill_value = m.groupdict()['fill_value'] or has_fill_value
elif dtype == "Sparse":
subtype = 'float64'
else:
raise ValueError("Cannot parse {}".format(dtype))
return subtype, has_fill_value

@classmethod
def is_dtype(cls, dtype):
dtype = getattr(dtype, 'dtype', dtype)
if (isinstance(dtype, compat.string_types) and
dtype.startswith("Sparse")):
sub_type, _ = cls._parse_subtype(dtype)
dtype = np.dtype(sub_type)
elif isinstance(dtype, cls):
return True
return isinstance(dtype, np.dtype) or dtype == 'Sparse'

# ----------------------------------------------------------------------------
# Array


_sparray_doc_kwargs = dict(klass='SparseArray')
Expand Down
Loading