pandas-dev · TomAugspurger · Oct 16, 2018 · Oct 14, 2018 · Oct 14, 2018 · Oct 14, 2018
diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py
@@ -3,7 +3,7 @@
                                   register_index_accessor,
                                   register_series_accessor)
 from pandas.core.algorithms import take  # noqa
-from pandas.core.arrays.base import (ExtensionArray,    # noqa
+from pandas.core.arrays import (ExtensionArray,    # noqa
                                      ExtensionScalarOpsMixin)
 from pandas.core.dtypes.dtypes import (  # noqa
     ExtensionDtype, register_extension_dtype

diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py
@@ -19,7 +19,8 @@
 """
 
 from numpy import ndarray
-from pandas.util._validators import (validate_args, validate_kwargs,
+from pandas.util._validators import (validate_args,
+                                     validate_kwargs,
                                      validate_args_and_kwargs)
 from pandas.errors import UnsupportedFunctionCall
 from pandas.core.dtypes.common import is_integer, is_bool

diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
@@ -56,6 +56,8 @@ def load_reduce(self):
 
 # If classes are moved, provide compat here.
 _class_locations_map = {
+    ('pandas.core.sparse.array', 'SparseArray'):
+        ('pandas.core.arrays', 'SparseArray'),
 
     # 15477
     ('pandas.core.base', 'FrozenNDArray'):
@@ -88,7 +90,7 @@ def load_reduce(self):
 
     # 15998 top-level dirs moving
     ('pandas.sparse.array', 'SparseArray'):
-        ('pandas.core.sparse.array', 'SparseArray'),
+        ('pandas.core.arrays.sparse', 'SparseArray'),
     ('pandas.sparse.series', 'SparseSeries'):
         ('pandas.core.sparse.series', 'SparseSeries'),
     ('pandas.sparse.frame', 'SparseDataFrame'):
@@ -112,7 +114,7 @@ def load_reduce(self):
 
     # 19269, arrays moving
     ('pandas.core.categorical', 'Categorical'):
-        ('pandas.core.arrays', 'Categorical'),
+        ('pandas.core.arrays.categorical', 'Categorical'),
 
     # 19939, add timedeltaindex, float64index compat from 15998 move
     ('pandas.tseries.tdi', 'TimedeltaIndex'):

diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
@@ -8,3 +8,4 @@
 from .timedeltas import TimedeltaArrayMixin  # noqa
 from .integer import (  # noqa
     IntegerArray, integer_array)
+from .sparse import SparseArray  # noqa
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -523,7 +523,7 @@ def _add_delta(self, delta):
         The result's name is set outside of _add_delta by the calling
         method (__add__ or __sub__)
         """
-        from pandas.core.arrays.timedeltas import TimedeltaArrayMixin
+        from pandas.core.arrays import TimedeltaArrayMixin
 
         if isinstance(delta, (Tick, timedelta, np.timedelta64)):
             new_values = self._add_delta_td(delta)
@@ -818,7 +818,7 @@ def to_period(self, freq=None):
         pandas.PeriodIndex: Immutable ndarray holding ordinal values
         pandas.DatetimeIndex.to_pydatetime: Return DatetimeIndex as object
         """
-        from pandas.core.arrays.period import PeriodArrayMixin
+        from pandas.core.arrays import PeriodArrayMixin
 
         if self.tz is not None:
             warnings.warn("Converting to PeriodArray/Index representation "

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -18,7 +18,8 @@
     is_integer_dtype,
     is_object_dtype,
     is_list_like)
-from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
+from pandas.core.arrays import (ExtensionArray,
+                                ExtensionOpsMixin)
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.missing import isna, notna

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -23,7 +23,7 @@
 from pandas.util._decorators import Appender
 from pandas.util._doctools import _WritableDoc
 
-from . import ExtensionArray, Categorical
+from pandas.core.arrays import ExtensionArray, Categorical
 
 _VALID_CLOSED = {'left', 'right', 'both', 'neither'}
 _interval_shared_docs = {}

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -301,7 +301,7 @@ def to_timestamp(self, freq=None, how='start'):
         -------
         DatetimeArray/Index
         """
-        from pandas.core.arrays.datetimes import DatetimeArrayMixin
+        from pandas.core.arrays import DatetimeArrayMixin
 
         how = libperiod._validate_end_alias(how)
 

diff --git a/pandas/core/sparse/array.py → pandas/core/arrays/sparse.py b/pandas/core/sparse/array.py → pandas/core/arrays/sparse.py
@@ -4,6 +4,7 @@
 from __future__ import division
 # pylint: disable=E1101,E1103,W0231
 
+import re
 import operator
 import numbers
 import numpy as np
@@ -16,8 +17,10 @@
 from pandas.errors import PerformanceWarning
 from pandas.compat.numpy import function as nv
 
-from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
+from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
 import pandas.core.common as com
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.generic import (
     ABCSparseSeries, ABCSeries, ABCIndexClass
 )
@@ -45,7 +48,252 @@
 import pandas.core.algorithms as algos
 import pandas.io.formats.printing as printing
 
-from pandas.core.sparse.dtype import SparseDtype
+
+# ----------------------------------------------------------------------------
+# Dtype
+
+@register_extension_dtype
+class SparseDtype(ExtensionDtype):
+    """
+    Dtype for data stored in :class:`SparseArray`.
+
+    This dtype implements the pandas ExtensionDtype interface.
+
+    .. versionadded:: 0.24.0
+
+    Parameters
+    ----------
+    dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
+        The dtype of the underlying array storing the non-fill value values.
+    fill_value : scalar, optional.
+        The scalar value not stored in the SparseArray. By default, this
+        depends on `dtype`.
+
+        ========== ==========
+        dtype      na_value
+        ========== ==========
+        float      ``np.nan``
+        int        ``0``
+        bool       ``False``
+        datetime64 ``pd.NaT``
+        timedelta64 ``pd.NaT``
+        ========== ==========
+
+        The default value may be overridden by specifying a `fill_value`.
+    """
+    # We include `_is_na_fill_value` in the metadata to avoid hash collisions
+    # between SparseDtype(float, 0.0) and SparseDtype(float, nan).
+    # Without is_na_fill_value in the comparison, those would be equal since
+    # hash(nan) is (sometimes?) 0.
+    _metadata = ('_dtype', '_fill_value', '_is_na_fill_value')
+
+    def __init__(self, dtype=np.float64, fill_value=None):
+        # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
+        from pandas.core.dtypes.missing import na_value_for_dtype
+        from pandas.core.dtypes.common import (
+            pandas_dtype, is_string_dtype, is_scalar
+        )
+
+        if isinstance(dtype, type(self)):
+            if fill_value is None:
+                fill_value = dtype.fill_value
+            dtype = dtype.subtype
+
+        dtype = pandas_dtype(dtype)
+        if is_string_dtype(dtype):
+            dtype = np.dtype('object')
+
+        if fill_value is None:
+            fill_value = na_value_for_dtype(dtype)
+
+        if not is_scalar(fill_value):
+            raise ValueError("fill_value must be a scalar. Got {} "
+                             "instead".format(fill_value))
+        self._dtype = dtype
+        self._fill_value = fill_value
+
+    def __hash__(self):
+        # Python3 doesn't inherit __hash__ when a base class overrides
+        # __eq__, so we explicitly do it here.
+        return super(SparseDtype, self).__hash__()
+
+    def __eq__(self, other):
+        # We have to override __eq__ to handle NA values in _metadata.
+        # The base class does simple == checks, which fail for NA.
+        if isinstance(other, compat.string_types):
+            try:
+                other = self.construct_from_string(other)
+            except TypeError:
+                return False
+
+        if isinstance(other, type(self)):
+            subtype = self.subtype == other.subtype
+            if self._is_na_fill_value:
+                # this case is complicated by two things:
+                # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
+                # SparseDtype(float, np.nan)     != SparseDtype(float, pd.NaT)
+                # i.e. we want to treat any floating-point NaN as equal, but
+                # not a floating-point NaN and a datetime NaT.
+                fill_value = (
+                    other._is_na_fill_value and
+                    isinstance(self.fill_value, type(other.fill_value)) or
+                    isinstance(other.fill_value, type(self.fill_value))
+                )
+            else:
+                fill_value = self.fill_value == other.fill_value
+
+            return subtype and fill_value
+        return False
+
+    @property
+    def fill_value(self):
+        """
+        The fill value of the array.
+
+        Converting the SparseArray to a dense ndarray will fill the
+        array with this value.
+
+        .. warning::
+
+           It's possible to end up with a SparseArray that has ``fill_value``
+           values in ``sp_values``. This can occur, for example, when setting
+           ``SparseArray.fill_value`` directly.
+        """
+        return self._fill_value
+
+    @property
+    def _is_na_fill_value(self):
+        from pandas.core.dtypes.missing import isna
+        return isna(self.fill_value)
+
+    @property
+    def _is_numeric(self):
+        from pandas.core.dtypes.common import is_object_dtype
+        return not is_object_dtype(self.subtype)
+
+    @property
+    def _is_boolean(self):
+        from pandas.core.dtypes.common import is_bool_dtype
+        return is_bool_dtype(self.subtype)
+
+    @property
+    def kind(self):
+        return self.subtype.kind
+
+    @property
+    def type(self):
+        return self.subtype.type
+
+    @property
+    def subtype(self):
+        return self._dtype
+
+    @property
+    def name(self):
+        return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)
+
+    def __repr__(self):
+        return self.name
+
+    @classmethod
+    def construct_array_type(cls):
+        return SparseArray
+
+    @classmethod
+    def construct_from_string(cls, string):
+        """
+        Construct a SparseDtype from a string form.
+
+        Parameters
+        ----------
+        string : str
+            Can take the following forms.
+
+            string           dtype
+            ================ ============================
+            'int'            SparseDtype[np.int64, 0]
+            'Sparse'         SparseDtype[np.float64, nan]
+            'Sparse[int]'    SparseDtype[np.int64, 0]
+            'Sparse[int, 0]' SparseDtype[np.int64, 0]
+            ================ ============================
+
+            It is not possible to specify non-default fill values
+            with a string. An argument like ``'Sparse[int, 1]'``
+            will raise a ``TypeError`` because the default fill value
+            for integers is 0.
+
+        Returns
+        -------
+        SparseDtype
+        """
+        msg = "Could not construct SparseDtype from '{}'".format(string)
+        if string.startswith("Sparse"):
+            try:
+                sub_type, has_fill_value = cls._parse_subtype(string)
+                result = SparseDtype(sub_type)
+            except Exception:
+                raise TypeError(msg)
+            else:
+                msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
+                       "looks like the fill_value in the string is not "
+                       "the default for the dtype. Non-default fill_values "
+                       "are not supported. Use the 'SparseDtype()' "
+                       "constructor instead.")
+                if has_fill_value and str(result) != string:
+                    raise TypeError(msg.format(string))
+                return result
+        else:
+            raise TypeError(msg)
+
+    @staticmethod
+    def _parse_subtype(dtype):
+        """
+        Parse a string to get the subtype
+
+        Parameters
+        ----------
+        dtype : str
+            A string like
+
+            * Sparse[subtype]
+            * Sparse[subtype, fill_value]
+
+        Returns
+        -------
+        subtype : str
+
+        Raises
+        ------
+        ValueError
+            When the subtype cannot be extracted.
+        """
+        xpr = re.compile(
+            r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$"
+        )
+        m = xpr.match(dtype)
+        has_fill_value = False
+        if m:
+            subtype = m.groupdict()['subtype']
+            has_fill_value = m.groupdict()['fill_value'] or has_fill_value
+        elif dtype == "Sparse":
+            subtype = 'float64'
+        else:
+            raise ValueError("Cannot parse {}".format(dtype))
+        return subtype, has_fill_value
+
+    @classmethod
+    def is_dtype(cls, dtype):
+        dtype = getattr(dtype, 'dtype', dtype)
+        if (isinstance(dtype, compat.string_types) and
+                dtype.startswith("Sparse")):
+            sub_type, _ = cls._parse_subtype(dtype)
+            dtype = np.dtype(sub_type)
+        elif isinstance(dtype, cls):
+            return True
+        return isinstance(dtype, np.dtype) or dtype == 'Sparse'
+
+# ----------------------------------------------------------------------------
+# Array
 
 
 _sparray_doc_kwargs = dict(klass='SparseArray')