pandas-dev · johnzangwill · Oct 7, 2021 · Oct 11, 2021 · Oct 11, 2021 · Oct 12, 2021
diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml
@@ -13,5 +13,5 @@ runs:
     - name: Build Pandas
       run: |
         python setup.py build_ext -j 2
-        python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index
+        python -m pip install -e . --no-build-isolation --no-use-pep517
       shell: bash -l {0}
diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml
@@ -23,7 +23,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.8", "3.9"]
     concurrency:
       group: ${{github.ref}}-${{matrix.python-version}}-sdist
       cancel-in-progress: ${{github.event_name == 'pull_request'}}
@@ -53,24 +53,13 @@ jobs:
     - uses: conda-incubator/setup-miniconda@v2
       with:
         activate-environment: pandas-sdist
-        python-version: '${{ matrix.python-version }}'
+        python-version: ${{ matrix.python-version }}
 
     - name: Install pandas from sdist
       run: |
-        pip list
+        conda list
         python -m pip install dist/*.gz
 
-    - name: Force oldest supported NumPy
-      run: |
-        case "${{matrix.python-version}}" in
-        3.8)
-          pip install numpy==1.18.5 ;;
-        3.9)
-          pip install numpy==1.19.3 ;;
-        3.10)
-          pip install numpy==1.21.2 ;;
-        esac
-
     - name: Import pandas
       run: |
         cd ..

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -132,12 +132,6 @@ repos:
         entry: 'np\.random\.seed'
         files: ^asv_bench/benchmarks
         exclude: ^asv_bench/benchmarks/pandas_vb_common\.py
-    -   id: np-testing-array-equal
-        name: Check for usage of numpy testing or array_equal
-        language: pygrep
-        entry: '(numpy|np)(\.testing|\.array_equal)'
-        files: ^pandas/tests/
-        types: [python]
     -   id: invalid-ea-testing
         name: Check for invalid EA testing
         language: pygrep

diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
@@ -35,49 +35,25 @@ class NumericEngineIndexing:
     params = [
         _get_numeric_engines(),
         ["monotonic_incr", "monotonic_decr", "non_monotonic"],
-        [True, False],
-        [10 ** 5, 2 * 10 ** 6],  # 2e6 is above SIZE_CUTOFF
     ]
-    param_names = ["engine_and_dtype", "index_type", "unique", "N"]
+    param_names = ["engine_and_dtype", "index_type"]
 
-    def setup(self, engine_and_dtype, index_type, unique, N):
+    def setup(self, engine_and_dtype, index_type):
         engine, dtype = engine_and_dtype
-
-        if index_type == "monotonic_incr":
-            if unique:
-                arr = np.arange(N * 3, dtype=dtype)
-            else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype)
-        elif index_type == "monotonic_decr":
-            if unique:
-                arr = np.arange(N * 3, dtype=dtype)[::-1]
-            else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype)[::-1]
-        else:
-            assert index_type == "non_monotonic"
-            if unique:
-                arr = np.empty(N * 3, dtype=dtype)
-                arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
-                arr[N:] = np.arange(N * 2, dtype=dtype)
-            else:
-                arr = np.array([1, 2, 3] * N, dtype=dtype)
+        N = 10 ** 5
+        values = list([1] * N + [2] * N + [3] * N)
+        arr = {
+            "monotonic_incr": np.array(values, dtype=dtype),
+            "monotonic_decr": np.array(list(reversed(values)), dtype=dtype),
+            "non_monotonic": np.array([1, 2, 3] * N, dtype=dtype),
+        }[index_type]
 
         self.data = engine(arr)
         # code belows avoids populating the mapping etc. while timing.
         self.data.get_loc(2)
 
-        self.key_middle = arr[len(arr) // 2]
-        self.key_early = arr[2]
-
-    def time_get_loc(self, engine_and_dtype, index_type, unique, N):
-        self.data.get_loc(self.key_early)
-
-    def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
-        # searchsorted performance may be different near the middle of a range
-        #  vs near an endpoint
-        self.data.get_loc(self.key_middle)
+    def time_get_loc(self, engine_and_dtype, index_type):
+        self.data.get_loc(2)
 
 
 class ObjectEngineIndexing:

diff --git a/ci/deps/actions-38-db-min.yaml b/ci/deps/actions-38-db-min.yaml
@@ -5,7 +5,7 @@ dependencies:
   - python=3.8
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-cov
   - pytest-xdist>=1.31

diff --git a/ci/deps/actions-38-db.yaml b/ci/deps/actions-38-db.yaml
@@ -5,7 +5,7 @@ dependencies:
   - python=3.8
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-xdist>=1.31
   - hypothesis>=5.5.3

diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml
@@ -5,7 +5,7 @@ dependencies:
   - python=3.8
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-cov
   - pytest-xdist>=1.31

diff --git a/ci/deps/actions-38-locale_slow.yaml b/ci/deps/actions-38-locale_slow.yaml
@@ -6,7 +6,7 @@ dependencies:
   - python=3.8
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-cov
   - pytest-xdist>=1.31

diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml
@@ -5,7 +5,7 @@ dependencies:
   - python=3.8.0
 
   # tools
-  - cython=0.29.24
+  - cython=0.29.21
   - pytest>=6.0
   - pytest-cov
   - pytest-xdist>=1.31

diff --git a/ci/deps/actions-38-slow.yaml b/ci/deps/actions-38-slow.yaml
@@ -5,7 +5,7 @@ dependencies:
   - python=3.8
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-cov
   - pytest-xdist>=1.31

diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml
@@ -6,7 +6,7 @@ dependencies:
   - python=3.8
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-cov
   - pytest-xdist>=1.31

diff --git a/ci/deps/actions-39-numpydev.yaml b/ci/deps/actions-39-numpydev.yaml
@@ -15,7 +15,7 @@ dependencies:
   - pytz
   - pip
   - pip:
-    - cython==0.29.24 # GH#34014
+    - cython==0.29.21 # GH#34014
     - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple"
     - "--pre"
     - "numpy"

diff --git a/ci/deps/actions-39-slow.yaml b/ci/deps/actions-39-slow.yaml
@@ -6,7 +6,7 @@ dependencies:
   - python=3.9
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-cov
   - pytest-xdist>=1.31

diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -5,7 +5,7 @@ dependencies:
   - python=3.9
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-cov
   - pytest-xdist>=1.31

diff --git a/ci/deps/azure-macos-38.yaml b/ci/deps/azure-macos-38.yaml
@@ -32,6 +32,6 @@ dependencies:
   - xlwt
   - pip
   - pip:
-    - cython>=0.29.24
+    - cython>=0.29.21
     - pyreadstat
     - pyxlsb
diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
@@ -6,7 +6,7 @@ dependencies:
   - python=3.8
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-xdist>=1.31
   - hypothesis>=5.5.3

diff --git a/ci/deps/azure-windows-39.yaml b/ci/deps/azure-windows-39.yaml
@@ -6,7 +6,7 @@ dependencies:
   - python=3.9
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-xdist>=1.31
   - hypothesis>=5.5.3

diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml
@@ -5,7 +5,7 @@ dependencies:
   - python=3.8
 
   # tools
-  - cython>=0.29.24
+  - cython>=0.29.21
   - pytest>=6.0
   - pytest-xdist>=1.31
   - hypothesis>=5.5.3

diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -61,7 +61,6 @@ objects.
       api.extensions.ExtensionArray.nbytes
       api.extensions.ExtensionArray.ndim
       api.extensions.ExtensionArray.shape
-      api.extensions.ExtensionArray.tolist
 
 Additionally, we have some utility methods for ensuring your object
 behaves correctly.

diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
@@ -88,7 +88,6 @@ Exponentially-weighted window functions
    :toctree: api/
 
    ExponentialMovingWindow.mean
-   ExponentialMovingWindow.sum
    ExponentialMovingWindow.std
    ExponentialMovingWindow.var
    ExponentialMovingWindow.corr

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -997,15 +997,6 @@ a list of items you want to check for.
 
    df.isin(values)
 
-To return the DataFrame of booleans where the values are *not* in the original DataFrame,
-use the ``~`` operator:
-
-.. ipython:: python
-
-   values = {'ids': ['a', 'b'], 'vals': [1, 3]}
-
-   ~df.isin(values)
-
 Combine DataFrame's ``isin`` with the ``any()`` and ``all()`` methods to
 quickly select subsets of your data that meet a given criteria.
 To select a row where each column meets its own criterion:

diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst
@@ -16,13 +16,13 @@ Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Fixed regression in :meth:`DataFrame.convert_dtypes` incorrectly converts byte strings to strings (:issue:`43183`)
 - Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`MultiIndex` (:issue:`43209`)
-- Fixed regression in :func:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
+- Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
 - Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`)
 - Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
 - Fixed performance regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` with :class:`StringDtype` (:issue:`41596`)
 - Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
 - Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
-- Fixed regression in :func:`read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
+- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
 - Fixed regression in :meth:`DataFrame.explode` raising ``AssertionError`` when ``column`` is any scalar which is not a string (:issue:`43314`)
 - Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`)
 - Fixed regression when iterating over a :class:`DataFrame.groupby.rolling` object causing the resulting DataFrames to have an incorrect index if the input groupings were not sorted (:issue:`43386`)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -181,6 +181,7 @@ Other enhancements
 - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`)
 - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`)
 - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
+- Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -473,8 +474,6 @@ Datetimelike
 - Bug in :func:`to_datetime` with ``format`` and ``pandas.NA`` was raising ``ValueError`` (:issue:`42957`)
 - :func:`to_datetime` would silently swap ``MM/DD/YYYY`` and ``DD/MM/YYYY`` formats if the given ``dayfirst`` option could not be respected - now, a warning is raised in the case of delimited date strings (e.g. ``31-12-2012``) (:issue:`12585`)
 - Bug in :meth:`date_range` and :meth:`bdate_range` do not return right bound when ``start`` = ``end`` and set is closed on one side (:issue:`43394`)
-- Bug in inplace addition and subtraction of :class:`DatetimeIndex` or :class:`TimedeltaIndex` with :class:`DatetimeArray` or :class:`TimedeltaArray` (:issue:`43904`)
-- Bug in in calling ``np.isnan``, ``np.isfinite``, or ``np.isinf`` on a timezone-aware :class:`DatetimeIndex` incorrectly raising ``TypeError`` (:issue:`43917`)
 -
 
 Timedelta
@@ -494,7 +493,6 @@ Numeric
 - Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
 - Bug in ``numexpr`` engine still being used when the option ``compute.use_numexpr`` is set to ``False`` (:issue:`32556`)
 - Bug in :class:`DataFrame` arithmetic ops with a subclass whose :meth:`_constructor` attribute is a callable other than the subclass itself (:issue:`43201`)
-- Bug in arithmetic operations involving :class:`RangeIndex` where the result would have the incorrect ``name`` (:issue:`43962`)
 -
 
 Conversion

diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -192,7 +192,6 @@ class UInt16HashTable(HashTable): ...
 class UInt8HashTable(HashTable): ...
 class StringHashTable(HashTable): ...
 class PyObjectHashTable(HashTable): ...
-class IntpHashTable(HashTable): ...
 
 def duplicated_int64(
     values: np.ndarray,  # const int64_t[:] values

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -65,18 +65,6 @@ cdef Py_ssize_t _INIT_VEC_CAP = 128
 include "hashtable_class_helper.pxi"
 include "hashtable_func_helper.pxi"
 
-
-# map derived hash-map types onto basic hash-map types:
-if np.dtype(np.intp) == np.dtype(np.int64):
-    IntpHashTable = Int64HashTable
-    unique_label_indices = _unique_label_indices_int64
-elif np.dtype(np.intp) == np.dtype(np.int32):
-    IntpHashTable = Int32HashTable
-    unique_label_indices = _unique_label_indices_int32
-else:
-    raise ValueError(np.dtype(np.intp))
-
-
 cdef class Factorizer:
     cdef readonly:
         Py_ssize_t count
@@ -180,3 +168,38 @@ cdef class Int64Factorizer(Factorizer):
 
         self.count = len(self.uniques)
         return labels
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def unique_label_indices(const int64_t[:] labels) -> ndarray:
+    """
+    Indices of the first occurrences of the unique labels
+    *excluding* -1. equivalent to:
+        np.unique(labels, return_index=True)[1]
+    """
+    cdef:
+        int ret = 0
+        Py_ssize_t i, n = len(labels)
+        kh_int64_t *table = kh_init_int64()
+        Int64Vector idx = Int64Vector()
+        ndarray[int64_t, ndim=1] arr
+        Int64VectorData *ud = idx.data
+
+    kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
+
+    with nogil:
+        for i in range(n):
+            kh_put_int64(table, labels[i], &ret)
+            if ret != 0:
+                if needs_resize(ud):
+                    with gil:
+                        idx.resize()
+                append_data_int64(ud, i)
+
+    kh_destroy_int64(table)
+
+    arr = idx.to_array()
+    arr = arr[np.asarray(labels)[arr].argsort()]
+
+    return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,7 @@ dependencies: @@
       - pytz
       - pip
       - pip:
-        - cython==0.29.24 # GH#34014
+        - cython==0.29.21 # GH#34014
         - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple"
         - "--pre"
         - "numpy"
@@ Expand Down @@