Skip to content

DataFrameGroupby value_counts #44259

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/build_pandas/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ runs:
- name: Build Pandas
run: |
python setup.py build_ext -j 2
python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index
python -m pip install -e . --no-build-isolation --no-use-pep517
shell: bash -l {0}
17 changes: 3 additions & 14 deletions .github/workflows/sdist.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.8", "3.9"]
concurrency:
group: ${{github.ref}}-${{matrix.python-version}}-sdist
cancel-in-progress: ${{github.event_name == 'pull_request'}}
Expand Down Expand Up @@ -53,24 +53,13 @@ jobs:
- uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: pandas-sdist
python-version: '${{ matrix.python-version }}'
python-version: ${{ matrix.python-version }}

- name: Install pandas from sdist
run: |
pip list
conda list
python -m pip install dist/*.gz

- name: Force oldest supported NumPy
run: |
case "${{matrix.python-version}}" in
3.8)
pip install numpy==1.18.5 ;;
3.9)
pip install numpy==1.19.3 ;;
3.10)
pip install numpy==1.21.2 ;;
esac

- name: Import pandas
run: |
cd ..
Expand Down
6 changes: 0 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,6 @@ repos:
entry: 'np\.random\.seed'
files: ^asv_bench/benchmarks
exclude: ^asv_bench/benchmarks/pandas_vb_common\.py
- id: np-testing-array-equal
name: Check for usage of numpy testing or array_equal
language: pygrep
entry: '(numpy|np)(\.testing|\.array_equal)'
files: ^pandas/tests/
types: [python]
- id: invalid-ea-testing
name: Check for invalid EA testing
language: pygrep
Expand Down
46 changes: 11 additions & 35 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,49 +35,25 @@ class NumericEngineIndexing:
params = [
_get_numeric_engines(),
["monotonic_incr", "monotonic_decr", "non_monotonic"],
[True, False],
[10 ** 5, 2 * 10 ** 6], # 2e6 is above SIZE_CUTOFF
]
param_names = ["engine_and_dtype", "index_type", "unique", "N"]
param_names = ["engine_and_dtype", "index_type"]

def setup(self, engine_and_dtype, index_type, unique, N):
def setup(self, engine_and_dtype, index_type):
engine, dtype = engine_and_dtype

if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype)
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype)
elif index_type == "monotonic_decr":
if unique:
arr = np.arange(N * 3, dtype=dtype)[::-1]
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype)[::-1]
else:
assert index_type == "non_monotonic"
if unique:
arr = np.empty(N * 3, dtype=dtype)
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
arr[N:] = np.arange(N * 2, dtype=dtype)
else:
arr = np.array([1, 2, 3] * N, dtype=dtype)
N = 10 ** 5
values = list([1] * N + [2] * N + [3] * N)
arr = {
"monotonic_incr": np.array(values, dtype=dtype),
"monotonic_decr": np.array(list(reversed(values)), dtype=dtype),
"non_monotonic": np.array([1, 2, 3] * N, dtype=dtype),
}[index_type]

self.data = engine(arr)
# code belows avoids populating the mapping etc. while timing.
self.data.get_loc(2)

self.key_middle = arr[len(arr) // 2]
self.key_early = arr[2]

def time_get_loc(self, engine_and_dtype, index_type, unique, N):
self.data.get_loc(self.key_early)

def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
# searchsorted performance may be different near the middle of a range
# vs near an endpoint
self.data.get_loc(self.key_middle)
def time_get_loc(self, engine_and_dtype, index_type):
self.data.get_loc(2)


class ObjectEngineIndexing:
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-38-db-min.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ dependencies:
- python=3.8

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-cov
- pytest-xdist>=1.31
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-38-db.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ dependencies:
- python=3.8

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-xdist>=1.31
- hypothesis>=5.5.3
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-38-locale.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ dependencies:
- python=3.8

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-cov
- pytest-xdist>=1.31
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-38-locale_slow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ dependencies:
- python=3.8

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-cov
- pytest-xdist>=1.31
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-38-minimum_versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ dependencies:
- python=3.8.0

# tools
- cython=0.29.24
- cython=0.29.21
- pytest>=6.0
- pytest-cov
- pytest-xdist>=1.31
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-38-slow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ dependencies:
- python=3.8

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-cov
- pytest-xdist>=1.31
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ dependencies:
- python=3.8

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-cov
- pytest-xdist>=1.31
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-39-numpydev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- pytz
- pip
- pip:
- cython==0.29.24 # GH#34014
- cython==0.29.21 # GH#34014
- "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple"
- "--pre"
- "numpy"
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-39-slow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ dependencies:
- python=3.9

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-cov
- pytest-xdist>=1.31
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ dependencies:
- python=3.9

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-cov
- pytest-xdist>=1.31
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-macos-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ dependencies:
- xlwt
- pip
- pip:
- cython>=0.29.24
- cython>=0.29.21
- pyreadstat
- pyxlsb
2 changes: 1 addition & 1 deletion ci/deps/azure-windows-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ dependencies:
- python=3.8

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-xdist>=1.31
- hypothesis>=5.5.3
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-windows-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ dependencies:
- python=3.9

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-xdist>=1.31
- hypothesis>=5.5.3
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/circle-38-arm64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ dependencies:
- python=3.8

# tools
- cython>=0.29.24
- cython>=0.29.21
- pytest>=6.0
- pytest-xdist>=1.31
- hypothesis>=5.5.3
Expand Down
1 change: 0 additions & 1 deletion doc/source/reference/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ objects.
api.extensions.ExtensionArray.nbytes
api.extensions.ExtensionArray.ndim
api.extensions.ExtensionArray.shape
api.extensions.ExtensionArray.tolist

Additionally, we have some utility methods for ensuring your object
behaves correctly.
Expand Down
1 change: 0 additions & 1 deletion doc/source/reference/window.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ Exponentially-weighted window functions
:toctree: api/

ExponentialMovingWindow.mean
ExponentialMovingWindow.sum
ExponentialMovingWindow.std
ExponentialMovingWindow.var
ExponentialMovingWindow.corr
Expand Down
9 changes: 0 additions & 9 deletions doc/source/user_guide/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -997,15 +997,6 @@ a list of items you want to check for.

df.isin(values)

To return the DataFrame of booleans where the values are *not* in the original DataFrame,
use the ``~`` operator:

.. ipython:: python

values = {'ids': ['a', 'b'], 'vals': [1, 3]}

~df.isin(values)

Combine DataFrame's ``isin`` with the ``any()`` and ``all()`` methods to
quickly select subsets of your data that meet a given criteria.
To select a row where each column meets its own criterion:
Expand Down
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v1.3.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :meth:`DataFrame.convert_dtypes` incorrectly converts byte strings to strings (:issue:`43183`)
- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`MultiIndex` (:issue:`43209`)
- Fixed regression in :func:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
- Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
- Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`)
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
- Fixed performance regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` with :class:`StringDtype` (:issue:`41596`)
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
- Fixed regression in :func:`read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
- Fixed regression in :meth:`DataFrame.explode` raising ``AssertionError`` when ``column`` is any scalar which is not a string (:issue:`43314`)
- Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`)
- Fixed regression when iterating over a :class:`DataFrame.groupby.rolling` object causing the resulting DataFrames to have an incorrect index if the input groupings were not sorted (:issue:`43386`)
Expand Down
4 changes: 1 addition & 3 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ Other enhancements
- The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`)
- Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`)
- :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
- Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`)
-

.. ---------------------------------------------------------------------------
Expand Down Expand Up @@ -473,8 +474,6 @@ Datetimelike
- Bug in :func:`to_datetime` with ``format`` and ``pandas.NA`` was raising ``ValueError`` (:issue:`42957`)
- :func:`to_datetime` would silently swap ``MM/DD/YYYY`` and ``DD/MM/YYYY`` formats if the given ``dayfirst`` option could not be respected - now, a warning is raised in the case of delimited date strings (e.g. ``31-12-2012``) (:issue:`12585`)
- Bug in :meth:`date_range` and :meth:`bdate_range` do not return right bound when ``start`` = ``end`` and set is closed on one side (:issue:`43394`)
- Bug in inplace addition and subtraction of :class:`DatetimeIndex` or :class:`TimedeltaIndex` with :class:`DatetimeArray` or :class:`TimedeltaArray` (:issue:`43904`)
- Bug in in calling ``np.isnan``, ``np.isfinite``, or ``np.isinf`` on a timezone-aware :class:`DatetimeIndex` incorrectly raising ``TypeError`` (:issue:`43917`)
-

Timedelta
Expand All @@ -494,7 +493,6 @@ Numeric
- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
- Bug in ``numexpr`` engine still being used when the option ``compute.use_numexpr`` is set to ``False`` (:issue:`32556`)
- Bug in :class:`DataFrame` arithmetic ops with a subclass whose :meth:`_constructor` attribute is a callable other than the subclass itself (:issue:`43201`)
- Bug in arithmetic operations involving :class:`RangeIndex` where the result would have the incorrect ``name`` (:issue:`43962`)
-

Conversion
Expand Down
1 change: 0 additions & 1 deletion pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,6 @@ class UInt16HashTable(HashTable): ...
class UInt8HashTable(HashTable): ...
class StringHashTable(HashTable): ...
class PyObjectHashTable(HashTable): ...
class IntpHashTable(HashTable): ...

def duplicated_int64(
values: np.ndarray, # const int64_t[:] values
Expand Down
47 changes: 35 additions & 12 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,6 @@ cdef Py_ssize_t _INIT_VEC_CAP = 128
include "hashtable_class_helper.pxi"
include "hashtable_func_helper.pxi"


# map derived hash-map types onto basic hash-map types:
if np.dtype(np.intp) == np.dtype(np.int64):
IntpHashTable = Int64HashTable
unique_label_indices = _unique_label_indices_int64
elif np.dtype(np.intp) == np.dtype(np.int32):
IntpHashTable = Int32HashTable
unique_label_indices = _unique_label_indices_int32
else:
raise ValueError(np.dtype(np.intp))


cdef class Factorizer:
cdef readonly:
Py_ssize_t count
Expand Down Expand Up @@ -180,3 +168,38 @@ cdef class Int64Factorizer(Factorizer):

self.count = len(self.uniques)
return labels


@cython.wraparound(False)
@cython.boundscheck(False)
def unique_label_indices(const int64_t[:] labels) -> ndarray:
"""
Indices of the first occurrences of the unique labels
*excluding* -1. equivalent to:
np.unique(labels, return_index=True)[1]
"""
cdef:
int ret = 0
Py_ssize_t i, n = len(labels)
kh_int64_t *table = kh_init_int64()
Int64Vector idx = Int64Vector()
ndarray[int64_t, ndim=1] arr
Int64VectorData *ud = idx.data

kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

with nogil:
for i in range(n):
kh_put_int64(table, labels[i], &ret)
if ret != 0:
if needs_resize(ud):
with gil:
idx.resize()
append_data_int64(ud, i)

kh_destroy_int64(table)

arr = idx.to_array()
arr = arr[np.asarray(labels)[arr].argsort()]

return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
Loading