diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 4e792da31e1d5..7c57e6ee9dbfd 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -378,8 +378,8 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.boundscheck(False) @cython.wraparound(False) def group_any_all(uint8_t[:] out, - const int64_t[:] labels, const uint8_t[:] values, + const int64_t[:] labels, const uint8_t[:] mask, object val_test, bint skipna): @@ -560,7 +560,8 @@ def _group_var(floating[:, :] out, int64_t[:] counts, floating[:, :] values, const int64_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1, + int64_t ddof=1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, ct, oldmean @@ -600,10 +601,10 @@ def _group_var(floating[:, :] out, for i in range(ncounts): for j in range(K): ct = nobs[i, j] - if ct < 2: + if ct <= ddof: out[i, j] = NAN else: - out[i, j] /= (ct - 1) + out[i, j] /= (ct - ddof) group_var_float32 = _group_var['float'] @@ -715,8 +716,8 @@ group_ohlc_float64 = _group_ohlc['double'] @cython.boundscheck(False) @cython.wraparound(False) def group_quantile(ndarray[float64_t] out, - ndarray[int64_t] labels, numeric[:] values, + ndarray[int64_t] labels, ndarray[uint8_t] mask, float64_t q, object interpolation): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index db5df9818b0b0..cec3d9711a8ca 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1736,7 +1736,8 @@ def _wrap_aggregated_output( DataFrame """ indexed_output = {key.position: val for key, val in output.items()} - columns = Index(key.label for key in output) + name = self._obj_with_exclusions._get_axis(1 - self.axis).name + columns = Index([key.label for key in output], name=name) result = self.obj._constructor(indexed_output) result.columns = columns diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 48fdb14ebe90c..b92e75f16e965 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1277,6 +1277,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: return self._get_cythonized_result( "group_any_all", aggregate=True, + numeric_only=False, cython_dtype=np.dtype(np.uint8), needs_values=True, needs_mask=True, @@ -1433,18 +1434,16 @@ def std(self, ddof: int = 1): Series or DataFrame Standard deviation of values within each group. """ - result = self.var(ddof=ddof) - if result.ndim == 1: - result = np.sqrt(result) - else: - cols = result.columns.get_indexer_for( - result.columns.difference(self.exclusions).unique() - ) - # TODO(GH-22046) - setting with iloc broken if labels are not unique - # .values to remove labels - result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values - - return result + return self._get_cythonized_result( + "group_var_float64", + aggregate=True, + needs_counts=True, + needs_values=True, + needs_2d=True, + cython_dtype=np.dtype(np.float64), + post_processing=lambda vals, inference: np.sqrt(vals), + ddof=ddof, + ) @Substitution(name="groupby") @Appender(_common_see_also) @@ -1778,6 +1777,7 @@ def _fill(self, direction, limit=None): return self._get_cythonized_result( "group_fillna_indexer", + numeric_only=False, needs_mask=True, cython_dtype=np.dtype(np.int64), result_is_index=True, @@ -2078,6 +2078,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: return self._get_cythonized_result( "group_quantile", aggregate=True, + numeric_only=False, needs_values=True, needs_mask=True, cython_dtype=np.dtype(np.float64), @@ -2367,7 +2368,11 @@ def _get_cythonized_result( how: str, cython_dtype: np.dtype, aggregate: bool = False, + numeric_only: bool = True, + needs_counts: bool = False, needs_values: bool = False, + needs_2d: bool = False, + min_count: Optional[int] = None, needs_mask: bool = False, needs_ngroups: bool = False, result_is_index: bool = False, @@ -2386,9 +2391,18 @@ def _get_cythonized_result( aggregate : bool, default False Whether the result should be aggregated to match the number of groups + numeric_only : bool, default True + Whether only numeric datatypes should be computed + needs_counts : bool, default False + Whether the counts should be a part of the Cython call needs_values : bool, default False Whether the values should be a part of the Cython call signature + needs_2d : bool, default False + Whether the values and result of the Cython call signature + are at least 2-dimensional. + min_count : int, default None + When not None, min_count for the Cython call needs_mask : bool, default False Whether boolean mask needs to be part of the Cython call signature @@ -2418,7 +2432,7 @@ def _get_cythonized_result( if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") if post_processing: - if not callable(pre_processing): + if not callable(post_processing): raise ValueError("'post_processing' must be a callable!") if pre_processing: if not callable(pre_processing): @@ -2438,21 +2452,39 @@ def _get_cythonized_result( name = obj.name values = obj._values + if numeric_only and not is_numeric_dtype(values): + continue + if aggregate: result_sz = ngroups else: result_sz = len(values) result = np.zeros(result_sz, dtype=cython_dtype) - func = partial(base_func, result, labels) + if needs_2d: + result = result.reshape((-1, 1)) + func = partial(base_func, result) + inferences = None + if needs_counts: + counts = np.zeros(self.ngroups, dtype=np.int64) + func = partial(func, counts) + if needs_values: vals = values if pre_processing: vals, inferences = pre_processing(vals) + if needs_2d: + vals = vals.reshape((-1, 1)) + vals = vals.astype(cython_dtype, copy=False) func = partial(func, vals) + func = partial(func, labels) + + if min_count is not None: + func = partial(func, min_count) + if needs_mask: mask = isna(values).view(np.uint8) func = partial(func, mask) @@ -2462,6 +2494,9 @@ def _get_cythonized_result( func(**kwargs) # Call func to modify indexer values in place + if needs_2d: + result = result.reshape(-1) + if result_is_index: result = algorithms.take_nd(values, result) @@ -2512,6 +2547,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): return self._get_cythonized_result( "group_shift_indexer", + numeric_only=False, cython_dtype=np.dtype(np.int64), needs_ngroups=True, result_is_index=True,