From 53ae9d6d09ecdb40e3f4dc3ef00a9b6015100ab4 Mon Sep 17 00:00:00 2001 From: Richard Date: Thu, 21 May 2020 16:56:04 -0400 Subject: [PATCH 1/6] CLN: Unify signatures in _libs.groupby --- pandas/_libs/groupby.pyx | 91 ++++++++++++++++++---------------- pandas/core/groupby/groupby.py | 36 +++++++++++++- 2 files changed, 84 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d5d706650bb34..69ec9b697a847 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -714,10 +714,12 @@ group_ohlc_float64 = _group_ohlc['double'] @cython.boundscheck(False) @cython.wraparound(False) -def group_quantile(ndarray[float64_t] out, - ndarray[int64_t] labels, - numeric[:] values, - ndarray[uint8_t] mask, +def group_quantile(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count, + const uint8_t[:, :] mask, float64_t q, object interpolation): """ @@ -740,12 +742,12 @@ def group_quantile(ndarray[float64_t] out, provided `out` parameter. """ cdef: - Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz + Py_ssize_t i, N=len(labels), K, ngroups, grp_sz=0, non_na_sz Py_ssize_t grp_start=0, idx=0 int64_t lab uint8_t interp float64_t q_idx, frac, val, next_val - ndarray[int64_t] counts, non_na_counts, sort_arr + int64_t[:, :] non_na_counts, sort_arrs assert values.shape[0] == N @@ -761,59 +763,64 @@ def group_quantile(ndarray[float64_t] out, } interp = inter_methods[interpolation] - counts = np.zeros_like(out, dtype=np.int64) non_na_counts = np.zeros_like(out, dtype=np.int64) + sort_arrs = np.empty_like(values, dtype=np.int64) ngroups = len(counts) + N, K = (values).shape + # First figure out the size of every group with nogil: for i in range(N): lab = labels[i] if lab == -1: # NA group label continue - counts[lab] += 1 - if not mask[i]: - non_na_counts[lab] += 1 + for j in range(K): + if not mask[i, j]: + non_na_counts[lab, j] += 1 - # Get an index of values sorted by labels and then values - order = (values, labels) - sort_arr = np.lexsort(order).astype(np.int64, copy=False) + for j in range(K): + order = (values[:, j], labels) + r = np.lexsort(order).astype(np.int64, copy=False) + # TODO: Need better way to assign r to column j + for i in range(N): + sort_arrs[i, j] = r[i] with nogil: for i in range(ngroups): # Figure out how many group elements there are grp_sz = counts[i] - non_na_sz = non_na_counts[i] - - if non_na_sz == 0: - out[i] = NaN - else: - # Calculate where to retrieve the desired value - # Casting to int will intentionally truncate result - idx = grp_start + (q * (non_na_sz - 1)) - - val = values[sort_arr[idx]] - # If requested quantile falls evenly on a particular index - # then write that index's value out. Otherwise interpolate - q_idx = q * (non_na_sz - 1) - frac = q_idx % 1 - - if frac == 0.0 or interp == INTERPOLATION_LOWER: - out[i] = val + for j in range(K): + non_na_sz = non_na_counts[i, j] + if non_na_sz == 0: + out[i, j] = NaN else: - next_val = values[sort_arr[idx + 1]] - if interp == INTERPOLATION_LINEAR: - out[i] = val + (next_val - val) * frac - elif interp == INTERPOLATION_HIGHER: - out[i] = next_val - elif interp == INTERPOLATION_MIDPOINT: - out[i] = (val + next_val) / 2.0 - elif interp == INTERPOLATION_NEAREST: - if frac > .5 or (frac == .5 and q > .5): # Always OK? - out[i] = next_val - else: - out[i] = val + # Calculate where to retrieve the desired value + # Casting to int will intentionally truncate result + idx = grp_start + (q * (non_na_sz - 1)) + + val = values[sort_arrs[idx, j], j] + # If requested quantile falls evenly on a particular index + # then write that index's value out. Otherwise interpolate + q_idx = q * (non_na_sz - 1) + frac = q_idx % 1 + + if frac == 0.0 or interp == INTERPOLATION_LOWER: + out[i, j] = val + else: + next_val = values[sort_arrs[idx + 1, j], j] + if interp == INTERPOLATION_LINEAR: + out[i, j] = val + (next_val - val) * frac + elif interp == INTERPOLATION_HIGHER: + out[i, j] = next_val + elif interp == INTERPOLATION_MIDPOINT: + out[i, j] = (val + next_val) / 2.0 + elif interp == INTERPOLATION_NEAREST: + if frac > .5 or (frac == .5 and q > .5): # Always OK? + out[i, j] = next_val + else: + out[i, j] = val # Increment the index reference in sorted_arr for the next group grp_start += grp_sz diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 37f2376d68d55..8af7a54772076 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2039,6 +2039,9 @@ def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: inference = "datetime64[ns]" vals = np.asarray(vals).astype(np.float) + if vals.dtype != np.dtype(np.float64): + vals = vals.astype(np.float64) + return vals, inference def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: @@ -2396,7 +2399,7 @@ def _get_cythonized_result( if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") if post_processing: - if not callable(pre_processing): + if not callable(post_processing): raise ValueError("'post_processing' must be a callable!") if pre_processing: if not callable(pre_processing): @@ -2412,6 +2415,37 @@ def _get_cythonized_result( output: Dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) + if how == "group_quantile": + values = self._obj_with_exclusions._values + result_sz = ngroups if aggregate else len(values) + + vals, inferences = pre_processing(values) + if self._obj_with_exclusions.ndim == 1: + width = 1 + vals = np.reshape(vals, (-1, 1)) + else: + width = len(self._obj_with_exclusions.columns) + result = np.zeros((result_sz, width), dtype=cython_dtype) + counts = np.zeros(self.ngroups, dtype=np.int64) + mask = isna(vals).view(np.uint8) + + func = partial(base_func, result, counts, vals, labels, -1, mask) + func(**kwargs) # Call func to modify indexer values in place + result = post_processing(result, inferences) + + if self._obj_with_exclusions.ndim == 1: + key = base.OutputKey(label=self._obj_with_exclusions.name, position=0) + output[key] = result[:, 0] + else: + for idx, name in enumerate(self._obj_with_exclusions.columns): + key = base.OutputKey(label=name, position=idx) + output[key] = result[:, idx] + + if aggregate: + return self._wrap_aggregated_output(output) + else: + return self._wrap_transformed_output(output) + for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._values From 7985efb63d29c03617c82585a25959ad6316cc2e Mon Sep 17 00:00:00 2001 From: Richard Date: Wed, 27 May 2020 17:20:32 -0400 Subject: [PATCH 2/6] Complete rework --- pandas/_libs/groupby.pyx | 100 ++++++++++++++++----------------- pandas/core/groupby/generic.py | 6 +- pandas/core/groupby/groupby.py | 99 ++++++++++++++++---------------- 3 files changed, 103 insertions(+), 102 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 69ec9b697a847..c69b00950abaf 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -378,8 +378,8 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.boundscheck(False) @cython.wraparound(False) def group_any_all(uint8_t[:] out, - const int64_t[:] labels, const uint8_t[:] values, + const int64_t[:] labels, const uint8_t[:] mask, object val_test, bint skipna): @@ -560,7 +560,8 @@ def _group_var(floating[:, :] out, int64_t[:] counts, floating[:, :] values, const int64_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1, + int64_t ddof=1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, ct, oldmean @@ -600,10 +601,10 @@ def _group_var(floating[:, :] out, for i in range(ncounts): for j in range(K): ct = nobs[i, j] - if ct < 2: + if ct <= ddof: out[i, j] = NAN else: - out[i, j] /= (ct - 1) + out[i, j] /= (ct - ddof) group_var_float32 = _group_var['float'] @@ -714,12 +715,10 @@ group_ohlc_float64 = _group_ohlc['double'] @cython.boundscheck(False) @cython.wraparound(False) -def group_quantile(floating[:, :] out, - int64_t[:] counts, - floating[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count, - const uint8_t[:, :] mask, +def group_quantile(ndarray[float64_t] out, + numeric[:] values, + ndarray[int64_t] labels, + ndarray[uint8_t] mask, float64_t q, object interpolation): """ @@ -742,12 +741,12 @@ def group_quantile(floating[:, :] out, provided `out` parameter. """ cdef: - Py_ssize_t i, N=len(labels), K, ngroups, grp_sz=0, non_na_sz + Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz Py_ssize_t grp_start=0, idx=0 int64_t lab uint8_t interp float64_t q_idx, frac, val, next_val - int64_t[:, :] non_na_counts, sort_arrs + ndarray[int64_t] counts, non_na_counts, sort_arr assert values.shape[0] == N @@ -763,64 +762,59 @@ def group_quantile(floating[:, :] out, } interp = inter_methods[interpolation] + counts = np.zeros_like(out, dtype=np.int64) non_na_counts = np.zeros_like(out, dtype=np.int64) - sort_arrs = np.empty_like(values, dtype=np.int64) ngroups = len(counts) - N, K = (values).shape - # First figure out the size of every group with nogil: for i in range(N): lab = labels[i] if lab == -1: # NA group label continue + counts[lab] += 1 - for j in range(K): - if not mask[i, j]: - non_na_counts[lab, j] += 1 + if not mask[i]: + non_na_counts[lab] += 1 - for j in range(K): - order = (values[:, j], labels) - r = np.lexsort(order).astype(np.int64, copy=False) - # TODO: Need better way to assign r to column j - for i in range(N): - sort_arrs[i, j] = r[i] + # Get an index of values sorted by labels and then values + order = (values, labels) + sort_arr = np.lexsort(order).astype(np.int64, copy=False) with nogil: for i in range(ngroups): # Figure out how many group elements there are grp_sz = counts[i] - for j in range(K): - non_na_sz = non_na_counts[i, j] - if non_na_sz == 0: - out[i, j] = NaN + non_na_sz = non_na_counts[i] + + if non_na_sz == 0: + out[i] = NaN + else: + # Calculate where to retrieve the desired value + # Casting to int will intentionally truncate result + idx = grp_start + (q * (non_na_sz - 1)) + + val = values[sort_arr[idx]] + # If requested quantile falls evenly on a particular index + # then write that index's value out. Otherwise interpolate + q_idx = q * (non_na_sz - 1) + frac = q_idx % 1 + + if frac == 0.0 or interp == INTERPOLATION_LOWER: + out[i] = val else: - # Calculate where to retrieve the desired value - # Casting to int will intentionally truncate result - idx = grp_start + (q * (non_na_sz - 1)) - - val = values[sort_arrs[idx, j], j] - # If requested quantile falls evenly on a particular index - # then write that index's value out. Otherwise interpolate - q_idx = q * (non_na_sz - 1) - frac = q_idx % 1 - - if frac == 0.0 or interp == INTERPOLATION_LOWER: - out[i, j] = val - else: - next_val = values[sort_arrs[idx + 1, j], j] - if interp == INTERPOLATION_LINEAR: - out[i, j] = val + (next_val - val) * frac - elif interp == INTERPOLATION_HIGHER: - out[i, j] = next_val - elif interp == INTERPOLATION_MIDPOINT: - out[i, j] = (val + next_val) / 2.0 - elif interp == INTERPOLATION_NEAREST: - if frac > .5 or (frac == .5 and q > .5): # Always OK? - out[i, j] = next_val - else: - out[i, j] = val + next_val = values[sort_arr[idx + 1]] + if interp == INTERPOLATION_LINEAR: + out[i] = val + (next_val - val) * frac + elif interp == INTERPOLATION_HIGHER: + out[i] = next_val + elif interp == INTERPOLATION_MIDPOINT: + out[i] = (val + next_val) / 2.0 + elif interp == INTERPOLATION_NEAREST: + if frac > .5 or (frac == .5 and q > .5): # Always OK? + out[i] = next_val + else: + out[i] = val # Increment the index reference in sorted_arr for the next group grp_start += grp_sz diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 69b143febeea2..53bc9c954d9ec 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1721,7 +1721,11 @@ def _wrap_aggregated_output( DataFrame """ indexed_output = {key.position: val for key, val in output.items()} - columns = Index(key.label for key in output) + if self.axis == 0: + name = self._obj_with_exclusions.columns.name + else: + name = self._obj_with_exclusions.index.name + columns = Index([key.label for key in output], name=name) result = self.obj._constructor(indexed_output) result.columns = columns diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8af7a54772076..fa23df8a17d9b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1260,6 +1260,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: return self._get_cythonized_result( "group_any_all", aggregate=True, + numeric_only=False, cython_dtype=np.dtype(np.uint8), needs_values=True, needs_mask=True, @@ -1416,18 +1417,16 @@ def std(self, ddof: int = 1): Series or DataFrame Standard deviation of values within each group. """ - result = self.var(ddof=ddof) - if result.ndim == 1: - result = np.sqrt(result) - else: - cols = result.columns.get_indexer_for( - result.columns.difference(self.exclusions).unique() - ) - # TODO(GH-22046) - setting with iloc broken if labels are not unique - # .values to remove labels - result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values - - return result + return self._get_cythonized_result( + "group_var_float64", + aggregate=True, + needs_counts=True, + needs_values=True, + needs_2d=True, + cython_dtype=np.dtype(np.float64), + post_processing=lambda vals, inference: np.sqrt(vals), + ddof=ddof, + ) @Substitution(name="groupby") @Appender(_common_see_also) @@ -1756,6 +1755,7 @@ def _fill(self, direction, limit=None): return self._get_cythonized_result( "group_fillna_indexer", + numeric_only=False, needs_mask=True, cython_dtype=np.dtype(np.int64), result_is_index=True, @@ -2039,9 +2039,6 @@ def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: inference = "datetime64[ns]" vals = np.asarray(vals).astype(np.float) - if vals.dtype != np.dtype(np.float64): - vals = vals.astype(np.float64) - return vals, inference def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: @@ -2059,6 +2056,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: return self._get_cythonized_result( "group_quantile", aggregate=True, + numeric_only=False, needs_values=True, needs_mask=True, cython_dtype=np.dtype(np.float64), @@ -2348,7 +2346,11 @@ def _get_cythonized_result( how: str, cython_dtype: np.dtype, aggregate: bool = False, + numeric_only: bool = True, + needs_counts: bool = False, needs_values: bool = False, + needs_2d: bool = False, + min_count: Optional[int] = None, needs_mask: bool = False, needs_ngroups: bool = False, result_is_index: bool = False, @@ -2367,9 +2369,18 @@ def _get_cythonized_result( aggregate : bool, default False Whether the result should be aggregated to match the number of groups + numeric_only : bool, default True + Whether only numeric datatypes should be computed + needs_counts : bool, default False + Whether the counts should be a part of the Cython call needs_values : bool, default False Whether the values should be a part of the Cython call signature + needs_2d : bool, default False + Whether the values and result of the Cython call signature + are 2-dimensional. + min_count : int, default None + When not None, min_count for the Cython call needs_mask : bool, default False Whether boolean mask needs to be part of the Cython call signature @@ -2415,56 +2426,44 @@ def _get_cythonized_result( output: Dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) - if how == "group_quantile": - values = self._obj_with_exclusions._values - result_sz = ngroups if aggregate else len(values) - - vals, inferences = pre_processing(values) - if self._obj_with_exclusions.ndim == 1: - width = 1 - vals = np.reshape(vals, (-1, 1)) - else: - width = len(self._obj_with_exclusions.columns) - result = np.zeros((result_sz, width), dtype=cython_dtype) - counts = np.zeros(self.ngroups, dtype=np.int64) - mask = isna(vals).view(np.uint8) - - func = partial(base_func, result, counts, vals, labels, -1, mask) - func(**kwargs) # Call func to modify indexer values in place - result = post_processing(result, inferences) - - if self._obj_with_exclusions.ndim == 1: - key = base.OutputKey(label=self._obj_with_exclusions.name, position=0) - output[key] = result[:, 0] - else: - for idx, name in enumerate(self._obj_with_exclusions.columns): - key = base.OutputKey(label=name, position=idx) - output[key] = result[:, idx] - - if aggregate: - return self._wrap_aggregated_output(output) - else: - return self._wrap_transformed_output(output) - for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._values + if numeric_only and not is_numeric_dtype(values): + continue + if aggregate: result_sz = ngroups else: result_sz = len(values) - result = np.zeros(result_sz, dtype=cython_dtype) - func = partial(base_func, result, labels) + if needs_2d: + result = np.zeros((result_sz, 1), dtype=cython_dtype) + else: + result = np.zeros(result_sz, dtype=cython_dtype) + func = partial(base_func, result) + inferences = None + if needs_counts: + counts = np.zeros(self.ngroups, dtype=np.int64) + func = partial(func, counts) + if needs_values: vals = values if pre_processing: vals, inferences = pre_processing(vals) + if needs_2d: + vals = vals.reshape((-1, 1)) + vals = vals.astype(cython_dtype, copy=False) func = partial(func, vals) + func = partial(func, labels) + + if min_count is not None: + func = partial(func, min_count) + if needs_mask: mask = isna(values).view(np.uint8) func = partial(func, mask) @@ -2474,6 +2473,9 @@ def _get_cythonized_result( func(**kwargs) # Call func to modify indexer values in place + if needs_2d: + result = result.reshape(-1) + if result_is_index: result = algorithms.take_nd(values, result) @@ -2524,6 +2526,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): return self._get_cythonized_result( "group_shift_indexer", + numeric_only=False, cython_dtype=np.dtype(np.int64), needs_ngroups=True, result_is_index=True, From 5e21c7294991b7832f3478b585d16c3fa6d32be2 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 9 Jun 2020 21:37:41 -0400 Subject: [PATCH 3/6] Simplified name logic in _wrap_aggregated_output --- pandas/core/groupby/generic.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 961b8a4cd863f..38cc61979fb10 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1731,10 +1731,7 @@ def _wrap_aggregated_output( DataFrame """ indexed_output = {key.position: val for key, val in output.items()} - if self.axis == 0: - name = self._obj_with_exclusions.columns.name - else: - name = self._obj_with_exclusions.index.name + name = self._obj_with_exclusions._get_axis(1 - self.axis).name columns = Index([key.label for key in output], name=name) result = self.obj._constructor(indexed_output) From 4d62493af238357ac0c450874762a28c5f05fe0e Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 14 Jun 2020 12:32:01 -0400 Subject: [PATCH 4/6] Renamed needs_2d -> needs_at_least2d --- pandas/core/groupby/groupby.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4672fb92c996a..2d02ee98ba428 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1439,7 +1439,7 @@ def std(self, ddof: int = 1): aggregate=True, needs_counts=True, needs_values=True, - needs_2d=True, + needs_at_least2d=True, cython_dtype=np.dtype(np.float64), post_processing=lambda vals, inference: np.sqrt(vals), ddof=ddof, @@ -2366,7 +2366,7 @@ def _get_cythonized_result( numeric_only: bool = True, needs_counts: bool = False, needs_values: bool = False, - needs_2d: bool = False, + needs_at_least2d: bool = False, min_count: Optional[int] = None, needs_mask: bool = False, needs_ngroups: bool = False, @@ -2393,9 +2393,9 @@ def _get_cythonized_result( needs_values : bool, default False Whether the values should be a part of the Cython call signature - needs_2d : bool, default False + needs_at_least2d : bool, default False Whether the values and result of the Cython call signature - are 2-dimensional. + are at least 2-dimensional. min_count : int, default None When not None, min_count for the Cython call needs_mask : bool, default False @@ -2455,7 +2455,7 @@ def _get_cythonized_result( else: result_sz = len(values) - if needs_2d: + if needs_at_least2d: result = np.zeros((result_sz, 1), dtype=cython_dtype) else: result = np.zeros(result_sz, dtype=cython_dtype) @@ -2471,7 +2471,7 @@ def _get_cythonized_result( vals = values if pre_processing: vals, inferences = pre_processing(vals) - if needs_2d: + if needs_at_least2d: vals = vals.reshape((-1, 1)) vals = vals.astype(cython_dtype, copy=False) func = partial(func, vals) @@ -2490,7 +2490,7 @@ def _get_cythonized_result( func(**kwargs) # Call func to modify indexer values in place - if needs_2d: + if needs_at_least2d: result = result.reshape(-1) if result_is_index: From f1c868ff67d3ed5b4db5108e49b31812face891c Mon Sep 17 00:00:00 2001 From: Richard Date: Mon, 15 Jun 2020 16:14:07 -0400 Subject: [PATCH 5/6] Revert renaming of needs_2d -> needs_at_least2d --- pandas/core/groupby/groupby.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2d02ee98ba428..89982717a7df2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1439,7 +1439,7 @@ def std(self, ddof: int = 1): aggregate=True, needs_counts=True, needs_values=True, - needs_at_least2d=True, + needs_2d=True, cython_dtype=np.dtype(np.float64), post_processing=lambda vals, inference: np.sqrt(vals), ddof=ddof, @@ -2366,7 +2366,7 @@ def _get_cythonized_result( numeric_only: bool = True, needs_counts: bool = False, needs_values: bool = False, - needs_at_least2d: bool = False, + needs_2d: bool = False, min_count: Optional[int] = None, needs_mask: bool = False, needs_ngroups: bool = False, @@ -2393,7 +2393,7 @@ def _get_cythonized_result( needs_values : bool, default False Whether the values should be a part of the Cython call signature - needs_at_least2d : bool, default False + needs_2d : bool, default False Whether the values and result of the Cython call signature are at least 2-dimensional. min_count : int, default None @@ -2455,7 +2455,7 @@ def _get_cythonized_result( else: result_sz = len(values) - if needs_at_least2d: + if needs_2d: result = np.zeros((result_sz, 1), dtype=cython_dtype) else: result = np.zeros(result_sz, dtype=cython_dtype) @@ -2471,7 +2471,7 @@ def _get_cythonized_result( vals = values if pre_processing: vals, inferences = pre_processing(vals) - if needs_at_least2d: + if needs_2d: vals = vals.reshape((-1, 1)) vals = vals.astype(cython_dtype, copy=False) func = partial(func, vals) @@ -2490,7 +2490,7 @@ def _get_cythonized_result( func(**kwargs) # Call func to modify indexer values in place - if needs_at_least2d: + if needs_2d: result = result.reshape(-1) if result_is_index: From 33bf96a4b11b364cbe987ef4fd10a6e7b215ab69 Mon Sep 17 00:00:00 2001 From: Richard Date: Thu, 18 Jun 2020 16:10:36 -0400 Subject: [PATCH 6/6] Requested change --- pandas/core/groupby/groupby.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 60ac19c303e7f..b92e75f16e965 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2460,10 +2460,9 @@ def _get_cythonized_result( else: result_sz = len(values) + result = np.zeros(result_sz, dtype=cython_dtype) if needs_2d: - result = np.zeros((result_sz, 1), dtype=cython_dtype) - else: - result = np.zeros(result_sz, dtype=cython_dtype) + result = result.reshape((-1, 1)) func = partial(base_func, result) inferences = None