diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 4e792da31e1d5..7c57e6ee9dbfd 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -378,8 +378,8 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_any_all(uint8_t[:] out,
-                  const int64_t[:] labels,
                   const uint8_t[:] values,
+                  const int64_t[:] labels,
                   const uint8_t[:] mask,
                   object val_test,
                   bint skipna):
@@ -560,7 +560,8 @@ def _group_var(floating[:, :] out,
                int64_t[:] counts,
                floating[:, :] values,
                const int64_t[:] labels,
-               Py_ssize_t min_count=-1):
+               Py_ssize_t min_count=-1,
+               int64_t ddof=1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         floating val, ct, oldmean
@@ -600,10 +601,10 @@ def _group_var(floating[:, :] out,
         for i in range(ncounts):
             for j in range(K):
                 ct = nobs[i, j]
-                if ct < 2:
+                if ct <= ddof:
                     out[i, j] = NAN
                 else:
-                    out[i, j] /= (ct - 1)
+                    out[i, j] /= (ct - ddof)
 
 
 group_var_float32 = _group_var['float']
@@ -715,8 +716,8 @@ group_ohlc_float64 = _group_ohlc['double']
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_quantile(ndarray[float64_t] out,
-                   ndarray[int64_t] labels,
                    numeric[:] values,
+                   ndarray[int64_t] labels,
                    ndarray[uint8_t] mask,
                    float64_t q,
                    object interpolation):
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index db5df9818b0b0..cec3d9711a8ca 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -1736,7 +1736,8 @@ def _wrap_aggregated_output(
         DataFrame
         """
         indexed_output = {key.position: val for key, val in output.items()}
-        columns = Index(key.label for key in output)
+        name = self._obj_with_exclusions._get_axis(1 - self.axis).name
+        columns = Index([key.label for key in output], name=name)
 
         result = self.obj._constructor(indexed_output)
         result.columns = columns
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 48fdb14ebe90c..b92e75f16e965 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1277,6 +1277,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray:
         return self._get_cythonized_result(
             "group_any_all",
             aggregate=True,
+            numeric_only=False,
             cython_dtype=np.dtype(np.uint8),
             needs_values=True,
             needs_mask=True,
@@ -1433,18 +1434,16 @@ def std(self, ddof: int = 1):
         Series or DataFrame
             Standard deviation of values within each group.
         """
-        result = self.var(ddof=ddof)
-        if result.ndim == 1:
-            result = np.sqrt(result)
-        else:
-            cols = result.columns.get_indexer_for(
-                result.columns.difference(self.exclusions).unique()
-            )
-            # TODO(GH-22046) - setting with iloc broken if labels are not unique
-            # .values to remove labels
-            result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values
-
-        return result
+        return self._get_cythonized_result(
+            "group_var_float64",
+            aggregate=True,
+            needs_counts=True,
+            needs_values=True,
+            needs_2d=True,
+            cython_dtype=np.dtype(np.float64),
+            post_processing=lambda vals, inference: np.sqrt(vals),
+            ddof=ddof,
+        )
 
     @Substitution(name="groupby")
     @Appender(_common_see_also)
@@ -1778,6 +1777,7 @@ def _fill(self, direction, limit=None):
 
         return self._get_cythonized_result(
             "group_fillna_indexer",
+            numeric_only=False,
             needs_mask=True,
             cython_dtype=np.dtype(np.int64),
             result_is_index=True,
@@ -2078,6 +2078,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
             return self._get_cythonized_result(
                 "group_quantile",
                 aggregate=True,
+                numeric_only=False,
                 needs_values=True,
                 needs_mask=True,
                 cython_dtype=np.dtype(np.float64),
@@ -2367,7 +2368,11 @@ def _get_cythonized_result(
         how: str,
         cython_dtype: np.dtype,
         aggregate: bool = False,
+        numeric_only: bool = True,
+        needs_counts: bool = False,
         needs_values: bool = False,
+        needs_2d: bool = False,
+        min_count: Optional[int] = None,
         needs_mask: bool = False,
         needs_ngroups: bool = False,
         result_is_index: bool = False,
@@ -2386,9 +2391,18 @@ def _get_cythonized_result(
         aggregate : bool, default False
             Whether the result should be aggregated to match the number of
             groups
+        numeric_only : bool, default True
+            Whether only numeric datatypes should be computed
+        needs_counts : bool, default False
+            Whether the counts should be a part of the Cython call
         needs_values : bool, default False
             Whether the values should be a part of the Cython call
             signature
+        needs_2d : bool, default False
+            Whether the values and result of the Cython call signature
+            are at least 2-dimensional.
+        min_count : int, default None
+            When not None, min_count for the Cython call
         needs_mask : bool, default False
             Whether boolean mask needs to be part of the Cython call
             signature
@@ -2418,7 +2432,7 @@ def _get_cythonized_result(
         if result_is_index and aggregate:
             raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
         if post_processing:
-            if not callable(pre_processing):
+            if not callable(post_processing):
                 raise ValueError("'post_processing' must be a callable!")
         if pre_processing:
             if not callable(pre_processing):
@@ -2438,21 +2452,39 @@ def _get_cythonized_result(
             name = obj.name
             values = obj._values
 
+            if numeric_only and not is_numeric_dtype(values):
+                continue
+
             if aggregate:
                 result_sz = ngroups
             else:
                 result_sz = len(values)
 
             result = np.zeros(result_sz, dtype=cython_dtype)
-            func = partial(base_func, result, labels)
+            if needs_2d:
+                result = result.reshape((-1, 1))
+            func = partial(base_func, result)
+
             inferences = None
 
+            if needs_counts:
+                counts = np.zeros(self.ngroups, dtype=np.int64)
+                func = partial(func, counts)
+
             if needs_values:
                 vals = values
                 if pre_processing:
                     vals, inferences = pre_processing(vals)
+                if needs_2d:
+                    vals = vals.reshape((-1, 1))
+                vals = vals.astype(cython_dtype, copy=False)
                 func = partial(func, vals)
 
+            func = partial(func, labels)
+
+            if min_count is not None:
+                func = partial(func, min_count)
+
             if needs_mask:
                 mask = isna(values).view(np.uint8)
                 func = partial(func, mask)
@@ -2462,6 +2494,9 @@ def _get_cythonized_result(
 
             func(**kwargs)  # Call func to modify indexer values in place
 
+            if needs_2d:
+                result = result.reshape(-1)
+
             if result_is_index:
                 result = algorithms.take_nd(values, result)
 
@@ -2512,6 +2547,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
 
         return self._get_cythonized_result(
             "group_shift_indexer",
+            numeric_only=False,
             cython_dtype=np.dtype(np.int64),
             needs_ngroups=True,
             result_is_index=True,