diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py
index 5f1f85d8179cd..6c4b913574d9e 100644
--- a/pandas/tests/groupby/methods/test_describe.py
+++ b/pandas/tests/groupby/methods/test_describe.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -73,7 +71,6 @@ def test_series_describe_as_index(as_index, keys):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_frame_describe_multikey(tsframe):
     grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
     result = grouped.describe()
@@ -82,7 +79,7 @@ def test_frame_describe_multikey(tsframe):
         group = grouped[col].describe()
         # GH 17464 - Remove duplicate MultiIndex levels
         group_col = MultiIndex(
-            levels=[[col], group.columns],
+            levels=[Index([col], dtype=tsframe.columns.dtype), group.columns],
             codes=[[0] * len(group.columns), range(len(group.columns))],
         )
         group = DataFrame(group.values, columns=group_col, index=group.index)
@@ -249,7 +246,6 @@ def test_describe_non_cython_paths():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("dtype", [int, float, object])
 @pytest.mark.parametrize(
     "kwargs",
@@ -271,5 +267,5 @@ def test_groupby_empty_dataset(dtype, kwargs):
 
     result = df.iloc[:0].groupby("A").B.describe(**kwargs)
     expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
-    expected.index = Index([])
+    expected.index = Index([], dtype=df.columns.dtype)
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py
index d20b30834dea2..d9c8706ec9202 100644
--- a/pandas/tests/groupby/methods/test_nth.py
+++ b/pandas/tests/groupby/methods/test_nth.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -679,14 +677,14 @@ def test_first_multi_key_groupby_categorical():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("method", ["first", "last", "nth"])
 def test_groupby_last_first_nth_with_none(method, nulls_fixture):
     # GH29645
-    expected = Series(["y"])
+    expected = Series(["y"], dtype=object)
     data = Series(
         [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
         index=[0, 0, 0, 0, 0],
+        dtype=object,
     ).groupby(level=0)
 
     if method == "nth":
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index bb54cbd69bd42..d42aa06d6bbfe 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -3,7 +3,6 @@
 
 from pandas._config import using_string_dtype
 
-from pandas.compat import HAS_PYARROW
 from pandas.compat.pyarrow import pa_version_under10p1
 
 from pandas.core.dtypes.missing import na_value_for_dtype
@@ -13,9 +12,6 @@
 from pandas.tests.groupby import get_groupby_method_args
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
 @pytest.mark.parametrize(
     "dropna, tuples, outputs",
     [
@@ -59,9 +55,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
     tm.assert_frame_equal(grouped, expected)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
 @pytest.mark.parametrize(
     "dropna, tuples, outputs",
     [
@@ -138,9 +131,6 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
     tm.assert_frame_equal(grouped, expected)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
 @pytest.mark.parametrize(
     "dropna, idx, expected",
     [
@@ -215,9 +205,6 @@ def test_groupby_dataframe_slice_then_transform(dropna, index):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
 @pytest.mark.parametrize(
     "dropna, tuples, outputs",
     [
@@ -299,9 +286,6 @@ def test_groupby_dropna_datetime_like_data(
     tm.assert_frame_equal(grouped, expected)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
 @pytest.mark.parametrize(
     "dropna, data, selected_data, levels",
     [
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index a65dda1570944..053dda0629571 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -6,6 +6,7 @@
 from pandas._config import using_string_dtype
 
 from pandas._libs import lib
+from pandas.compat import HAS_PYARROW
 
 from pandas.core.dtypes.common import ensure_platform_int
 
@@ -372,8 +373,7 @@ def test_transform_select_columns(df):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_transform_nuisance_raises(df):
+def test_transform_nuisance_raises(df, using_infer_string):
     # case that goes through _transform_item_by_item
 
     df.columns = ["A", "B", "B", "D"]
@@ -383,10 +383,16 @@ def test_transform_nuisance_raises(df):
     grouped = df.groupby("A")
 
     gbc = grouped["B"]
-    with pytest.raises(TypeError, match="Could not convert"):
+    msg = "Could not convert"
+    if using_infer_string:
+        if df.columns.dtype.storage == "pyarrow":
+            msg = "with dtype str does not support operation 'mean'"
+        else:
+            msg = "Cannot perform reduction 'mean' with string dtype"
+    with pytest.raises(TypeError, match=msg):
         gbc.transform(lambda x: np.mean(x))
 
-    with pytest.raises(TypeError, match="Could not convert"):
+    with pytest.raises(TypeError, match=msg):
         df.groupby("A").transform(lambda x: np.mean(x))
 
 
@@ -445,8 +451,7 @@ def test_transform_coercion():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_groupby_transform_with_int():
+def test_groupby_transform_with_int(using_infer_string):
     # GH 3740, make sure that we might upcast on item-by-item transform
 
     # floats
@@ -476,8 +481,14 @@ def test_groupby_transform_with_int():
             "D": "foo",
         }
     )
+    msg = "Could not convert"
+    if using_infer_string:
+        if HAS_PYARROW:
+            msg = "with dtype str does not support operation 'mean'"
+        else:
+            msg = "Cannot perform reduction 'mean' with string dtype"
     with np.errstate(all="ignore"):
-        with pytest.raises(TypeError, match="Could not convert"):
+        with pytest.raises(TypeError, match=msg):
             df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
         result = df.groupby("A")[["B", "C"]].transform(
             lambda x: (x - x.mean()) / x.std()
@@ -489,7 +500,7 @@ def test_groupby_transform_with_int():
     s = Series([2, 3, 4, 10, 5, -1])
     df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"})
     with np.errstate(all="ignore"):
-        with pytest.raises(TypeError, match="Could not convert"):
+        with pytest.raises(TypeError, match=msg):
             df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
         result = df.groupby("A")[["B", "C"]].transform(
             lambda x: (x - x.mean()) / x.std()
@@ -705,7 +716,6 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.slow
 @pytest.mark.parametrize(
     "op, args, targop",
@@ -757,6 +767,7 @@ def test_cython_transform_frame_column(
                 "does not support operation",
                 ".* is not supported for object dtype",
                 "is not implemented for this dtype",
+                ".* is not supported for str dtype",
             ]
         )
         with pytest.raises(TypeError, match=msg):