pandas-dev · datapythonista · May 31, 2025 · Jun 12, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -32,6 +32,7 @@ Other enhancements
 - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
 - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
 - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
+- Added new :meth:`DataFrame.select` method to select a subset of columns from the :class:`DataFrame` (:issue:`61522`)
 - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
 - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
 - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4535,6 +4535,127 @@ def _get_item(self, item: Hashable) -> Series:
     # ----------------------------------------------------------------------
     # Unsorted
 
+    def select(self, *args):
+        """
+        Select a subset of columns from the DataFrame.
+
+        Select can be used to return a DataFrame with some specific columns.
+        This can be select a subset of the columns, as well as to return a
+        DataFrame with the columns sorted in a specific order.
+
+        Parameters
+        ----------
+        *args : hashable or a single list arg of hashable
+            The names of the columns to return. In general this will be strings,
+            but pandas supports other types of column names, if they are hashable.
+            If only one argument of type list is provided, the elements of the
+            list will be considered the names of the columns to be returned
+
+        Returns
+        -------
+        DataFrame
+            The DataFrame with the selected columns.
+
+        See Also
+        --------
+        DataFrame.filter : To return a subset of rows, instead of a subset of columns.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "first_name": ["John", "Alice", "Bob"],
+        ...         "last_name": ["Smith", "Cooper", "Marley"],
+        ...         "age": [61, 22, 35],
+        ...     }
+        ... )
+
+        Select a subset of columns:
+
+        >>> df.select("first_name", "age")
+          first_name  age
+        0       John   61
+        1      Alice   22
+        2        Bob   35
+
+        A list can also be used to specify the names of the columns to return:
+
+        >>> df.select(["last_name", "age"])
+                  last_name  age
+        0     Smith   61
+        1    Cooper   22
+        2    Marley   35
+
+        Selecting with a pattern can be done with Python expressions:
+
+        >>> df.select([col for col in df.columns if col.endswith("_name")])
+          first_name last_name
+        0       John     Smith
+        1      Alice    Cooper
+        2        Bob    Marley
+
+        All columns can be selected, but in a different order:
+
+        >>> df.select("last_name", "first_name", "age")
+          last_name first_name  age
+        0     Smith       John   61
+        1    Cooper      Alice   22
+        2    Marley        Bob   35
+
+        Note that a DataFrame is always returned. If a single column is requested, a
+        DataFrame with a single column is returned, not a Series:
+
+        >>> df.select("age")
+           age
+        0   61
+        1   22
+        2   35
+
+        The ``select`` method also works when columns are a ``MultiIndex``:
+
+        >>> df = pd.DataFrame(
+        ...     [("John", "Smith", 61), ("Alice", "Cooper", 22), ("Bob", "Marley", 35)],
+        ...     columns=pd.MultiIndex.from_tuples(
+        ...         [("names", "first_name"), ("names", "last_name"), ("other", "age")]
+        ...     ),
+        ... )
+
+        If column names are provided, they will select from the first level of
+        the ``MultiIndex``:
+
+        >>> df.select("names")
+              names
+          first_name last_name
+        0       John     Smith
+        1      Alice    Cooper
+        2        Bob    Marley
+
+        To select from multiple or all levels, tuples can be used:
+
+        >>> df.select(("names", "last_name"), ("other", "age"))
+              names other
+          last_name   age
+        0     Smith    61
+        1    Cooper    22
+        2    Marley    35
+        """
+        if args and isinstance(args[0], list):
+            if len(args) == 1:
+                columns = args[0]
+            else:
+                raise ValueError(
+                    "`DataFrame.select` supports individual columns "
+                    "`df.select('col1', 'col2',...)` or a list "
+                    "`df.select(['col1', 'col2',...])`, but not both. "
+                    "You can unpack the list if you have a mix: "
+                    "`df.select(*['col1', 'col2'], 'col3')`."
+                )
+        else:
+            columns = list(args)
+
+        indexer = self.columns._get_indexer_strict(columns, "columns")[1]
+        return self.take(indexer, axis=1)
+
     @overload
     def query(
         self,

diff --git a/pandas/tests/frame/methods/test_select.py b/pandas/tests/frame/methods/test_select.py
@@ -0,0 +1,98 @@
+import pytest
+
+import pandas as pd
+from pandas import DataFrame
+import pandas._testing as tm
+
+
+@pytest.fixture
+def regular_df():
+    return DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]})
+
+
+@pytest.fixture
+def multiindex_df():
+    return DataFrame(
+        [(0, 2, 4), (1, 3, 5)],
+        columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d"), ("B", "e")]),
+    )
+
+
+class TestSelect:
+    def test_select_subset_cols(self, regular_df):
+        expected = DataFrame({"a": [1, 2], "c": [5, 6]})
+        result = regular_df.select("a", "c")
+        tm.assert_frame_equal(result, expected)
+
+    def test_single_value(self, regular_df):
+        expected = DataFrame({"a": [1, 2]})
+        result = regular_df.select("a")
+        assert isinstance(result, DataFrame)
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_change_order(self, regular_df):
+        expected = DataFrame({"b": [3, 4], "d": [7, 8], "a": [1, 2], "c": [5, 6]})
+        result = regular_df.select("b", "d", "a", "c")
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_none(self, regular_df):
+        result = regular_df.select()
+        assert result.empty
+
+    def test_select_duplicated(self, regular_df):
+        expected = ["a", "d", "a"]
+        result = regular_df.select("a", "d", "a")
+        assert result.columns.tolist() == expected
+
+    def test_select_single_list(self, regular_df):
+        expected = DataFrame({"a": [1, 2], "c": [5, 6]})
+        result = regular_df.select(["a", "c"])
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_list_and_string(self, regular_df):
+        with pytest.raises(ValueError, match="supports individual columns"):
+            regular_df.select(["a", "c"], "b")
+
+    def test_select_missing(self, regular_df):
+        with pytest.raises(KeyError, match=r"None of .* are in the \[columns\]"):
+            regular_df.select("z")
+
+    def test_select_not_hashable(self, regular_df):
+        with pytest.raises(TypeError, match="unhashable type"):
+            regular_df.select(set())
+
+    def test_select_multiindex_one_level(self, multiindex_df):
+        expected = DataFrame(
+            [(0, 2), (1, 3)],
+            columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d")]),
+        )
+        result = multiindex_df.select("A")
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_multiindex_single_column(self, multiindex_df):
+        expected = DataFrame(
+            [(2,), (3,)], columns=pd.MultiIndex.from_tuples([("A", "d")])
+        )
+        result = multiindex_df.select(("A", "d"))
+        assert isinstance(result, DataFrame)
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_multiindex_multiple_columns(self, multiindex_df):
+        expected = DataFrame(
+            [(0, 4), (1, 5)],
+            columns=pd.MultiIndex.from_tuples([("A", "c"), ("B", "e")]),
+        )
+        result = multiindex_df.select(("A", "c"), ("B", "e"))
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_multiindex_multiple_columns_as_list(self, multiindex_df):
+        expected = DataFrame(
+            [(0, 4), (1, 5)],
+            columns=pd.MultiIndex.from_tuples([("A", "c"), ("B", "e")]),
+        )
+        result = multiindex_df.select([("A", "c"), ("B", "e")])
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_multiindex_missing(self, multiindex_df):
+        with pytest.raises(KeyError, match="not in index"):
+            multiindex_df.select("Z")