BUG: add pyarrow autogenerated prefix (#55115)

hedeershowk · martin-sicho · mroeschke · web-flow · commit 824a2738dc0b · 2023-09-27T09:50:45.000-07:00
* add pyarrow autogenerated prefix * whats new bug fix * test with no head and pyarrow * only test pyarrow * BUG: This fixes #55009 (`raw=True` caused `apply` method of `DataFrame` to ignore passed arguments) (#55089) * fixes #55009 * update documentation * write documentation * add test * change formatting * cite DataDrame directly in docs Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * PR review feedback * Update doc/source/whatsnew/v2.2.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * alphabetical whatsnew --------- Co-authored-by: Martin Šícho <sichom@vscht.cz> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -314,6 +314,7 @@ MultiIndex
 I/O
 ^^^
 - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
+- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`)
 - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
 - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`)
 
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -130,6 +130,12 @@ def handle_warning(invalid_row):
             )
         }
         self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
+        # autogenerated column names are prefixed with 'f' in pyarrow.csv
+        if self.header is None and "include_columns" in self.convert_options:
+            self.convert_options["include_columns"] = [
+                f"f{n}" for n in self.convert_options["include_columns"]
+            ]
+
         self.read_options = {
             "autogenerate_column_names": self.header is None,
             "skip_rows": self.header
diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py
@@ -684,3 +684,21 @@ def test_header_delim_whitespace(all_parsers):
     result = parser.read_csv(StringIO(data), delim_whitespace=True)
     expected = DataFrame({"a,b": ["1,2", "3,4"]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_no_header_pyarrow(pyarrow_parser_only):
+    parser = pyarrow_parser_only
+    data = """
+a,i,x
+b,j,y
+"""
+    result = parser.read_csv(
+        StringIO(data),
+        header=None,
+        usecols=[0, 1],
+        dtype="string[pyarrow]",
+        dtype_backend="pyarrow",
+        engine="pyarrow",
+    )
+    expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]")
+    tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,12 @@ def handle_warning(invalid_row):`
`130`	`130`	`)`
`131`	`131`	`}`
`132`	`132`	`self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]`
	`133`	`+ # autogenerated column names are prefixed with 'f' in pyarrow.csv`
	`134`	`+ if self.header is None and "include_columns" in self.convert_options:`
	`135`	`+ self.convert_options["include_columns"] = [`
	`136`	`+ f"f{n}" for n in self.convert_options["include_columns"]`
	`137`	`+ ]`
	`138`	`+`
`133`	`139`	`self.read_options = {`
`134`	`140`	`"autogenerate_column_names": self.header is None,`
`135`	`141`	`"skip_rows": self.header`