diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index cb2a59860783f..4f57597432e2c 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -230,7 +230,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) -- +- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) - Period diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e5e61e409c320..3655d6efad66e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1280,6 +1280,8 @@ cdef class TextReader: # generate extra (bogus) headers if there are more columns than headers if j >= len(self.header[0]): return j + elif self.has_mi_columns: + return tuple(header_row[j] for header_row in self.header) else: return self.header[0][j] else: diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index bc20f1d1eea5f..6ed52ed86af2a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -257,3 +257,29 @@ def test_dtype_mangle_dup_cols_single_dtype(all_parsers): result = parser.read_csv(StringIO(data), dtype=str) expected = DataFrame({"a": ["1"], "a.1": ["1"]}) tm.assert_frame_equal(result, expected) + + +def test_dtype_multi_index(all_parsers): + # GH 42446 + parser = all_parsers + data = "A,B,B\nX,Y,Z\n1,2,3" + + result = parser.read_csv( + StringIO(data), + header=list(range(2)), + dtype={ + ("A", "X"): np.int32, + ("B", "Y"): np.int32, + ("B", "Z"): np.float32, + }, + ) + + expected = DataFrame( + { + ("A", "X"): np.int32([1]), + ("B", "Y"): np.int32([2]), + ("B", "Z"): np.float32([3]), + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index ffa6c8259a59e..78b64baab4dc0 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -161,3 +161,29 @@ def test_converter_index_col_bug(all_parsers): xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A")) tm.assert_frame_equal(rs, xp) + + +def test_converter_multi_index(all_parsers): + # GH 42446 + parser = all_parsers + data = "A,B,B\nX,Y,Z\n1,2,3" + + result = parser.read_csv( + StringIO(data), + header=list(range(2)), + converters={ + ("A", "X"): np.int32, + ("B", "Y"): np.int32, + ("B", "Z"): np.float32, + }, + ) + + expected = DataFrame( + { + ("A", "X"): np.int32([1]), + ("B", "Y"): np.int32([2]), + ("B", "Z"): np.float32([3]), + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 6e445f6813310..2880bf8690b46 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -570,3 +570,23 @@ def test_str_nan_dropped(all_parsers): ) tm.assert_frame_equal(result, expected) + + +def test_nan_multi_index(all_parsers): + # GH 42446 + parser = all_parsers + data = "A,B,B\nX,Y,Z\n1,2,inf" + + result = parser.read_csv( + StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} + ) + + expected = DataFrame( + { + ("A", "X"): [1], + ("B", "Y"): [2], + ("B", "Z"): [np.nan], + } + ) + + tm.assert_frame_equal(result, expected)