diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index aeee77bb02e98..9dde669c9d39d 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -419,15 +419,6 @@ def test_tokenize_CR_with_quoting(self): expected = self.read_csv(StringIO(data.replace('\r', '\n'))) tm.assert_frame_equal(result, expected) - def test_raise_on_no_columns(self): - # single newline - data = "\n" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) - - # test with more than a single newline - data = "\n\n\n" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) - def test_grow_boundary_at_cap(self): # See gh-12494 # diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 14f4de853e118..2e3c102948cfa 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1323,3 +1323,12 @@ def test_inf_parsing(self): # TODO: remove condition when 'na_filter' is supported for Python df = self.read_csv(StringIO(data), index_col=0, na_filter=False) tm.assert_almost_equal(df['A'].values, expected.values) + + def test_raise_on_no_columns(self): + # single newline + data = "\n" + self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) + + # test with more than a single newline + data = "\n\n\n" + self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index c34549835cb46..5916d8d347c8b 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -250,117 +250,3 @@ def test_na_values_keep_default(self): 'Three': ['None', 'two', 'None', 'nan', 'five', '', 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - def test_skiprow_with_newline(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line 11 -line 12",2 -2,"line 21 -line 22",2 -3,"line 31",1""" - expected = [[2, 'line 21\nline 22', 2], - [3, 'line 31', 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = ('a,b,c\n~a\n b~,~e\n d~,' - '~f\n f~\n1,2,~12\n 13\n 14~') - expected = [['a\n b', 'e\n d', 'f\n f']] - expected = DataFrame(expected, columns=[ - 'a', 'b', 'c']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[2]) - tm.assert_frame_equal(df, expected) - - data = ('Text,url\n~example\n ' - 'sentence\n one~,url1\n~' - 'example\n sentence\n two~,url2\n~' - 'example\n sentence\n three~,url3') - expected = [['example\n sentence\n two', 'url2']] - expected = DataFrame(expected, columns=[ - 'Text', 'url']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[1, 3]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line '11' line 12",2 -2,"line '21' line 22",2 -3,"line '31' line 32",1""" - expected = [[2, "line '21' line 22", 2], - [3, "line '31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_newline_and_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line \n'11' line 12",2 -2,"line \n'21' line 22",2 -3,"line \n'31' line 32",1""" - expected = [[2, "line \n'21' line 22", 2], - [3, "line \n'31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = """id,text,num_lines -1,"line '11\n' line 12",2 -2,"line '21\n' line 22",2 -3,"line '31\n' line 32",1""" - expected = [[2, "line '21\n' line 22", 2], - [3, "line '31\n' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = """id,text,num_lines -1,"line '11\n' \r\tline 12",2 -2,"line '21\n' \r\tline 22",2 -3,"line '31\n' \r\tline 32",1""" - expected = [[2, "line '21\n' \r\tline 22", 2], - [3, "line '31\n' \r\tline 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - def test_skiprows_lineterminator(self): - # see gh-9079 - data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', - '2007/01/01 01:00 0.2140 U M ', - '2007/01/01 02:00 0.2141 M O ', - '2007/01/01 04:00 0.2142 D M ']) - expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], - ['2007/01/01', '02:00', 0.2141, 'M', 'O'], - ['2007/01/01', '04:00', 0.2142, 'D', 'M']], - columns=['date', 'time', 'var', 'flag', - 'oflag']) - - # test with default line terminators "LF" and "CRLF" - df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(data.replace('\n', '\r\n')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - # "CR" is not respected with the Python parser yet - if self.engine == 'c': - df = self.read_csv(StringIO(data.replace('\n', '\r')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py index 3e585a9a623c9..c9f50dec6c01e 100644 --- a/pandas/io/tests/parser/skiprows.py +++ b/pandas/io/tests/parser/skiprows.py @@ -76,3 +76,117 @@ def test_skiprows_blank(self): datetime(2000, 1, 3)]) expected.index.name = 0 tm.assert_frame_equal(data, expected) + + def test_skiprow_with_newline(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line 11 +line 12",2 +2,"line 21 +line 22",2 +3,"line 31",1""" + expected = [[2, 'line 21\nline 22', 2], + [3, 'line 31', 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = ('a,b,c\n~a\n b~,~e\n d~,' + '~f\n f~\n1,2,~12\n 13\n 14~') + expected = [['a\n b', 'e\n d', 'f\n f']] + expected = DataFrame(expected, columns=[ + 'a', 'b', 'c']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[2]) + tm.assert_frame_equal(df, expected) + + data = ('Text,url\n~example\n ' + 'sentence\n one~,url1\n~' + 'example\n sentence\n two~,url2\n~' + 'example\n sentence\n three~,url3') + expected = [['example\n sentence\n two', 'url2']] + expected = DataFrame(expected, columns=[ + 'Text', 'url']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[1, 3]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_quote(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line '11' line 12",2 +2,"line '21' line 22",2 +3,"line '31' line 32",1""" + expected = [[2, "line '21' line 22", 2], + [3, "line '31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_newline_and_quote(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line \n'11' line 12",2 +2,"line \n'21' line 22",2 +3,"line \n'31' line 32",1""" + expected = [[2, "line \n'21' line 22", 2], + [3, "line \n'31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' line 12",2 +2,"line '21\n' line 22",2 +3,"line '31\n' line 32",1""" + expected = [[2, "line '21\n' line 22", 2], + [3, "line '31\n' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' \r\tline 12",2 +2,"line '21\n' \r\tline 22",2 +3,"line '31\n' \r\tline 32",1""" + expected = [[2, "line '21\n' \r\tline 22", 2], + [3, "line '31\n' \r\tline 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + def test_skiprows_lineterminator(self): + # see gh-9079 + data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', + '2007/01/01 01:00 0.2140 U M ', + '2007/01/01 02:00 0.2141 M O ', + '2007/01/01 04:00 0.2142 D M ']) + expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], + ['2007/01/01', '02:00', 0.2141, 'M', 'O'], + ['2007/01/01', '04:00', 0.2142, 'D', 'M']], + columns=['date', 'time', 'var', 'flag', + 'oflag']) + + # test with default line terminators "LF" and "CRLF" + df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(data.replace('\n', '\r\n')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + + # "CR" is not respected with the Python parser yet + if self.engine == 'c': + df = self.read_csv(StringIO(data.replace('\n', '\r')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected)