diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0ee56f865f8c8..ed47ddb0d2813 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -882,6 +882,7 @@ Bug Fixes - Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) +- Bug in ``pd.read_csv()`` in the C engine where the NULL character was not being parsed as NULL (:issue:`14012`) - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) - Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 4cea9e1d6b595..09d521e5a7e46 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -543,3 +543,21 @@ def test_parse_trim_buffers(self): # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) + + def test_internal_null_byte(self): + # see gh-14012 + # + # The null byte ('\x00') should not be used as a + # true line terminator, escape character, or comment + # character, only as a placeholder to indicate that + # none was specified. + # + # This test should be moved to common.py ONLY when + # Python's csv class supports parsing '\x00'. + names = ['a', 'b', 'c'] + data = "1,2,3\n4,\x00,6\n7,8,9" + expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6], + [7, 8, 9]], columns=names) + + result = self.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(result, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 3c09933b3ec87..af85b7b894d26 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -684,14 +684,19 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { #define IS_WHITESPACE(c) ((c == ' ' || c == '\t')) -#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && \ - c == '\n') || c == self->lineterminator) +#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \ + (self->lineterminator != '\0' && \ + c == self->lineterminator)) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) // don't parse '\r' with a custom line terminator #define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r')) +#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar)) + +#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar)) + #define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \ self->skipinitialspace)) @@ -866,7 +871,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) self->state = EAT_CRNL; } break; - } else if (c == self->commentchar) { + } else if (IS_COMMENT_CHAR(c)) { self->state = EAT_LINE_COMMENT; break; } else if (IS_WHITESPACE(c)) { @@ -899,7 +904,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else if (IS_QUOTE(c)) { // start quoted field self->state = IN_QUOTED_FIELD; - } else if (c == self->escapechar) { + } else if (IS_ESCAPE_CHAR(c)) { // possible escaped character self->state = ESCAPED_CHAR; } else if (IS_SKIPPABLE_SPACE(c)) { @@ -912,7 +917,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) // save empty field END_FIELD(); } - } else if (c == self->commentchar) { + } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); self->state = EAT_COMMENT; } else { @@ -950,7 +955,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; - } else if (c == self->escapechar) { + } else if (IS_ESCAPE_CHAR(c)) { // possible escaped character self->state = ESCAPED_CHAR; } else if (IS_DELIMITER(c)) { @@ -962,7 +967,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else { self->state = START_FIELD; } - } else if (c == self->commentchar) { + } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); self->state = EAT_COMMENT; } else { @@ -973,7 +978,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) case IN_QUOTED_FIELD: // in quoted field - if (c == self->escapechar) { + if (IS_ESCAPE_CHAR(c)) { // possible escape character self->state = ESCAPE_IN_QUOTED_FIELD; } else if (IS_QUOTE(c)) {