Skip to content

BUG: Parse NULL char as null value #14019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,7 @@ Bug Fixes

- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`)

- Bug in ``pd.read_csv()`` in the C engine where the NULL character was not being parsed as NULL (:issue:`14012`)
- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`)
- Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`)
- Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`)
Expand Down
18 changes: 18 additions & 0 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,3 +543,21 @@ def test_parse_trim_buffers(self):

# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)

def test_internal_null_byte(self):
# see gh-14012
#
# The null byte ('\x00') should not be used as a
# true line terminator, escape character, or comment
# character, only as a placeholder to indicate that
# none was specified.
#
# This test should be moved to common.py ONLY when
# Python's csv class supports parsing '\x00'.
names = ['a', 'b', 'c']
data = "1,2,3\n4,\x00,6\n7,8,9"
expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
[7, 8, 9]], columns=names)

result = self.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)
21 changes: 13 additions & 8 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -684,14 +684,19 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {

#define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))

#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && \
c == '\n') || c == self->lineterminator)
#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \
(self->lineterminator != '\0' && \
c == self->lineterminator))

#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))

// don't parse '\r' with a custom line terminator
#define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r'))

#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar))

#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar))

#define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \
self->skipinitialspace))

Expand Down Expand Up @@ -866,7 +871,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
self->state = EAT_CRNL;
}
break;
} else if (c == self->commentchar) {
} else if (IS_COMMENT_CHAR(c)) {
self->state = EAT_LINE_COMMENT;
break;
} else if (IS_WHITESPACE(c)) {
Expand Down Expand Up @@ -899,7 +904,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
} else if (IS_QUOTE(c)) {
// start quoted field
self->state = IN_QUOTED_FIELD;
} else if (c == self->escapechar) {
} else if (IS_ESCAPE_CHAR(c)) {
// possible escaped character
self->state = ESCAPED_CHAR;
} else if (IS_SKIPPABLE_SPACE(c)) {
Expand All @@ -912,7 +917,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
// save empty field
END_FIELD();
}
} else if (c == self->commentchar) {
} else if (IS_COMMENT_CHAR(c)) {
END_FIELD();
self->state = EAT_COMMENT;
} else {
Expand Down Expand Up @@ -950,7 +955,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
} else if (IS_CARRIAGE(c)) {
END_FIELD();
self->state = EAT_CRNL;
} else if (c == self->escapechar) {
} else if (IS_ESCAPE_CHAR(c)) {
// possible escaped character
self->state = ESCAPED_CHAR;
} else if (IS_DELIMITER(c)) {
Expand All @@ -962,7 +967,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
} else {
self->state = START_FIELD;
}
} else if (c == self->commentchar) {
} else if (IS_COMMENT_CHAR(c)) {
END_FIELD();
self->state = EAT_COMMENT;
} else {
Expand All @@ -973,7 +978,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)

case IN_QUOTED_FIELD:
// in quoted field
if (c == self->escapechar) {
if (IS_ESCAPE_CHAR(c)) {
// possible escape character
self->state = ESCAPE_IN_QUOTED_FIELD;
} else if (IS_QUOTE(c)) {
Expand Down