Skip to content

Commit 2f37c35

Browse files
authored
bpo-40334: Fix error location upon parsing an invalid string literal (GH-19962)
When parsing a string with an invalid escape, the old parser used to point to the beginning of the invalid string. This commit changes the new parser to match that behaviour, since it's currently pointing to the end of the string (or to be more precise, to the beginning of the next token).
1 parent 3466922 commit 2f37c35

File tree

6 files changed

+34
-28
lines changed

6 files changed

+34
-28
lines changed

Lib/test/test_cmd_line_script.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -648,7 +648,7 @@ def test_syntaxerror_invalid_escape_sequence_multi_line(self):
648648
self.assertEqual(
649649
stderr.splitlines()[-3:],
650650
[ b' foo = """\\q"""',
651-
b' ^',
651+
b' ^',
652652
b'SyntaxError: invalid escape sequence \\q'
653653
],
654654
)

Lib/test/test_string_literals.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,7 @@ def test_eval_str_invalid_escape(self):
118118
eval("'''\n\\z'''")
119119
self.assertEqual(len(w), 1)
120120
self.assertEqual(w[0].filename, '<string>')
121-
if use_old_parser():
122-
self.assertEqual(w[0].lineno, 1)
121+
self.assertEqual(w[0].lineno, 1)
123122

124123
with warnings.catch_warnings(record=True) as w:
125124
warnings.simplefilter('error', category=DeprecationWarning)
@@ -128,8 +127,8 @@ def test_eval_str_invalid_escape(self):
128127
exc = cm.exception
129128
self.assertEqual(w, [])
130129
self.assertEqual(exc.filename, '<string>')
131-
if use_old_parser():
132-
self.assertEqual(exc.lineno, 1)
130+
self.assertEqual(exc.lineno, 1)
131+
self.assertEqual(exc.offset, 1)
133132

134133
def test_eval_str_raw(self):
135134
self.assertEqual(eval(""" r'x' """), 'x')

Parser/pegen/parse_string.c

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,24 @@
1212
// file (like "_PyPegen_raise_syntax_error").
1313

1414
static int
15-
warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char)
15+
warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
1616
{
1717
PyObject *msg =
1818
PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
1919
if (msg == NULL) {
2020
return -1;
2121
}
2222
if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
23-
p->tok->lineno, NULL, NULL) < 0) {
23+
t->lineno, NULL, NULL) < 0) {
2424
if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
2525
/* Replace the DeprecationWarning exception with a SyntaxError
2626
to get a more accurate error report */
2727
PyErr_Clear();
28+
29+
/* This is needed, in order for the SyntaxError to point to the token t,
30+
since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
31+
error location, if p->known_err_token is not set. */
32+
p->known_err_token = t;
2833
RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
2934
}
3035
Py_DECREF(msg);
@@ -47,7 +52,7 @@ decode_utf8(const char **sPtr, const char *end)
4752
}
4853

4954
static PyObject *
50-
decode_unicode_with_escapes(Parser *parser, const char *s, size_t len)
55+
decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
5156
{
5257
PyObject *v, *u;
5358
char *buf;
@@ -110,7 +115,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len)
110115
v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
111116

112117
if (v != NULL && first_invalid_escape != NULL) {
113-
if (warn_invalid_escape_sequence(parser, *first_invalid_escape) < 0) {
118+
if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
114119
/* We have not decref u before because first_invalid_escape points
115120
inside u. */
116121
Py_XDECREF(u);
@@ -123,7 +128,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len)
123128
}
124129

125130
static PyObject *
126-
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len)
131+
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
127132
{
128133
const char *first_invalid_escape;
129134
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
@@ -132,7 +137,7 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len)
132137
}
133138

134139
if (first_invalid_escape != NULL) {
135-
if (warn_invalid_escape_sequence(p, *first_invalid_escape) < 0) {
140+
if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
136141
Py_DECREF(result);
137142
return NULL;
138143
}
@@ -146,9 +151,14 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len)
146151
If the string is an f-string, set *fstr and *fstrlen to the unparsed
147152
string object. Return 0 if no errors occurred. */
148153
int
149-
_PyPegen_parsestr(Parser *p, const char *s, int *bytesmode, int *rawmode, PyObject **result,
150-
const char **fstr, Py_ssize_t *fstrlen)
154+
_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
155+
const char **fstr, Py_ssize_t *fstrlen, Token *t)
151156
{
157+
const char *s = PyBytes_AsString(t->bytes);
158+
if (s == NULL) {
159+
return -1;
160+
}
161+
152162
size_t len;
153163
int quote = Py_CHARMASK(*s);
154164
int fmode = 0;
@@ -245,15 +255,15 @@ _PyPegen_parsestr(Parser *p, const char *s, int *bytesmode, int *rawmode, PyObje
245255
*result = PyBytes_FromStringAndSize(s, len);
246256
}
247257
else {
248-
*result = decode_bytes_with_escapes(p, s, len);
258+
*result = decode_bytes_with_escapes(p, s, len, t);
249259
}
250260
}
251261
else {
252262
if (*rawmode) {
253263
*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
254264
}
255265
else {
256-
*result = decode_unicode_with_escapes(p, s, len);
266+
*result = decode_unicode_with_escapes(p, s, len, t);
257267
}
258268
}
259269
return *result == NULL ? -1 : 0;
@@ -637,7 +647,7 @@ fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
637647
*/
638648
static int
639649
fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
640-
PyObject **literal, int recurse_lvl)
650+
PyObject **literal, int recurse_lvl, Token *t)
641651
{
642652
/* Get any literal string. It ends when we hit an un-doubled left
643653
brace (which isn't part of a unicode name escape such as
@@ -660,7 +670,7 @@ fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
660670
}
661671
break;
662672
}
663-
if (ch == '{' && warn_invalid_escape_sequence(p, ch) < 0) {
673+
if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
664674
return -1;
665675
}
666676
}
@@ -704,7 +714,7 @@ fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
704714
NULL, NULL);
705715
else
706716
*literal = decode_unicode_with_escapes(p, literal_start,
707-
s - literal_start);
717+
s - literal_start, t);
708718
if (!*literal)
709719
return -1;
710720
}
@@ -1041,7 +1051,7 @@ fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int
10411051
assert(*literal == NULL && *expression == NULL);
10421052

10431053
/* Get any literal string. */
1044-
result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl);
1054+
result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
10451055
if (result < 0)
10461056
goto error;
10471057

Parser/pegen/parse_string.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ typedef struct {
3434
} FstringParser;
3535

3636
void _PyPegen_FstringParser_Init(FstringParser *);
37-
int _PyPegen_parsestr(Parser *, const char *, int *, int *, PyObject **,
38-
const char **, Py_ssize_t *);
37+
int _PyPegen_parsestr(Parser *, int *, int *, PyObject **,
38+
const char **, Py_ssize_t *, Token *);
3939
int _PyPegen_FstringParser_ConcatFstring(Parser *, FstringParser *, const char **,
4040
const char *, int, int, Token *, Token *,
4141
Token *);

Parser/pegen/pegen.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const ch
383383
PyObject *errstr = NULL;
384384
PyObject *loc = NULL;
385385
PyObject *tmp = NULL;
386-
Token *t = p->tokens[p->fill - 1];
386+
Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
387387
Py_ssize_t col_number = !with_col_number;
388388
va_list va;
389389
p->error_indicator = 1;
@@ -1053,6 +1053,7 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
10531053
p->starting_col_offset = 0;
10541054
p->flags = flags;
10551055
p->feature_version = feature_version;
1056+
p->known_err_token = NULL;
10561057

10571058
return p;
10581059
}
@@ -1972,12 +1973,7 @@ _PyPegen_concatenate_strings(Parser *p, asdl_seq *strings)
19721973
const char *fstr;
19731974
Py_ssize_t fstrlen = -1;
19741975

1975-
char *this_str = PyBytes_AsString(t->bytes);
1976-
if (!this_str) {
1977-
goto error;
1978-
}
1979-
1980-
if (_PyPegen_parsestr(p, this_str, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen) != 0) {
1976+
if (_PyPegen_parsestr(p, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen, t) != 0) {
19811977
goto error;
19821978
}
19831979

Parser/pegen/pegen.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ typedef struct {
7171
int flags;
7272
int feature_version;
7373
growable_comment_array type_ignore_comments;
74+
Token *known_err_token;
7475
} Parser;
7576

7677
typedef struct {

0 commit comments

Comments
 (0)