Skip to content

Commit e4aaa14

Browse files
committed
gh-96268: Fix loading invalid UTF-8
This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8. This also fixes the related test so it will always detect the expected failure and error message.
1 parent fba3b67 commit e4aaa14

File tree

2 files changed

+31
-12
lines changed

2 files changed

+31
-12
lines changed

Lib/test/test_source_encoding.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -236,16 +236,23 @@ def test_invalid_utf8(self):
236236
# test it is to write actual files to disk.
237237

238238
# Each example is put inside a string at the top of the file so
239-
# it's an otherwise valid Python source file.
240-
template = b'"%s"\n'
239+
# it's an otherwise valid Python source file. Put some newlines
240+
# beforehand so we can assert that the error is reported on the
241+
# correct line.
242+
template = b'\n\n\n"%s"\n'
241243

242244
fn = TESTFN
243245
self.addCleanup(unlink, fn)
244246

245247
def check(content):
246248
with open(fn, 'wb') as fp:
247249
fp.write(template % content)
248-
script_helper.assert_python_failure(fn)
250+
rc, stdout, stderr = script_helper.assert_python_failure(fn)
251+
# We want to assert that the python subprocess failed gracefully,
252+
# not via a signal.
253+
self.assertGreaterEqual(rc, 1)
254+
self.assertTrue(b"Non-UTF-8 code starting with" in stderr)
255+
self.assertTrue(b"on line 5" in stderr)
249256

250257
# continuation bytes in a sequence of 2, 3, or 4 bytes
251258
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]

Parser/tokenizer.c

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -489,25 +489,37 @@ static void fp_ungetc(int c, struct tok_state *tok) {
489489

490490
/* Check whether the characters at s start a valid
491491
UTF-8 sequence. Return the number of characters forming
492-
the sequence if yes, 0 if not. */
492+
the sequence if yes, 0 if not. The special cases match
493+
those in stringlib/codecs.h:decode_utf8.
494+
*/
493495
static int valid_utf8(const unsigned char* s)
494496
{
495497
int expected = 0;
496498
int length;
497-
if (*s < 0x80)
499+
if (*s < 0x80) {
498500
/* single-byte code */
499501
return 1;
500-
if (*s < 0xc0)
501-
/* following byte */
502-
return 0;
503-
if (*s < 0xE0)
502+
} else if (*s < 0xE0) {
503+
if (*s < 0xC2) {
504+
return 0;
505+
}
504506
expected = 1;
505-
else if (*s < 0xF0)
507+
} else if (*s < 0xF0) {
508+
if (*s == 0xE0 && *(s + 1) < 0xA0) {
509+
return 0;
510+
} else if (*s == 0xED && *(s + 1) >= 0xA0) {
511+
return 0;
512+
}
506513
expected = 2;
507-
else if (*s < 0xF8)
514+
} else if (*s < 0xF5) {
515+
if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
516+
return 0;
517+
}
508518
expected = 3;
509-
else
519+
} else {
520+
/* invalid start byte */
510521
return 0;
522+
}
511523
length = expected + 1;
512524
for (; expected; expected--)
513525
if (s[expected] < 0x80 || s[expected] >= 0xC0)

0 commit comments

Comments
 (0)