From e4aaa14a427bf4797259b68cac6f79fd8e934719 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 25 Aug 2022 10:13:56 -0400 Subject: [PATCH 01/10] gh-96268: Fix loading invalid UTF-8 This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8. This also fixes the related test so it will always detect the expected failure and error message. --- Lib/test/test_source_encoding.py | 13 ++++++++++--- Parser/tokenizer.c | 30 +++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 8d7b573c7e915b..604b49010393e3 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -236,8 +236,10 @@ def test_invalid_utf8(self): # test it is to write actual files to disk. # Each example is put inside a string at the top of the file so - # it's an otherwise valid Python source file. - template = b'"%s"\n' + # it's an otherwise valid Python source file. Put some newlines + # beforehand so we can assert that the error is reported on the + # correct line. + template = b'\n\n\n"%s"\n' fn = TESTFN self.addCleanup(unlink, fn) @@ -245,7 +247,12 @@ def test_invalid_utf8(self): def check(content): with open(fn, 'wb') as fp: fp.write(template % content) - script_helper.assert_python_failure(fn) + rc, stdout, stderr = script_helper.assert_python_failure(fn) + # We want to assert that the python subprocess failed gracefully, + # not via a signal. + self.assertGreaterEqual(rc, 1) + self.assertTrue(b"Non-UTF-8 code starting with" in stderr) + self.assertTrue(b"on line 5" in stderr) # continuation bytes in a sequence of 2, 3, or 4 bytes continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index f2606f17d14630..5229df052c83dc 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -489,25 +489,37 @@ static void fp_ungetc(int c, struct tok_state *tok) { /* Check whether the characters at s start a valid UTF-8 sequence. Return the number of characters forming - the sequence if yes, 0 if not. */ + the sequence if yes, 0 if not. The special cases match + those in stringlib/codecs.h:decode_utf8. +*/ static int valid_utf8(const unsigned char* s) { int expected = 0; int length; - if (*s < 0x80) + if (*s < 0x80) { /* single-byte code */ return 1; - if (*s < 0xc0) - /* following byte */ - return 0; - if (*s < 0xE0) + } else if (*s < 0xE0) { + if (*s < 0xC2) { + return 0; + } expected = 1; - else if (*s < 0xF0) + } else if (*s < 0xF0) { + if (*s == 0xE0 && *(s + 1) < 0xA0) { + return 0; + } else if (*s == 0xED && *(s + 1) >= 0xA0) { + return 0; + } expected = 2; - else if (*s < 0xF8) + } else if (*s < 0xF5) { + if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { + return 0; + } expected = 3; - else + } else { + /* invalid start byte */ return 0; + } length = expected + 1; for (; expected; expected--) if (s[expected] < 0x80 || s[expected] >= 0xC0) From 407eef73ce47a5d7e31d45cd2586d2306bdbd18d Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 25 Aug 2022 10:19:39 -0400 Subject: [PATCH 02/10] Add blurb --- .../2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst new file mode 100644 index 00000000000000..bc4f87a9697f81 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst @@ -0,0 +1,11 @@ +Loading a file with invalid UTF-8 will now report the broken character at +the correct location. + +#.. section: Library #.. section: Documentation #.. section: Tests #.. +section: Build #.. section: Windows #.. section: macOS #.. section: IDLE #.. +section: Tools/Demos #.. section: C API + +# Write your Misc/NEWS entry below. It should be a simple ReST paragraph. # +Don't start with "- Issue #: " or "- gh-issue-: " or that sort of +stuff. +########################################################################### From 3d60ff7e9bc035c74d87d43eeeb6dd7d181062f8 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 25 Aug 2022 10:24:53 -0400 Subject: [PATCH 03/10] Fix blurb --- .../2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst index bc4f87a9697f81..987d85ff3bab8e 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst @@ -1,11 +1,2 @@ Loading a file with invalid UTF-8 will now report the broken character at the correct location. - -#.. section: Library #.. section: Documentation #.. section: Tests #.. -section: Build #.. section: Windows #.. section: macOS #.. section: IDLE #.. -section: Tools/Demos #.. section: C API - -# Write your Misc/NEWS entry below. It should be a simple ReST paragraph. # -Don't start with "- Issue #: " or "- gh-issue-: " or that sort of -stuff. -########################################################################### From 5cc57ad6fe9fb152a0d89b92ddcd722c3ecbebfc Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 25 Aug 2022 12:24:46 -0400 Subject: [PATCH 04/10] Use assertIn instead --- Lib/test/test_source_encoding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 604b49010393e3..6d0e9a50df9344 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -251,8 +251,8 @@ def check(content): # We want to assert that the python subprocess failed gracefully, # not via a signal. self.assertGreaterEqual(rc, 1) - self.assertTrue(b"Non-UTF-8 code starting with" in stderr) - self.assertTrue(b"on line 5" in stderr) + self.assertIn(b"Non-UTF-8 code starting with", stderr) + self.assertIn(b"on line 5", stderr) # continuation bytes in a sequence of 2, 3, or 4 bytes continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] From ad4de7ab311929e31decff436d1da96ab4e0e7d8 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 31 Aug 2022 10:56:16 -0400 Subject: [PATCH 05/10] Fix reference to other decoding function --- Parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 5229df052c83dc..9edb17a43447d2 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -490,7 +490,7 @@ static void fp_ungetc(int c, struct tok_state *tok) { /* Check whether the characters at s start a valid UTF-8 sequence. Return the number of characters forming the sequence if yes, 0 if not. The special cases match - those in stringlib/codecs.h:decode_utf8. + those in stringlib/codecs.h:utf8_decode. */ static int valid_utf8(const unsigned char* s) { From 18927b17379a9f8989c5c1155266a789e6935672 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 31 Aug 2022 10:56:26 -0400 Subject: [PATCH 06/10] Fix coding style --- Parser/tokenizer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 9edb17a43447d2..229b7847f78682 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -492,7 +492,8 @@ static void fp_ungetc(int c, struct tok_state *tok) { the sequence if yes, 0 if not. The special cases match those in stringlib/codecs.h:utf8_decode. */ -static int valid_utf8(const unsigned char* s) +static int +valid_utf8(const unsigned char* s) { int expected = 0; int length; From f741a9d202449f7962b97e4c6f9ef07dc6e8840a Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 31 Aug 2022 13:53:09 -0400 Subject: [PATCH 07/10] Add comments about handled code ranges in each branch --- Parser/tokenizer.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 229b7847f78682..381c2fe414eabb 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -501,19 +501,35 @@ valid_utf8(const unsigned char* s) /* single-byte code */ return 1; } else if (*s < 0xE0) { + /* \xC2\x80-\xDF\xBF -- 0080-07FF */ if (*s < 0xC2) { + /* invalid sequence + \x80-\xBF -- continuation byte + \xC0-\xC1 -- fake 0000-007F */ return 0; } expected = 1; } else if (*s < 0xF0) { + /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ if (*s == 0xE0 && *(s + 1) < 0xA0) { + /* invalid sequence + \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ return 0; } else if (*s == 0xED && *(s + 1) >= 0xA0) { + /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF + will result in surrogates in range D800-DFFF. Surrogates are + not valid UTF-8 so they are rejected. + See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf + (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ return 0; } expected = 2; } else if (*s < 0xF5) { + /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { + /* invalid sequence -- one of: + \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF + \xF4\x90\x80\x80- -- 110000- overflow */ return 0; } expected = 3; From f8e9e6eb33ff4695b64c77fb8740553ff7ce67d4 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 31 Aug 2022 13:56:03 -0400 Subject: [PATCH 08/10] Fix line number in error message --- Lib/test/test_source_encoding.py | 2 +- Parser/tokenizer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 6d0e9a50df9344..1c39f53c6c3676 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -252,7 +252,7 @@ def check(content): # not via a signal. self.assertGreaterEqual(rc, 1) self.assertIn(b"Non-UTF-8 code starting with", stderr) - self.assertIn(b"on line 5", stderr) + self.assertIn(b"on line 4", stderr) # continuation bytes in a sequence of 2, 3, or 4 bytes continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 381c2fe414eabb..f3ab1d446e17f7 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -564,7 +564,7 @@ ensure_utf8(char *line, struct tok_state *tok) "in file %U on line %i, " "but no encoding declared; " "see https://peps.python.org/pep-0263/ for details", - badchar, tok->filename, tok->lineno + 1); + badchar, tok->filename, tok->lineno); return 0; } return 1; From ace4a8c0ef7c1542921cb828ac28364d106180f7 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 6 Sep 2022 15:39:33 -0400 Subject: [PATCH 09/10] Remove obsolete comment --- Parser/tokenizer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index f3ab1d446e17f7..03148ea2f07ec2 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -557,8 +557,6 @@ ensure_utf8(char *line, struct tok_state *tok) } } if (badchar) { - /* Need to add 1 to the line number, since this line - has not been counted, yet. */ PyErr_Format(PyExc_SyntaxError, "Non-UTF-8 code starting with '\\x%.2x' " "in file %U on line %i, " From df074a8b8971741a278cacc5ee5a6f61e68862b8 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 7 Sep 2022 08:38:52 -0400 Subject: [PATCH 10/10] PEP7 --- Parser/tokenizer.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 03148ea2f07ec2..761b125ebe3943 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -500,7 +500,8 @@ valid_utf8(const unsigned char* s) if (*s < 0x80) { /* single-byte code */ return 1; - } else if (*s < 0xE0) { + } + else if (*s < 0xE0) { /* \xC2\x80-\xDF\xBF -- 0080-07FF */ if (*s < 0xC2) { /* invalid sequence @@ -509,13 +510,15 @@ valid_utf8(const unsigned char* s) return 0; } expected = 1; - } else if (*s < 0xF0) { + } + else if (*s < 0xF0) { /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ if (*s == 0xE0 && *(s + 1) < 0xA0) { /* invalid sequence \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ return 0; - } else if (*s == 0xED && *(s + 1) >= 0xA0) { + } + else if (*s == 0xED && *(s + 1) >= 0xA0) { /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF will result in surrogates in range D800-DFFF. Surrogates are not valid UTF-8 so they are rejected. @@ -524,7 +527,8 @@ valid_utf8(const unsigned char* s) return 0; } expected = 2; - } else if (*s < 0xF5) { + } + else if (*s < 0xF5) { /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { /* invalid sequence -- one of: @@ -533,7 +537,8 @@ valid_utf8(const unsigned char* s) return 0; } expected = 3; - } else { + } + else { /* invalid start byte */ return 0; }