From e4aaa14a427bf4797259b68cac6f79fd8e934719 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Thu, 25 Aug 2022 10:13:56 -0400
Subject: [PATCH 01/10] gh-96268: Fix loading invalid UTF-8

This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8.

This also fixes the related test so it will always detect the expected failure
and error message.
---
 Lib/test/test_source_encoding.py | 13 ++++++++++---
 Parser/tokenizer.c               | 30 +++++++++++++++++++++---------
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index 8d7b573c7e915b..604b49010393e3 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -236,8 +236,10 @@ def test_invalid_utf8(self):
         # test it is to write actual files to disk.
 
         # Each example is put inside a string at the top of the file so
-        # it's an otherwise valid Python source file.
-        template = b'"%s"\n'
+        # it's an otherwise valid Python source file. Put some newlines
+        # beforehand so we can assert that the error is reported on the
+        # correct line.
+        template = b'\n\n\n"%s"\n'
 
         fn = TESTFN
         self.addCleanup(unlink, fn)
@@ -245,7 +247,12 @@ def test_invalid_utf8(self):
         def check(content):
             with open(fn, 'wb') as fp:
                 fp.write(template % content)
-            script_helper.assert_python_failure(fn)
+            rc, stdout, stderr = script_helper.assert_python_failure(fn)
+            # We want to assert that the python subprocess failed gracefully,
+            # not via a signal.
+            self.assertGreaterEqual(rc, 1)
+            self.assertTrue(b"Non-UTF-8 code starting with" in stderr)
+            self.assertTrue(b"on line 5" in stderr)
 
         # continuation bytes in a sequence of 2, 3, or 4 bytes
         continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index f2606f17d14630..5229df052c83dc 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -489,25 +489,37 @@ static void fp_ungetc(int c, struct tok_state *tok) {
 
 /* Check whether the characters at s start a valid
    UTF-8 sequence. Return the number of characters forming
-   the sequence if yes, 0 if not.  */
+   the sequence if yes, 0 if not.  The special cases match
+   those in stringlib/codecs.h:decode_utf8.
+*/
 static int valid_utf8(const unsigned char* s)
 {
     int expected = 0;
     int length;
-    if (*s < 0x80)
+    if (*s < 0x80) {
         /* single-byte code */
         return 1;
-    if (*s < 0xc0)
-        /* following byte */
-        return 0;
-    if (*s < 0xE0)
+    } else if (*s < 0xE0) {
+        if (*s < 0xC2) {
+            return 0;
+        }
         expected = 1;
-    else if (*s < 0xF0)
+    } else if (*s < 0xF0) {
+        if (*s == 0xE0 && *(s + 1) < 0xA0) {
+            return 0;
+        } else if (*s == 0xED && *(s + 1) >= 0xA0) {
+            return 0;
+        }
         expected = 2;
-    else if (*s < 0xF8)
+    } else if (*s < 0xF5) {
+        if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
+            return 0;
+        }
         expected = 3;
-    else
+    } else {
+        /* invalid start byte */
         return 0;
+    }
     length = expected + 1;
     for (; expected; expected--)
         if (s[expected] < 0x80 || s[expected] >= 0xC0)

From 407eef73ce47a5d7e31d45cd2586d2306bdbd18d Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Thu, 25 Aug 2022 10:19:39 -0400
Subject: [PATCH 02/10] Add blurb

---
 .../2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst     | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst
new file mode 100644
index 00000000000000..bc4f87a9697f81
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst	
@@ -0,0 +1,11 @@
+Loading a file with invalid UTF-8 will now report the broken character at
+the correct location.
+
+#.. section: Library #.. section: Documentation #.. section: Tests #..
+section: Build #.. section: Windows #.. section: macOS #.. section: IDLE #..
+section: Tools/Demos #.. section: C API
+
+# Write your Misc/NEWS entry below.  It should be a simple ReST paragraph. #
+Don't start with "- Issue #<n>: " or "- gh-issue-<n>: " or that sort of
+stuff.
+###########################################################################

From 3d60ff7e9bc035c74d87d43eeeb6dd7d181062f8 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Thu, 25 Aug 2022 10:24:53 -0400
Subject: [PATCH 03/10] Fix blurb

---
 .../2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst        | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst
index bc4f87a9697f81..987d85ff3bab8e 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst	
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst	
@@ -1,11 +1,2 @@
 Loading a file with invalid UTF-8 will now report the broken character at
 the correct location.
-
-#.. section: Library #.. section: Documentation #.. section: Tests #..
-section: Build #.. section: Windows #.. section: macOS #.. section: IDLE #..
-section: Tools/Demos #.. section: C API
-
-# Write your Misc/NEWS entry below.  It should be a simple ReST paragraph. #
-Don't start with "- Issue #<n>: " or "- gh-issue-<n>: " or that sort of
-stuff.
-###########################################################################

From 5cc57ad6fe9fb152a0d89b92ddcd722c3ecbebfc Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Thu, 25 Aug 2022 12:24:46 -0400
Subject: [PATCH 04/10] Use assertIn instead

---
 Lib/test/test_source_encoding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index 604b49010393e3..6d0e9a50df9344 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -251,8 +251,8 @@ def check(content):
             # We want to assert that the python subprocess failed gracefully,
             # not via a signal.
             self.assertGreaterEqual(rc, 1)
-            self.assertTrue(b"Non-UTF-8 code starting with" in stderr)
-            self.assertTrue(b"on line 5" in stderr)
+            self.assertIn(b"Non-UTF-8 code starting with", stderr)
+            self.assertIn(b"on line 5", stderr)
 
         # continuation bytes in a sequence of 2, 3, or 4 bytes
         continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]

From ad4de7ab311929e31decff436d1da96ab4e0e7d8 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 31 Aug 2022 10:56:16 -0400
Subject: [PATCH 05/10] Fix reference to other decoding function

---
 Parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 5229df052c83dc..9edb17a43447d2 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -490,7 +490,7 @@ static void fp_ungetc(int c, struct tok_state *tok) {
 /* Check whether the characters at s start a valid
    UTF-8 sequence. Return the number of characters forming
    the sequence if yes, 0 if not.  The special cases match
-   those in stringlib/codecs.h:decode_utf8.
+   those in stringlib/codecs.h:utf8_decode.
 */
 static int valid_utf8(const unsigned char* s)
 {

From 18927b17379a9f8989c5c1155266a789e6935672 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 31 Aug 2022 10:56:26 -0400
Subject: [PATCH 06/10] Fix coding style

---
 Parser/tokenizer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 9edb17a43447d2..229b7847f78682 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -492,7 +492,8 @@ static void fp_ungetc(int c, struct tok_state *tok) {
    the sequence if yes, 0 if not.  The special cases match
    those in stringlib/codecs.h:utf8_decode.
 */
-static int valid_utf8(const unsigned char* s)
+static int
+valid_utf8(const unsigned char* s)
 {
     int expected = 0;
     int length;

From f741a9d202449f7962b97e4c6f9ef07dc6e8840a Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 31 Aug 2022 13:53:09 -0400
Subject: [PATCH 07/10] Add comments about handled code ranges in each branch

---
 Parser/tokenizer.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 229b7847f78682..381c2fe414eabb 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -501,19 +501,35 @@ valid_utf8(const unsigned char* s)
         /* single-byte code */
         return 1;
     } else if (*s < 0xE0) {
+        /* \xC2\x80-\xDF\xBF -- 0080-07FF */
         if (*s < 0xC2) {
+            /* invalid sequence
+               \x80-\xBF -- continuation byte
+               \xC0-\xC1 -- fake 0000-007F */
             return 0;
         }
         expected = 1;
     } else if (*s < 0xF0) {
+        /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
         if (*s == 0xE0 && *(s + 1) < 0xA0) {
+            /* invalid sequence
+               \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
             return 0;
         } else if (*s == 0xED && *(s + 1) >= 0xA0) {
+            /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
+               will result in surrogates in range D800-DFFF. Surrogates are
+               not valid UTF-8 so they are rejected.
+               See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
             return 0;
         }
         expected = 2;
     } else if (*s < 0xF5) {
+        /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
         if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
+            /* invalid sequence -- one of:
+               \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
+               \xF4\x90\x80\x80- -- 110000- overflow */
             return 0;
         }
         expected = 3;

From f8e9e6eb33ff4695b64c77fb8740553ff7ce67d4 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 31 Aug 2022 13:56:03 -0400
Subject: [PATCH 08/10] Fix line number in error message

---
 Lib/test/test_source_encoding.py | 2 +-
 Parser/tokenizer.c               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index 6d0e9a50df9344..1c39f53c6c3676 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -252,7 +252,7 @@ def check(content):
             # not via a signal.
             self.assertGreaterEqual(rc, 1)
             self.assertIn(b"Non-UTF-8 code starting with", stderr)
-            self.assertIn(b"on line 5", stderr)
+            self.assertIn(b"on line 4", stderr)
 
         # continuation bytes in a sequence of 2, 3, or 4 bytes
         continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 381c2fe414eabb..f3ab1d446e17f7 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -564,7 +564,7 @@ ensure_utf8(char *line, struct tok_state *tok)
                      "in file %U on line %i, "
                      "but no encoding declared; "
                      "see https://peps.python.org/pep-0263/ for details",
-                     badchar, tok->filename, tok->lineno + 1);
+                     badchar, tok->filename, tok->lineno);
         return 0;
     }
     return 1;

From ace4a8c0ef7c1542921cb828ac28364d106180f7 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 6 Sep 2022 15:39:33 -0400
Subject: [PATCH 09/10] Remove obsolete comment

---
 Parser/tokenizer.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index f3ab1d446e17f7..03148ea2f07ec2 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -557,8 +557,6 @@ ensure_utf8(char *line, struct tok_state *tok)
         }
     }
     if (badchar) {
-        /* Need to add 1 to the line number, since this line
-       has not been counted, yet.  */
         PyErr_Format(PyExc_SyntaxError,
                      "Non-UTF-8 code starting with '\\x%.2x' "
                      "in file %U on line %i, "

From df074a8b8971741a278cacc5ee5a6f61e68862b8 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 7 Sep 2022 08:38:52 -0400
Subject: [PATCH 10/10] PEP7

---
 Parser/tokenizer.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 03148ea2f07ec2..761b125ebe3943 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -500,7 +500,8 @@ valid_utf8(const unsigned char* s)
     if (*s < 0x80) {
         /* single-byte code */
         return 1;
-    } else if (*s < 0xE0) {
+    }
+    else if (*s < 0xE0) {
         /* \xC2\x80-\xDF\xBF -- 0080-07FF */
         if (*s < 0xC2) {
             /* invalid sequence
@@ -509,13 +510,15 @@ valid_utf8(const unsigned char* s)
             return 0;
         }
         expected = 1;
-    } else if (*s < 0xF0) {
+    }
+    else if (*s < 0xF0) {
         /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
         if (*s == 0xE0 && *(s + 1) < 0xA0) {
             /* invalid sequence
                \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
             return 0;
-        } else if (*s == 0xED && *(s + 1) >= 0xA0) {
+        }
+        else if (*s == 0xED && *(s + 1) >= 0xA0) {
             /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
                will result in surrogates in range D800-DFFF. Surrogates are
                not valid UTF-8 so they are rejected.
@@ -524,7 +527,8 @@ valid_utf8(const unsigned char* s)
             return 0;
         }
         expected = 2;
-    } else if (*s < 0xF5) {
+    }
+    else if (*s < 0xF5) {
         /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
         if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
             /* invalid sequence -- one of:
@@ -533,7 +537,8 @@ valid_utf8(const unsigned char* s)
             return 0;
         }
         expected = 3;
-    } else {
+    }
+    else {
         /* invalid start byte */
         return 0;
     }