gh-96268: Fix loading invalid UTF-8

mdboom · mdboom · commit e4aaa14a427b · 2022-08-25T10:13:56.000-04:00
This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8.

This also fixes the related test so it will always detect the expected failure
and error message.
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
@@ -236,16 +236,23 @@ def test_invalid_utf8(self):
         # test it is to write actual files to disk.
 
         # Each example is put inside a string at the top of the file so
-        # it's an otherwise valid Python source file.
-        template = b'"%s"\n'
+        # it's an otherwise valid Python source file. Put some newlines
+        # beforehand so we can assert that the error is reported on the
+        # correct line.
+        template = b'\n\n\n"%s"\n'
 
         fn = TESTFN
         self.addCleanup(unlink, fn)
 
         def check(content):
             with open(fn, 'wb') as fp:
                 fp.write(template % content)
-            script_helper.assert_python_failure(fn)
+            rc, stdout, stderr = script_helper.assert_python_failure(fn)
+            # We want to assert that the python subprocess failed gracefully,
+            # not via a signal.
+            self.assertGreaterEqual(rc, 1)
+            self.assertTrue(b"Non-UTF-8 code starting with" in stderr)
+            self.assertTrue(b"on line 5" in stderr)
 
         # continuation bytes in a sequence of 2, 3, or 4 bytes
         continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
@@ -489,25 +489,37 @@ static void fp_ungetc(int c, struct tok_state *tok) {
 
 /* Check whether the characters at s start a valid
    UTF-8 sequence. Return the number of characters forming
-   the sequence if yes, 0 if not.  */
+   the sequence if yes, 0 if not.  The special cases match
+   those in stringlib/codecs.h:decode_utf8.
+*/
 static int valid_utf8(const unsigned char* s)
 {
     int expected = 0;
     int length;
-    if (*s < 0x80)
+    if (*s < 0x80) {
         /* single-byte code */
         return 1;
-    if (*s < 0xc0)
-        /* following byte */
-        return 0;
-    if (*s < 0xE0)
+    } else if (*s < 0xE0) {
+        if (*s < 0xC2) {
+            return 0;
+        }
         expected = 1;
-    else if (*s < 0xF0)
+    } else if (*s < 0xF0) {
+        if (*s == 0xE0 && *(s + 1) < 0xA0) {
+            return 0;
+        } else if (*s == 0xED && *(s + 1) >= 0xA0) {
+            return 0;
+        }
         expected = 2;
-    else if (*s < 0xF8)
+    } else if (*s < 0xF5) {
+        if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
+            return 0;
+        }
         expected = 3;
-    else
+    } else {
+        /* invalid start byte */
         return 0;
+    }
     length = expected + 1;
     for (; expected; expected--)
         if (s[expected] < 0x80 || s[expected] >= 0xC0)