Skip to content

Commit e52f328

Browse files
committed
pythongh-94823: Improve coverage in tokenizer.c:valid_utf8
When loading a source file from disk, there is a separate UTF-8 validator distinct from the one in `unicode_decode_utf8`. This exercises that code path with the same set of invalid inputs as we use for testing the "other" UTF-8 decoder.
1 parent 8d7089f commit e52f328

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed

Lib/test/test_source_encoding.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,64 @@ def test_crcrcrlf2(self):
224224
out = self.check_script_output(src, br"'\n\n\n'")
225225

226226

227+
class UTF8ValidatorTest(unittest.TestCase):
228+
@unittest.skipIf(sys.platform.startswith("win"),
229+
"Times out on Windows")
230+
def test_invalid_utf8(self):
231+
# This is port of test_utf8_decode_invalid_sequences in test_unicode.py
232+
# to exercise the separate utf8 validator in tokenize.c used when
233+
# reading source files.
234+
235+
# Each example is put inside a string at the top of the file so
236+
# it's an otherwise valid Python source file.
237+
template = b'"%s"\n'
238+
239+
with tempfile.TemporaryDirectory() as tmpd:
240+
fn = os.path.join(tmpd, 'test.py')
241+
242+
def check(content):
243+
with open(fn, 'wb') as fp:
244+
fp.write(template % content)
245+
script_helper.assert_python_failure(fn)
246+
247+
# continuation bytes in a sequence of 2, 3, or 4 bytes
248+
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0, 7)]
249+
# start bytes of a 2-byte sequence equivalent to code points < 0x7F
250+
invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
251+
# start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
252+
invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
253+
invalid_start_bytes = (
254+
continuation_bytes + invalid_2B_seq_start_bytes +
255+
invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
256+
)
257+
258+
for byte in invalid_start_bytes:
259+
check(byte)
260+
261+
for sb in invalid_2B_seq_start_bytes:
262+
for cb in continuation_bytes:
263+
check(sb + cb)
264+
265+
for sb in invalid_4B_seq_start_bytes:
266+
for cb1 in continuation_bytes[:3]:
267+
for cb3 in continuation_bytes[:3]:
268+
check(sb+cb1+b'\x80'+cb3)
269+
270+
for cb in [bytes([x]) for x in range(0x80, 0xA0, 5)]:
271+
check(b'\xE0'+cb+b'\x80')
272+
check(b'\xE0'+cb+b'\xBF')
273+
# surrogates
274+
for cb in [bytes([x]) for x in range(0xA0, 0xC0, 5)]:
275+
check(b'\xED'+cb+b'\x80')
276+
check(b'\xED'+cb+b'\xBF')
277+
for cb in [bytes([x]) for x in range(0x80, 0x90, 5)]:
278+
check(b'\xF0'+cb+b'\x80\x80')
279+
check(b'\xF0'+cb+b'\xBF\xBF')
280+
for cb in [bytes([x]) for x in range(0x90, 0xC0, 5)]:
281+
check(b'\xF4'+cb+b'\x80\x80')
282+
check(b'\xF4'+cb+b'\xBF\xBF')
283+
284+
227285
class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
228286

229287
def check_script_output(self, src, expected):

0 commit comments

Comments
 (0)