@@ -224,6 +224,64 @@ def test_crcrcrlf2(self):
224
224
out = self .check_script_output (src , br"'\n\n\n'" )
225
225
226
226
227
+ class UTF8ValidatorTest (unittest .TestCase ):
228
+ @unittest .skipIf (sys .platform .startswith ("win" ),
229
+ "Times out on Windows" )
230
+ def test_invalid_utf8 (self ):
231
+ # This is port of test_utf8_decode_invalid_sequences in test_unicode.py
232
+ # to exercise the separate utf8 validator in tokenize.c used when
233
+ # reading source files.
234
+
235
+ # Each example is put inside a string at the top of the file so
236
+ # it's an otherwise valid Python source file.
237
+ template = b'"%s"\n '
238
+
239
+ with tempfile .TemporaryDirectory () as tmpd :
240
+ fn = os .path .join (tmpd , 'test.py' )
241
+
242
+ def check (content ):
243
+ with open (fn , 'wb' ) as fp :
244
+ fp .write (template % content )
245
+ script_helper .assert_python_failure (fn )
246
+
247
+ # continuation bytes in a sequence of 2, 3, or 4 bytes
248
+ continuation_bytes = [bytes ([x ]) for x in range (0x80 , 0xC0 , 7 )]
249
+ # start bytes of a 2-byte sequence equivalent to code points < 0x7F
250
+ invalid_2B_seq_start_bytes = [bytes ([x ]) for x in range (0xC0 , 0xC2 )]
251
+ # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
252
+ invalid_4B_seq_start_bytes = [bytes ([x ]) for x in range (0xF5 , 0xF8 )]
253
+ invalid_start_bytes = (
254
+ continuation_bytes + invalid_2B_seq_start_bytes +
255
+ invalid_4B_seq_start_bytes + [bytes ([x ]) for x in range (0xF7 , 0x100 )]
256
+ )
257
+
258
+ for byte in invalid_start_bytes :
259
+ check (byte )
260
+
261
+ for sb in invalid_2B_seq_start_bytes :
262
+ for cb in continuation_bytes :
263
+ check (sb + cb )
264
+
265
+ for sb in invalid_4B_seq_start_bytes :
266
+ for cb1 in continuation_bytes [:3 ]:
267
+ for cb3 in continuation_bytes [:3 ]:
268
+ check (sb + cb1 + b'\x80 ' + cb3 )
269
+
270
+ for cb in [bytes ([x ]) for x in range (0x80 , 0xA0 , 5 )]:
271
+ check (b'\xE0 ' + cb + b'\x80 ' )
272
+ check (b'\xE0 ' + cb + b'\xBF ' )
273
+ # surrogates
274
+ for cb in [bytes ([x ]) for x in range (0xA0 , 0xC0 , 5 )]:
275
+ check (b'\xED ' + cb + b'\x80 ' )
276
+ check (b'\xED ' + cb + b'\xBF ' )
277
+ for cb in [bytes ([x ]) for x in range (0x80 , 0x90 , 5 )]:
278
+ check (b'\xF0 ' + cb + b'\x80 \x80 ' )
279
+ check (b'\xF0 ' + cb + b'\xBF \xBF ' )
280
+ for cb in [bytes ([x ]) for x in range (0x90 , 0xC0 , 5 )]:
281
+ check (b'\xF4 ' + cb + b'\x80 \x80 ' )
282
+ check (b'\xF4 ' + cb + b'\xBF \xBF ' )
283
+
284
+
227
285
class BytesSourceEncodingTest (AbstractSourceEncodingTest , unittest .TestCase ):
228
286
229
287
def check_script_output (self , src , expected ):
0 commit comments