File tree Expand file tree Collapse file tree 2 files changed +31
-12
lines changed Expand file tree Collapse file tree 2 files changed +31
-12
lines changed Original file line number Diff line number Diff line change @@ -236,16 +236,23 @@ def test_invalid_utf8(self):
236
236
# test it is to write actual files to disk.
237
237
238
238
# Each example is put inside a string at the top of the file so
239
- # it's an otherwise valid Python source file.
240
- template = b'"%s"\n '
239
+ # it's an otherwise valid Python source file. Put some newlines
240
+ # beforehand so we can assert that the error is reported on the
241
+ # correct line.
242
+ template = b'\n \n \n "%s"\n '
241
243
242
244
fn = TESTFN
243
245
self .addCleanup (unlink , fn )
244
246
245
247
def check (content ):
246
248
with open (fn , 'wb' ) as fp :
247
249
fp .write (template % content )
248
- script_helper .assert_python_failure (fn )
250
+ rc , stdout , stderr = script_helper .assert_python_failure (fn )
251
+ # We want to assert that the python subprocess failed gracefully,
252
+ # not via a signal.
253
+ self .assertGreaterEqual (rc , 1 )
254
+ self .assertTrue (b"Non-UTF-8 code starting with" in stderr )
255
+ self .assertTrue (b"on line 5" in stderr )
249
256
250
257
# continuation bytes in a sequence of 2, 3, or 4 bytes
251
258
continuation_bytes = [bytes ([x ]) for x in range (0x80 , 0xC0 )]
Original file line number Diff line number Diff line change @@ -489,25 +489,37 @@ static void fp_ungetc(int c, struct tok_state *tok) {
489
489
490
490
/* Check whether the characters at s start a valid
491
491
UTF-8 sequence. Return the number of characters forming
492
- the sequence if yes, 0 if not. */
492
+ the sequence if yes, 0 if not. The special cases match
493
+ those in stringlib/codecs.h:decode_utf8.
494
+ */
493
495
static int valid_utf8 (const unsigned char * s )
494
496
{
495
497
int expected = 0 ;
496
498
int length ;
497
- if (* s < 0x80 )
499
+ if (* s < 0x80 ) {
498
500
/* single-byte code */
499
501
return 1 ;
500
- if (* s < 0xc0 )
501
- /* following byte */
502
- return 0 ;
503
- if ( * s < 0xE0 )
502
+ } else if (* s < 0xE0 ) {
503
+ if ( * s < 0xC2 ) {
504
+ return 0 ;
505
+ }
504
506
expected = 1 ;
505
- else if (* s < 0xF0 )
507
+ } else if (* s < 0xF0 ) {
508
+ if (* s == 0xE0 && * (s + 1 ) < 0xA0 ) {
509
+ return 0 ;
510
+ } else if (* s == 0xED && * (s + 1 ) >= 0xA0 ) {
511
+ return 0 ;
512
+ }
506
513
expected = 2 ;
507
- else if (* s < 0xF8 )
514
+ } else if (* s < 0xF5 ) {
515
+ if (* (s + 1 ) < 0x90 ? * s == 0xF0 : * s == 0xF4 ) {
516
+ return 0 ;
517
+ }
508
518
expected = 3 ;
509
- else
519
+ } else {
520
+ /* invalid start byte */
510
521
return 0 ;
522
+ }
511
523
length = expected + 1 ;
512
524
for (; expected ; expected -- )
513
525
if (s [expected ] < 0x80 || s [expected ] >= 0xC0 )
You can’t perform that action at this time.
0 commit comments