@@ -64,7 +64,10 @@ impl Parser {
64
64
while !self . eof ( ) {
65
65
let c = self . cur ( ) ;
66
66
let build_expr = match c {
67
- '\\' => try!( self . parse_escape ( ) ) ,
67
+ '\\' => {
68
+ let bexpr = try!( self . parse_escape ( ) ) ;
69
+ self . maybe_class_case_fold ( bexpr)
70
+ }
68
71
'|' => { let e = try!( self . alternate ( ) ) ; self . bump ( ) ; e }
69
72
'?' => try!( self . parse_simple_repeat ( Repeater :: ZeroOrOne ) ) ,
70
73
'*' => try!( self . parse_simple_repeat ( Repeater :: ZeroOrMore ) ) ,
@@ -150,8 +153,7 @@ impl Parser {
150
153
}
151
154
'd' |'s' |'w' |'D' |'S' |'W' => {
152
155
self . bump ( ) ;
153
- self . parse_perl_class ( c, c == 'D' || c == 'S' || c == 'W' )
154
- . map ( |cls| Build :: Expr ( Expr :: Class ( cls) ) )
156
+ Ok ( Build :: Expr ( Expr :: Class ( self . parse_perl_class ( c) ) ) )
155
157
}
156
158
c => Err ( self . err ( ErrorKind :: UnrecognizedEscape ( c) ) ) ,
157
159
}
@@ -463,11 +465,43 @@ impl Parser {
463
465
//
464
466
// `negate` is true when the class name is used with `\P`.
465
467
fn parse_unicode_class ( & mut self , neg : bool ) -> Result < CharClass > {
466
- Ok ( CharClass :: new ( vec ! [ ] ) )
468
+ let name =
469
+ if self . bump_if ( '{' ) {
470
+ let n = self . bump_get ( |c| c != '}' ) . unwrap_or ( "" . into ( ) ) ;
471
+ if n. is_empty ( ) || !self . bump_if ( '}' ) {
472
+ // e.g., \p{Greek
473
+ return Err ( self . err ( ErrorKind :: UnclosedUnicodeName ) ) ;
474
+ }
475
+ n
476
+ } else {
477
+ if self . eof ( ) {
478
+ // e.g., \p
479
+ return Err ( self . err ( ErrorKind :: UnexpectedEscapeEof ) ) ;
480
+ }
481
+ self . bump ( ) . to_string ( )
482
+ } ;
483
+ match unicode_class ( & name) {
484
+ None => Err ( self . err ( ErrorKind :: UnrecognizedUnicodeClass ( name) ) ) ,
485
+ Some ( cls) => if neg { Ok ( cls. negate ( ) ) } else { Ok ( cls) } ,
486
+ }
467
487
}
468
488
469
- fn parse_perl_class ( & mut self , c : char , neg : bool ) -> Result < CharClass > {
470
- Ok ( CharClass :: new ( vec ! [ ] ) )
489
+ // Parses a perl character class with Unicode support.
490
+ //
491
+ // `name` must be one of d, s, w, D, S, W. If not, this function panics.
492
+ //
493
+ // No parser state is changed.
494
+ fn parse_perl_class ( & mut self , name : char ) -> CharClass {
495
+ use unicode:: regex:: { PERLD , PERLS , PERLW } ;
496
+ match name {
497
+ 'd' => raw_class_to_expr ( PERLD ) ,
498
+ 'D' => raw_class_to_expr ( PERLD ) . negate ( ) ,
499
+ 's' => raw_class_to_expr ( PERLS ) ,
500
+ 'S' => raw_class_to_expr ( PERLS ) . negate ( ) ,
501
+ 'w' => raw_class_to_expr ( PERLW ) ,
502
+ 'W' => raw_class_to_expr ( PERLW ) . negate ( ) ,
503
+ _ => unreachable ! ( ) ,
504
+ }
471
505
}
472
506
473
507
// Always bump to the next input and return the given expression as a
@@ -532,6 +566,22 @@ impl Parser {
532
566
Some ( Build :: Expr ( e) ) => Ok ( e) ,
533
567
}
534
568
}
569
+
570
+ // If the current contexts calls for case insensitivity and if the expr
571
+ // given is a character class, do case folding on it and return the new
572
+ // class.
573
+ //
574
+ // Otherwise, return the expression unchanged.
575
+ fn maybe_class_case_fold ( & mut self , bexpr : Build ) -> Build {
576
+ match bexpr {
577
+ Build :: Expr ( Expr :: Class ( cls) ) => {
578
+ Build :: Expr ( Expr :: Class (
579
+ if self . flags . casei { cls. case_fold ( ) } else { cls }
580
+ ) )
581
+ }
582
+ bexpr => bexpr,
583
+ }
584
+ }
535
585
}
536
586
537
587
// Auxiliary methods for manipulating the expression stack.
@@ -774,14 +824,19 @@ fn is_hex(c: char) -> bool {
774
824
775
825
fn unicode_class ( name : & str ) -> Option < CharClass > {
776
826
UNICODE_CLASSES . binary_search_by ( |& ( s, _) | s. cmp ( name) ) . ok ( ) . map ( |i| {
777
- let range = |& ( s, e) | ClassRange { start : s, end : e } ;
778
- CharClass :: new ( UNICODE_CLASSES [ i] . 1 . iter ( ) . map ( range) . collect ( ) )
827
+ raw_class_to_expr ( UNICODE_CLASSES [ i] . 1 )
779
828
} )
780
829
}
781
830
831
+ fn raw_class_to_expr ( raw : & [ ( char , char ) ] ) -> CharClass {
832
+ let range = |& ( s, e) | ClassRange { start : s, end : e } ;
833
+ CharClass :: new ( raw. iter ( ) . map ( range) . collect ( ) )
834
+ }
835
+
782
836
#[ cfg( test) ]
783
837
mod tests {
784
- use { Expr , Repeater , Error , ErrorKind } ;
838
+ use { CharClass , ClassRange , Expr , Repeater , Error , ErrorKind } ;
839
+ use unicode:: regex:: { PERLD , PERLS , PERLW } ;
785
840
use super :: Parser ;
786
841
787
842
fn p ( s : & str ) -> Expr { Parser :: parse ( s) . unwrap ( ) }
@@ -791,6 +846,26 @@ mod tests {
791
846
fn b < T > ( v : T ) -> Box < T > { Box :: new ( v) }
792
847
fn c ( es : & [ Expr ] ) -> Expr { Expr :: Concat ( es. to_vec ( ) ) }
793
848
849
+ fn class ( ranges : & [ ( char , char ) ] ) -> CharClass {
850
+ CharClass :: new ( ranges. iter ( ) . cloned ( ) . map ( |( c1, c2) | {
851
+ if c1 <= c2 {
852
+ ClassRange :: new ( c1, c2)
853
+ } else {
854
+ ClassRange :: new ( c2, c1)
855
+ }
856
+ } ) . collect ( ) )
857
+ }
858
+
859
+ fn classi ( ranges : & [ ( char , char ) ] ) -> CharClass {
860
+ CharClass :: new_casei ( ranges. iter ( ) . cloned ( ) . map ( |( c1, c2) | {
861
+ if c1 <= c2 {
862
+ ClassRange :: new ( c1, c2)
863
+ } else {
864
+ ClassRange :: new ( c2, c1)
865
+ }
866
+ } ) . collect ( ) )
867
+ }
868
+
794
869
#[ test]
795
870
fn empty ( ) {
796
871
assert_eq ! ( p( "" ) , Expr :: Empty ) ;
@@ -1201,6 +1276,137 @@ mod tests {
1201
1276
assert_eq ! ( p( r"\x{2603}" ) , lit( '\u{2603}' ) ) ;
1202
1277
}
1203
1278
1279
+ #[ test]
1280
+ fn escape_unicode_name ( ) {
1281
+ assert_eq ! ( p( r"\p{Yi}" ) , Expr :: Class ( class( & [
1282
+ ( '\u{a000}' , '\u{a48c}' ) , ( '\u{a490}' , '\u{a4c6}' )
1283
+ ] ) ) ) ;
1284
+ }
1285
+
1286
+ #[ test]
1287
+ fn escape_unicode_letter ( ) {
1288
+ assert_eq ! ( p( r"\pZ" ) , Expr :: Class ( class( & [
1289
+ ( '\u{20}' , '\u{20}' ) , ( '\u{a0}' , '\u{a0}' ) ,
1290
+ ( '\u{1680}' , '\u{1680}' ) , ( '\u{2000}' , '\u{200a}' ) ,
1291
+ ( '\u{2028}' , '\u{2029}' ) , ( '\u{202f}' , '\u{202f}' ) ,
1292
+ ( '\u{205f}' , '\u{205f}' ) , ( '\u{3000}' , '\u{3000}' ) ,
1293
+ ] ) ) ) ;
1294
+ }
1295
+
1296
+ #[ test]
1297
+ fn escape_unicode_name_case_fold ( ) {
1298
+ assert_eq ! ( p( r"(?i)\p{Yi}" ) , Expr :: Class ( class( & [
1299
+ ( '\u{a000}' , '\u{a48c}' ) , ( '\u{a490}' , '\u{a4c6}' )
1300
+ ] ) . case_fold( ) ) ) ;
1301
+ }
1302
+
1303
+ #[ test]
1304
+ fn escape_unicode_letter_case_fold ( ) {
1305
+ assert_eq ! ( p( r"(?i)\pZ" ) , Expr :: Class ( class( & [
1306
+ ( '\u{20}' , '\u{20}' ) , ( '\u{a0}' , '\u{a0}' ) ,
1307
+ ( '\u{1680}' , '\u{1680}' ) , ( '\u{2000}' , '\u{200a}' ) ,
1308
+ ( '\u{2028}' , '\u{2029}' ) , ( '\u{202f}' , '\u{202f}' ) ,
1309
+ ( '\u{205f}' , '\u{205f}' ) , ( '\u{3000}' , '\u{3000}' ) ,
1310
+ ] ) . case_fold( ) ) ) ;
1311
+ }
1312
+
1313
+ #[ test]
1314
+ fn escape_unicode_name_negate ( ) {
1315
+ assert_eq ! ( p( r"\P{Yi}" ) , Expr :: Class ( class( & [
1316
+ ( '\u{a000}' , '\u{a48c}' ) , ( '\u{a490}' , '\u{a4c6}' )
1317
+ ] ) . negate( ) ) ) ;
1318
+ }
1319
+
1320
+ #[ test]
1321
+ fn escape_unicode_letter_negate ( ) {
1322
+ assert_eq ! ( p( r"\PZ" ) , Expr :: Class ( class( & [
1323
+ ( '\u{20}' , '\u{20}' ) , ( '\u{a0}' , '\u{a0}' ) ,
1324
+ ( '\u{1680}' , '\u{1680}' ) , ( '\u{2000}' , '\u{200a}' ) ,
1325
+ ( '\u{2028}' , '\u{2029}' ) , ( '\u{202f}' , '\u{202f}' ) ,
1326
+ ( '\u{205f}' , '\u{205f}' ) , ( '\u{3000}' , '\u{3000}' ) ,
1327
+ ] ) . negate( ) ) ) ;
1328
+ }
1329
+
1330
+ #[ test]
1331
+ fn escape_unicode_name_negate_case_fold ( ) {
1332
+ assert_eq ! ( p( r"(?i)\P{Yi}" ) , Expr :: Class ( class( & [
1333
+ ( '\u{a000}' , '\u{a48c}' ) , ( '\u{a490}' , '\u{a4c6}' )
1334
+ ] ) . negate( ) . case_fold( ) ) ) ;
1335
+ }
1336
+
1337
+ #[ test]
1338
+ fn escape_unicode_letter_negate_case_fold ( ) {
1339
+ assert_eq ! ( p( r"(?i)\PZ" ) , Expr :: Class ( class( & [
1340
+ ( '\u{20}' , '\u{20}' ) , ( '\u{a0}' , '\u{a0}' ) ,
1341
+ ( '\u{1680}' , '\u{1680}' ) , ( '\u{2000}' , '\u{200a}' ) ,
1342
+ ( '\u{2028}' , '\u{2029}' ) , ( '\u{202f}' , '\u{202f}' ) ,
1343
+ ( '\u{205f}' , '\u{205f}' ) , ( '\u{3000}' , '\u{3000}' ) ,
1344
+ ] ) . negate( ) . case_fold( ) ) ) ;
1345
+ }
1346
+
1347
+ #[ test]
1348
+ fn escape_perl_d ( ) {
1349
+ assert_eq ! ( p( r"\d" ) , Expr :: Class ( class( PERLD ) ) ) ;
1350
+ }
1351
+
1352
+ #[ test]
1353
+ fn escape_perl_s ( ) {
1354
+ assert_eq ! ( p( r"\s" ) , Expr :: Class ( class( PERLS ) ) ) ;
1355
+ }
1356
+
1357
+ #[ test]
1358
+ fn escape_perl_w ( ) {
1359
+ assert_eq ! ( p( r"\w" ) , Expr :: Class ( class( PERLW ) ) ) ;
1360
+ }
1361
+
1362
+ #[ test]
1363
+ fn escape_perl_d_negate ( ) {
1364
+ assert_eq ! ( p( r"\D" ) , Expr :: Class ( class( PERLD ) . negate( ) ) ) ;
1365
+ }
1366
+
1367
+ #[ test]
1368
+ fn escape_perl_s_negate ( ) {
1369
+ assert_eq ! ( p( r"\S" ) , Expr :: Class ( class( PERLS ) . negate( ) ) ) ;
1370
+ }
1371
+
1372
+ #[ test]
1373
+ fn escape_perl_w_negate ( ) {
1374
+ assert_eq ! ( p( r"\W" ) , Expr :: Class ( class( PERLW ) . negate( ) ) ) ;
1375
+ }
1376
+
1377
+ #[ test]
1378
+ fn escape_perl_d_case_fold ( ) {
1379
+ assert_eq ! ( p( r"(?i)\d" ) , Expr :: Class ( class( PERLD ) . case_fold( ) ) ) ;
1380
+ }
1381
+
1382
+ #[ test]
1383
+ fn escape_perl_s_case_fold ( ) {
1384
+ assert_eq ! ( p( r"(?i)\s" ) , Expr :: Class ( class( PERLS ) . case_fold( ) ) ) ;
1385
+ }
1386
+
1387
+ #[ test]
1388
+ fn escape_perl_w_case_fold ( ) {
1389
+ assert_eq ! ( p( r"(?i)\w" ) , Expr :: Class ( class( PERLW ) . case_fold( ) ) ) ;
1390
+ }
1391
+
1392
+ #[ test]
1393
+ fn escape_perl_d_case_fold_negate ( ) {
1394
+ assert_eq ! ( p( r"(?i)\D" ) ,
1395
+ Expr :: Class ( class( PERLD ) . negate( ) . case_fold( ) ) ) ;
1396
+ }
1397
+
1398
+ #[ test]
1399
+ fn escape_perl_s_case_fold_negate ( ) {
1400
+ assert_eq ! ( p( r"(?i)\S" ) ,
1401
+ Expr :: Class ( class( PERLS ) . negate( ) . case_fold( ) ) ) ;
1402
+ }
1403
+
1404
+ #[ test]
1405
+ fn escape_perl_w_case_fold_negate ( ) {
1406
+ assert_eq ! ( p( r"(?i)\W" ) ,
1407
+ Expr :: Class ( class( PERLW ) . negate( ) . case_fold( ) ) ) ;
1408
+ }
1409
+
1204
1410
/******************************************************/
1205
1411
// Test every single possible error case.
1206
1412
/******************************************************/
@@ -1610,4 +1816,45 @@ mod tests {
1610
1816
kind: ErrorKind :: InvalidBase16 ( "9999999999" . into( ) ) ,
1611
1817
} ) ;
1612
1818
}
1819
+
1820
+ #[ test]
1821
+ fn error_unicode_unclosed ( ) {
1822
+ assert_eq ! ( perr( r"\p{" ) , Error {
1823
+ pos: 3 ,
1824
+ surround: r"\p{" . into( ) ,
1825
+ kind: ErrorKind :: UnclosedUnicodeName ,
1826
+ } ) ;
1827
+ assert_eq ! ( perr( r"\p{Greek" ) , Error {
1828
+ pos: 8 ,
1829
+ surround: r"Greek" . into( ) ,
1830
+ kind: ErrorKind :: UnclosedUnicodeName ,
1831
+ } ) ;
1832
+ }
1833
+
1834
+ #[ test]
1835
+ fn error_unicode_no_letter ( ) {
1836
+ assert_eq ! ( perr( r"\p" ) , Error {
1837
+ pos: 2 ,
1838
+ surround: r"\p" . into( ) ,
1839
+ kind: ErrorKind :: UnexpectedEscapeEof ,
1840
+ } ) ;
1841
+ }
1842
+
1843
+ #[ test]
1844
+ fn error_unicode_unknown_letter ( ) {
1845
+ assert_eq ! ( perr( r"\pA" ) , Error {
1846
+ pos: 3 ,
1847
+ surround: r"\pA" . into( ) ,
1848
+ kind: ErrorKind :: UnrecognizedUnicodeClass ( "A" . into( ) ) ,
1849
+ } ) ;
1850
+ }
1851
+
1852
+ #[ test]
1853
+ fn error_unicode_unknown_name ( ) {
1854
+ assert_eq ! ( perr( r"\p{Yii}" ) , Error {
1855
+ pos: 7 ,
1856
+ surround: r"{Yii}" . into( ) ,
1857
+ kind: ErrorKind :: UnrecognizedUnicodeClass ( "Yii" . into( ) ) ,
1858
+ } ) ;
1859
+ }
1613
1860
}
0 commit comments