@@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
111
111
"character class difference is not supported" ;
112
112
const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED : & str =
113
113
"character class symmetric difference is not supported" ;
114
+ const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED : & str =
115
+ "special word boundary assertion is unclosed or has an invalid character" ;
116
+ const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED : & str =
117
+ "special word boundary assertion is unrecognized" ;
118
+ const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF : & str =
119
+ "found start of special word boundary or repetition without an end" ;
114
120
115
121
/// A regular expression parser.
116
122
///
@@ -479,12 +485,86 @@ impl<'a> Parser<'a> {
479
485
'v' => special ( '\x0B' ) ,
480
486
'A' => Ok ( Hir :: look ( hir:: Look :: Start ) ) ,
481
487
'z' => Ok ( Hir :: look ( hir:: Look :: End ) ) ,
482
- 'b' => Ok ( Hir :: look ( hir:: Look :: Word ) ) ,
488
+ 'b' => {
489
+ let mut hir = Hir :: look ( hir:: Look :: Word ) ;
490
+ if !self . is_done ( ) && self . char ( ) == '{' {
491
+ if let Some ( special) =
492
+ self . maybe_parse_special_word_boundary ( ) ?
493
+ {
494
+ hir = special;
495
+ }
496
+ }
497
+ Ok ( hir)
498
+ }
483
499
'B' => Ok ( Hir :: look ( hir:: Look :: WordNegate ) ) ,
500
+ '<' => Ok ( Hir :: look ( hir:: Look :: WordStart ) ) ,
501
+ '>' => Ok ( Hir :: look ( hir:: Look :: WordEnd ) ) ,
484
502
_ => Err ( Error :: new ( ERR_ESCAPE_UNRECOGNIZED ) ) ,
485
503
}
486
504
}
487
505
506
+ /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
507
+ /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
508
+ ///
509
+ /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
510
+ /// if it fails it will just return `None` with no error. This is done
511
+ /// because `\b{5}` is a valid expression and we want to let that be parsed
512
+ /// by the existing counted repetition parsing code. (I thought about just
513
+ /// invoking the counted repetition code from here, but it seemed a little
514
+ /// ham-fisted.)
515
+ ///
516
+ /// Unlike `maybe_parse_ascii_class` though, this can return an error.
517
+ /// Namely, if we definitely know it isn't a counted repetition, then we
518
+ /// return an error specific to the specialty word boundaries.
519
+ ///
520
+ /// This assumes the parser is positioned at a `{` immediately following
521
+ /// a `\b`. When `None` is returned, the parser is returned to the position
522
+ /// at which it started: pointing at a `{`.
523
+ ///
524
+ /// The position given should correspond to the start of the `\b`.
525
+ fn maybe_parse_special_word_boundary ( & self ) -> Result < Option < Hir > , Error > {
526
+ assert_eq ! ( self . char ( ) , '{' ) ;
527
+
528
+ let is_valid_char = |c| match c {
529
+ 'A' ..='Z' | 'a' ..='z' | '-' => true ,
530
+ _ => false ,
531
+ } ;
532
+ let start = self . pos ( ) ;
533
+ if !self . bump_and_bump_space ( ) {
534
+ return Err ( Error :: new ( ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF ) ) ;
535
+ }
536
+ // This is one of the critical bits: if the first non-whitespace
537
+ // character isn't in [-A-Za-z] (i.e., this can't be a special word
538
+ // boundary), then we bail and let the counted repetition parser deal
539
+ // with this.
540
+ if !is_valid_char ( self . char ( ) ) {
541
+ self . pos . set ( start) ;
542
+ self . char . set ( Some ( '{' ) ) ;
543
+ return Ok ( None ) ;
544
+ }
545
+
546
+ // Now collect up our chars until we see a '}'.
547
+ let mut scratch = String :: new ( ) ;
548
+ while !self . is_done ( ) && is_valid_char ( self . char ( ) ) {
549
+ scratch. push ( self . char ( ) ) ;
550
+ self . bump_and_bump_space ( ) ;
551
+ }
552
+ if self . is_done ( ) || self . char ( ) != '}' {
553
+ return Err ( Error :: new ( ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED ) ) ;
554
+ }
555
+ self . bump ( ) ;
556
+ let kind = match scratch. as_str ( ) {
557
+ "start" => hir:: Look :: WordStart ,
558
+ "end" => hir:: Look :: WordEnd ,
559
+ "start-half" => hir:: Look :: WordStartHalf ,
560
+ "end-half" => hir:: Look :: WordEndHalf ,
561
+ _ => {
562
+ return Err ( Error :: new ( ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED ) )
563
+ }
564
+ } ;
565
+ Ok ( Some ( Hir :: look ( kind) ) )
566
+ }
567
+
488
568
/// Parse a hex representation of a Unicode codepoint. This handles both
489
569
/// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
490
570
/// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
@@ -1948,8 +2028,6 @@ bar
1948
2028
assert_eq ! ( ERR_UNICODE_CLASS_UNSUPPORTED , perr( r"\pL" ) ) ;
1949
2029
assert_eq ! ( ERR_UNICODE_CLASS_UNSUPPORTED , perr( r"\p{L}" ) ) ;
1950
2030
assert_eq ! ( ERR_ESCAPE_UNRECOGNIZED , perr( r"\i" ) ) ;
1951
- assert_eq ! ( ERR_ESCAPE_UNRECOGNIZED , perr( r"\<" ) ) ;
1952
- assert_eq ! ( ERR_ESCAPE_UNRECOGNIZED , perr( r"\>" ) ) ;
1953
2031
assert_eq ! ( ERR_UNCOUNTED_REP_SUB_MISSING , perr( r"?" ) ) ;
1954
2032
assert_eq ! ( ERR_UNCOUNTED_REP_SUB_MISSING , perr( r"*" ) ) ;
1955
2033
assert_eq ! ( ERR_UNCOUNTED_REP_SUB_MISSING , perr( r"+" ) ) ;
@@ -1983,6 +2061,11 @@ bar
1983
2061
assert_eq ! ( ERR_CLASS_INTERSECTION_UNSUPPORTED , perr( r"[a&&b]" ) ) ;
1984
2062
assert_eq ! ( ERR_CLASS_DIFFERENCE_UNSUPPORTED , perr( r"[a--b]" ) ) ;
1985
2063
assert_eq ! ( ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED , perr( r"[a~~b]" ) ) ;
2064
+ assert_eq ! ( ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED , perr( r"\b{foo" ) ) ;
2065
+ assert_eq ! ( ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED , perr( r"\b{foo!}" ) ) ;
2066
+ assert_eq ! ( ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED , perr( r"\b{foo}" ) ) ;
2067
+ assert_eq ! ( ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF , perr( r"\b{" ) ) ;
2068
+ assert_eq ! ( ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF , perr( r"(?x)\b{ " ) ) ;
1986
2069
}
1987
2070
1988
2071
#[ test]
0 commit comments