lite: add special word boundaries to regex-lite

BurntSushi · BurntSushi · commit d53fe59524ce · 2023-10-08T14:56:53.000-04:00
This was substantially easier. Coupling, private abstractions and slow code are so much easier to deal with. Ref #469
diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs
@@ -592,6 +592,24 @@ pub(crate) enum Look {
     Word = 1 << 6,
     /// Match an ASCII-only negation of a word boundary.
     WordNegate = 1 << 7,
+    /// Match the start of an ASCII-only word boundary. That is, this matches a
+    /// position at either the beginning of the haystack or where the previous
+    /// character is not a word character and the following character is a word
+    /// character.
+    WordStart = 1 << 8,
+    /// Match the end of an ASCII-only word boundary. That is, this matches
+    /// a position at either the end of the haystack or where the previous
+    /// character is a word character and the following character is not a word
+    /// character.
+    WordEnd = 1 << 9,
+    /// Match the start half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the beginning of the haystack or where the
+    /// previous character is not a word character.
+    WordStartHalf = 1 << 10,
+    /// Match the end half of an ASCII-only word boundary. That is, this
+    /// matches a position at either the end of the haystack or where the
+    /// following character is not a word character.
+    WordEndHalf = 1 << 11,
 }
 
 impl Look {
@@ -631,6 +649,30 @@ impl Look {
                     at < haystack.len() && utf8::is_word_byte(haystack[at]);
                 word_before == word_after
             }
+            WordStart => {
+                let word_before =
+                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
+                let word_after =
+                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
+                !word_before && word_after
+            }
+            WordEnd => {
+                let word_before =
+                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
+                let word_after =
+                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
+                word_before && !word_after
+            }
+            WordStartHalf => {
+                let word_before =
+                    at > 0 && utf8::is_word_byte(haystack[at - 1]);
+                !word_before
+            }
+            WordEndHalf => {
+                let word_after =
+                    at < haystack.len() && utf8::is_word_byte(haystack[at]);
+                !word_after
+            }
         }
     }
 }
diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs
@@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
     "character class difference is not supported";
 const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
     "character class symmetric difference is not supported";
+const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
+    "special word boundary assertion is unclosed or has an invalid character";
+const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
+    "special word boundary assertion is unrecognized";
+const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
+    "found start of special word boundary or repetition without an end";
 
 /// A regular expression parser.
 ///
@@ -479,12 +485,86 @@ impl<'a> Parser<'a> {
             'v' => special('\x0B'),
             'A' => Ok(Hir::look(hir::Look::Start)),
             'z' => Ok(Hir::look(hir::Look::End)),
-            'b' => Ok(Hir::look(hir::Look::Word)),
+            'b' => {
+                let mut hir = Hir::look(hir::Look::Word);
+                if !self.is_done() && self.char() == '{' {
+                    if let Some(special) =
+                        self.maybe_parse_special_word_boundary()?
+                    {
+                        hir = special;
+                    }
+                }
+                Ok(hir)
+            }
             'B' => Ok(Hir::look(hir::Look::WordNegate)),
+            '<' => Ok(Hir::look(hir::Look::WordStart)),
+            '>' => Ok(Hir::look(hir::Look::WordEnd)),
             _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
         }
     }
 
+    /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
+    /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
+    ///
+    /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
+    /// if it fails it will just return `None` with no error. This is done
+    /// because `\b{5}` is a valid expression and we want to let that be parsed
+    /// by the existing counted repetition parsing code. (I thought about just
+    /// invoking the counted repetition code from here, but it seemed a little
+    /// ham-fisted.)
+    ///
+    /// Unlike `maybe_parse_ascii_class` though, this can return an error.
+    /// Namely, if we definitely know it isn't a counted repetition, then we
+    /// return an error specific to the specialty word boundaries.
+    ///
+    /// This assumes the parser is positioned at a `{` immediately following
+    /// a `\b`. When `None` is returned, the parser is returned to the position
+    /// at which it started: pointing at a `{`.
+    ///
+    /// The position given should correspond to the start of the `\b`.
+    fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
+        assert_eq!(self.char(), '{');
+
+        let is_valid_char = |c| match c {
+            'A'..='Z' | 'a'..='z' | '-' => true,
+            _ => false,
+        };
+        let start = self.pos();
+        if !self.bump_and_bump_space() {
+            return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
+        }
+        // This is one of the critical bits: if the first non-whitespace
+        // character isn't in [-A-Za-z] (i.e., this can't be a special word
+        // boundary), then we bail and let the counted repetition parser deal
+        // with this.
+        if !is_valid_char(self.char()) {
+            self.pos.set(start);
+            self.char.set(Some('{'));
+            return Ok(None);
+        }
+
+        // Now collect up our chars until we see a '}'.
+        let mut scratch = String::new();
+        while !self.is_done() && is_valid_char(self.char()) {
+            scratch.push(self.char());
+            self.bump_and_bump_space();
+        }
+        if self.is_done() || self.char() != '}' {
+            return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
+        }
+        self.bump();
+        let kind = match scratch.as_str() {
+            "start" => hir::Look::WordStart,
+            "end" => hir::Look::WordEnd,
+            "start-half" => hir::Look::WordStartHalf,
+            "end-half" => hir::Look::WordEndHalf,
+            _ => {
+                return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
+            }
+        };
+        Ok(Some(Hir::look(kind)))
+    }
+
     /// Parse a hex representation of a Unicode codepoint. This handles both
     /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
     /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
@@ -1948,8 +2028,6 @@ bar
         assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
         assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
         assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
-        assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<"));
-        assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>"));
         assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
         assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
         assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
@@ -1983,6 +2061,11 @@ bar
         assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
         assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
         assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
+        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
+        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
+        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
+        assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
+        assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
     }
 
     #[test]
diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs
@@ -466,12 +466,16 @@ x{n}?     exactly n x
 ### Empty matches
 
 <pre class="rust">
-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    an ASCII word boundary (\w on one side and \W, \A, or \z on other)
-\B    not an ASCII word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              an ASCII word boundary (\w on one side and \W, \A, or \z on other)
+\B              not an ASCII word boundary
+\b{start}       an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}         an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of an ASCII start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of an ASCII end-of-word boundary (\W|\z on the right)
 </pre>
 
 The empty regex is valid and matches the empty string. For example, the
@@ -581,25 +585,29 @@ Note that this includes all possible escape sequences, even ones that are
 documented elsewhere.
 
 <pre class="rust">
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 </pre>
 
 ### Perl character classes (ASCII only)
diff --git a/regex-lite/tests/lib.rs b/regex-lite/tests/lib.rs
@@ -38,6 +38,7 @@ fn suite() -> anyhow::Result<regex_test::RegexTests> {
     load!("unicode");
     load!("utf8");
     load!("word-boundary");
+    load!("word-boundary-special");
     load!("fowler/basic");
     load!("fowler/nullsubexpr");
     load!("fowler/repetition");