syntax: flatten look-around assertions

BurntSushi · BurntSushi · commit 1f707e7bc4be · 2023-04-17T14:49:44.000-04:00
Instead of having both 'HirKind::Anchor' and 'HirKind::WordBoundary',
this patch flattens them into one 'hirKind::Look'.

Why do this? I think they make more sense grouped together. Namely, they
are all simplistic look-around assertions and they all tend to be
handled with very similar logic.
diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs
@@ -627,7 +627,7 @@ fn prefixes(expr: &Hir, lits: &mut Literals) {
         HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
         HirKind::Concat(ref es) => {
             for e in es {
-                if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
+                if let HirKind::Look(hir::Look::Start) = *e.kind() {
                     if !lits.is_empty() {
                         lits.cut();
                         break;
@@ -703,7 +703,7 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
         HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
         HirKind::Concat(ref es) => {
             for e in es.iter().rev() {
-                if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
+                if let HirKind::Look(hir::Look::End) = *e.kind() {
                     if !lits.is_empty() {
                         lits.cut();
                         break;
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -175,11 +175,8 @@ pub enum HirKind {
     /// class. A class can either consist of Unicode scalar values as
     /// characters, or it can use bytes.
     Class(Class),
-    /// An anchor assertion. An anchor assertion match always has zero length.
-    Anchor(Anchor),
-    /// A word boundary assertion, which may or may not be Unicode aware. A
-    /// word boundary assertion match always has zero length.
-    WordBoundary(WordBoundary),
+    /// A look-around assertion. A look-around match always has zero length.
+    Look(Look),
     /// A repetition operation applied to a child expression.
     Repetition(Repetition),
     /// A possibly capturing group, which contains a child expression.
@@ -271,8 +268,8 @@ impl Hir {
         Hir { kind: HirKind::Class(class), info }
     }
 
-    /// Creates an anchor assertion HIR expression.
-    pub fn anchor(anchor: Anchor) -> Hir {
+    /// Creates a look-around assertion HIR expression.
+    pub fn look(look: Look) -> Hir {
         let mut info = HirInfo::new();
         info.set_always_utf8(true);
         info.set_all_assertions(true);
@@ -282,53 +279,34 @@ impl Hir {
         info.set_line_anchored_end(false);
         info.set_any_anchored_start(false);
         info.set_any_anchored_end(false);
+        // All look-around assertions always produce zero-length or "empty"
+        // matches. This is true even though not all of them (like \b) match
+        // the empty string itself. That is, '\b' does not match ''. But it
+        // does match the empty string between '!' and 'a' in '!a'.
         info.set_match_empty(true);
         info.set_literal(false);
         info.set_alternation_literal(false);
-        if let Anchor::StartText = anchor {
+        if let Look::Start = look {
             info.set_anchored_start(true);
             info.set_line_anchored_start(true);
             info.set_any_anchored_start(true);
         }
-        if let Anchor::EndText = anchor {
+        if let Look::End = look {
             info.set_anchored_end(true);
             info.set_line_anchored_end(true);
             info.set_any_anchored_end(true);
         }
-        if let Anchor::StartLine = anchor {
+        if let Look::StartLF = look {
             info.set_line_anchored_start(true);
         }
-        if let Anchor::EndLine = anchor {
+        if let Look::EndLF = look {
             info.set_line_anchored_end(true);
         }
-        Hir { kind: HirKind::Anchor(anchor), info }
-    }
-
-    /// Creates a word boundary assertion HIR expression.
-    pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
-        let mut info = HirInfo::new();
-        info.set_always_utf8(true);
-        info.set_all_assertions(true);
-        info.set_anchored_start(false);
-        info.set_anchored_end(false);
-        info.set_line_anchored_start(false);
-        info.set_line_anchored_end(false);
-        info.set_any_anchored_start(false);
-        info.set_any_anchored_end(false);
-        info.set_literal(false);
-        info.set_alternation_literal(false);
-        // A negated word boundary matches '', so that's fine. But \b does not
-        // match \b, so why do we say it can match the empty string? Well,
-        // because, if you search for \b against 'a', it will report [0, 0) and
-        // [1, 1) as matches, and both of those matches correspond to the empty
-        // string. Thus, only *certain* empty strings match \b, which similarly
-        // applies to \B.
-        info.set_match_empty(true);
-        // Negated ASCII word boundaries can match invalid UTF-8.
-        if let WordBoundary::AsciiNegate = word_boundary {
+        if let Look::WordAsciiNegate = look {
+            // Negated ASCII word boundaries can match invalid UTF-8.
             info.set_always_utf8(false);
         }
-        Hir { kind: HirKind::WordBoundary(word_boundary), info }
+        Hir { kind: HirKind::Look(look), info }
     }
 
     /// Creates a repetition HIR expression.
@@ -697,8 +675,7 @@ impl HirKind {
             HirKind::Empty
             | HirKind::Literal(_)
             | HirKind::Class(_)
-            | HirKind::Anchor(_)
-            | HirKind::WordBoundary(_) => false,
+            | HirKind::Look(_) => false,
             HirKind::Group(_)
             | HirKind::Repetition(_)
             | HirKind::Concat(_)
@@ -1313,44 +1290,37 @@ impl core::fmt::Debug for ClassBytesRange {
     }
 }
 
-/// The high-level intermediate representation for an anchor assertion.
+/// The high-level intermediate representation for a look-around assertion.
 ///
-/// A matching anchor assertion is always zero-length.
+/// An assertion match is always zero-length. Also called an "empty match."
 #[derive(Clone, Debug, Eq, PartialEq)]
-pub enum Anchor {
-    /// Match the beginning of a line or the beginning of text. Specifically,
-    /// this matches at the starting position of the input, or at the position
-    /// immediately following a `\n` character.
-    StartLine,
-    /// Match the end of a line or the end of text. Specifically,
-    /// this matches at the end position of the input, or at the position
-    /// immediately preceding a `\n` character.
-    EndLine,
+pub enum Look {
     /// Match the beginning of text. Specifically, this matches at the starting
     /// position of the input.
-    StartText,
+    Start,
     /// Match the end of text. Specifically, this matches at the ending
     /// position of the input.
-    EndText,
-}
-
-/// The high-level intermediate representation for a word-boundary assertion.
-///
-/// A matching word boundary assertion is always zero-length.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum WordBoundary {
-    /// Match a Unicode-aware word boundary. That is, this matches a position
-    /// where the left adjacent character and right adjacent character
-    /// correspond to a word and non-word or a non-word and word character.
-    Unicode,
-    /// Match a Unicode-aware negation of a word boundary.
-    UnicodeNegate,
+    End,
+    /// Match the beginning of a line or the beginning of text. Specifically,
+    /// this matches at the starting position of the input, or at the position
+    /// immediately following a `\n` character.
+    StartLF,
+    /// Match the end of a line or the end of text. Specifically, this matches
+    /// at the end position of the input, or at the position immediately
+    /// preceding a `\n` character.
+    EndLF,
     /// Match an ASCII-only word boundary. That is, this matches a position
     /// where the left adjacent character and right adjacent character
     /// correspond to a word and non-word or a non-word and word character.
-    Ascii,
+    WordAscii,
     /// Match an ASCII-only negation of a word boundary.
-    AsciiNegate,
+    WordAsciiNegate,
+    /// Match a Unicode-aware word boundary. That is, this matches a position
+    /// where the left adjacent character and right adjacent character
+    /// correspond to a word and non-word or a non-word and word character.
+    WordUnicode,
+    /// Match a Unicode-aware negation of a word boundary.
+    WordUnicodeNegate,
 }
 
 /// The high-level intermediate representation for a group.
@@ -1461,8 +1431,7 @@ impl Drop for Hir {
             HirKind::Empty
             | HirKind::Literal(_)
             | HirKind::Class(_)
-            | HirKind::Anchor(_)
-            | HirKind::WordBoundary(_) => return,
+            | HirKind::Look(_) => return,
             HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
             HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
             HirKind::Concat(ref x) if x.is_empty() => return,
@@ -1476,8 +1445,7 @@ impl Drop for Hir {
                 HirKind::Empty
                 | HirKind::Literal(_)
                 | HirKind::Class(_)
-                | HirKind::Anchor(_)
-                | HirKind::WordBoundary(_) => {}
+                | HirKind::Look(_) => {}
                 HirKind::Group(ref mut x) => {
                     stack.push(mem::replace(&mut x.hir, Hir::empty()));
                 }
diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs
@@ -125,30 +125,32 @@ impl<W: fmt::Write> Visitor for Writer<W> {
                 }
                 self.wtr.write_str("])")?;
             }
-            HirKind::Anchor(hir::Anchor::StartLine) => {
-                self.wtr.write_str("(?m:^)")?;
-            }
-            HirKind::Anchor(hir::Anchor::EndLine) => {
-                self.wtr.write_str("(?m:$)")?;
-            }
-            HirKind::Anchor(hir::Anchor::StartText) => {
-                self.wtr.write_str(r"\A")?;
-            }
-            HirKind::Anchor(hir::Anchor::EndText) => {
-                self.wtr.write_str(r"\z")?;
-            }
-            HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
-                self.wtr.write_str(r"\b")?;
-            }
-            HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
-                self.wtr.write_str(r"\B")?;
-            }
-            HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
-                self.wtr.write_str(r"(?-u:\b)")?;
-            }
-            HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
-                self.wtr.write_str(r"(?-u:\B)")?;
-            }
+            HirKind::Look(ref look) => match *look {
+                hir::Look::Start => {
+                    self.wtr.write_str(r"\A")?;
+                }
+                hir::Look::End => {
+                    self.wtr.write_str(r"\z")?;
+                }
+                hir::Look::StartLF => {
+                    self.wtr.write_str("(?m:^)")?;
+                }
+                hir::Look::EndLF => {
+                    self.wtr.write_str("(?m:$)")?;
+                }
+                hir::Look::WordAscii => {
+                    self.wtr.write_str(r"(?-u:\b)")?;
+                }
+                hir::Look::WordAsciiNegate => {
+                    self.wtr.write_str(r"(?-u:\B)")?;
+                }
+                hir::Look::WordUnicode => {
+                    self.wtr.write_str(r"\b")?;
+                }
+                hir::Look::WordUnicodeNegate => {
+                    self.wtr.write_str(r"\B")?;
+                }
+            },
             HirKind::Group(ref x) => match x.kind {
                 hir::GroupKind::Capture { ref name, .. } => {
                     self.wtr.write_str("(")?;
@@ -170,8 +172,7 @@ impl<W: fmt::Write> Visitor for Writer<W> {
             HirKind::Empty
             | HirKind::Literal(_)
             | HirKind::Class(_)
-            | HirKind::Anchor(_)
-            | HirKind::WordBoundary(_)
+            | HirKind::Look(_)
             | HirKind::Concat(_)
             | HirKind::Alternation(_) => {}
             HirKind::Repetition(ref x) => {
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -722,30 +722,26 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         let unicode = self.flags().unicode();
         let multi_line = self.flags().multi_line();
         Ok(match asst.kind {
-            ast::AssertionKind::StartLine => Hir::anchor(if multi_line {
-                hir::Anchor::StartLine
+            ast::AssertionKind::StartLine => Hir::look(if multi_line {
+                hir::Look::StartLF
             } else {
-                hir::Anchor::StartText
+                hir::Look::Start
             }),
-            ast::AssertionKind::EndLine => Hir::anchor(if multi_line {
-                hir::Anchor::EndLine
+            ast::AssertionKind::EndLine => Hir::look(if multi_line {
+                hir::Look::EndLF
             } else {
-                hir::Anchor::EndText
+                hir::Look::End
+            }),
+            ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
+            ast::AssertionKind::EndText => Hir::look(hir::Look::End),
+            ast::AssertionKind::WordBoundary => Hir::look(if unicode {
+                hir::Look::WordUnicode
+            } else {
+                hir::Look::WordAscii
             }),
-            ast::AssertionKind::StartText => {
-                Hir::anchor(hir::Anchor::StartText)
-            }
-            ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText),
-            ast::AssertionKind::WordBoundary => {
-                Hir::word_boundary(if unicode {
-                    hir::WordBoundary::Unicode
-                } else {
-                    hir::WordBoundary::Ascii
-                })
-            }
             ast::AssertionKind::NotWordBoundary => {
-                Hir::word_boundary(if unicode {
-                    hir::WordBoundary::UnicodeNegate
+                Hir::look(if unicode {
+                    hir::Look::WordUnicodeNegate
                 } else {
                     // It is possible for negated ASCII word boundaries to
                     // match at invalid UTF-8 boundaries, even when searching
@@ -755,7 +751,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
                             self.error(asst.span, ErrorKind::InvalidUtf8)
                         );
                     }
-                    hir::WordBoundary::AsciiNegate
+                    hir::Look::WordAsciiNegate
                 })
             }
         })
@@ -1364,12 +1360,8 @@ mod tests {
         }
     }
 
-    fn hir_anchor(anchor: hir::Anchor) -> Hir {
-        Hir::anchor(anchor)
-    }
-
-    fn hir_word(wb: hir::WordBoundary) -> Hir {
-        Hir::word_boundary(wb)
+    fn hir_look(look: hir::Look) -> Hir {
+        Hir::look(look)
     }
 
     #[test]
@@ -1563,22 +1555,19 @@ mod tests {
 
     #[test]
     fn assertions() {
-        assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
-        assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
-        assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
-        assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
-        assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
-        assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
-        assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
-        assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
+        assert_eq!(t("^"), hir_look(hir::Look::Start));
+        assert_eq!(t("$"), hir_look(hir::Look::End));
+        assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
+        assert_eq!(t(r"\z"), hir_look(hir::Look::End));
+        assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
+        assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
+        assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
+        assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
 
-        assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
-        assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
-        assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
-        assert_eq!(
-            t_bytes(r"(?-u)\B"),
-            hir_word(hir::WordBoundary::AsciiNegate)
-        );
+        assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
+        assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
+        assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
+        assert_eq!(t_bytes(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
 
         assert_eq!(
             t_err(r"(?-u)\B"),
@@ -1693,17 +1682,17 @@ mod tests {
             t("(?im)a^"),
             hir_cat(vec![
                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
-                hir_anchor(hir::Anchor::StartLine),
+                hir_look(hir::Look::StartLF),
             ])
         );
         #[cfg(feature = "unicode-case")]
         assert_eq!(
             t("(?im)a^(?i-m)a^"),
             hir_cat(vec![
                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
-                hir_anchor(hir::Anchor::StartLine),
+                hir_look(hir::Look::StartLF),
                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
-                hir_anchor(hir::Anchor::StartText),
+                hir_look(hir::Look::Start),
             ])
         );
         assert_eq!(
diff --git a/src/compile.rs b/src/compile.rs