Skip to content

Commit 1f707e7

Browse files
committed
syntax: flatten look-around assertions
Instead of having both 'HirKind::Anchor' and 'HirKind::WordBoundary', this patch flattens them into one 'hirKind::Look'. Why do this? I think they make more sense grouped together. Namely, they are all simplistic look-around assertions and they all tend to be handled with very similar logic.
1 parent 00ea571 commit 1f707e7

File tree

5 files changed

+166
-208
lines changed

5 files changed

+166
-208
lines changed

regex-syntax/src/hir/literal/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ fn prefixes(expr: &Hir, lits: &mut Literals) {
627627
HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
628628
HirKind::Concat(ref es) => {
629629
for e in es {
630-
if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
630+
if let HirKind::Look(hir::Look::Start) = *e.kind() {
631631
if !lits.is_empty() {
632632
lits.cut();
633633
break;
@@ -703,7 +703,7 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
703703
HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
704704
HirKind::Concat(ref es) => {
705705
for e in es.iter().rev() {
706-
if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
706+
if let HirKind::Look(hir::Look::End) = *e.kind() {
707707
if !lits.is_empty() {
708708
lits.cut();
709709
break;

regex-syntax/src/hir/mod.rs

Lines changed: 39 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,8 @@ pub enum HirKind {
175175
/// class. A class can either consist of Unicode scalar values as
176176
/// characters, or it can use bytes.
177177
Class(Class),
178-
/// An anchor assertion. An anchor assertion match always has zero length.
179-
Anchor(Anchor),
180-
/// A word boundary assertion, which may or may not be Unicode aware. A
181-
/// word boundary assertion match always has zero length.
182-
WordBoundary(WordBoundary),
178+
/// A look-around assertion. A look-around match always has zero length.
179+
Look(Look),
183180
/// A repetition operation applied to a child expression.
184181
Repetition(Repetition),
185182
/// A possibly capturing group, which contains a child expression.
@@ -271,8 +268,8 @@ impl Hir {
271268
Hir { kind: HirKind::Class(class), info }
272269
}
273270

274-
/// Creates an anchor assertion HIR expression.
275-
pub fn anchor(anchor: Anchor) -> Hir {
271+
/// Creates a look-around assertion HIR expression.
272+
pub fn look(look: Look) -> Hir {
276273
let mut info = HirInfo::new();
277274
info.set_always_utf8(true);
278275
info.set_all_assertions(true);
@@ -282,53 +279,34 @@ impl Hir {
282279
info.set_line_anchored_end(false);
283280
info.set_any_anchored_start(false);
284281
info.set_any_anchored_end(false);
282+
// All look-around assertions always produce zero-length or "empty"
283+
// matches. This is true even though not all of them (like \b) match
284+
// the empty string itself. That is, '\b' does not match ''. But it
285+
// does match the empty string between '!' and 'a' in '!a'.
285286
info.set_match_empty(true);
286287
info.set_literal(false);
287288
info.set_alternation_literal(false);
288-
if let Anchor::StartText = anchor {
289+
if let Look::Start = look {
289290
info.set_anchored_start(true);
290291
info.set_line_anchored_start(true);
291292
info.set_any_anchored_start(true);
292293
}
293-
if let Anchor::EndText = anchor {
294+
if let Look::End = look {
294295
info.set_anchored_end(true);
295296
info.set_line_anchored_end(true);
296297
info.set_any_anchored_end(true);
297298
}
298-
if let Anchor::StartLine = anchor {
299+
if let Look::StartLF = look {
299300
info.set_line_anchored_start(true);
300301
}
301-
if let Anchor::EndLine = anchor {
302+
if let Look::EndLF = look {
302303
info.set_line_anchored_end(true);
303304
}
304-
Hir { kind: HirKind::Anchor(anchor), info }
305-
}
306-
307-
/// Creates a word boundary assertion HIR expression.
308-
pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
309-
let mut info = HirInfo::new();
310-
info.set_always_utf8(true);
311-
info.set_all_assertions(true);
312-
info.set_anchored_start(false);
313-
info.set_anchored_end(false);
314-
info.set_line_anchored_start(false);
315-
info.set_line_anchored_end(false);
316-
info.set_any_anchored_start(false);
317-
info.set_any_anchored_end(false);
318-
info.set_literal(false);
319-
info.set_alternation_literal(false);
320-
// A negated word boundary matches '', so that's fine. But \b does not
321-
// match \b, so why do we say it can match the empty string? Well,
322-
// because, if you search for \b against 'a', it will report [0, 0) and
323-
// [1, 1) as matches, and both of those matches correspond to the empty
324-
// string. Thus, only *certain* empty strings match \b, which similarly
325-
// applies to \B.
326-
info.set_match_empty(true);
327-
// Negated ASCII word boundaries can match invalid UTF-8.
328-
if let WordBoundary::AsciiNegate = word_boundary {
305+
if let Look::WordAsciiNegate = look {
306+
// Negated ASCII word boundaries can match invalid UTF-8.
329307
info.set_always_utf8(false);
330308
}
331-
Hir { kind: HirKind::WordBoundary(word_boundary), info }
309+
Hir { kind: HirKind::Look(look), info }
332310
}
333311

334312
/// Creates a repetition HIR expression.
@@ -697,8 +675,7 @@ impl HirKind {
697675
HirKind::Empty
698676
| HirKind::Literal(_)
699677
| HirKind::Class(_)
700-
| HirKind::Anchor(_)
701-
| HirKind::WordBoundary(_) => false,
678+
| HirKind::Look(_) => false,
702679
HirKind::Group(_)
703680
| HirKind::Repetition(_)
704681
| HirKind::Concat(_)
@@ -1313,44 +1290,37 @@ impl core::fmt::Debug for ClassBytesRange {
13131290
}
13141291
}
13151292

1316-
/// The high-level intermediate representation for an anchor assertion.
1293+
/// The high-level intermediate representation for a look-around assertion.
13171294
///
1318-
/// A matching anchor assertion is always zero-length.
1295+
/// An assertion match is always zero-length. Also called an "empty match."
13191296
#[derive(Clone, Debug, Eq, PartialEq)]
1320-
pub enum Anchor {
1321-
/// Match the beginning of a line or the beginning of text. Specifically,
1322-
/// this matches at the starting position of the input, or at the position
1323-
/// immediately following a `\n` character.
1324-
StartLine,
1325-
/// Match the end of a line or the end of text. Specifically,
1326-
/// this matches at the end position of the input, or at the position
1327-
/// immediately preceding a `\n` character.
1328-
EndLine,
1297+
pub enum Look {
13291298
/// Match the beginning of text. Specifically, this matches at the starting
13301299
/// position of the input.
1331-
StartText,
1300+
Start,
13321301
/// Match the end of text. Specifically, this matches at the ending
13331302
/// position of the input.
1334-
EndText,
1335-
}
1336-
1337-
/// The high-level intermediate representation for a word-boundary assertion.
1338-
///
1339-
/// A matching word boundary assertion is always zero-length.
1340-
#[derive(Clone, Debug, Eq, PartialEq)]
1341-
pub enum WordBoundary {
1342-
/// Match a Unicode-aware word boundary. That is, this matches a position
1343-
/// where the left adjacent character and right adjacent character
1344-
/// correspond to a word and non-word or a non-word and word character.
1345-
Unicode,
1346-
/// Match a Unicode-aware negation of a word boundary.
1347-
UnicodeNegate,
1303+
End,
1304+
/// Match the beginning of a line or the beginning of text. Specifically,
1305+
/// this matches at the starting position of the input, or at the position
1306+
/// immediately following a `\n` character.
1307+
StartLF,
1308+
/// Match the end of a line or the end of text. Specifically, this matches
1309+
/// at the end position of the input, or at the position immediately
1310+
/// preceding a `\n` character.
1311+
EndLF,
13481312
/// Match an ASCII-only word boundary. That is, this matches a position
13491313
/// where the left adjacent character and right adjacent character
13501314
/// correspond to a word and non-word or a non-word and word character.
1351-
Ascii,
1315+
WordAscii,
13521316
/// Match an ASCII-only negation of a word boundary.
1353-
AsciiNegate,
1317+
WordAsciiNegate,
1318+
/// Match a Unicode-aware word boundary. That is, this matches a position
1319+
/// where the left adjacent character and right adjacent character
1320+
/// correspond to a word and non-word or a non-word and word character.
1321+
WordUnicode,
1322+
/// Match a Unicode-aware negation of a word boundary.
1323+
WordUnicodeNegate,
13541324
}
13551325

13561326
/// The high-level intermediate representation for a group.
@@ -1461,8 +1431,7 @@ impl Drop for Hir {
14611431
HirKind::Empty
14621432
| HirKind::Literal(_)
14631433
| HirKind::Class(_)
1464-
| HirKind::Anchor(_)
1465-
| HirKind::WordBoundary(_) => return,
1434+
| HirKind::Look(_) => return,
14661435
HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
14671436
HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
14681437
HirKind::Concat(ref x) if x.is_empty() => return,
@@ -1476,8 +1445,7 @@ impl Drop for Hir {
14761445
HirKind::Empty
14771446
| HirKind::Literal(_)
14781447
| HirKind::Class(_)
1479-
| HirKind::Anchor(_)
1480-
| HirKind::WordBoundary(_) => {}
1448+
| HirKind::Look(_) => {}
14811449
HirKind::Group(ref mut x) => {
14821450
stack.push(mem::replace(&mut x.hir, Hir::empty()));
14831451
}

regex-syntax/src/hir/print.rs

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -125,30 +125,32 @@ impl<W: fmt::Write> Visitor for Writer<W> {
125125
}
126126
self.wtr.write_str("])")?;
127127
}
128-
HirKind::Anchor(hir::Anchor::StartLine) => {
129-
self.wtr.write_str("(?m:^)")?;
130-
}
131-
HirKind::Anchor(hir::Anchor::EndLine) => {
132-
self.wtr.write_str("(?m:$)")?;
133-
}
134-
HirKind::Anchor(hir::Anchor::StartText) => {
135-
self.wtr.write_str(r"\A")?;
136-
}
137-
HirKind::Anchor(hir::Anchor::EndText) => {
138-
self.wtr.write_str(r"\z")?;
139-
}
140-
HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
141-
self.wtr.write_str(r"\b")?;
142-
}
143-
HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
144-
self.wtr.write_str(r"\B")?;
145-
}
146-
HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
147-
self.wtr.write_str(r"(?-u:\b)")?;
148-
}
149-
HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
150-
self.wtr.write_str(r"(?-u:\B)")?;
151-
}
128+
HirKind::Look(ref look) => match *look {
129+
hir::Look::Start => {
130+
self.wtr.write_str(r"\A")?;
131+
}
132+
hir::Look::End => {
133+
self.wtr.write_str(r"\z")?;
134+
}
135+
hir::Look::StartLF => {
136+
self.wtr.write_str("(?m:^)")?;
137+
}
138+
hir::Look::EndLF => {
139+
self.wtr.write_str("(?m:$)")?;
140+
}
141+
hir::Look::WordAscii => {
142+
self.wtr.write_str(r"(?-u:\b)")?;
143+
}
144+
hir::Look::WordAsciiNegate => {
145+
self.wtr.write_str(r"(?-u:\B)")?;
146+
}
147+
hir::Look::WordUnicode => {
148+
self.wtr.write_str(r"\b")?;
149+
}
150+
hir::Look::WordUnicodeNegate => {
151+
self.wtr.write_str(r"\B")?;
152+
}
153+
},
152154
HirKind::Group(ref x) => match x.kind {
153155
hir::GroupKind::Capture { ref name, .. } => {
154156
self.wtr.write_str("(")?;
@@ -170,8 +172,7 @@ impl<W: fmt::Write> Visitor for Writer<W> {
170172
HirKind::Empty
171173
| HirKind::Literal(_)
172174
| HirKind::Class(_)
173-
| HirKind::Anchor(_)
174-
| HirKind::WordBoundary(_)
175+
| HirKind::Look(_)
175176
| HirKind::Concat(_)
176177
| HirKind::Alternation(_) => {}
177178
HirKind::Repetition(ref x) => {

regex-syntax/src/hir/translate.rs

Lines changed: 33 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -722,30 +722,26 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
722722
let unicode = self.flags().unicode();
723723
let multi_line = self.flags().multi_line();
724724
Ok(match asst.kind {
725-
ast::AssertionKind::StartLine => Hir::anchor(if multi_line {
726-
hir::Anchor::StartLine
725+
ast::AssertionKind::StartLine => Hir::look(if multi_line {
726+
hir::Look::StartLF
727727
} else {
728-
hir::Anchor::StartText
728+
hir::Look::Start
729729
}),
730-
ast::AssertionKind::EndLine => Hir::anchor(if multi_line {
731-
hir::Anchor::EndLine
730+
ast::AssertionKind::EndLine => Hir::look(if multi_line {
731+
hir::Look::EndLF
732732
} else {
733-
hir::Anchor::EndText
733+
hir::Look::End
734+
}),
735+
ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
736+
ast::AssertionKind::EndText => Hir::look(hir::Look::End),
737+
ast::AssertionKind::WordBoundary => Hir::look(if unicode {
738+
hir::Look::WordUnicode
739+
} else {
740+
hir::Look::WordAscii
734741
}),
735-
ast::AssertionKind::StartText => {
736-
Hir::anchor(hir::Anchor::StartText)
737-
}
738-
ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText),
739-
ast::AssertionKind::WordBoundary => {
740-
Hir::word_boundary(if unicode {
741-
hir::WordBoundary::Unicode
742-
} else {
743-
hir::WordBoundary::Ascii
744-
})
745-
}
746742
ast::AssertionKind::NotWordBoundary => {
747-
Hir::word_boundary(if unicode {
748-
hir::WordBoundary::UnicodeNegate
743+
Hir::look(if unicode {
744+
hir::Look::WordUnicodeNegate
749745
} else {
750746
// It is possible for negated ASCII word boundaries to
751747
// match at invalid UTF-8 boundaries, even when searching
@@ -755,7 +751,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
755751
self.error(asst.span, ErrorKind::InvalidUtf8)
756752
);
757753
}
758-
hir::WordBoundary::AsciiNegate
754+
hir::Look::WordAsciiNegate
759755
})
760756
}
761757
})
@@ -1364,12 +1360,8 @@ mod tests {
13641360
}
13651361
}
13661362

1367-
fn hir_anchor(anchor: hir::Anchor) -> Hir {
1368-
Hir::anchor(anchor)
1369-
}
1370-
1371-
fn hir_word(wb: hir::WordBoundary) -> Hir {
1372-
Hir::word_boundary(wb)
1363+
fn hir_look(look: hir::Look) -> Hir {
1364+
Hir::look(look)
13731365
}
13741366

13751367
#[test]
@@ -1563,22 +1555,19 @@ mod tests {
15631555

15641556
#[test]
15651557
fn assertions() {
1566-
assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
1567-
assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
1568-
assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
1569-
assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
1570-
assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
1571-
assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
1572-
assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
1573-
assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
1558+
assert_eq!(t("^"), hir_look(hir::Look::Start));
1559+
assert_eq!(t("$"), hir_look(hir::Look::End));
1560+
assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1561+
assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1562+
assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1563+
assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1564+
assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1565+
assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
15741566

1575-
assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
1576-
assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
1577-
assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
1578-
assert_eq!(
1579-
t_bytes(r"(?-u)\B"),
1580-
hir_word(hir::WordBoundary::AsciiNegate)
1581-
);
1567+
assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
1568+
assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
1569+
assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
1570+
assert_eq!(t_bytes(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
15821571

15831572
assert_eq!(
15841573
t_err(r"(?-u)\B"),
@@ -1693,17 +1682,17 @@ mod tests {
16931682
t("(?im)a^"),
16941683
hir_cat(vec![
16951684
hir_uclass(&[('A', 'A'), ('a', 'a')]),
1696-
hir_anchor(hir::Anchor::StartLine),
1685+
hir_look(hir::Look::StartLF),
16971686
])
16981687
);
16991688
#[cfg(feature = "unicode-case")]
17001689
assert_eq!(
17011690
t("(?im)a^(?i-m)a^"),
17021691
hir_cat(vec![
17031692
hir_uclass(&[('A', 'A'), ('a', 'a')]),
1704-
hir_anchor(hir::Anchor::StartLine),
1693+
hir_look(hir::Look::StartLF),
17051694
hir_uclass(&[('A', 'A'), ('a', 'a')]),
1706-
hir_anchor(hir::Anchor::StartText),
1695+
hir_look(hir::Look::Start),
17071696
])
17081697
);
17091698
assert_eq!(

0 commit comments

Comments
 (0)