Skip to content

Commit d53fe59

Browse files
committed
lite: add special word boundaries to regex-lite
This was substantially easier. Coupling, private abstractions and slow code are so much easier to deal with. Ref #469
1 parent 49d67f0 commit d53fe59

File tree

4 files changed

+162
-28
lines changed

4 files changed

+162
-28
lines changed

regex-lite/src/hir/mod.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,24 @@ pub(crate) enum Look {
592592
Word = 1 << 6,
593593
/// Match an ASCII-only negation of a word boundary.
594594
WordNegate = 1 << 7,
595+
/// Match the start of an ASCII-only word boundary. That is, this matches a
596+
/// position at either the beginning of the haystack or where the previous
597+
/// character is not a word character and the following character is a word
598+
/// character.
599+
WordStart = 1 << 8,
600+
/// Match the end of an ASCII-only word boundary. That is, this matches
601+
/// a position at either the end of the haystack or where the previous
602+
/// character is a word character and the following character is not a word
603+
/// character.
604+
WordEnd = 1 << 9,
605+
/// Match the start half of an ASCII-only word boundary. That is, this
606+
/// matches a position at either the beginning of the haystack or where the
607+
/// previous character is not a word character.
608+
WordStartHalf = 1 << 10,
609+
/// Match the end half of an ASCII-only word boundary. That is, this
610+
/// matches a position at either the end of the haystack or where the
611+
/// following character is not a word character.
612+
WordEndHalf = 1 << 11,
595613
}
596614

597615
impl Look {
@@ -631,6 +649,30 @@ impl Look {
631649
at < haystack.len() && utf8::is_word_byte(haystack[at]);
632650
word_before == word_after
633651
}
652+
WordStart => {
653+
let word_before =
654+
at > 0 && utf8::is_word_byte(haystack[at - 1]);
655+
let word_after =
656+
at < haystack.len() && utf8::is_word_byte(haystack[at]);
657+
!word_before && word_after
658+
}
659+
WordEnd => {
660+
let word_before =
661+
at > 0 && utf8::is_word_byte(haystack[at - 1]);
662+
let word_after =
663+
at < haystack.len() && utf8::is_word_byte(haystack[at]);
664+
word_before && !word_after
665+
}
666+
WordStartHalf => {
667+
let word_before =
668+
at > 0 && utf8::is_word_byte(haystack[at - 1]);
669+
!word_before
670+
}
671+
WordEndHalf => {
672+
let word_after =
673+
at < haystack.len() && utf8::is_word_byte(haystack[at]);
674+
!word_after
675+
}
634676
}
635677
}
636678
}

regex-lite/src/hir/parse.rs

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
111111
"character class difference is not supported";
112112
const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
113113
"character class symmetric difference is not supported";
114+
const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
115+
"special word boundary assertion is unclosed or has an invalid character";
116+
const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
117+
"special word boundary assertion is unrecognized";
118+
const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
119+
"found start of special word boundary or repetition without an end";
114120

115121
/// A regular expression parser.
116122
///
@@ -479,12 +485,86 @@ impl<'a> Parser<'a> {
479485
'v' => special('\x0B'),
480486
'A' => Ok(Hir::look(hir::Look::Start)),
481487
'z' => Ok(Hir::look(hir::Look::End)),
482-
'b' => Ok(Hir::look(hir::Look::Word)),
488+
'b' => {
489+
let mut hir = Hir::look(hir::Look::Word);
490+
if !self.is_done() && self.char() == '{' {
491+
if let Some(special) =
492+
self.maybe_parse_special_word_boundary()?
493+
{
494+
hir = special;
495+
}
496+
}
497+
Ok(hir)
498+
}
483499
'B' => Ok(Hir::look(hir::Look::WordNegate)),
500+
'<' => Ok(Hir::look(hir::Look::WordStart)),
501+
'>' => Ok(Hir::look(hir::Look::WordEnd)),
484502
_ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
485503
}
486504
}
487505

506+
/// Attempt to parse a specialty word boundary. That is, `\b{start}`,
507+
/// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
508+
///
509+
/// This is similar to `maybe_parse_ascii_class` in that, in most cases,
510+
/// if it fails it will just return `None` with no error. This is done
511+
/// because `\b{5}` is a valid expression and we want to let that be parsed
512+
/// by the existing counted repetition parsing code. (I thought about just
513+
/// invoking the counted repetition code from here, but it seemed a little
514+
/// ham-fisted.)
515+
///
516+
/// Unlike `maybe_parse_ascii_class` though, this can return an error.
517+
/// Namely, if we definitely know it isn't a counted repetition, then we
518+
/// return an error specific to the specialty word boundaries.
519+
///
520+
/// This assumes the parser is positioned at a `{` immediately following
521+
/// a `\b`. When `None` is returned, the parser is returned to the position
522+
/// at which it started: pointing at a `{`.
523+
///
524+
/// The position given should correspond to the start of the `\b`.
525+
fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
526+
assert_eq!(self.char(), '{');
527+
528+
let is_valid_char = |c| match c {
529+
'A'..='Z' | 'a'..='z' | '-' => true,
530+
_ => false,
531+
};
532+
let start = self.pos();
533+
if !self.bump_and_bump_space() {
534+
return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
535+
}
536+
// This is one of the critical bits: if the first non-whitespace
537+
// character isn't in [-A-Za-z] (i.e., this can't be a special word
538+
// boundary), then we bail and let the counted repetition parser deal
539+
// with this.
540+
if !is_valid_char(self.char()) {
541+
self.pos.set(start);
542+
self.char.set(Some('{'));
543+
return Ok(None);
544+
}
545+
546+
// Now collect up our chars until we see a '}'.
547+
let mut scratch = String::new();
548+
while !self.is_done() && is_valid_char(self.char()) {
549+
scratch.push(self.char());
550+
self.bump_and_bump_space();
551+
}
552+
if self.is_done() || self.char() != '}' {
553+
return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
554+
}
555+
self.bump();
556+
let kind = match scratch.as_str() {
557+
"start" => hir::Look::WordStart,
558+
"end" => hir::Look::WordEnd,
559+
"start-half" => hir::Look::WordStartHalf,
560+
"end-half" => hir::Look::WordEndHalf,
561+
_ => {
562+
return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
563+
}
564+
};
565+
Ok(Some(Hir::look(kind)))
566+
}
567+
488568
/// Parse a hex representation of a Unicode codepoint. This handles both
489569
/// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
490570
/// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
@@ -1948,8 +2028,6 @@ bar
19482028
assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
19492029
assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
19502030
assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
1951-
assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<"));
1952-
assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>"));
19532031
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
19542032
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
19552033
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
@@ -1983,6 +2061,11 @@ bar
19832061
assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
19842062
assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
19852063
assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
2064+
assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
2065+
assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
2066+
assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
2067+
assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
2068+
assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
19862069
}
19872070

19882071
#[test]

regex-lite/src/lib.rs

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -466,12 +466,16 @@ x{n}? exactly n x
466466
### Empty matches
467467
468468
<pre class="rust">
469-
^ the beginning of a haystack (or start-of-line with multi-line mode)
470-
$ the end of a haystack (or end-of-line with multi-line mode)
471-
\A only the beginning of a haystack (even with multi-line mode enabled)
472-
\z only the end of a haystack (even with multi-line mode enabled)
473-
\b an ASCII word boundary (\w on one side and \W, \A, or \z on other)
474-
\B not an ASCII word boundary
469+
^ the beginning of a haystack (or start-of-line with multi-line mode)
470+
$ the end of a haystack (or end-of-line with multi-line mode)
471+
\A only the beginning of a haystack (even with multi-line mode enabled)
472+
\z only the end of a haystack (even with multi-line mode enabled)
473+
\b an ASCII word boundary (\w on one side and \W, \A, or \z on other)
474+
\B not an ASCII word boundary
475+
\b{start} an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
476+
\b{end} an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
477+
\b{start-half} half of an ASCII start-of-word boundary (\W|\A on the left)
478+
\b{end-half} half of an ASCII end-of-word boundary (\W|\z on the right)
475479
</pre>
476480
477481
The empty regex is valid and matches the empty string. For example, the
@@ -581,25 +585,29 @@ Note that this includes all possible escape sequences, even ones that are
581585
documented elsewhere.
582586
583587
<pre class="rust">
584-
\* literal *, applies to all ASCII except [0-9A-Za-z<>]
585-
\a bell (\x07)
586-
\f form feed (\x0C)
587-
\t horizontal tab
588-
\n new line
589-
\r carriage return
590-
\v vertical tab (\x0B)
591-
\A matches at the beginning of a haystack
592-
\z matches at the end of a haystack
593-
\b word boundary assertion
594-
\B negated word boundary assertion
595-
\x7F hex character code (exactly two digits)
596-
\x{10FFFF} any hex character code corresponding to a Unicode code point
597-
\u007F hex character code (exactly four digits)
598-
\u{7F} any hex character code corresponding to a Unicode code point
599-
\U0000007F hex character code (exactly eight digits)
600-
\U{7F} any hex character code corresponding to a Unicode code point
601-
\d, \s, \w Perl character class
602-
\D, \S, \W negated Perl character class
588+
\* literal *, applies to all ASCII except [0-9A-Za-z<>]
589+
\a bell (\x07)
590+
\f form feed (\x0C)
591+
\t horizontal tab
592+
\n new line
593+
\r carriage return
594+
\v vertical tab (\x0B)
595+
\A matches at the beginning of a haystack
596+
\z matches at the end of a haystack
597+
\b word boundary assertion
598+
\B negated word boundary assertion
599+
\b{start}, \< start-of-word boundary assertion
600+
\b{end}, \> end-of-word boundary assertion
601+
\b{start-half} half of a start-of-word boundary assertion
602+
\b{end-half} half of a end-of-word boundary assertion
603+
\x7F hex character code (exactly two digits)
604+
\x{10FFFF} any hex character code corresponding to a Unicode code point
605+
\u007F hex character code (exactly four digits)
606+
\u{7F} any hex character code corresponding to a Unicode code point
607+
\U0000007F hex character code (exactly eight digits)
608+
\U{7F} any hex character code corresponding to a Unicode code point
609+
\d, \s, \w Perl character class
610+
\D, \S, \W negated Perl character class
603611
</pre>
604612
605613
### Perl character classes (ASCII only)

regex-lite/tests/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ fn suite() -> anyhow::Result<regex_test::RegexTests> {
3838
load!("unicode");
3939
load!("utf8");
4040
load!("word-boundary");
41+
load!("word-boundary-special");
4142
load!("fowler/basic");
4243
load!("fowler/nullsubexpr");
4344
load!("fowler/repetition");

0 commit comments

Comments
 (0)