Skip to content

Commit f9fc861

Browse files
committed
Optimize case folding.
When `regex-syntax` is compiled under debug mode, case folding can take a significant amount of time. This path is easily triggered by using case insensitive regexes. This commit speeds up the case folding process by skipping binary searches, although it is still not optimal. It could probably benefit from a fresh approach, but let's leave it alone for now.
1 parent 7a72b1f commit f9fc861

File tree

1 file changed

+28
-14
lines changed

1 file changed

+28
-14
lines changed

regex-syntax/src/lib.rs

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -482,17 +482,24 @@ impl ClassRange {
482482
/// canonical order.
483483
fn case_fold(self) -> Vec<ClassRange> {
484484
let (s, e) = (self.start as u32, self.end as u32 + 1);
485-
let folded = (s..e).filter_map(char::from_u32).map(simple_case_fold);
486-
ClassRange::ranges(folded)
487-
}
488-
489-
/// Turns a non-empty sequence of sorted characters into a sequence of
490-
/// class ranges in canonical format/order.
491-
fn ranges<I: Iterator<Item=char>>(mut chars: I) -> Vec<ClassRange> {
492-
let mut ranges = Vec::with_capacity(100);
493-
let mut start = chars.next().expect("non-empty char iterator");
485+
let mut start = simple_case_fold(self.start);
494486
let mut end = start;
495-
for c in chars {
487+
let mut next_case_fold = self.start;
488+
let mut ranges = Vec::with_capacity(100);
489+
for mut c in (s+1..e).filter_map(char::from_u32) {
490+
if c >= next_case_fold {
491+
c = match simple_case_fold_result(c) {
492+
Ok(i) => case_folding::C_plus_S_table[i].1,
493+
Err(i) => {
494+
if i < case_folding::C_plus_S_table.len() {
495+
next_case_fold = case_folding::C_plus_S_table[i].0;
496+
} else {
497+
next_case_fold = '\u{10FFFF}'
498+
}
499+
c
500+
}
501+
};
502+
}
496503
if c != inc_char(end) {
497504
ranges.push(ClassRange::new(start, end));
498505
start = c;
@@ -886,10 +893,17 @@ impl fmt::Display for ErrorKind {
886893
/// expose it because it is used inside the various Regex engines.
887894
#[doc(hidden)]
888895
pub fn simple_case_fold(c: char) -> char {
889-
match case_folding::C_plus_S_table.binary_search_by(|&(x, _)| x.cmp(&c)) {
890-
Ok(i) => case_folding::C_plus_S_table[i].1,
891-
Err(_) => c,
892-
}
896+
simple_case_fold_result(c)
897+
.map(|i| case_folding::C_plus_S_table[i].1)
898+
.unwrap_or(c)
899+
}
900+
901+
/// The result of binary search on the simple case folding table.
902+
///
903+
/// This level of detail is exposed so that we can do case folding on a
904+
/// range of characters efficiently.
905+
fn simple_case_fold_result(c: char) -> ::std::result::Result<usize, usize> {
906+
case_folding::C_plus_S_table.binary_search_by(|&(x, _)| x.cmp(&c))
893907
}
894908

895909
/// Escapes all regular expression meta characters in `text`.

0 commit comments

Comments
 (0)