Skip to content

Commit 793e938

Browse files
committed
progress: API and docs refresh
1 parent 52e04f4 commit 793e938

36 files changed

+5348
-1366
lines changed

Cargo.toml

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,15 @@ std = [
4545
"regex-automata/std",
4646
"regex-syntax/std",
4747
]
48+
# This feature enables the 'log' crate to emit messages. This is usually
49+
# only useful for folks working on the regex crate itself, but can be useful
50+
# if you're trying hard to do some performance hacking on regex patterns
51+
# themselves. Note that you'll need to pair this with a crate like 'env_logger'
52+
# to actually emit the log messages somewhere.
53+
logging = [
54+
"aho-corasick?/logging",
55+
"regex-automata/logging",
56+
]
4857
# The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until
4958
# then, it is an alias for the 'std' feature.
5059
use_std = ["std"]
@@ -64,11 +73,6 @@ perf = [
6473
"perf-inline",
6574
"perf-literal",
6675
]
67-
# Enables fast caching. (If disabled, caching is still used, but is slower.)
68-
# Currently, this feature has no effect. It used to remove the thread_local
69-
# dependency and use a slower internal cache, but now the default cache has
70-
# been improved and thread_local is no longer a dependency at all.
71-
perf-cache = []
7276
# Enables use of a lazy DFA when possible.
7377
perf-dfa = ["regex-automata/hybrid"]
7478
# Enables use of a fully compiled DFA when possible.
@@ -86,6 +90,11 @@ perf-literal = [
8690
"dep:memchr",
8791
"regex-automata/perf-literal",
8892
]
93+
# Enables fast caching. (If disabled, caching is still used, but is slower.)
94+
# Currently, this feature has no effect. It used to remove the thread_local
95+
# dependency and use a slower internal cache, but now the default cache has
96+
# been improved and thread_local is no longer a dependency at all.
97+
perf-cache = []
8998

9099

91100
# UNICODE DATA FEATURES
@@ -151,7 +160,7 @@ unstable = ["pattern"]
151160
# by default if the unstable feature is enabled.
152161
pattern = []
153162

154-
# For very fast prefix literal matching.
163+
# For very fast multi-prefix literal matching.
155164
[dependencies.aho-corasick]
156165
version = "1.0.0"
157166
optional = true
@@ -161,22 +170,22 @@ optional = true
161170
version = "2.5.0"
162171
optional = true
163172

164-
# For parsing regular expressions.
165-
[dependencies.regex-syntax]
166-
path = "regex-syntax"
167-
version = "0.7.1"
168-
default-features = false
169-
170173
# For the actual regex engines.
171174
[dependencies.regex-automata]
172175
path = "regex-automata"
173176
version = "0.3.0"
174177
default-features = false
175178
features = ["alloc", "syntax", "meta", "nfa-pikevm"]
176179

180+
# For parsing regular expressions.
181+
[dependencies.regex-syntax]
182+
path = "regex-syntax"
183+
version = "0.7.1"
184+
default-features = false
185+
177186
[dev-dependencies]
178187
# For examples.
179-
lazy_static = "1"
188+
once_cell = "1.17.1"
180189
# For property based tests.
181190
quickcheck = { version = "1.0.3", default-features = false }
182191
# To check README's example

regex-automata/src/hybrid/dfa.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1954,7 +1954,7 @@ impl Cache {
19541954
/// This panics if no search has been started by [`Cache::search_start`].
19551955
#[inline]
19561956
pub fn search_update(&mut self, at: usize) {
1957-
let mut p =
1957+
let p =
19581958
self.progress.as_mut().expect("no in-progress search to update");
19591959
p.at = at;
19601960
}

regex-automata/src/meta/regex.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2819,6 +2819,12 @@ impl Config {
28192819
///
28202820
/// By default, `\n` is the line terminator.
28212821
///
2822+
/// **Warning**: This does not change the behavior of `.`. To do that,
2823+
/// you'll need to configure the syntax option
2824+
/// [`syntax::Config::line_terminator`](crate::util::syntax::Config::line_terminator)
2825+
/// in addition to this. Otherwise, `.` will continue to match any
2826+
/// character other than `\n`.
2827+
///
28222828
/// # Example
28232829
///
28242830
/// ```

regex-automata/src/util/syntax.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ pub struct Config {
147147
multi_line: bool,
148148
dot_matches_new_line: bool,
149149
crlf: bool,
150+
line_terminator: u8,
150151
swap_greed: bool,
151152
ignore_whitespace: bool,
152153
unicode: bool,
@@ -164,6 +165,7 @@ impl Config {
164165
multi_line: false,
165166
dot_matches_new_line: false,
166167
crlf: false,
168+
line_terminator: b'\n',
167169
swap_greed: false,
168170
ignore_whitespace: false,
169171
unicode: true,
@@ -239,6 +241,31 @@ impl Config {
239241
self
240242
}
241243

244+
/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
245+
///
246+
/// Namely, instead of `.` (by default) matching everything except for `\n`,
247+
/// this will cause `.` to match everything except for the byte given.
248+
///
249+
/// If `.` is used in a context where Unicode mode is enabled and this byte
250+
/// isn't ASCII, then an error will be returned. When Unicode mode is
251+
/// disabled, then any byte is permitted, but will return an error if UTF-8
252+
/// mode is enabled and it is a non-ASCII byte.
253+
///
254+
/// In short, any ASCII value for a line terminator is always okay. But a
255+
/// non-ASCII byte might result in an error depending on whether Unicode
256+
/// mode or UTF-8 mode are enabled.
257+
///
258+
/// Note that if `R` mode is enabled then it always takes precedence and
259+
/// the line terminator will be treated as `\r` and `\n` simultaneously.
260+
///
261+
/// Note also that this *doesn't* impact the look-around assertions
262+
/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
263+
/// configuration in the regex engine itself.
264+
pub fn line_terminator(mut self, byte: u8) -> Config {
265+
self.line_terminator = byte;
266+
self
267+
}
268+
242269
/// Enable or disable the "swap greed" flag by default.
243270
///
244271
/// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
@@ -377,6 +404,11 @@ impl Config {
377404
self.crlf
378405
}
379406

407+
/// Returns the line terminator in this syntax configuration.
408+
pub fn get_line_terminator(&self) -> u8 {
409+
self.line_terminator
410+
}
411+
380412
/// Returns whether "swap greed" mode is enabled.
381413
pub fn get_swap_greed(&self) -> bool {
382414
self.swap_greed
@@ -410,6 +442,7 @@ impl Config {
410442
.multi_line(self.multi_line)
411443
.dot_matches_new_line(self.dot_matches_new_line)
412444
.crlf(self.crlf)
445+
.line_terminator(self.line_terminator)
413446
.swap_greed(self.swap_greed)
414447
.ignore_whitespace(self.ignore_whitespace)
415448
.utf8(self.utf8)
@@ -436,6 +469,7 @@ impl Config {
436469
.multi_line(self.multi_line)
437470
.crlf(self.crlf)
438471
.dot_matches_new_line(self.dot_matches_new_line)
472+
.line_terminator(self.line_terminator)
439473
.swap_greed(self.swap_greed)
440474
.utf8(self.utf8);
441475
}

regex-automata/tests/dfa/onepass/suite.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,4 +193,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
193193
.case_insensitive(test.case_insensitive())
194194
.unicode(test.unicode())
195195
.utf8(test.utf8())
196+
.line_terminator(test.line_terminator())
196197
}

regex-automata/tests/dfa/suite.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
391391
.case_insensitive(test.case_insensitive())
392392
.unicode(test.unicode())
393393
.utf8(test.utf8())
394+
.line_terminator(test.line_terminator())
394395
}
395396

396397
/// Execute an overlapping search, and for each match found, also find its

regex-automata/tests/hybrid/suite.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
281281
.case_insensitive(test.case_insensitive())
282282
.unicode(test.unicode())
283283
.utf8(test.utf8())
284+
.line_terminator(test.line_terminator())
284285
}
285286

286287
/// Execute an overlapping search, and for each match found, also find its

regex-automata/tests/meta/suite.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,4 +196,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
196196
.case_insensitive(test.case_insensitive())
197197
.unicode(test.unicode())
198198
.utf8(test.utf8())
199+
.line_terminator(test.line_terminator())
199200
}

regex-automata/tests/nfa/thompson/backtrack/suite.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,4 +209,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
209209
.case_insensitive(test.case_insensitive())
210210
.unicode(test.unicode())
211211
.utf8(test.utf8())
212+
.line_terminator(test.line_terminator())
212213
}

regex-automata/tests/nfa/thompson/pikevm/suite.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,4 +158,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
158158
.case_insensitive(test.case_insensitive())
159159
.unicode(test.unicode())
160160
.utf8(test.utf8())
161+
.line_terminator(test.line_terminator())
161162
}

regex-lite/tests/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ mod fuzz;
22
mod string;
33

44
const BLACKLIST: &[&str] = &[
5-
// CRLF-aware line anchors aren't supported in regex API yet.
6-
"crlf",
75
// Custom line terminators aren't supported in regex-lite. We could add it,
86
// but it didn't seem worth it.
97
"line-terminator",

regex-syntax/src/hir/mod.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ pub enum ErrorKind {
8888
/// This error occurs when translating a pattern that could match a byte
8989
/// sequence that isn't UTF-8 and `utf8` was enabled.
9090
InvalidUtf8,
91+
/// This error occurs when one uses a non-ASCII byte for a line terminator,
92+
/// but where Unicode mode is enabled and UTF-8 mode is disabled.
93+
InvalidLineTerminator,
9194
/// This occurs when an unrecognized Unicode property name could not
9295
/// be found.
9396
UnicodePropertyNotFound,
@@ -120,6 +123,7 @@ impl core::fmt::Display for ErrorKind {
120123
let msg = match *self {
121124
UnicodeNotAllowed => "Unicode not allowed here",
122125
InvalidUtf8 => "pattern can match invalid UTF-8",
126+
InvalidLineTerminator => "invalid line terminator, must be ASCII",
123127
UnicodePropertyNotFound => "Unicode property not found",
124128
UnicodePropertyValueNotFound => "Unicode property value not found",
125129
UnicodePerlClassNotFound => {
@@ -648,6 +652,12 @@ impl Hir {
648652
cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
649653
Hir::class(Class::Bytes(cls))
650654
}
655+
Dot::AnyCharExcept(ch) => {
656+
let mut cls =
657+
ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
658+
cls.negate();
659+
Hir::class(Class::Unicode(cls))
660+
}
651661
Dot::AnyCharExceptLF => {
652662
let mut cls = ClassUnicode::empty();
653663
cls.push(ClassUnicodeRange::new('\0', '\x09'));
@@ -661,6 +671,12 @@ impl Hir {
661671
cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
662672
Hir::class(Class::Unicode(cls))
663673
}
674+
Dot::AnyByteExcept(byte) => {
675+
let mut cls =
676+
ClassBytes::new([ClassBytesRange::new(byte, byte)]);
677+
cls.negate();
678+
Hir::class(Class::Bytes(cls))
679+
}
664680
Dot::AnyByteExceptLF => {
665681
let mut cls = ClassBytes::empty();
666682
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
@@ -1772,6 +1788,18 @@ pub enum Dot {
17721788
///
17731789
/// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
17741790
AnyByte,
1791+
/// Matches the UTF-8 encoding of any Unicode scalar value except for the
1792+
/// `char` given.
1793+
///
1794+
/// This is equivalent to using `(?u-s:.)` with the line terminator set
1795+
/// to a particular ASCII byte. (Because of peculiarities in the regex
1796+
/// engines, a line terminator must be a single byte. It follows that when
1797+
/// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1798+
/// value. That is, ti must be ASCII.)
1799+
///
1800+
/// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1801+
/// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1802+
AnyCharExcept(char),
17751803
/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
17761804
///
17771805
/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
@@ -1781,6 +1809,17 @@ pub enum Dot {
17811809
///
17821810
/// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
17831811
AnyCharExceptCRLF,
1812+
/// Matches any byte value except for the `u8` given.
1813+
///
1814+
/// This is equivalent to using `(?-us:.)` with the line terminator set
1815+
/// to a particular ASCII byte. (Because of peculiarities in the regex
1816+
/// engines, a line terminator must be a single byte. It follows that when
1817+
/// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1818+
/// value. That is, ti must be ASCII.)
1819+
///
1820+
/// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1821+
/// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1822+
AnyByteExcept(u8),
17841823
/// Matches any byte value except for `\n`.
17851824
///
17861825
/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.

0 commit comments

Comments
 (0)