Skip to content

Commit bbf274e

Browse files
committed
allow starting DFA in noncontinuous bytes
regex-automtaton already supports transversing the DFA one byte at a time with `next_state`. This is potentially very useful when scanning noncontinuous data like network stream or a rope data structures as commonly used in editors. However, to start the DFA with `start_state_forward`/`start_state_reverse` currently requires an `Input` and will look ahead/look one byte behind the span boundaries. To support that (especially when using prefilters/literal optimization) a streaming use case can not provide such a haystack easily (it can be worked around with a temporary array and copying one byte over but its extremely brittle/hacky). This commit adds the `start_state_forward_with`/`start_state_reverse_with` function which allow passing the information extracted from the Input directly.
1 parent 7c3463d commit bbf274e

File tree

5 files changed

+272
-71
lines changed

5 files changed

+272
-71
lines changed

regex-automata/src/dfa/automaton.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use crate::{
88
primitives::{PatternID, StateID},
99
search::{Anchored, HalfMatch, Input, MatchError},
1010
},
11+
Span,
1112
};
1213

1314
/// A trait describing the interface of a deterministic finite automaton (DFA).
@@ -253,6 +254,14 @@ pub unsafe trait Automaton {
253254
input: &Input<'_>,
254255
) -> Result<StateID, MatchError>;
255256

257+
/// TODO
258+
fn start_state_forward_with(
259+
&self,
260+
mode: Anchored,
261+
look_behind: Option<u8>,
262+
span: Span,
263+
) -> Result<StateID, MatchError>;
264+
256265
/// Return the ID of the start state for this lazy DFA when executing a
257266
/// reverse search.
258267
///
@@ -280,6 +289,14 @@ pub unsafe trait Automaton {
280289
input: &Input<'_>,
281290
) -> Result<StateID, MatchError>;
282291

292+
/// TODO
293+
fn start_state_reverse_with(
294+
&self,
295+
mode: Anchored,
296+
look_ahead: Option<u8>,
297+
span: Span,
298+
) -> Result<StateID, MatchError>;
299+
283300
/// If this DFA has a universal starting state for the given anchor mode
284301
/// and the DFA supports universal starting states, then this returns that
285302
/// state's identifier.
@@ -1806,6 +1823,16 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
18061823
(**self).start_state_forward(input)
18071824
}
18081825

1826+
#[inline]
1827+
fn start_state_forward_with(
1828+
&self,
1829+
mode: Anchored,
1830+
look_behind: Option<u8>,
1831+
span: Span,
1832+
) -> Result<StateID, MatchError> {
1833+
(**self).start_state_forward_with(mode, look_behind, span)
1834+
}
1835+
18091836
#[inline]
18101837
fn start_state_reverse(
18111838
&self,
@@ -1814,6 +1841,16 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
18141841
(**self).start_state_reverse(input)
18151842
}
18161843

1844+
#[inline]
1845+
fn start_state_reverse_with(
1846+
&self,
1847+
mode: Anchored,
1848+
look_behind: Option<u8>,
1849+
span: Span,
1850+
) -> Result<StateID, MatchError> {
1851+
(**self).start_state_reverse_with(mode, look_behind, span)
1852+
}
1853+
18171854
#[inline]
18181855
fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
18191856
(**self).universal_start_state(mode)

regex-automata/src/dfa/dense.rs

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ use crate::{
4444
start::{Start, StartByteMap},
4545
wire::{self, DeserializeError, Endian, SerializeError},
4646
},
47+
Span,
4748
};
4849

4950
/// The label that is pre-pended to a serialized DFA.
@@ -2883,7 +2884,9 @@ impl OwnedDFA {
28832884
let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| {
28842885
// This OK because we only call 'start' under conditions
28852886
// in which we know it will succeed.
2886-
dfa.st.start(inp, start).expect("valid Input configuration")
2887+
dfa.st
2888+
.start(inp.get_anchored(), start)
2889+
.expect("valid Input configuration")
28872890
};
28882891
if self.start_kind().has_unanchored() {
28892892
let inp = Input::new("").anchored(Anchored::No);
@@ -3214,31 +3217,63 @@ unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
32143217
&self,
32153218
input: &Input<'_>,
32163219
) -> Result<StateID, MatchError> {
3217-
if !self.quitset.is_empty() && input.start() > 0 {
3218-
let offset = input.start() - 1;
3219-
let byte = input.haystack()[offset];
3220-
if self.quitset.contains(byte) {
3221-
return Err(MatchError::quit(byte, offset));
3220+
self.start_state_forward_with(
3221+
input.get_anchored(),
3222+
input.start().checked_sub(1).map(|i| input.haystack()[i]),
3223+
input.get_span(),
3224+
)
3225+
}
3226+
#[cfg_attr(feature = "perf-inline", inline(always))]
3227+
fn start_state_forward_with(
3228+
&self,
3229+
mode: Anchored,
3230+
look_behind: Option<u8>,
3231+
span: Span,
3232+
) -> Result<StateID, MatchError> {
3233+
debug_assert_eq!(
3234+
span.start != 0,
3235+
look_behind.is_some(),
3236+
"look_behind should be provided if and only if the DFA starts at an offset"
3237+
);
3238+
if !self.quitset.is_empty() {
3239+
if let Some(byte) = look_behind {
3240+
if self.quitset.contains(byte) {
3241+
return Err(MatchError::quit(byte, span.start - 1));
3242+
}
32223243
}
32233244
}
3224-
let start = self.st.start_map.fwd(&input);
3225-
self.st.start(input, start)
3245+
let start = self.st.start_map.fwd_with(look_behind);
3246+
self.st.start(mode, start)
32263247
}
32273248

32283249
#[cfg_attr(feature = "perf-inline", inline(always))]
32293250
fn start_state_reverse(
32303251
&self,
32313252
input: &Input<'_>,
32323253
) -> Result<StateID, MatchError> {
3233-
if !self.quitset.is_empty() && input.end() < input.haystack().len() {
3234-
let offset = input.end();
3235-
let byte = input.haystack()[offset];
3236-
if self.quitset.contains(byte) {
3237-
return Err(MatchError::quit(byte, offset));
3254+
self.start_state_reverse_with(
3255+
input.get_anchored(),
3256+
input.haystack().get(input.end()).copied(),
3257+
input.get_span(),
3258+
)
3259+
}
3260+
3261+
#[cfg_attr(feature = "perf-inline", inline(always))]
3262+
fn start_state_reverse_with(
3263+
&self,
3264+
mode: Anchored,
3265+
look_ahead: Option<u8>,
3266+
span: Span,
3267+
) -> Result<StateID, MatchError> {
3268+
if !self.quitset.is_empty() {
3269+
if let Some(byte) = look_ahead {
3270+
if self.quitset.contains(byte) {
3271+
return Err(MatchError::quit(byte, span.end));
3272+
}
32383273
}
32393274
}
3240-
let start = self.st.start_map.rev(&input);
3241-
self.st.start(input, start)
3275+
let start = self.st.start_map.rev_with(look_ahead);
3276+
self.st.start(mode, start)
32423277
}
32433278

32443279
#[cfg_attr(feature = "perf-inline", inline(always))]
@@ -4174,11 +4209,10 @@ impl<T: AsRef<[u32]>> StartTable<T> {
41744209
#[cfg_attr(feature = "perf-inline", inline(always))]
41754210
fn start(
41764211
&self,
4177-
input: &Input<'_>,
4212+
mode: Anchored,
41784213
start: Start,
41794214
) -> Result<StateID, MatchError> {
41804215
let start_index = start.as_usize();
4181-
let mode = input.get_anchored();
41824216
let index = match mode {
41834217
Anchored::No => {
41844218
if !self.kind.has_unanchored() {

regex-automata/src/dfa/sparse.rs

Lines changed: 51 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ use crate::{
6767
start::{Start, StartByteMap},
6868
wire::{self, DeserializeError, Endian, SerializeError},
6969
},
70+
Span,
7071
};
7172

7273
const LABEL: &str = "rust-regex-automata-dfa-sparse";
@@ -1206,36 +1207,69 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
12061207
self.flags.is_always_start_anchored
12071208
}
12081209

1209-
#[inline]
1210+
#[cfg_attr(feature = "perf-inline", inline(always))]
12101211
fn start_state_forward(
12111212
&self,
12121213
input: &Input<'_>,
12131214
) -> Result<StateID, MatchError> {
1214-
if !self.quitset.is_empty() && input.start() > 0 {
1215-
let offset = input.start() - 1;
1216-
let byte = input.haystack()[offset];
1217-
if self.quitset.contains(byte) {
1218-
return Err(MatchError::quit(byte, offset));
1215+
self.start_state_forward_with(
1216+
input.get_anchored(),
1217+
input.start().checked_sub(1).map(|i| input.haystack()[i]),
1218+
input.get_span(),
1219+
)
1220+
}
1221+
1222+
#[cfg_attr(feature = "perf-inline", inline(always))]
1223+
fn start_state_forward_with(
1224+
&self,
1225+
mode: Anchored,
1226+
look_behind: Option<u8>,
1227+
span: Span,
1228+
) -> Result<StateID, MatchError> {
1229+
debug_assert_eq!(
1230+
span.start != 0,
1231+
look_behind.is_some(),
1232+
"look_behind should be provided if and only if the DFA starts at an offset"
1233+
);
1234+
if !self.quitset.is_empty() {
1235+
if let Some(byte) = look_behind {
1236+
if self.quitset.contains(byte) {
1237+
return Err(MatchError::quit(byte, span.start - 1));
1238+
}
12191239
}
12201240
}
1221-
let start = self.st.start_map.fwd(&input);
1222-
self.st.start(input, start)
1241+
let start = self.st.start_map.fwd_with(look_behind);
1242+
self.st.start(mode, start)
12231243
}
12241244

1225-
#[inline]
1245+
#[cfg_attr(feature = "perf-inline", inline(always))]
12261246
fn start_state_reverse(
12271247
&self,
12281248
input: &Input<'_>,
12291249
) -> Result<StateID, MatchError> {
1230-
if !self.quitset.is_empty() && input.end() < input.haystack().len() {
1231-
let offset = input.end();
1232-
let byte = input.haystack()[offset];
1233-
if self.quitset.contains(byte) {
1234-
return Err(MatchError::quit(byte, offset));
1250+
self.start_state_reverse_with(
1251+
input.get_anchored(),
1252+
input.haystack().get(input.end()).copied(),
1253+
input.get_span(),
1254+
)
1255+
}
1256+
1257+
#[cfg_attr(feature = "perf-inline", inline(always))]
1258+
fn start_state_reverse_with(
1259+
&self,
1260+
mode: Anchored,
1261+
look_ahead: Option<u8>,
1262+
span: Span,
1263+
) -> Result<StateID, MatchError> {
1264+
if !self.quitset.is_empty() {
1265+
if let Some(byte) = look_ahead {
1266+
if self.quitset.contains(byte) {
1267+
return Err(MatchError::quit(byte, span.end));
1268+
}
12351269
}
12361270
}
1237-
let start = self.st.start_map.rev(&input);
1238-
self.st.start(input, start)
1271+
let start = self.st.start_map.rev_with(look_ahead);
1272+
self.st.start(mode, start)
12391273
}
12401274

12411275
#[inline]
@@ -2145,11 +2179,10 @@ impl<T: AsRef<[u8]>> StartTable<T> {
21452179
/// panics.
21462180
fn start(
21472181
&self,
2148-
input: &Input<'_>,
2182+
mode: Anchored,
21492183
start: Start,
21502184
) -> Result<StateID, MatchError> {
21512185
let start_index = start.as_usize();
2152-
let mode = input.get_anchored();
21532186
let index = match mode {
21542187
Anchored::No => {
21552188
if !self.kind.has_unanchored() {

0 commit comments

Comments
 (0)