From 5d562458378568abf234978dc7ddf3c167cad4ad Mon Sep 17 00:00:00 2001 From: shilangyu Date: Mon, 3 Nov 2025 20:25:52 +0100 Subject: [PATCH 1/5] PrefixAcc: add pikevm option for prefix acc kind --- regex-automata/src/nfa/thompson/pikevm.rs | 30 +++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index a5cd7086f5..548fe6f330 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -66,6 +66,21 @@ std::thread_local! { pub struct Config { match_kind: Option, pre: Option>, + pre_strategy: Option, +} + +/// The strategy for using a prefilter during PikeVM execution. +#[derive(Clone, Copy, Debug, Default)] +pub enum PrefilterStrategy { + /// Use the prefilter whenever the PikeVM runs out of states to explore. + #[default] + OnEmptyStates, + /// Use the prefilter in advance to know which positions to skip exploring. + /// This leads to matching doing less work in total, but it requires doing + /// some work upfront which might go to waste if we find a match before + /// reaching the precomputed position from the prefilter. This makes + /// matching non-streaming, but on average faster. + OneAhead, } impl Config { @@ -161,6 +176,15 @@ impl Config { self } + /// Set the strategy for using a prefilter during PikeVM execution. + pub fn prefilter_strategy( + mut self, + strategy: PrefilterStrategy, + ) -> Config { + self.pre_strategy = Some(strategy); + self + } + /// Returns the match semantics set in this configuration. pub fn get_match_kind(&self) -> MatchKind { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) @@ -171,6 +195,11 @@ impl Config { self.pre.as_ref().unwrap_or(&None).as_ref() } + /// Returns the prefilter strategy set in this configuration, if one at all. + pub fn get_prefilter_strategy(&self) -> Option { + self.pre_strategy + } + /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it @@ -179,6 +208,7 @@ impl Config { Config { match_kind: o.match_kind.or(self.match_kind), pre: o.pre.or_else(|| self.pre.clone()), + pre_strategy: o.pre_strategy.or(self.pre_strategy), } } } From de0c7897f5bbd4b37a7bcc4d1cf955b552da23d4 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 11 Mar 2026 00:03:57 +0100 Subject: [PATCH 2/5] PrefixAcc: implement OneAhead prefilter strategy --- regex-automata/src/nfa/thompson/pikevm.rs | 70 +++++++++++++++++++++-- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 548fe6f330..7e9793fd19 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -70,7 +70,7 @@ pub struct Config { } /// The strategy for using a prefilter during PikeVM execution. -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub enum PrefilterStrategy { /// Use the prefilter whenever the PikeVM runs out of states to explore. #[default] @@ -1282,6 +1282,16 @@ impl PikeVM { Some(config) => config, }; + #[derive(Copy, Clone)] + enum NextMatchingPre { + At(usize), + Nowhere, + } + + let pre_strategy = + self.config.get_prefilter_strategy().unwrap_or_default(); + let mut next_matching_prefix = None; + let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { ref mut stack, ref mut curr, ref mut next } = cache; @@ -1299,6 +1309,29 @@ impl PikeVM { // match state.) let mut at = input.start(); while at <= input.end() { + if pre_strategy == PrefilterStrategy::OneAhead { + if let Some(pre) = pre { + // If the position which we have computed is in the past, + // we recompute a new value. Otherwise, we leave it untouched. + // When we are out of states to explore, in the `OneAhead` + // strategy we will use this position to accelerate to. + match next_matching_prefix { + Some(NextMatchingPre::Nowhere) => {} + Some(NextMatchingPre::At(pos)) if pos >= at => {} + Some(NextMatchingPre::At(_)) | None => { + let span = Span::from(at..input.end()); + next_matching_prefix = + Some(match pre.find(input.haystack(), span) { + None => NextMatchingPre::Nowhere, + Some(ref span) => { + NextMatchingPre::At(span.start) + } + }); + } + } + } + } + // If we have no states left to visit, then there are some cases // where we know we can quit early or even skip ahead. if curr.set.is_empty() { @@ -1321,13 +1354,39 @@ impl PikeVM { // ahead until we find something that we know might advance us // forward. if let Some(pre) = pre { - let span = Span::from(at..input.end()); - match pre.find(input.haystack(), span) { - None => break, - Some(ref span) => at = span.start, + match pre_strategy { + PrefilterStrategy::OnEmptyStates => { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => break, + Some(ref span) => { + at = span.start; + } + } + } + PrefilterStrategy::OneAhead => { + let next_pos = next_matching_prefix.expect("in OneAhead strategy the next matching should be Some"); + + match next_pos { + NextMatchingPre::Nowhere => break, + NextMatchingPre::At(pos) => at = pos, + } + } } } } + + // If we precomputed the next position returned by the the prefilter, + // we know if a match can potentially start here. If not, we skip the + // epsilon closure computation which follows. This can potentially save + // save us from exploring this position completely. + let match_can_start_here = if let Some(next_matching_pre) = + next_matching_prefix + { + matches!(next_matching_pre, NextMatchingPre::At(pos) if pos == at) + } else { + true + }; // Instead of using the NFA's unanchored start state, we actually // always use its anchored starting state. As a result, when doing // an unanchored search, we need to simulate our own '(?s-u:.)*?' @@ -1376,6 +1435,7 @@ impl PikeVM { // an anchored search. if (hm.is_none() || allmatches) && (!anchored || at == input.start()) + && match_can_start_here { // Since we are adding to the 'curr' active states and since // this is for the start ID, we use a slots slice that is From f9ff8a08d1b79bc02b0ca608cde28c3749e828d4 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 11 Mar 2026 00:23:26 +0100 Subject: [PATCH 3/5] PrefixAcc: test both PikeVM's prefiltering strategies --- .../tests/nfa/thompson/pikevm/suite.rs | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/regex-automata/tests/nfa/thompson/pikevm/suite.rs b/regex-automata/tests/nfa/thompson/pikevm/suite.rs index 1fb3fec9f2..0e5351cac6 100644 --- a/regex-automata/tests/nfa/thompson/pikevm/suite.rs +++ b/regex-automata/tests/nfa/thompson/pikevm/suite.rs @@ -3,7 +3,7 @@ use { regex_automata::{ nfa::thompson::{ self, - pikevm::{self, PikeVM}, + pikevm::{self, PikeVM, PrefilterStrategy}, }, util::{prefilter::Prefilter, syntax}, PatternSet, @@ -29,24 +29,34 @@ fn default() -> Result<()> { /// Tests the PikeVM with prefilters enabled. #[test] fn prefilter() -> Result<()> { - let my_compiler = |test: &RegexTest, regexes: &[String]| { - // Parse regexes as HIRs so we can get literals to build a prefilter. - let mut hirs = vec![]; - for pattern in regexes.iter() { - hirs.push(syntax::parse_with(pattern, &config_syntax(test))?); - } - let kind = match untestify_kind(test.match_kind()) { - None => return Ok(CompiledRegex::skip()), - Some(kind) => kind, + let my_compiler = + |test: &RegexTest, regexes: &[String], strategy: PrefilterStrategy| { + // Parse regexes as HIRs so we can get literals to build a prefilter. + let mut hirs = vec![]; + for pattern in regexes.iter() { + hirs.push(syntax::parse_with(pattern, &config_syntax(test))?); + } + let kind = match untestify_kind(test.match_kind()) { + None => return Ok(CompiledRegex::skip()), + Some(kind) => kind, + }; + let pre = Prefilter::from_hirs_prefix(kind, &hirs); + let mut builder = PikeVM::builder(); + builder.configure( + PikeVM::config().prefilter(pre).prefilter_strategy(strategy), + ); + compiler(builder)(test, regexes) }; - let pre = Prefilter::from_hirs_prefix(kind, &hirs); - let mut builder = PikeVM::builder(); - builder.configure(PikeVM::config().prefilter(pre)); - compiler(builder)(test, regexes) - }; let mut runner = TestRunner::new()?; runner.expand(&["is_match", "find", "captures"], |test| test.compiles()); - runner.test_iter(suite()?.iter(), my_compiler).assert(); + runner + .test_iter(suite()?.iter(), |t, r| { + my_compiler(t, r, PrefilterStrategy::OnEmptyStates) + }) + .test_iter(suite()?.iter(), |t, r| { + my_compiler(t, r, PrefilterStrategy::OneAhead) + }) + .assert(); Ok(()) } From e6f222e2aef8e76a1cb66113e7caa75001b7099e Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 11 Mar 2026 11:56:51 +0100 Subject: [PATCH 4/5] PrefixAcc: cache the NextMatchingPre between searches --- regex-automata/src/nfa/thompson/pikevm.rs | 136 +++++++++++++++++----- 1 file changed, 105 insertions(+), 31 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 7e9793fd19..8cdabd36f9 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1257,7 +1257,7 @@ impl PikeVM { input: &Input<'_>, slots: &mut [Option], ) -> Option { - cache.setup_search(slots.len()); + cache.setup_search(slots.len(), input); if input.is_done() { return None; } @@ -1282,19 +1282,17 @@ impl PikeVM { Some(config) => config, }; - #[derive(Copy, Clone)] - enum NextMatchingPre { - At(usize), - Nowhere, - } - let pre_strategy = self.config.get_prefilter_strategy().unwrap_or_default(); - let mut next_matching_prefix = None; let pre = if anchored { None } else { self.get_config().get_prefilter() }; - let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let Cache { + ref mut stack, + ref mut curr, + ref mut next, + ref mut next_matching_pre, + } = cache; let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like @@ -1315,18 +1313,31 @@ impl PikeVM { // we recompute a new value. Otherwise, we leave it untouched. // When we are out of states to explore, in the `OneAhead` // strategy we will use this position to accelerate to. - match next_matching_prefix { - Some(NextMatchingPre::Nowhere) => {} - Some(NextMatchingPre::At(pos)) if pos >= at => {} - Some(NextMatchingPre::At(_)) | None => { + match *next_matching_pre { + Some(NextMatchingPre { + pos: NextMatchingPrePos::Nowhere, + .. + }) => {} + Some(NextMatchingPre { + pos: NextMatchingPrePos::At(pos), + .. + }) if pos >= at => {} + Some(NextMatchingPre { + pos: NextMatchingPrePos::At(_), + .. + }) + | None => { let span = Span::from(at..input.end()); - next_matching_prefix = - Some(match pre.find(input.haystack(), span) { - None => NextMatchingPre::Nowhere, + *next_matching_pre = Some(NextMatchingPre { + found_at: at, + found_for: input.haystack().as_ptr() as _, + pos: match pre.find(input.haystack(), span) { + None => NextMatchingPrePos::Nowhere, Some(ref span) => { - NextMatchingPre::At(span.start) + NextMatchingPrePos::At(span.start) } - }); + }, + }); } } } @@ -1365,11 +1376,17 @@ impl PikeVM { } } PrefilterStrategy::OneAhead => { - let next_pos = next_matching_prefix.expect("in OneAhead strategy the next matching should be Some"); + let next_pos = next_matching_pre.expect("in OneAhead strategy the next matching should be Some"); match next_pos { - NextMatchingPre::Nowhere => break, - NextMatchingPre::At(pos) => at = pos, + NextMatchingPre { + pos: NextMatchingPrePos::Nowhere, + .. + } => break, + NextMatchingPre { + pos: NextMatchingPrePos::At(pos), + .. + } => at = pos, } } } @@ -1380,13 +1397,22 @@ impl PikeVM { // we know if a match can potentially start here. If not, we skip the // epsilon closure computation which follows. This can potentially save // save us from exploring this position completely. - let match_can_start_here = if let Some(next_matching_pre) = - next_matching_prefix - { - matches!(next_matching_pre, NextMatchingPre::At(pos) if pos == at) - } else { - true - }; + let match_can_start_here = + if pre_strategy == PrefilterStrategy::OneAhead { + if let Some(next_matching_pre) = *next_matching_pre { + matches!( + next_matching_pre, + NextMatchingPre { + pos: NextMatchingPrePos::At(pos), + .. + } if pos == at, + ) + } else { + true + } + } else { + true + }; // Instead of using the NFA's unanchored start state, we actually // always use its anchored starting state. As a result, when doing // an unanchored search, we need to simulate our own '(?s-u:.)*?' @@ -1498,7 +1524,7 @@ impl PikeVM { // and composition, so it seems like good sense to have the PikeVM // match that behavior. - cache.setup_search(0); + cache.setup_search(0, input); if input.is_done() { return; } @@ -1515,7 +1541,12 @@ impl PikeVM { Some(config) => config, }; - let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let Cache { + ref mut stack, + ref mut curr, + ref mut next, + next_matching_pre: _, + } = cache; for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { @@ -1953,6 +1984,39 @@ impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { } } +/// The next input position where the prefilter matched. +/// Used for the `[PrefilterStrategy::OneAhead]` strategy. +#[derive(Copy, Clone, Debug)] +enum NextMatchingPrePos { + /// The prefilter matched at this position. + At(usize), + /// The prefilter did not match anywhere in the haystack after the current + /// position. + Nowhere, +} + +/// Extra metadata about the next position where the prefilter matched. +/// The extra metadata is used to determine the validity of the computed +/// position for a given search. +#[derive(Copy, Clone, Debug)] +struct NextMatchingPre { + /// At what position in the haystack the prefilter search was performed. + found_at: usize, + /// The pointer value of the haystack `&[u8]` for which the prefilter search was performed. + found_for: usize, + /// The next position where the prefilter matched. + pos: NextMatchingPrePos, +} + +impl NextMatchingPre { + /// Whether the computed position is valid for a search with the given input. + /// It is valid if the haystack pointers are the same and the position is in the past. + fn is_valid_for(&self, input: &Input<'_>) -> bool { + self.found_for == input.haystack().as_ptr() as _ + && self.found_at <= input.start() + } +} + /// A cache represents mutable state that a [`PikeVM`] requires during a /// search. /// @@ -1976,6 +2040,9 @@ pub struct Cache { /// The next set of states we're building that will be explored for the /// next byte in the haystack. next: ActiveStates, + /// The next input position where the prefilter matched. + /// Used for the `[PrefilterStrategy::OneAhead]` strategy. + next_matching_pre: Option, } impl Cache { @@ -1992,6 +2059,7 @@ impl Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), + next_matching_pre: None, } } @@ -2035,6 +2103,7 @@ impl Cache { pub fn reset(&mut self, re: &PikeVM) { self.curr.reset(re); self.next.reset(re); + self.next_matching_pre = None; } /// Returns the heap memory usage, in bytes, of this cache. @@ -2058,10 +2127,15 @@ impl Cache { /// of possible slots, e.g., when one only wants to track overall match /// offsets. This in turn permits less copying of capturing group spans /// in the PikeVM. - fn setup_search(&mut self, captures_slot_len: usize) { + fn setup_search(&mut self, captures_slot_len: usize, input: &Input<'_>) { self.stack.clear(); self.curr.setup_search(captures_slot_len); self.next.setup_search(captures_slot_len); + if let Some(next_matching_pre) = self.next_matching_pre { + if !next_matching_pre.is_valid_for(input) { + self.next_matching_pre = None; + } + } } } From 2657cdbc65dcb1500f3591c746fffdf907606125 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Fri, 20 Mar 2026 09:33:52 +0100 Subject: [PATCH 5/5] PrefixAcc: specialize prefilter strategy --- regex-automata/src/nfa/thompson/pikevm.rs | 90 +++++++++++++---------- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 8cdabd36f9..b7ee4db3e0 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1256,6 +1256,25 @@ impl PikeVM { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], + ) -> Option { + match self.config.get_prefilter_strategy().unwrap_or_default() { + PrefilterStrategy::OnEmptyStates => { + self.search_imp_strategy::(cache, input, slots) + } + PrefilterStrategy::OneAhead => { + self.search_imp_strategy::(cache, input, slots) + } + } + } + + /// [`search_imp`] with the specialization to the [`PrefilterStrategy`]. We do + /// so to allow the compiler to specialize the function for a particular strategy + /// reducing any overhead of checking the strategy on every iteration of the search loop. + fn search_imp_strategy( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option], ) -> Option { cache.setup_search(slots.len(), input); if input.is_done() { @@ -1282,9 +1301,6 @@ impl PikeVM { Some(config) => config, }; - let pre_strategy = - self.config.get_prefilter_strategy().unwrap_or_default(); - let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { @@ -1307,7 +1323,7 @@ impl PikeVM { // match state.) let mut at = input.start(); while at <= input.end() { - if pre_strategy == PrefilterStrategy::OneAhead { + if ONE_AHEAD { if let Some(pre) = pre { // If the position which we have computed is in the past, // we recompute a new value. Otherwise, we leave it untouched. @@ -1365,28 +1381,25 @@ impl PikeVM { // ahead until we find something that we know might advance us // forward. if let Some(pre) = pre { - match pre_strategy { - PrefilterStrategy::OnEmptyStates => { - let span = Span::from(at..input.end()); - match pre.find(input.haystack(), span) { - None => break, - Some(ref span) => { - at = span.start; - } - } + if ONE_AHEAD { + let next_pos = next_matching_pre.expect("in OneAhead strategy the next matching should be Some"); + + match next_pos { + NextMatchingPre { + pos: NextMatchingPrePos::Nowhere, + .. + } => break, + NextMatchingPre { + pos: NextMatchingPrePos::At(pos), + .. + } => at = pos, } - PrefilterStrategy::OneAhead => { - let next_pos = next_matching_pre.expect("in OneAhead strategy the next matching should be Some"); - - match next_pos { - NextMatchingPre { - pos: NextMatchingPrePos::Nowhere, - .. - } => break, - NextMatchingPre { - pos: NextMatchingPrePos::At(pos), - .. - } => at = pos, + } else { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => break, + Some(ref span) => { + at = span.start; } } } @@ -1397,22 +1410,21 @@ impl PikeVM { // we know if a match can potentially start here. If not, we skip the // epsilon closure computation which follows. This can potentially save // save us from exploring this position completely. - let match_can_start_here = - if pre_strategy == PrefilterStrategy::OneAhead { - if let Some(next_matching_pre) = *next_matching_pre { - matches!( - next_matching_pre, - NextMatchingPre { - pos: NextMatchingPrePos::At(pos), - .. - } if pos == at, - ) - } else { - true - } + let match_can_start_here = if ONE_AHEAD { + if let Some(next_matching_pre) = *next_matching_pre { + matches!( + next_matching_pre, + NextMatchingPre { + pos: NextMatchingPrePos::At(pos), + .. + } if pos == at, + ) } else { true - }; + } + } else { + true + }; // Instead of using the NFA's unanchored start state, we actually // always use its anchored starting state. As a result, when doing // an unanchored search, we need to simulate our own '(?s-u:.)*?'