Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 185 additions & 9 deletions regex-automata/src/nfa/thompson/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,21 @@ std::thread_local! {
pub struct Config {
match_kind: Option<MatchKind>,
pre: Option<Option<Prefilter>>,
pre_strategy: Option<PrefilterStrategy>,
}

/// The strategy for using a prefilter during PikeVM execution.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum PrefilterStrategy {
/// Use the prefilter whenever the PikeVM runs out of states to explore.
#[default]
OnEmptyStates,
/// Use the prefilter in advance to know which positions to skip exploring.
/// This leads to matching doing less work in total, but it requires doing
/// some work upfront which might go to waste if we find a match before
/// reaching the precomputed position from the prefilter. This makes
/// matching non-streaming, but on average faster.
OneAhead,
}

impl Config {
Expand Down Expand Up @@ -161,6 +176,15 @@ impl Config {
self
}

/// Set the strategy for using a prefilter during PikeVM execution.
pub fn prefilter_strategy(
mut self,
strategy: PrefilterStrategy,
) -> Config {
self.pre_strategy = Some(strategy);
self
}

/// Returns the match semantics set in this configuration.
pub fn get_match_kind(&self) -> MatchKind {
self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
Expand All @@ -171,6 +195,11 @@ impl Config {
self.pre.as_ref().unwrap_or(&None).as_ref()
}

/// Returns the prefilter strategy set in this configuration, if one at all.
pub fn get_prefilter_strategy(&self) -> Option<PrefilterStrategy> {
self.pre_strategy
}

/// Overwrite the default configuration such that the options in `o` are
/// always used. If an option in `o` is not set, then the corresponding
/// option in `self` is used. If it's not set in `self` either, then it
Expand All @@ -179,6 +208,7 @@ impl Config {
Config {
match_kind: o.match_kind.or(self.match_kind),
pre: o.pre.or_else(|| self.pre.clone()),
pre_strategy: o.pre_strategy.or(self.pre_strategy),
}
}
}
Expand Down Expand Up @@ -1227,7 +1257,26 @@ impl PikeVM {
input: &Input<'_>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<HalfMatch> {
cache.setup_search(slots.len());
match self.config.get_prefilter_strategy().unwrap_or_default() {
PrefilterStrategy::OnEmptyStates => {
self.search_imp_strategy::<false>(cache, input, slots)
}
PrefilterStrategy::OneAhead => {
self.search_imp_strategy::<true>(cache, input, slots)
}
}
}

/// [`search_imp`] with the specialization to the [`PrefilterStrategy`]. We do
/// so to allow the compiler to specialize the function for a particular strategy
/// reducing any overhead of checking the strategy on every iteration of the search loop.
fn search_imp_strategy<const ONE_AHEAD: bool>(
&self,
cache: &mut Cache,
input: &Input<'_>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<HalfMatch> {
cache.setup_search(slots.len(), input);
if input.is_done() {
return None;
}
Expand All @@ -1254,7 +1303,12 @@ impl PikeVM {

let pre =
if anchored { None } else { self.get_config().get_prefilter() };
let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
let Cache {
ref mut stack,
ref mut curr,
ref mut next,
ref mut next_matching_pre,
} = cache;
let mut hm = None;
// Yes, our search doesn't end at input.end(), but includes it. This
// is necessary because matches are delayed by one byte, just like
Expand All @@ -1269,6 +1323,42 @@ impl PikeVM {
// match state.)
let mut at = input.start();
while at <= input.end() {
if ONE_AHEAD {
if let Some(pre) = pre {
// If the position which we have computed is in the past,
// we recompute a new value. Otherwise, we leave it untouched.
// When we are out of states to explore, in the `OneAhead`
// strategy we will use this position to accelerate to.
match *next_matching_pre {
Some(NextMatchingPre {
pos: NextMatchingPrePos::Nowhere,
..
}) => {}
Some(NextMatchingPre {
pos: NextMatchingPrePos::At(pos),
..
}) if pos >= at => {}
Some(NextMatchingPre {
pos: NextMatchingPrePos::At(_),
..
})
| None => {
let span = Span::from(at..input.end());
*next_matching_pre = Some(NextMatchingPre {
found_at: at,
found_for: input.haystack().as_ptr() as _,
pos: match pre.find(input.haystack(), span) {
None => NextMatchingPrePos::Nowhere,
Some(ref span) => {
NextMatchingPrePos::At(span.start)
}
},
});
}
}
}
}

// If we have no states left to visit, then there are some cases
// where we know we can quit early or even skip ahead.
if curr.set.is_empty() {
Expand All @@ -1291,13 +1381,50 @@ impl PikeVM {
// ahead until we find something that we know might advance us
// forward.
if let Some(pre) = pre {
let span = Span::from(at..input.end());
match pre.find(input.haystack(), span) {
None => break,
Some(ref span) => at = span.start,
if ONE_AHEAD {
let next_pos = next_matching_pre.expect("in OneAhead strategy the next matching should be Some");

match next_pos {
NextMatchingPre {
pos: NextMatchingPrePos::Nowhere,
..
} => break,
NextMatchingPre {
pos: NextMatchingPrePos::At(pos),
..
} => at = pos,
}
} else {
let span = Span::from(at..input.end());
match pre.find(input.haystack(), span) {
None => break,
Some(ref span) => {
at = span.start;
}
}
}
}
}

// If we precomputed the next position returned by the the prefilter,
// we know if a match can potentially start here. If not, we skip the
// epsilon closure computation which follows. This can potentially save
// save us from exploring this position completely.
let match_can_start_here = if ONE_AHEAD {
if let Some(next_matching_pre) = *next_matching_pre {
matches!(
next_matching_pre,
NextMatchingPre {
pos: NextMatchingPrePos::At(pos),
..
} if pos == at,
)
} else {
true
}
} else {
true
};
// Instead of using the NFA's unanchored start state, we actually
// always use its anchored starting state. As a result, when doing
// an unanchored search, we need to simulate our own '(?s-u:.)*?'
Expand Down Expand Up @@ -1346,6 +1473,7 @@ impl PikeVM {
// an anchored search.
if (hm.is_none() || allmatches)
&& (!anchored || at == input.start())
&& match_can_start_here
{
// Since we are adding to the 'curr' active states and since
// this is for the start ID, we use a slots slice that is
Expand Down Expand Up @@ -1408,7 +1536,7 @@ impl PikeVM {
// and composition, so it seems like good sense to have the PikeVM
// match that behavior.

cache.setup_search(0);
cache.setup_search(0, input);
if input.is_done() {
return;
}
Expand All @@ -1425,7 +1553,12 @@ impl PikeVM {
Some(config) => config,
};

let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
let Cache {
ref mut stack,
ref mut curr,
ref mut next,
next_matching_pre: _,
} = cache;
for at in input.start()..=input.end() {
let any_matches = !patset.is_empty();
if curr.set.is_empty() {
Expand Down Expand Up @@ -1863,6 +1996,39 @@ impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> {
}
}

/// The next input position where the prefilter matched.
/// Used for the `[PrefilterStrategy::OneAhead]` strategy.
#[derive(Copy, Clone, Debug)]
enum NextMatchingPrePos {
/// The prefilter matched at this position.
At(usize),
/// The prefilter did not match anywhere in the haystack after the current
/// position.
Nowhere,
}

/// Extra metadata about the next position where the prefilter matched.
/// The extra metadata is used to determine the validity of the computed
/// position for a given search.
#[derive(Copy, Clone, Debug)]
struct NextMatchingPre {
/// At what position in the haystack the prefilter search was performed.
found_at: usize,
/// The pointer value of the haystack `&[u8]` for which the prefilter search was performed.
found_for: usize,
/// The next position where the prefilter matched.
pos: NextMatchingPrePos,
}

impl NextMatchingPre {
/// Whether the computed position is valid for a search with the given input.
/// It is valid if the haystack pointers are the same and the position is in the past.
fn is_valid_for(&self, input: &Input<'_>) -> bool {
self.found_for == input.haystack().as_ptr() as _
&& self.found_at <= input.start()
}
}

/// A cache represents mutable state that a [`PikeVM`] requires during a
/// search.
///
Expand All @@ -1886,6 +2052,9 @@ pub struct Cache {
/// The next set of states we're building that will be explored for the
/// next byte in the haystack.
next: ActiveStates,
/// The next input position where the prefilter matched.
/// Used for the `[PrefilterStrategy::OneAhead]` strategy.
next_matching_pre: Option<NextMatchingPre>,
}

impl Cache {
Expand All @@ -1902,6 +2071,7 @@ impl Cache {
stack: vec![],
curr: ActiveStates::new(re),
next: ActiveStates::new(re),
next_matching_pre: None,
}
}

Expand Down Expand Up @@ -1945,6 +2115,7 @@ impl Cache {
pub fn reset(&mut self, re: &PikeVM) {
self.curr.reset(re);
self.next.reset(re);
self.next_matching_pre = None;
}

/// Returns the heap memory usage, in bytes, of this cache.
Expand All @@ -1968,10 +2139,15 @@ impl Cache {
/// of possible slots, e.g., when one only wants to track overall match
/// offsets. This in turn permits less copying of capturing group spans
/// in the PikeVM.
fn setup_search(&mut self, captures_slot_len: usize) {
fn setup_search(&mut self, captures_slot_len: usize, input: &Input<'_>) {
self.stack.clear();
self.curr.setup_search(captures_slot_len);
self.next.setup_search(captures_slot_len);
if let Some(next_matching_pre) = self.next_matching_pre {
if !next_matching_pre.is_valid_for(input) {
self.next_matching_pre = None;
}
}
}
}

Expand Down
42 changes: 26 additions & 16 deletions regex-automata/tests/nfa/thompson/pikevm/suite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use {
regex_automata::{
nfa::thompson::{
self,
pikevm::{self, PikeVM},
pikevm::{self, PikeVM, PrefilterStrategy},
},
util::{prefilter::Prefilter, syntax},
PatternSet,
Expand All @@ -29,24 +29,34 @@ fn default() -> Result<()> {
/// Tests the PikeVM with prefilters enabled.
#[test]
fn prefilter() -> Result<()> {
let my_compiler = |test: &RegexTest, regexes: &[String]| {
// Parse regexes as HIRs so we can get literals to build a prefilter.
let mut hirs = vec![];
for pattern in regexes.iter() {
hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
}
let kind = match untestify_kind(test.match_kind()) {
None => return Ok(CompiledRegex::skip()),
Some(kind) => kind,
let my_compiler =
|test: &RegexTest, regexes: &[String], strategy: PrefilterStrategy| {
// Parse regexes as HIRs so we can get literals to build a prefilter.
let mut hirs = vec![];
for pattern in regexes.iter() {
hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
}
let kind = match untestify_kind(test.match_kind()) {
None => return Ok(CompiledRegex::skip()),
Some(kind) => kind,
};
let pre = Prefilter::from_hirs_prefix(kind, &hirs);
let mut builder = PikeVM::builder();
builder.configure(
PikeVM::config().prefilter(pre).prefilter_strategy(strategy),
);
compiler(builder)(test, regexes)
};
let pre = Prefilter::from_hirs_prefix(kind, &hirs);
let mut builder = PikeVM::builder();
builder.configure(PikeVM::config().prefilter(pre));
compiler(builder)(test, regexes)
};
let mut runner = TestRunner::new()?;
runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
runner.test_iter(suite()?.iter(), my_compiler).assert();
runner
.test_iter(suite()?.iter(), |t, r| {
my_compiler(t, r, PrefilterStrategy::OnEmptyStates)
})
.test_iter(suite()?.iter(), |t, r| {
my_compiler(t, r, PrefilterStrategy::OneAhead)
})
.assert();
Ok(())
}

Expand Down