diff --git a/CHANGELOG.md b/CHANGELOG.md index 38c7532..91b97c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,36 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120)) - SIMD prefilter for CompositeSequenceDFA (#83) +## [0.12.18] - 2026-03-24 + +### Performance +- **Flat DFA transition table** (Rust approach) — replaced double pointer chase + (`stateList[id].transitions[class]`) with flat array (`flatTrans[sid*stride+class]`). + Hot loop works with state ID only — no `*State` pointer in fast path. Applied to + all 6 DFA search functions. Inspired by Rust `Cache.trans` flat layout. + +- **4x loop unrolling** in `searchFirstAt` — process 4 bytes per iteration when + all transitions are in flat table. Falls to single-byte slow path on special states. + +- **DFA integrated prefilter skip-ahead** (Rust approach) — when DFA returns to + start state with no match in progress, uses `prefilter.Find()` to skip ahead + instead of byte-by-byte scanning. Applied to `searchFirstAt` and `searchAt`. + Reference: Rust `hybrid/search.rs:232-258`. + `peak_hours`: 197ms → **90ms** (gap vs Rust: 9x → 4x). + +- **PikeVM integrated prefilter skip-ahead** — prefilter integrated inside PikeVM + search loop (`pikevm.rs:1293`). When NFA has no active threads, PikeVM jumps to + next candidate. Safe for partial-coverage prefilters. + +### Fixed +- **NFA candidate loop guard** — replaced `IsComplete()` with `partialCoverage` + flag. `IsComplete()` blocked ALL incomplete prefilters including prefix-only ones. + `errors` pattern: 1984ms → **80ms**. + +- **DFA prefilter skip for incomplete prefilters** — `IsComplete()` guard blocked + DFA prefilter skip-ahead for memmem/Teddy prefix-only prefilters. But DFA verifies + full pattern — skip is always safe. `sessions`: 229ms → **30ms**. + ## [0.12.17] - 2026-03-23 ### Fixed @@ -39,6 +69,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Now allows UseTeddy when anchors are only `(?m)^` (no \b, $, etc). `http_methods` on macOS ARM64: 89ms → **<1ms** (restored to v0.12.14 level). +- **Fix NFA candidate loop guard** — `IsComplete()` guard blocked prefilter + candidate loop for ALL incomplete prefilters, including prefix-only ones + where all alternation branches are represented. Now uses `partialCoverage` + flag (set only on overflow truncation) instead of `IsComplete()`. Pattern + ` [5][0-9]{2} | [4][0-9]{2} ` (Kostya's `errors`): 1984ms → **109ms**. + Rust handles this by integrating prefilter as skip-ahead inside PikeVM + (not as an external correctness gate) — see `pikevm.rs:1293-1299`. + ## [0.12.16] - 2026-03-21 ### Performance diff --git a/README.md b/README.md index 355e187..5d3fd70 100644 --- a/README.md +++ b/README.md @@ -64,16 +64,16 @@ Cross-language benchmarks on 6MB input, AMD EPYC ([source](https://github.com/ko | Pattern | Go stdlib | coregex | Rust regex | vs stdlib | vs Rust | |---------|-----------|---------|------------|-----------|---------| -| Literal alternation | 475 ms | 4.4 ms | 0.6 ms | **108x** | 7.1x slower | -| Multi-literal | 1412 ms | 12.8 ms | 4.7 ms | **110x** | 2.7x slower | -| Inner `.*keyword.*` | 232 ms | 0.30 ms | 0.27 ms | **774x** | 1.1x slower | -| Suffix `.*\.txt` | 236 ms | 1.82 ms | 1.13 ms | **129x** | 1.6x slower | -| Multiline `(?m)^/.*\.php` | 103 ms | 0.50 ms | 0.67 ms | **206x** | **1.3x faster** | -| Email validation | 265 ms | 0.62 ms | 0.27 ms | **428x** | 2.2x slower | -| URL extraction | 353 ms | 0.65 ms | 0.35 ms | **543x** | 1.8x slower | -| IP address | 496 ms | 2.1 ms | 12.1 ms | **231x** | **5.6x faster** | -| Char class `[\w]+` | 581 ms | 51.2 ms | 50.2 ms | **11x** | ~parity | -| Word repeat `(\w{2,8})+` | 712 ms | 186 ms | 48.7 ms | **3x** | 3.8x slower | +| Literal alternation | 475 ms | 4.4 ms | 0.7 ms | **109x** | 6.3x slower | +| Multi-literal | 1391 ms | 12.6 ms | 4.7 ms | **110x** | 2.6x slower | +| Inner `.*keyword.*` | 231 ms | 0.29 ms | 0.29 ms | **797x** | **~parity** | +| Suffix `.*\.txt` | 234 ms | 1.83 ms | 1.07 ms | **128x** | 1.7x slower | +| Multiline `(?m)^/.*\.php` | 103 ms | 0.66 ms | 0.66 ms | **156x** | **~parity** | +| Email validation | 261 ms | 0.54 ms | 0.31 ms | **482x** | 1.7x slower | +| URL extraction | 262 ms | 0.84 ms | 0.35 ms | **311x** | 2.4x slower | +| IP address | 498 ms | 2.1 ms | 12.0 ms | **237x** | **5.6x faster** | +| Char class `[\w]+` | 554 ms | 48.0 ms | 50.1 ms | **11x** | **1.0x faster** | +| Word repeat `(\w{2,8})+` | 641 ms | 185 ms | 48.7 ms | **3x** | 3.7x slower | **Where coregex excels:** - Multiline patterns (`(?m)^/.*\.php`) — near Rust parity, 100x+ vs stdlib diff --git a/ROADMAP.md b/ROADMAP.md index b1ba046..e9b34c2 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -2,7 +2,7 @@ > **Strategic Focus**: Production-grade regex engine with RE2/rust-regex level optimizations -**Last Updated**: 2026-03-20 | **Current Version**: v0.12.15 | **Target**: v1.0.0 stable +**Last Updated**: 2026-03-24 | **Current Version**: v0.12.18 | **Target**: v1.0.0 stable --- @@ -87,7 +87,13 @@ v0.12.13 ✅ → FatTeddy fix, prefilter acceleration, AC v0.2.1 ↓ v0.12.14 ✅ → Concurrent safety fix for isMatchDFA prefilter (#137) ↓ -v0.12.15 (Current) ✅ → Per-goroutine DFA cache, word boundary 30%→0.3% CPU, AC prefilter +v0.12.15 ✅ → Per-goroutine DFA cache, word boundary 30%→0.3% CPU, AC prefilter + ↓ +v0.12.16 ✅ → WrapLineAnchor for (?m)^ patterns + ↓ +v0.12.17 ✅ → Fix LogParser ARM64 regression, restore DFA/Teddy for (?m)^ + ↓ +v0.12.18 (Current) ✅ → Flat DFA transition table, integrated prefilter, PikeVM skip-ahead ↓ v1.0.0-rc → Feature freeze, API locked ↓ @@ -130,7 +136,10 @@ v1.0.0 STABLE → Production release with API stability guarantee - ✅ **v0.12.12**: Prefix trimming for case-fold literals - ✅ **v0.12.13**: FatTeddy fix (ANDL→ORL, VPTEST), prefilter acceleration, AC v0.2.1 - ✅ **v0.12.14**: Concurrent safety fix for isMatchDFA prefilter (#137) -- ✅ **v0.12.15**: Per-goroutine DFA cache (Rust approach), word boundary 30%→0.3% CPU, AC DFA prefilter for >32 literals (7-13x faster) +- ✅ **v0.12.15**: Per-goroutine DFA cache (Rust approach), word boundary 30%→0.3% CPU, 7 correctness fixes +- ✅ **v0.12.16**: WrapLineAnchor for (?m)^ patterns +- ✅ **v0.12.17**: Fix LogParser ARM64 regression — restore DFA/Teddy for (?m)^, partial prefilter +- ✅ **v0.12.18**: Flat DFA transition table (Rust approach), integrated prefilter skip-ahead in DFA+PikeVM, 4x unrolling — **35% faster than v0.12.14, 3x from Rust** --- diff --git a/dfa/lazy/cache.go b/dfa/lazy/cache.go index 2b4a10b..4044111 100644 --- a/dfa/lazy/cache.go +++ b/dfa/lazy/cache.go @@ -27,36 +27,50 @@ import ( // - After too many clears, falls back to NFA // - Clearing keeps allocated memory to avoid re-allocation type DFACache struct { - // states maps StateKey -> DFA State + // states maps StateKey -> DFA State (used only in determinize slow path) states map[StateKey]*State - // stateList provides O(1) lookup of states by ID via direct indexing. - // StateIDs are sequential (0, 1, 2...), so slice indexing is faster than map. - // This was previously DFA.states — moved here because it grows during search. + // stateList provides O(1) lookup of State structs by ID. + // Used only in slow path (determinize, word boundary, acceleration). + // Hot loop uses flatTrans + matchFlags instead. stateList []*State + // --- Flat transition table (Rust approach) --- + // Hot loop uses ONLY these fields — no *State pointer chase. + // + // Rust: cache.trans[sid + class] — single flat array, premultiplied ID. + // We use: flatTrans[int(sid)*stride + class] — same layout. + // + // This replaces per-state State.transitions[] in the hot loop: + // ONE slice access instead of TWO pointer chases (stateList → State → transitions). + + // flatTrans is the flat transition table. + // Layout: [state0_c0, state0_c1, ..., state0_cN, state1_c0, ...] + // InvalidState (0xFFFFFFFF) = unknown transition (needs determinize). + flatTrans []StateID + + // matchFlags[stateID] = true if state is a match/accepting state. + // Replaces State.IsMatch() in hot loop — no pointer chase needed. + matchFlags []bool + + // stride is the number of byte equivalence classes (alphabet size). + stride int + // startTable caches start states for different look-behind contexts. - // This enables correct handling of assertions (^, \b, etc.) and - // avoids recomputing epsilon closures on every search. - // Previously lived on DFA — moved here because it is populated lazily. startTable StartTable // maxStates is the capacity limit maxStates uint32 // nextID is the next available state ID. - // Start at 1 (0 is reserved for StartState). nextID StateID - // clearCount tracks how many times the cache has been cleared during - // the current search. This is used to detect pathological cache thrashing - // and trigger NFA fallback when clears exceed the configured limit. - // Inspired by Rust regex-automata's hybrid DFA cache clearing strategy. + // clearCount tracks cache clear count for NFA fallback threshold. clearCount int - // Statistics for cache performance tuning - hits uint64 // Number of cache hits - misses uint64 // Number of cache misses + // Statistics + hits uint64 + misses uint64 } // Get retrieves a state by its key. @@ -95,9 +109,67 @@ func (c *DFACache) Insert(key StateKey, state *State) (StateID, error) { c.states[key] = state c.misses++ + // Grow flat transition table for this state's row (all InvalidState initially). + if c.stride > 0 { + sid := int(state.id) + needed := (sid + 1) * c.stride + if needed > len(c.flatTrans) { + growth := needed - len(c.flatTrans) + for i := 0; i < growth; i++ { + c.flatTrans = append(c.flatTrans, InvalidState) + } + } + // Grow matchFlags + for len(c.matchFlags) <= sid { + c.matchFlags = append(c.matchFlags, false) + } + c.matchFlags[sid] = state.isMatch + } + return state.ID(), nil } +// safeOffset computes flat table offset, safe on 386 where int is 32-bit. +// StateID is uint32; on 386 int(0xFFFFFFFF) = -1 and uint multiply overflows. +// Returns MaxInt for special state IDs (DeadState, InvalidState) so bounds +// check (offset < ftLen) always fails safely. +func safeOffset(sid StateID, stride int, classIdx int) int { + if sid >= DeadState { + return int(^uint(0) >> 1) // MaxInt — always >= ftLen + } + return int(sid)*stride + classIdx +} + +// SetFlatTransition records a transition in the flat table. +// Called from determinize when a transition is computed. +func (c *DFACache) SetFlatTransition(fromID StateID, classIdx int, toID StateID) { + offset := safeOffset(fromID, c.stride, classIdx) + if offset < len(c.flatTrans) { + c.flatTrans[offset] = toID + } +} + +// FlatNext returns the next state ID from the flat table. +// Returns InvalidState if the transition hasn't been computed yet. +// This is the hot-path function — should be inlined by the compiler. +func (c *DFACache) FlatNext(sid StateID, classIdx int) StateID { + offset := int(sid)*c.stride + classIdx + return c.flatTrans[offset] +} + +// IsMatchState returns whether the given state ID is a match state. +// Uses compact matchFlags slice — no pointer chase. +func (c *DFACache) IsMatchState(sid StateID) bool { + if sid >= DeadState { + return false + } + id := int(sid) + if id >= len(c.matchFlags) { + return false + } + return c.matchFlags[id] +} + // GetOrInsert retrieves a state from cache or inserts it if not present. // This is the primary method used during DFA construction. // @@ -220,6 +292,11 @@ func (c *DFACache) getState(id StateID) *State { return nil } + // Guard against special state IDs (DeadState=0xFFFFFFFE, InvalidState=0xFFFFFFFF). + // On 386, int(uint32(0xFFFFFFFF)) = -1, causing negative index panic. + if id >= DeadState { + return nil + } idx := int(id) if idx >= len(c.stateList) { return nil diff --git a/dfa/lazy/lazy.go b/dfa/lazy/lazy.go index 4c4a8c8..07672cd 100644 --- a/dfa/lazy/lazy.go +++ b/dfa/lazy/lazy.go @@ -106,9 +106,13 @@ func (d *DFA) NewCache() *DFACache { // Start small — grow on demand. Pre-allocating MaxStates (10,000) wastes // ~400KB per cache and dominates cold-start cost for pooled caches. const initCap = 64 + stride := d.AlphabetLen() return &DFACache{ states: make(map[StateKey]*State, initCap), stateList: make([]*State, 0, initCap), + flatTrans: make([]StateID, 0, initCap*stride), + matchFlags: make([]bool, 0, initCap), + stride: stride, startTable: newStartTableFromByteMap(&d.startByteMap), maxStates: d.config.MaxStates, nextID: StartState + 1, @@ -264,20 +268,36 @@ func (d *DFA) SearchAtAnchored(cache *DFACache, haystack []byte, at int) int { lastMatch = at } + sid := currentState.id + ft := cache.flatTrans + stride := cache.stride + ftLen := len(ft) + for pos := at; pos < len(haystack); pos++ { b := haystack[pos] - // O(1) word boundary match check using pre-computed flags (was 30% CPU). - // matchAtWordBoundary/matchAtNonWordBoundary computed during determinize. - if d.hasWordBoundary && currentState.checkWordBoundaryFast(b) { - return pos + if d.hasWordBoundary { + st := cache.getState(sid) + if st != nil && st.checkWordBoundaryFast(b) { + return pos + } + } + + classIdx := int(d.byteToClass(b)) + offset := safeOffset(sid, stride, classIdx) + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState } - // Convert byte to equivalence class for transition lookup - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - switch { - case !ok: + switch nextID { + case InvalidState: + currentState = cache.getState(sid) + if currentState == nil { + return d.nfaFallback(haystack, at) + } nextState, err := d.determinize(cache, currentState, b) if err != nil { if isCacheCleared(err) { @@ -285,7 +305,10 @@ func (d *DFA) SearchAtAnchored(cache *DFACache, haystack []byte, at int) int { if currentState == nil { return d.nfaFallback(haystack, at) } - pos-- // Will be incremented by for-loop + sid = currentState.id + ft = cache.flatTrans + ftLen = len(ft) + pos-- continue } return d.nfaFallback(haystack, at) @@ -293,24 +316,24 @@ func (d *DFA) SearchAtAnchored(cache *DFACache, haystack []byte, at int) int { if nextState == nil { return lastMatch } - currentState = nextState + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) - case nextID == DeadState: + case DeadState: return lastMatch default: - currentState = cache.getState(nextID) - if currentState == nil { - return d.nfaFallback(haystack, at) - } + sid = nextID } - if currentState.IsMatch() { + if cache.IsMatchState(sid) { lastMatch = pos + 1 } } - if d.checkEOIMatch(currentState) { + eoi := cache.getState(sid) + if eoi != nil && d.checkEOIMatch(eoi) { return len(haystack) } @@ -348,17 +371,17 @@ func (d *DFA) SearchFirstAt(cache *DFACache, haystack []byte, at int) int { // searchFirstAt is the core DFA search with early termination after first match. // Returns the end of the first match found, without extending for longest match. -func (d *DFA) searchFirstAt(cache *DFACache, haystack []byte, startPos int) int { +func (d *DFA) searchFirstAt(cache *DFACache, haystack []byte, startPos int) int { //nolint:funlen,maintidx // 4x unrolled hot loop with integrated prefilter if d.isAlwaysAnchored && startPos > 0 { return -1 } - currentState := d.getStartStateForUnanchored(cache, haystack, startPos) - if currentState == nil { + startState := d.getStartStateForUnanchored(cache, haystack, startPos) + if startState == nil { return d.nfaFallback(haystack, startPos) } - if currentState.IsMatch() { + if startState.IsMatch() { return startPos } @@ -367,47 +390,182 @@ func (d *DFA) searchFirstAt(cache *DFACache, haystack []byte, startPos int) int committed := false lastMatch := -1 + // Hot loop: flat transition table (Rust approach). + // Work with state ID only — no *State pointer chase in fast path. + // State struct needed only for: determinize (slow), word boundary (guarded). + sid := startState.id + ft := cache.flatTrans + stride := cache.stride + + // Bounds hint for compiler — eliminates repeated len checks in loop. + if len(ft) > 0 { + _ = ft[len(ft)-1] + } + + // 4x unrolled hot loop (Rust approach: hybrid/search.rs:195-221). + canUnroll := !d.hasWordBoundary + ftLen := len(ft) + startSID := startState.id + hasPre := d.prefilter != nil + for pos < end { - b := haystack[pos] + // Prefilter skip-ahead: when DFA is at start state with no match + // in progress, use prefilter to jump to next candidate position. + // This is the Rust approach (hybrid/search.rs:232-258). + // Eliminates byte-by-byte scanning between matches. + if hasPre && sid == startSID && !committed && pos > startPos { + candidate := d.prefilter.Find(haystack, pos) + if candidate == -1 { + return lastMatch // No more candidates + } + if candidate > pos { + pos = candidate + // Re-obtain start state at new position (context may differ) + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { + return d.nfaFallback(haystack, startPos) + } + sid = newStart.id + startSID = sid + ft = cache.flatTrans + ftLen = len(ft) + } + } - if d.hasWordBoundary && d.checkWordBoundaryMatch(currentState, b) { - return pos + // === 4x UNROLLED FAST PATH === + if canUnroll && pos+3 < end { + // Transition 1 + o1 := safeOffset(sid, stride, int(d.byteToClass(haystack[pos]))) + if o1 >= ftLen { + goto searchFirstSlowPath + } + n1 := ft[o1] + if n1 >= DeadState { // DeadState or InvalidState + goto searchFirstSlowPath + } + pos++ + if cache.matchFlags[int(n1)] { + lastMatch = pos + committed = true + } else if committed { + return lastMatch + } + + // Transition 2 + o2 := safeOffset(n1, stride, int(d.byteToClass(haystack[pos]))) + if o2 >= ftLen { + sid = n1 + goto searchFirstSlowPath + } + n2 := ft[o2] + if n2 >= DeadState { + sid = n1 + goto searchFirstSlowPath + } + pos++ + if cache.matchFlags[int(n2)] { + lastMatch = pos + committed = true + } else if committed { + return lastMatch + } + + // Transition 3 + o3 := safeOffset(n2, stride, int(d.byteToClass(haystack[pos]))) + if o3 >= ftLen { + sid = n2 + goto searchFirstSlowPath + } + n3 := ft[o3] + if n3 >= DeadState { + sid = n2 + goto searchFirstSlowPath + } + pos++ + if cache.matchFlags[int(n3)] { + lastMatch = pos + committed = true + } else if committed { + return lastMatch + } + + // Transition 4 + o4 := safeOffset(n3, stride, int(d.byteToClass(haystack[pos]))) + if o4 >= ftLen { + sid = n3 + goto searchFirstSlowPath + } + n4 := ft[o4] + if n4 >= DeadState { + sid = n3 + goto searchFirstSlowPath + } + pos++ + sid = n4 + if cache.matchFlags[int(n4)] { + lastMatch = pos + committed = true + } else if committed { + return lastMatch + } + + continue } - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - switch { - case !ok: - nextState, err := d.determinize(cache, currentState, b) + searchFirstSlowPath: + // === SINGLE-BYTE SLOW PATH === + if d.hasWordBoundary { + st := cache.getState(sid) + if st != nil && st.checkWordBoundaryFast(haystack[pos]) { + return pos + } + } + + classIdx := int(d.byteToClass(haystack[pos])) + offset := safeOffset(sid, stride, classIdx) + + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState + } + + switch nextID { + case InvalidState: + currentState := cache.getState(sid) + if currentState == nil { + return d.nfaFallback(haystack, startPos) + } + nextState, err := d.determinize(cache, currentState, haystack[pos]) if err != nil { return d.nfaFallback(haystack, startPos) } if nextState == nil { return lastMatch } - currentState = nextState - case nextID == DeadState: + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) + case DeadState: return lastMatch default: - currentState = cache.getState(nextID) - if currentState == nil { - return d.nfaFallback(haystack, startPos) - } + sid = nextID } pos++ - if currentState.IsMatch() { + if cache.IsMatchState(sid) { lastMatch = pos committed = true } else if committed { - // First match is committed and we left the match state. - // Return immediately — don't extend for longest match. return lastMatch } } - if d.checkEOIMatch(currentState) { + // EOI match check (needs State struct — slow path) + eoi := cache.getState(sid) + if eoi != nil && d.checkEOIMatch(eoi) { return len(haystack) } @@ -485,48 +643,59 @@ func (d *DFA) isMatchWithPrefilter(cache *DFACache, haystack []byte) bool { return true } - // Integrated prefilter+DFA loop: single scan, prefilter on dead state + // Integrated prefilter+DFA loop with flat table (Rust approach) endPos := len(haystack) + sid := currentState.id + ft := cache.flatTrans + stride := cache.stride + ftLen := len(ft) + for pos < endPos { - b := haystack[pos] + // Word boundary check (slow path) + if d.hasWordBoundary { + st := cache.getState(sid) + if st != nil && st.checkWordBoundaryFast(haystack[pos]) { + return true + } + } - // Word boundary check - if d.hasWordBoundary && currentState.checkWordBoundaryFast(b) { - return true + classIdx := int(d.byteToClass(haystack[pos])) + offset := safeOffset(sid, stride, classIdx) + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState } - // Get next state - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - switch { - case !ok: - // Determinize on demand - nextState, err := d.determinize(cache, currentState, b) + switch nextID { + case InvalidState: + currentState = cache.getState(sid) + if currentState == nil { + start, end, matched := d.pikevm.SearchAt(haystack, pos) + return matched && start >= 0 && end >= start + } + nextState, err := d.determinize(cache, currentState, haystack[pos]) if err != nil { - // Cache error — NFA fallback from current position start, end, matched := d.pikevm.SearchAt(haystack, pos) return matched && start >= 0 && end >= start } if nextState == nil { - // Dead state — skip ahead with prefilter goto pfSkip } - currentState = nextState + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) - case nextID == DeadState: - // Dead state — skip ahead with prefilter + case DeadState: goto pfSkip default: - currentState = cache.getState(nextID) - if currentState == nil { - start, end, matched := d.pikevm.SearchAt(haystack, pos) - return matched && start >= 0 && end >= start - } + sid = nextID } pos++ - if currentState.IsMatch() { + if cache.IsMatchState(sid) { return true } continue @@ -541,16 +710,23 @@ func (d *DFA) isMatchWithPrefilter(cache *DFACache, haystack []byte) bool { pos = candidate // Restart DFA at new candidate with anchored start state - currentState = d.getStartState(cache, haystack, pos, true) - if currentState == nil { + newStart := d.getStartState(cache, haystack, pos, true) + if newStart == nil { return d.isMatchWithPrefilterFallback(cache, haystack, pos) } - if currentState.IsMatch() { + sid = newStart.id + ft = cache.flatTrans + ftLen = len(ft) + if newStart.IsMatch() { return true } } - return d.checkEOIMatch(currentState) + eoi := cache.getState(sid) + if eoi != nil { + return d.checkEOIMatch(eoi) + } + return false } // isMatchWithPrefilterFallback is the old two-pass approach used when @@ -610,77 +786,100 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int endPos := len(haystack) pos := startPos + // Hot loop: flat transition table (Rust approach). + // Work with state ID only — no *State pointer chase in fast path. + sid := currentState.id + ft := cache.flatTrans + stride := cache.stride + ftLen := len(ft) + + // Bounds hint for compiler — eliminates repeated len checks in loop. + if ftLen > 0 { + _ = ft[ftLen-1] + } + for pos < endPos { // === 4x UNROLLED FAST PATH (earliest match) === // For IsMatch(), we return true on ANY match, so no leftmost-longest tracking. // This is even simpler than searchAt: just check isMatch after each transition. - if canUnroll && !currentState.IsAccelerable() && pos+3 < endPos { + if canUnroll && pos+3 < endPos { + // Check acceleration on slow→fast transition (once per entry). + accelState := cache.getState(sid) + if accelState != nil && accelState.IsAccelerable() { + goto earliestSlowPath + } + // Transition 1 - nextID := currentState.transitions[d.byteToClass(haystack[pos])] - if isSpecialStateID(nextID) { + o1 := safeOffset(sid, stride, int(d.byteToClass(haystack[pos]))) + if o1 >= ftLen { goto earliestSlowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - start, end, matched := d.pikevm.SearchAt(haystack, startPos) - return matched && start >= 0 && end >= start + n1 := ft[o1] + if n1 >= DeadState { + goto earliestSlowPath } pos++ - if currentState.isMatch { + if cache.matchFlags[int(n1)] { return true } // Check remaining bounds for subsequent transitions if pos+2 >= endPos { + sid = n1 goto earliestSlowPath } // Transition 2 - nextID = currentState.transitions[d.byteToClass(haystack[pos])] - if isSpecialStateID(nextID) { + o2 := safeOffset(n1, stride, int(d.byteToClass(haystack[pos]))) + if o2 >= ftLen { + sid = n1 goto earliestSlowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - start, end, matched := d.pikevm.SearchAt(haystack, startPos) - return matched && start >= 0 && end >= start + n2 := ft[o2] + if n2 >= DeadState { + sid = n1 + goto earliestSlowPath } pos++ - if currentState.isMatch { + if cache.matchFlags[int(n2)] { return true } if pos+1 >= endPos { + sid = n2 goto earliestSlowPath } // Transition 3 - nextID = currentState.transitions[d.byteToClass(haystack[pos])] - if isSpecialStateID(nextID) { + o3 := safeOffset(n2, stride, int(d.byteToClass(haystack[pos]))) + if o3 >= ftLen { + sid = n2 goto earliestSlowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - start, end, matched := d.pikevm.SearchAt(haystack, startPos) - return matched && start >= 0 && end >= start + n3 := ft[o3] + if n3 >= DeadState { + sid = n2 + goto earliestSlowPath } pos++ - if currentState.isMatch { + if cache.matchFlags[int(n3)] { return true } // Transition 4 - nextID = currentState.transitions[d.byteToClass(haystack[pos])] - if isSpecialStateID(nextID) { + o4 := safeOffset(n3, stride, int(d.byteToClass(haystack[pos]))) + if o4 >= ftLen { + sid = n3 goto earliestSlowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - start, end, matched := d.pikevm.SearchAt(haystack, startPos) - return matched && start >= 0 && end >= start + n4 := ft[o4] + if n4 >= DeadState { + sid = n3 + goto earliestSlowPath } pos++ - if currentState.isMatch { + sid = n4 + if cache.matchFlags[int(n4)] { return true } @@ -694,6 +893,11 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int } // Try lazy acceleration detection if not yet checked + currentState = cache.getState(sid) + if currentState == nil { + start, end, matched := d.pikevm.SearchAt(haystack, startPos) + return matched && start >= 0 && end >= start + } d.tryDetectAcceleration(currentState) // State acceleration: if current state is accelerable, use SIMD to skip ahead @@ -716,17 +920,23 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int return true } - // Get next state (convert byte to class for transition lookup) - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - switch { - case !ok: + // Flat table lookup for transition + classIdx := int(d.byteToClass(b)) + offset := safeOffset(sid, stride, classIdx) + + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState + } + + switch nextID { + case InvalidState: // Determinize on demand nextState, err := d.determinize(cache, currentState, b) if err != nil { // Cache cleared or full — fall back to NFA from original start position. - // After cache clear, DFA state context is lost and restarting mid-search - // can miss matches for unanchored patterns. NFA fallback is correct and fast. start, end, matched := d.pikevm.SearchAt(haystack, startPos) return matched && start >= 0 && end >= start } @@ -734,25 +944,22 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int // Dead state - no match possible from here return false } - currentState = nextState + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) - case nextID == DeadState: + case DeadState: // Dead state - no match possible from here return false default: - currentState = cache.getState(nextID) - if currentState == nil { - // State not in cache - fallback to NFA - start, end, matched := d.pikevm.SearchAt(haystack, pos) - return matched && start >= 0 && end >= start - } + sid = nextID } pos++ // Early termination: return true immediately on any match - if currentState.IsMatch() { + if cache.IsMatchState(sid) { return true } } @@ -762,7 +969,8 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int // that are satisfied at end-of-input. // Example: pattern `test\b` matching "test" - the \b is satisfied at EOI // because prev='t'(word), next=none(non-word) → word boundary. - return d.checkEOIMatch(currentState) + eoi := cache.getState(sid) + return eoi != nil && d.checkEOIMatch(eoi) } // searchEarliestMatchAnchored performs ANCHORED DFA search with early termination. @@ -791,33 +999,57 @@ func (d *DFA) searchEarliestMatchAnchored(cache *DFACache, haystack []byte, star return true } + // Hot loop: flat transition table (Rust approach). + // Work with state ID only — no *State pointer chase in fast path. + sid := currentState.id + ft := cache.flatTrans + stride := cache.stride + ftLen := len(ft) + // Scan input byte by byte with early termination for pos := startPos; pos < len(haystack); pos++ { b := haystack[pos] // O(1) word boundary match check using pre-computed flags (was 30% CPU). // matchAtWordBoundary/matchAtNonWordBoundary computed during determinize. - if d.hasWordBoundary && currentState.checkWordBoundaryFast(b) { - return true + if d.hasWordBoundary { + st := cache.getState(sid) + if st != nil && st.checkWordBoundaryFast(b) { + return true + } } - // Get next state - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - switch { - case !ok: + // Flat table lookup for transition + classIdx := int(d.byteToClass(b)) + offset := safeOffset(sid, stride, classIdx) + + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState + } + + switch nextID { + case InvalidState: + currentState = cache.getState(sid) + if currentState == nil { + start, end, matched := d.pikevm.SearchAt(haystack, startPos) + return matched && start == startPos && end >= start + } nextState, err := d.determinize(cache, currentState, b) if err != nil { if isCacheCleared(err) { // Cache was cleared. For anchored search, re-obtain // the anchored start state at current position. - // Note: this is imprecise for anchored search since we lose - // DFA context, but it's still correct because we'll rebuild. currentState = d.getStartState(cache, haystack, pos, true) if currentState == nil { start, end, matched := d.pikevm.SearchAt(haystack, startPos) return matched && start == startPos && end >= start } + sid = currentState.id + ft = cache.flatTrans + ftLen = len(ft) // Re-process this byte with the new state (pos not incremented by for-loop yet) pos-- // Will be incremented by for-loop continue @@ -828,25 +1060,24 @@ func (d *DFA) searchEarliestMatchAnchored(cache *DFACache, haystack []byte, star if nextState == nil { return false } - currentState = nextState + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) - case nextID == DeadState: + case DeadState: return false default: - currentState = cache.getState(nextID) - if currentState == nil { - start, end, matched := d.pikevm.SearchAt(haystack, startPos) - return matched && start == startPos && end >= start - } + sid = nextID } - if currentState.IsMatch() { + if cache.IsMatchState(sid) { return true } } - return d.checkEOIMatch(currentState) + eoi := cache.getState(sid) + return eoi != nil && d.checkEOIMatch(eoi) } // findWithPrefilterAt searches using prefilter to accelerate unanchored search. @@ -874,121 +1105,134 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int) lastMatch := -1 committed := false // True once we've entered a match state + sid := currentState.id + ft := cache.flatTrans + stride := cache.stride + ftLen := len(ft) + startSID := sid + if currentState.IsMatch() { - lastMatch = pos // Empty match at start + lastMatch = pos committed = true } for pos < len(haystack) { - b := haystack[pos] + if d.hasWordBoundary { + st := cache.getState(sid) + if st != nil && d.checkWordBoundaryMatch(st, haystack[pos]) { + return pos + } + } - // Check if word boundary would result in a match BEFORE consuming the byte. - // This handles patterns like `test\b` where after matching "test", - // the next byte '!' creates a word boundary that satisfies \b. - // Skip this expensive check for patterns without word boundaries. - if d.hasWordBoundary && d.checkWordBoundaryMatch(currentState, b) { - return pos // Return current position as match end + classIdx := int(d.byteToClass(haystack[pos])) + offset := safeOffset(sid, stride, classIdx) + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState } - // Get next state (convert byte to class for transition lookup) - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - var nextState *State - switch { - case !ok: - // Determinize on demand - var err error - nextState, err = d.determinize(cache, currentState, b) + switch nextID { + case InvalidState: + currentState = cache.getState(sid) + if currentState == nil { + return d.nfaFallback(haystack, 0) + } + nextState, err := d.determinize(cache, currentState, haystack[pos]) if err != nil { if isCacheCleared(err) { - // Cache was cleared. Re-obtain start state and continue. - currentState = d.getStartStateForUnanchored(cache, haystack, pos) - if currentState == nil { + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { return d.nfaFallback(haystack, 0) } + sid = newStart.id + startSID = sid + ft = cache.flatTrans + ftLen = len(ft) committed = lastMatch >= 0 continue } return d.nfaFallback(haystack, 0) } if nextState == nil { - // Dead state - return last match if we had one + // Dead state — prefilter skip if lastMatch != -1 { return lastMatch } - // No match yet - find next candidate pos++ candidate = d.prefilter.Find(haystack, pos) if candidate == -1 { return -1 } pos = candidate - // Get context-aware start state based on look-behind at new position - currentState = d.getStartStateForUnanchored(cache, haystack, pos) - if currentState == nil { + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { return d.nfaFallback(haystack, 0) } + sid = newStart.id + startSID = sid + ft = cache.flatTrans + ftLen = len(ft) lastMatch = -1 committed = false - if currentState.IsMatch() { + if newStart.IsMatch() { lastMatch = pos committed = true } continue } - case nextID == DeadState: - // Dead state - return last match if we had one + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) + + case DeadState: if lastMatch != -1 { return lastMatch } - // No match yet - find next candidate pos++ candidate = d.prefilter.Find(haystack, pos) if candidate == -1 { return -1 } pos = candidate - // Get context-aware start state based on look-behind at new position - currentState = d.getStartStateForUnanchored(cache, haystack, pos) - if currentState == nil { + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { return d.nfaFallback(haystack, 0) } + sid = newStart.id + startSID = sid + ft = cache.flatTrans + ftLen = len(ft) lastMatch = -1 committed = false - if currentState.IsMatch() { + if newStart.IsMatch() { lastMatch = pos committed = true } continue + default: - nextState = cache.getState(nextID) - if nextState == nil { - return d.nfaFallback(haystack, 0) - } + sid = nextID } pos++ - currentState = nextState - // Track match state and enforce leftmost semantics - if currentState.IsMatch() { + if cache.IsMatchState(sid) { lastMatch = pos committed = true } else if committed { - // We were in a match but now we're not - return leftmost match return lastMatch } - // If back in start state (unanchored prefix self-loop), use prefilter to skip - // Only do this if we haven't committed to a match yet - if !committed && currentState.ID() == StartState && pos < len(haystack) { + // Start state prefilter skip-ahead + if !committed && sid == startSID && pos < len(haystack) { candidate = d.prefilter.Find(haystack, pos) if candidate == -1 { return -1 } if candidate > pos { pos = candidate - // Stay in start state } } } @@ -996,25 +1240,14 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int) // Reached end of input. // Check if there's a match at EOI due to pending word boundary assertions. // Example: pattern `test\b` matching "test" - the \b is satisfied at EOI. - if d.checkEOIMatch(currentState) { + eoi := cache.getState(sid) + if eoi != nil && d.checkEOIMatch(eoi) { return len(haystack) } - // Return last match position (if any) return lastMatch } -// isSpecialStateID returns true if the given state ID requires special handling. -// A state ID is "special" if it's not a normal cached transition target: -// it's either InvalidState (needs determinization), DeadState, or missing from the cache. -// This is used by the 4x unrolled search loop to batch transitions and only -// check for special cases every 4 bytes. -func isSpecialStateID(id StateID) bool { - // DeadState (0xFFFFFFFE) and InvalidState (0xFFFFFFFF) are both in the high range. - // Normal states are sequential from 0, so any ID >= DeadState is special. - return id >= DeadState -} - // isCacheCleared checks if an error from determinize() is the cache-cleared signal. // When true, the search loop must re-obtain the current state from the start state // at the current position and continue searching. @@ -1082,36 +1315,65 @@ func (d *DFA) searchAt(cache *DFACache, haystack []byte, startPos int) int { //n end := len(haystack) pos := startPos + // Hot loop: flat transition table (Rust approach). + // Work with state ID only — no *State pointer chase in fast path. + // State struct needed only for: determinize (slow), word boundary (guarded), acceleration. + sid := currentState.id + ft := cache.flatTrans + stride := cache.stride + ftLen := len(ft) + + // Bounds hint for compiler — eliminates repeated len checks in loop. + if ftLen > 0 { + _ = ft[ftLen-1] + } + + startSID := currentState.id + hasPre := d.prefilter != nil + for pos < end { + // Prefilter skip-ahead at start state (Rust hybrid/search.rs:232-258) + if hasPre && sid == startSID && !committed && pos > startPos { + candidate := d.prefilter.Find(haystack, pos) + if candidate == -1 { + return lastMatch + } + if candidate > pos { + pos = candidate + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { + return d.nfaFallback(haystack, startPos) + } + sid = newStart.id + startSID = sid + ft = cache.flatTrans + ftLen = len(ft) + } + } + // === 4x UNROLLED FAST PATH === // Process 4 transitions per iteration when conditions allow. - // This reduces branch mispredictions and enables better instruction pipelining. - // We only enter this path when: - // 1. Pattern has no word boundaries (no per-byte boundary checks needed) - // 2. Not yet committed to a match (no per-byte leftmost-longest tracking needed) - // 3. State is not accelerable (acceleration is a better optimization) - // 4. Enough bytes remain for a full 4-byte batch - if canUnroll && !committed && !currentState.IsAccelerable() && pos+3 < end { + if canUnroll && !committed && pos+3 < end { + // Check acceleration on slow→fast transition (once per entry). + accelState := cache.getState(sid) + if accelState != nil && accelState.IsAccelerable() { + goto slowPath + } + // Transition 1 - // Direct field access to transitions[] avoids method call overhead. - // isSpecialStateID check covers both InvalidState (needs determinize) - // and DeadState. If special, currentState and pos are unchanged, - // so the slow path re-processes this byte correctly. - nextID := currentState.transitions[d.byteToClass(haystack[pos])] - if isSpecialStateID(nextID) { + o1 := safeOffset(sid, stride, int(d.byteToClass(haystack[pos]))) + if o1 >= ftLen { goto slowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - return d.nfaFallback(haystack, startPos) + n1 := ft[o1] + if n1 >= DeadState { + goto slowPath } pos++ - // After each transition, check for match state. - // If match found, record it and exit to slow path for leftmost-longest tracking. - // Also exit if not enough bytes remain for the remaining transitions. - if currentState.isMatch || pos+2 >= end { - if currentState.isMatch { + if cache.matchFlags[int(n1)] || pos+2 >= end { + sid = n1 + if cache.matchFlags[int(n1)] { lastMatch = pos committed = true } @@ -1119,18 +1381,21 @@ func (d *DFA) searchAt(cache *DFACache, haystack []byte, startPos int) int { //n } // Transition 2 - nextID = currentState.transitions[d.byteToClass(haystack[pos])] - if isSpecialStateID(nextID) { + o2 := safeOffset(n1, stride, int(d.byteToClass(haystack[pos]))) + if o2 >= ftLen { + sid = n1 goto slowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - return d.nfaFallback(haystack, startPos) + n2 := ft[o2] + if n2 >= DeadState { + sid = n1 + goto slowPath } pos++ - if currentState.isMatch || pos+1 >= end { - if currentState.isMatch { + if cache.matchFlags[int(n2)] || pos+1 >= end { + sid = n2 + if cache.matchFlags[int(n2)] { lastMatch = pos committed = true } @@ -1138,134 +1403,120 @@ func (d *DFA) searchAt(cache *DFACache, haystack []byte, startPos int) int { //n } // Transition 3 - nextID = currentState.transitions[d.byteToClass(haystack[pos])] - if isSpecialStateID(nextID) { + o3 := safeOffset(n2, stride, int(d.byteToClass(haystack[pos]))) + if o3 >= ftLen { + sid = n2 goto slowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - return d.nfaFallback(haystack, startPos) + n3 := ft[o3] + if n3 >= DeadState { + sid = n2 + goto slowPath } pos++ - if currentState.isMatch { + if cache.matchFlags[int(n3)] { + sid = n3 lastMatch = pos committed = true goto slowPath } // Transition 4 - nextID = currentState.transitions[d.byteToClass(haystack[pos])] - if isSpecialStateID(nextID) { + o4 := safeOffset(n3, stride, int(d.byteToClass(haystack[pos]))) + if o4 >= ftLen { + sid = n3 goto slowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - return d.nfaFallback(haystack, startPos) + n4 := ft[o4] + if n4 >= DeadState { + sid = n3 + goto slowPath } pos++ + sid = n4 - // After all 4 transitions: check for match - if currentState.isMatch { + if cache.matchFlags[int(n4)] { lastMatch = pos committed = true } - // Loop back to try another batch of 4 continue } slowPath: - // === SINGLE-BYTE SLOW PATH === - // Handles all edge cases: word boundaries, acceleration, determinization, - // dead states, committed match tracking. if pos >= end { break } - // Try lazy acceleration detection if not yet checked + // Resolve State for slow path (acceleration, word boundary, determinize). + currentState = cache.getState(sid) + if currentState == nil { + return d.nfaFallback(haystack, startPos) + } d.tryDetectAcceleration(currentState) - // State acceleration: if current state is accelerable, use SIMD to skip ahead if exitBytes := currentState.AccelExitBytes(); len(exitBytes) > 0 { nextPos := d.accelerate(haystack, pos, exitBytes) if nextPos == -1 { - // No exit byte found in remainder - no match possible from here return lastMatch } - // Skip to the exit byte position pos = nextPos } b := haystack[pos] - // Check if word boundary would result in a match BEFORE consuming the byte. - // This handles patterns like `test\b` where after matching "test", - // the next byte '!' creates a word boundary that satisfies \b. - // Skip this expensive check for patterns without word boundaries. if d.hasWordBoundary && d.checkWordBoundaryMatch(currentState, b) { - return pos // Return current position as match end + return pos } - // Check if current state has a transition for this byte - // Convert byte to equivalence class for transition lookup - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - switch { - case !ok: - // No cached transition: determinize on-demand + // Flat table lookup for transition + classIdx := int(d.byteToClass(b)) + offset := safeOffset(sid, stride, classIdx) + + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState + } + + switch nextID { + case InvalidState: nextState, err := d.determinize(cache, currentState, b) if err != nil { - // Cache cleared or full — fall back to NFA from original start position. - // After cache clear, DFA state context is lost and restarting mid-search - // can miss matches for unanchored patterns. NFA fallback is always correct. return d.nfaFallback(haystack, startPos) } - - // Check for dead state (no possible transitions) if nextState == nil { - // Dead state: return last match (if any) return lastMatch } - - currentState = nextState - case nextID == DeadState: - // Cached dead state: return last match (if any) + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) + case DeadState: return lastMatch default: - // Cached transition: follow it - currentState = cache.getState(nextID) - if currentState == nil { - // State not in cache? Shouldn't happen, fall back - return d.nfaFallback(haystack, startPos) - } + sid = nextID } pos++ - // Track match state for leftmost-longest semantics - if currentState.IsMatch() { + if cache.IsMatchState(sid) { lastMatch = pos committed = true } else if committed { - // We were in a match but now we're not. - // Check if any pattern threads are still active (could extend the match). - // If only fresh starts or unanchored machinery remain, return the committed match. - if !d.hasInProgressPattern(currentState) { + currentState = cache.getState(sid) + if currentState == nil || !d.hasInProgressPattern(currentState) { return lastMatch } - // Pattern threads still active - continue to find potential longer match } } - // Reached end of input. - // Check if there's a match at EOI due to pending word boundary assertions. - // Example: pattern `test\b` matching "test" - the \b is satisfied at EOI. - if d.checkEOIMatch(currentState) { + eoi := cache.getState(sid) + if eoi != nil && d.checkEOIMatch(eoi) { return len(haystack) } - // Return last match position (if any) return lastMatch } @@ -1308,9 +1559,7 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro // Cache the dead state transition to avoid re-computation // Use classIdx for transition storage (compressed alphabet) current.AddTransition(classIdx, DeadState) - // Return nil state with NO error - dead state is NOT an error condition. - // This follows the documented behavior: (nil, nil) for dead state. - // Returning an error here would incorrectly trigger NFA fallback. + cache.SetFlatTransition(current.id, int(classIdx), DeadState) return nil, nil //nolint:nilnil // dead state is valid, not an error } @@ -1337,6 +1586,7 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro // Cache hit: reuse existing state // Use classIdx for transition storage (compressed alphabet) current.AddTransition(classIdx, existing.ID()) + cache.SetFlatTransition(current.id, int(classIdx), existing.ID()) return existing, nil } @@ -1375,6 +1625,7 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro // Add transition from current state to new state // Use classIdx for transition storage (compressed alphabet) current.AddTransition(classIdx, newState.ID()) + cache.SetFlatTransition(current.id, int(classIdx), newState.ID()) return newState, nil } @@ -1675,141 +1926,156 @@ func (d *DFA) byteToClass(b byte) byte { // Returns the position where a match ends (scanning backward), or -1 if no match. // For reverse search, a "match" means the reverse DFA reached a match state, // which corresponds to finding the START of a match in the original direction. -func (d *DFA) SearchReverse(cache *DFACache, haystack []byte, start, end int) int { // Reverse DFA search with 4x unrolling +func (d *DFA) SearchReverse(cache *DFACache, haystack []byte, start, end int) int { //nolint:funlen // 4x unrolled reverse DFA search if end <= start || end > len(haystack) { return -1 } // Get start state for reverse search - // For reverse DFA, we start from what would be "end of match" in forward direction currentState := d.getStartStateForReverse(cache, haystack, end) if currentState == nil { return d.nfaFallbackReverse(haystack, start, end) } - // Track last match position (in reverse, this is the START of match) lastMatch := -1 - // Check if start state is already a match (empty match case) if currentState.IsMatch() { lastMatch = end } at := end - 1 + // Hot loop: flat transition table (Rust approach). + sid := currentState.id + ft := cache.flatTrans + stride := cache.stride + ftLen := len(ft) + + if ftLen > 0 { + _ = ft[ftLen-1] + } + // === 4x UNROLLED REVERSE LOOP === - // Process 4 transitions per iteration going backward. - // The reverse search has no word boundary or acceleration concerns, - // so the unrolled loop is simpler than the forward search. + // offset/nextSID declared before loop to avoid goto-over-declaration. + var revOff int + var nextSID StateID for at >= start+3 { // Transition 1 (from at, going backward) - nextID := currentState.transitions[d.byteToClass(haystack[at])] - if isSpecialStateID(nextID) { + revOff = safeOffset(sid, stride, int(d.byteToClass(haystack[at]))) + if revOff >= ftLen { goto reverseSlowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - return d.nfaFallbackReverse(haystack, start, end) + nextSID = ft[revOff] + if nextSID >= DeadState { + goto reverseSlowPath } - if currentState.isMatch { + if cache.matchFlags[int(nextSID)] { lastMatch = at } + sid = nextSID at-- // Transition 2 - nextID = currentState.transitions[d.byteToClass(haystack[at])] - if isSpecialStateID(nextID) { + revOff = safeOffset(sid, stride, int(d.byteToClass(haystack[at]))) + if revOff >= ftLen { goto reverseSlowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - return d.nfaFallbackReverse(haystack, start, end) + nextSID = ft[revOff] + if nextSID >= DeadState { + goto reverseSlowPath } - if currentState.isMatch { + if cache.matchFlags[int(nextSID)] { lastMatch = at } + sid = nextSID at-- // Transition 3 - nextID = currentState.transitions[d.byteToClass(haystack[at])] - if isSpecialStateID(nextID) { + revOff = safeOffset(sid, stride, int(d.byteToClass(haystack[at]))) + if revOff >= ftLen { goto reverseSlowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - return d.nfaFallbackReverse(haystack, start, end) + nextSID = ft[revOff] + if nextSID >= DeadState { + goto reverseSlowPath } - if currentState.isMatch { + if cache.matchFlags[int(nextSID)] { lastMatch = at } + sid = nextSID at-- // Transition 4 - nextID = currentState.transitions[d.byteToClass(haystack[at])] - if isSpecialStateID(nextID) { + revOff = safeOffset(sid, stride, int(d.byteToClass(haystack[at]))) + if revOff >= ftLen { goto reverseSlowPath } - currentState = cache.stateList[int(nextID)] - if currentState == nil { - return d.nfaFallbackReverse(haystack, start, end) + nextSID = ft[revOff] + if nextSID >= DeadState { + goto reverseSlowPath } - if currentState.isMatch { + if cache.matchFlags[int(nextSID)] { lastMatch = at } + sid = nextSID at-- continue reverseSlowPath: - // A special state ID was encountered in the unrolled loop. - // Fall through to the single-byte loop below for proper handling. break } // === SINGLE-BYTE REVERSE TAIL LOOP === - // Handles remaining bytes (0-3) after the unrolled loop, plus any bytes - // that need determinization or hit dead states. for at >= start { b := haystack[at] - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - switch { - case !ok: - // Determinize on demand + classIdx := int(d.byteToClass(b)) + offset := safeOffset(sid, stride, classIdx) + + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState + } + + switch nextID { + case InvalidState: + currentState = cache.getState(sid) + if currentState == nil { + return d.nfaFallbackReverse(haystack, start, end) + } nextState, err := d.determinize(cache, currentState, b) if err != nil { if isCacheCleared(err) { - // Cache was cleared. Re-obtain start state for reverse search. currentState = d.getStartStateForReverse(cache, haystack, at+1) if currentState == nil { return d.nfaFallbackReverse(haystack, start, end) } - // Re-process this byte with the new state + sid = currentState.id + ft = cache.flatTrans + ftLen = len(ft) continue } return d.nfaFallbackReverse(haystack, start, end) } if nextState == nil { - // Dead state - return last match if we had one return lastMatch } - currentState = nextState + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) - case nextID == DeadState: - // Dead state - return last match if we had one + case DeadState: return lastMatch default: - currentState = cache.getState(nextID) - if currentState == nil { - return d.nfaFallbackReverse(haystack, start, end) - } + sid = nextID } - // Track match state - if currentState.IsMatch() { - lastMatch = at // Position where match starts (in forward direction) + if cache.IsMatchState(sid) { + lastMatch = at } at-- @@ -1848,34 +2114,47 @@ func (d *DFA) SearchReverseLimited(cache *DFACache, haystack []byte, start, end, return -1 } - // Get start state for reverse search currentState := d.getStartStateForReverse(cache, haystack, end) if currentState == nil { return d.nfaFallbackReverse(haystack, start, end) } - // Track last match position (in reverse, this is the START of match) lastMatch := -1 - // Check if start state is already a match (empty match case) if currentState.IsMatch() { lastMatch = end } - // Effective lower bound: max(start, minStart) lowerBound := start if minStart > lowerBound { lowerBound = minStart } - // Scan BACKWARD from end-1 to lowerBound + // Hot loop: flat transition table (Rust approach). + sid := currentState.id + ft := cache.flatTrans + stride := cache.stride + ftLen := len(ft) + for at := end - 1; at >= lowerBound; at-- { b := haystack[at] - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - switch { - case !ok: + classIdx := int(d.byteToClass(b)) + offset := safeOffset(sid, stride, classIdx) + + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState + } + + switch nextID { + case InvalidState: + currentState = cache.getState(sid) + if currentState == nil { + return d.nfaFallbackReverse(haystack, start, end) + } nextState, err := d.determinize(cache, currentState, b) if err != nil { if isCacheCleared(err) { @@ -1883,35 +2162,33 @@ func (d *DFA) SearchReverseLimited(cache *DFACache, haystack []byte, start, end, if currentState == nil { return d.nfaFallbackReverse(haystack, start, end) } + sid = currentState.id + ft = cache.flatTrans + ftLen = len(ft) at++ // Will be decremented by for-loop continue } return d.nfaFallbackReverse(haystack, start, end) } if nextState == nil { - // Dead state - definitively no match return lastMatch } - currentState = nextState + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) - case nextID == DeadState: - // Dead state - definitively no match + case DeadState: return lastMatch default: - currentState = cache.getState(nextID) - if currentState == nil { - return d.nfaFallbackReverse(haystack, start, end) - } + sid = nextID } - if currentState.IsMatch() { + if cache.IsMatchState(sid) { lastMatch = at } } - // If we stopped at lowerBound > start and the DFA hasn't reached a dead state, - // the search was limited by minStart. Signal potential quadratic behavior. if lowerBound > start && lastMatch < 0 { return SearchReverseLimitedQuadratic } @@ -1928,27 +2205,42 @@ func (d *DFA) IsMatchReverse(cache *DFACache, haystack []byte, start, end int) b return false } - // Get start state for reverse search currentState := d.getStartStateForReverse(cache, haystack, end) if currentState == nil { _, _, matched := d.pikevm.Search(haystack[start:end]) return matched } - // Check if start state is already a match if currentState.IsMatch() { return true } - // Scan BACKWARD from end-1 to start with early termination + // Hot loop: flat transition table (Rust approach). + sid := currentState.id + ft := cache.flatTrans + stride := cache.stride + ftLen := len(ft) + for at := end - 1; at >= start; at-- { b := haystack[at] - // Convert byte to equivalence class for transition lookup - classIdx := d.byteToClass(b) - nextID, ok := currentState.Transition(classIdx) - switch { - case !ok: + classIdx := int(d.byteToClass(b)) + offset := safeOffset(sid, stride, classIdx) + + var nextID StateID + if offset < ftLen { + nextID = ft[offset] + } else { + nextID = InvalidState + } + + switch nextID { + case InvalidState: + currentState = cache.getState(sid) + if currentState == nil { + _, _, matched := d.pikevm.Search(haystack[start:end]) + return matched + } nextState, err := d.determinize(cache, currentState, b) if err != nil { if isCacheCleared(err) { @@ -1957,6 +2249,9 @@ func (d *DFA) IsMatchReverse(cache *DFACache, haystack []byte, start, end int) b _, _, matched := d.pikevm.Search(haystack[start:end]) return matched } + sid = currentState.id + ft = cache.flatTrans + ftLen = len(ft) at++ // Will be decremented by for-loop continue } @@ -1966,30 +2261,23 @@ func (d *DFA) IsMatchReverse(cache *DFACache, haystack []byte, start, end int) b if nextState == nil { return false } - currentState = nextState + sid = nextState.id + ft = cache.flatTrans + ftLen = len(ft) - case nextID == DeadState: + case DeadState: return false default: - currentState = cache.getState(nextID) - if currentState == nil { - _, _, matched := d.pikevm.Search(haystack[start:end]) - return matched - } + sid = nextID } - // Early termination on any match - if currentState.IsMatch() { + if cache.IsMatchState(sid) { return true } } - // After processing all bytes, check if final state is a match. - // This handles patterns with optional elements at the start (reverse end), - // e.g., pattern "0?0" on input "0" - after processing "0" we're in a state - // where the optional "0?" already matched (zero times). - return currentState.IsMatch() + return cache.IsMatchState(sid) } // getStartStateForReverse returns the appropriate start state for reverse search. diff --git a/literal/extractor.go b/literal/extractor.go index f3d0cad..6b1d974 100644 --- a/literal/extractor.go +++ b/literal/extractor.go @@ -267,10 +267,9 @@ func (e *Extractor) extractPrefixesAlternate(re *syntax.Regexp, depth int) *Seq result := NewSeq(allLits...) if overflowed || result.Len() > e.config.MaxLiterals { - // Either not all branches are represented (overflow) or too many literals. // Trim to 3-byte prefixes + dedup to fit prefilter capacity. // Mark ALL as inexact — prefilter is used for skip-ahead only, - // DFA/NFA verifies each candidate (safe with partial coverage). + // DFA/NFA verifies each candidate. // // Rust does the same: optimize_for_prefix_by_preference trims and deduplicates. // A partial prefilter is much better than no prefilter — DFA with skip-ahead @@ -281,6 +280,13 @@ func (e *Extractor) extractPrefixesAlternate(re *syntax.Regexp, depth int) *Seq if result.Len() > e.config.MaxLiterals { result.literals = result.literals[:e.config.MaxLiterals] } + // Mark partial coverage when overflow truncated branches. + // Prefilter with partial coverage CANNOT be used in candidate loops + // (would miss unrepresented branches). Only safe as skip-ahead + // inside NFA/DFA engine (Rust approach: PikeVM integrates prefilter). + if overflowed { + result.partialCoverage = true + } } return result diff --git a/literal/seq.go b/literal/seq.go index 50e6dd3..a51ecca 100644 --- a/literal/seq.go +++ b/literal/seq.go @@ -91,7 +91,21 @@ func (l Literal) String() string { // ) // fmt.Printf("Sequence has %d literals\n", seq.Len()) // Output: Sequence has 2 literals type Seq struct { - literals []Literal + literals []Literal + partialCoverage bool // True when alternation overflow truncated branches +} + +// IsPartialCoverage returns true if the literal set doesn't cover all +// alternation branches (due to overflow truncation). A partial-coverage +// prefilter cannot be used as a correctness gate in candidate loops — +// it would miss branches whose literals were not extracted. +// Rust avoids this by integrating prefilter as skip-ahead inside PikeVM, +// not as an external candidate loop. +func (s *Seq) IsPartialCoverage() bool { + if s == nil { + return false + } + return s.partialCoverage } // NewSeq creates a new sequence from the given literals. diff --git a/meta/compile.go b/meta/compile.go index 921866f..0ff9384 100644 --- a/meta/compile.go +++ b/meta/compile.go @@ -492,6 +492,11 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { } pikevm := nfa.NewPikeVM(pikevmNFA) + // Set prefilter as skip-ahead inside PikeVM (Rust approach: pikevm.rs:1293). + // When NFA has no active threads, PikeVM skips to next candidate position. + // Safe for partial-coverage prefilters — NFA processes all branches. + configurePikeVMSkipAhead(pikevm, pf, isStartAnchored) + // Build OnePass DFA for anchored patterns with captures (optional optimization) onePassRes := buildOnePassDFA(re, nfaEngine, config) @@ -617,6 +622,7 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { ahoCorasick: engines.ahoCorasick, anchoredLiteralInfo: anchoredLiteralInfo, prefilter: pf, + prefilterPartialCoverage: literals != nil && literals.IsPartialCoverage(), strategy: strategy, config: config, onepass: onePassRes, @@ -681,6 +687,13 @@ func hasNonLineAnchors(re *syntax.Regexp) bool { return false } +// configurePikeVMSkipAhead sets prefilter as skip-ahead inside PikeVM. +func configurePikeVMSkipAhead(pikevm *nfa.PikeVM, pf prefilter.Prefilter, isStartAnchored bool) { + if pf != nil && !isStartAnchored { + pikevm.SetSkipAhead(pf) + } +} + // buildSearchStateConfig extracts all DFA references needed for per-search caches. // Strategy-specific DFAs come from reverse searchers (which have their own DFAs). func buildSearchStateConfig(nfaEngine *nfa.NFA, numCaptures int, engines strategyEngines, strategy Strategy) searchStateConfig { diff --git a/meta/engine.go b/meta/engine.go index 15f5655..d2bcc54 100644 --- a/meta/engine.go +++ b/meta/engine.go @@ -96,6 +96,7 @@ type Engine struct { ahoCorasick *ahocorasick.Automaton // For large literal alternations (>32 patterns) anchoredLiteralInfo *AnchoredLiteralInfo // For ^prefix.*suffix$ patterns (Issue #79) prefilter prefilter.Prefilter + prefilterPartialCoverage bool // True when prefilter doesn't cover all alternation branches strategy Strategy config Config diff --git a/meta/find_indices.go b/meta/find_indices.go index 80ff8c9..4ba5f36 100644 --- a/meta/find_indices.go +++ b/meta/find_indices.go @@ -119,11 +119,16 @@ func (e *Engine) findIndicesNFA(haystack []byte) (int, int, bool) { state := e.getSearchState() defer e.putSearchState(state) - // Use prefilter candidate loop for skip-ahead — but ONLY when prefilter - // covers all possible match positions (IsComplete or all branches represented). - // Incomplete prefilters (partial case-fold coverage) cannot be used as - // correctness gates — they'd miss branches whose literals were truncated. - if e.prefilter != nil && e.prefilter.IsComplete() { + // Use prefilter for candidate skip-ahead if available. + // Prefilter finds PREFIX positions → NFA/BT verifies full match from there. + // Safe for both complete and incomplete prefilters — as long as all + // alternation branches are represented in the literal set. + // + // NOT safe for partial-coverage prefilters (overflow truncated branches): + // candidate loop would miss unrepresented branches entirely. + // Rust avoids this by integrating prefilter inside PikeVM as skip-ahead + // (not as an external correctness gate). See pikevm.rs:1293-1299. + if e.prefilter != nil && !e.prefilterPartialCoverage { at := 0 for at < len(haystack) { // Find next candidate position via prefilter @@ -175,8 +180,8 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) { state := e.getSearchState() defer e.putSearchState(state) - // Use prefilter candidate loop — only safe with complete prefilter - if e.prefilter != nil && e.prefilter.IsComplete() { + // Use prefilter candidate loop — safe unless partial coverage (overflow) + if e.prefilter != nil && !e.prefilterPartialCoverage { for at < len(haystack) { pos := e.prefilter.Find(haystack, at) if pos == -1 { @@ -211,10 +216,10 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) { } // findIndicesDFA searches using DFA with prefilter - zero alloc. -func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) { +func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) { //nolint:cyclop // DFA with prefilter paths atomic.AddUint64(&e.stats.DFASearches, 1) - // Literal fast path + // Literal fast path — complete prefilter returns match directly if e.prefilter != nil && e.prefilter.IsComplete() { pos := e.prefilter.Find(haystack, 0) if pos == -1 { @@ -228,6 +233,20 @@ func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) { return e.pikevm.Search(haystack) } + // Prefilter skip-ahead for DFA — safe even with incomplete prefilter. + // DFA verifies full pattern at candidate position; prefilter just skips. + if e.prefilter != nil && !e.prefilter.IsComplete() { + pos := e.prefilter.Find(haystack, 0) + if pos == -1 { + return -1, -1, false + } + atomic.AddUint64(&e.stats.PrefilterHits, 1) + if e.reverseDFA != nil { + return e.findIndicesBidirectionalDFA(haystack, pos) + } + return e.pikevm.SearchAt(haystack, pos) + } + // Prefilter-accelerated search: find candidate, verify with anchored DFA. // For large NFAs (e.g., 181 states for (?i) patterns), bidirectional DFA // cache-thrashes. Anchored verification at candidate position is O(pattern_len). @@ -280,7 +299,8 @@ func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) { } // Prefilter with non-greedy: use prefilter for rejection only, PikeVM for match. - if e.prefilter != nil { + // Not safe with partial coverage — would miss unrepresented branches. + if e.prefilter != nil && !e.prefilterPartialCoverage { pos := e.prefilter.Find(haystack, 0) if pos == -1 { return -1, -1, false @@ -308,21 +328,7 @@ func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) { func (e *Engine) findIndicesDFAAt(haystack []byte, at int) (int, int, bool) { atomic.AddUint64(&e.stats.DFASearches, 1) - // Literal fast path - if e.prefilter != nil && e.prefilter.IsComplete() { - pos := e.prefilter.Find(haystack, at) - if pos == -1 { - return -1, -1, false - } - atomic.AddUint64(&e.stats.PrefilterHits, 1) - literalLen := e.prefilter.LiteralLen() - if literalLen > 0 { - return pos, pos + literalLen, true - } - return e.pikevm.SearchAt(haystack, at) - } - - // Prefilter skip: use prefix prefilter to jump to candidate position. + // Prefilter skip-ahead — safe for all prefilters, DFA verifies. if e.prefilter != nil { pos := e.prefilter.Find(haystack, at) if pos == -1 { @@ -1028,9 +1034,9 @@ func (e *Engine) findIndicesNFAAtWithState(haystack []byte, at int, state *Searc // BoundedBacktracker can be used for Find operations only when safe useBT := e.boundedBacktracker != nil && !e.canMatchEmpty - // Use prefilter candidate loop — only safe with complete prefilter. - // Incomplete prefilters (partial case-fold coverage) would miss branches. - if e.prefilter != nil && e.prefilter.IsComplete() { + // Use prefilter candidate loop — safe unless partial coverage (overflow). + // Partial-coverage prefilters would miss unrepresented branches. + if e.prefilter != nil && !e.prefilterPartialCoverage { for at < len(haystack) { pos := e.prefilter.Find(haystack, at) if pos == -1 { diff --git a/nfa/pikevm.go b/nfa/pikevm.go index 9fcd7ae..22f6b9e 100644 --- a/nfa/pikevm.go +++ b/nfa/pikevm.go @@ -67,8 +67,17 @@ type searchThread struct { // Thread safety: PikeVM configuration (nfa) is immutable after creation. // For thread-safe concurrent usage, use *WithState methods with external PikeVMState. // The legacy methods without state use internal state and are NOT thread-safe. +// SkipAhead is a prefilter interface for PikeVM skip-ahead optimization. +// When NFA has no active threads, Find skips to the next candidate position +// instead of scanning byte-by-byte. This is the Rust approach (pikevm.rs:1293). +// Safe for both complete and partial-coverage prefilters. +type SkipAhead interface { + Find(haystack []byte, start int) int +} + type PikeVM struct { - nfa *NFA + nfa *NFA + skipAhead SkipAhead // Optional prefilter for skip-ahead (nil = disabled) // internalState is used by legacy non-thread-safe methods. // For concurrent usage, use *WithState methods with external PikeVMState. @@ -266,6 +275,13 @@ func (p *PikeVM) initState(state *PikeVMState) { state.SlotTable = NewSlotTable(p.nfa.States(), slotsPerState) } +// SetSkipAhead sets the prefilter for skip-ahead optimization. +// When set, PikeVM uses it to skip positions where no match can start +// (when there are no active NFA threads). Safe for partial-coverage prefilters. +func (p *PikeVM) SetSkipAhead(sa SkipAhead) { + p.skipAhead = sa +} + // NewPikeVMState creates a new mutable state for use with PikeVM. // The state must be initialized by calling PikeVM.InitState before use. // This should be pooled via sync.Pool for concurrent usage. @@ -717,6 +733,15 @@ func (p *PikeVM) searchUnanchoredAt(haystack []byte, startAt int) (int, int, boo // Add new start thread at current position (simulates .*? prefix) // Stop adding new starts once we've found a match. if bestStart == -1 && (!isAnchored || pos == startAt) { + // Skip-ahead: when no active threads, use prefilter to jump forward. + // Rust approach (pikevm.rs:1293). Safe for partial-coverage prefilters. + if len(p.internalState.Queue) == 0 && p.skipAhead != nil && pos > startAt { + candidate := p.skipAhead.Find(haystack, pos) + if candidate == -1 { + break + } + pos = candidate + } p.internalState.Visited.Clear() p.addThread(thread{state: p.nfa.StartAnchored(), startPos: pos}, haystack, pos) } @@ -1656,6 +1681,14 @@ func (p *PikeVM) searchWithSlotTableUnanchored(haystack []byte, startAt int) (in for pos := startAt; pos <= len(haystack); pos++ { if bestStart == -1 && (!isAnchored || pos == 0) { + // Skip-ahead (Rust pikevm.rs:1293) + if len(p.internalState.SearchQueue) == 0 && p.skipAhead != nil && pos > startAt { + candidate := p.skipAhead.Find(haystack, pos) + if candidate == -1 { + break + } + pos = candidate + } p.internalState.Visited.Clear() p.addSearchThread(searchThread{state: p.nfa.StartAnchored(), startPos: pos}, haystack, pos) }