From 5b2499daf24c9655257348d1b0975e8c79e2122b Mon Sep 17 00:00:00 2001 From: Andy Date: Tue, 24 Mar 2026 15:55:01 +0300 Subject: [PATCH 1/7] =?UTF-8?q?perf:=20remove=20dual=20transition=20storag?= =?UTF-8?q?e=20=E2=80=94=20State.transitions=20eliminated?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove transitions []StateID and transitionCount from State struct. Transitions now stored exclusively in DFACache.flatTrans flat table. - Remove State.AddTransition(), Transition(), Stride(), TransitionCount() - Remove Builder.move() (unused after DetectAcceleration simplification) - Simplify DetectAcceleration/DetectAccelerationFromCached to return nil - Add DetectAccelerationFromFlat() reading from flat table - Simplify tryDetectAccelerationWithCache (flatTrans-only path) - Remove 3 redundant AddTransition calls from determinize - Update tests: add TestDetectAccelerationFromFlat, remove State transition tests Memory: ~222MB -> ~150MB (eliminates redundant per-state transition slices) --- CHANGELOG.md | 9 ++ dfa/lazy/accel_test.go | 40 +++--- dfa/lazy/anchored_search_prefilter_test.go | 63 ++-------- dfa/lazy/builder.go | 139 ++++++--------------- dfa/lazy/lazy.go | 20 ++- dfa/lazy/state.go | 71 ++--------- dfa/lazy/state_set_test.go | 96 +------------- 7 files changed, 95 insertions(+), 343 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 91b97c2..710a167 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120)) - SIMD prefilter for CompositeSequenceDFA (#83) +## [0.12.19] - 2026-03-24 + +### Performance +- **Remove dual transition storage** — eliminated `transitions []StateID` and + `transitionCount` from `State` struct. Transitions now stored exclusively in + `DFACache.flatTrans`. Removes ~93MB of redundant per-state transition slices + (222MB → ~150MB). Acceleration detection migrated to `DetectAccelerationFromFlat()` + reading directly from flat table. + ## [0.12.18] - 2026-03-24 ### Performance diff --git a/dfa/lazy/accel_test.go b/dfa/lazy/accel_test.go index 8c31819..d434dea 100644 --- a/dfa/lazy/accel_test.go +++ b/dfa/lazy/accel_test.go @@ -87,37 +87,37 @@ func TestDetectAcceleration(t *testing.T) { } func TestDetectAccelerationFromCached(t *testing.T) { - // Test the lazy detection that only uses cached transitions + // State no longer stores transitions — DetectAccelerationFromCached returns nil. + // Acceleration is now detected via DetectAccelerationFromFlat using flatTrans. state := NewState(StateID(1), []nfa.StateID{0}, false) - - // Initially no cached transitions - should return nil exitBytes := DetectAccelerationFromCached(state) if exitBytes != nil { - t.Errorf("Expected nil with no cached transitions, got %v", exitBytes) + t.Errorf("Expected nil (State has no transitions), got %v", exitBytes) } +} + +func TestDetectAccelerationFromFlat(t *testing.T) { + // Test acceleration detection via flat transition table + stride := 256 + sid := StateID(1) + flatTrans := make([]StateID, 2*stride) // 2 states - // Add 250 self-loop transitions + // State 1: 250 self-loops, 3 exits to state 2, 3 dead + base := int(sid) * stride for i := 0; i < 250; i++ { - state.AddTransition(byte(i), StateID(1)) // Self-loop + flatTrans[base+i] = sid // Self-loop } + flatTrans[base+250] = StateID(2) + flatTrans[base+251] = StateID(2) + flatTrans[base+252] = StateID(2) + flatTrans[base+253] = DeadState + flatTrans[base+254] = DeadState + flatTrans[base+255] = DeadState - // Add 3 exit bytes - state.AddTransition(byte(250), StateID(2)) // Exit to state 2 - state.AddTransition(byte(251), StateID(2)) // Exit to state 2 - state.AddTransition(byte(252), StateID(2)) // Exit to state 2 - - // Add 3 dead transitions - state.AddTransition(byte(253), DeadState) - state.AddTransition(byte(254), DeadState) - state.AddTransition(byte(255), DeadState) - - // Now should detect as accelerable - exitBytes = DetectAccelerationFromCached(state) + exitBytes := DetectAccelerationFromFlat(sid, flatTrans, stride, nil) if len(exitBytes) != 3 { t.Errorf("Expected 3 exit bytes, got %v", exitBytes) } - - // Verify the exit bytes are correct expected := map[byte]bool{250: true, 251: true, 252: true} for _, b := range exitBytes { if !expected[b] { diff --git a/dfa/lazy/anchored_search_prefilter_test.go b/dfa/lazy/anchored_search_prefilter_test.go index ee8584a..0593e89 100644 --- a/dfa/lazy/anchored_search_prefilter_test.go +++ b/dfa/lazy/anchored_search_prefilter_test.go @@ -73,82 +73,39 @@ func TestDetectAccelFromCachedWithClassesByteMapping(t *testing.T) { // TestDetectAccelFromCachedWithClassesNilClasses verifies the nil byteClasses fallback. func TestDetectAccelFromCachedWithClassesNilClasses(t *testing.T) { - // Create a state with known transitions (stride=256, no compression) + // State no longer stores transitions — DetectAccelerationFromCachedWithClasses returns nil. + // Use DetectAccelerationFromFlat for flat table detection. state := NewState(StateID(1), []nfa.StateID{0}, false) - - // Fill 253 self-loop transitions - for i := 0; i < 253; i++ { - state.AddTransition(byte(i), StateID(1)) - } - // Add 3 exit transitions to a different state - state.AddTransition(253, StateID(2)) - state.AddTransition(254, StateID(2)) - state.AddTransition(255, StateID(2)) - - // nil byteClasses -> exit class indices ARE the bytes (identity) result := DetectAccelerationFromCachedWithClasses(state, nil) - if len(result) != 3 { - t.Fatalf("expected 3 exit bytes with nil classes, got %v", result) - } - expected := map[byte]bool{253: true, 254: true, 255: true} - for _, b := range result { - if !expected[b] { - t.Errorf("unexpected exit byte %d", b) - } + if result != nil { + t.Errorf("expected nil (State has no transitions), got %v", result) } } -// TestDetectAccelFromCachedInsufficientTransitions tests that when too few -// transitions are cached, acceleration detection returns nil. +// TestDetectAccelFromCachedInsufficientTransitions tests that State-based detection returns nil. func TestDetectAccelFromCachedInsufficientTransitions(t *testing.T) { state := NewState(StateID(1), []nfa.StateID{0}, false) - // Only add a few transitions (way below 94% threshold) - state.AddTransition(0, StateID(1)) - state.AddTransition(1, StateID(2)) - result := DetectAccelerationFromCachedWithClasses(state, nil) if result != nil { - t.Errorf("expected nil for insufficient cached transitions, got %v", result) + t.Errorf("expected nil (State has no transitions), got %v", result) } } -// TestDetectAccelFromCachedTooManyExitClasses tests that >3 exit classes returns nil. +// TestDetectAccelFromCachedTooManyExitClasses tests that State-based detection returns nil. func TestDetectAccelFromCachedTooManyExitClasses(t *testing.T) { state := NewState(StateID(1), []nfa.StateID{0}, false) - // Fill 250 self-loops - for i := 0; i < 250; i++ { - state.AddTransition(byte(i), StateID(1)) - } - // Add 4 distinct exit transitions (> 3 limit) - state.AddTransition(250, StateID(2)) - state.AddTransition(251, StateID(3)) - state.AddTransition(252, StateID(4)) - state.AddTransition(253, StateID(5)) - // Fill remaining with dead - state.AddTransition(254, DeadState) - state.AddTransition(255, DeadState) - result := DetectAccelerationFromCachedWithClasses(state, nil) if result != nil { - t.Errorf("expected nil for >3 exit classes, got %v", result) + t.Errorf("expected nil (State has no transitions), got %v", result) } } -// TestDetectAccelFromCachedZeroExitClasses tests that 0 exit classes returns nil. +// TestDetectAccelFromCachedZeroExitClasses tests that State-based detection returns nil. func TestDetectAccelFromCachedZeroExitClasses(t *testing.T) { state := NewState(StateID(1), []nfa.StateID{0}, false) - // All transitions are self-loops or dead - for i := 0; i < 256; i++ { - if i < 200 { - state.AddTransition(byte(i), StateID(1)) // self-loop - } else { - state.AddTransition(byte(i), DeadState) // dead - } - } - result := DetectAccelerationFromCachedWithClasses(state, nil) if result != nil { - t.Errorf("expected nil for 0 exit classes, got %v", result) + t.Errorf("expected nil (State has no transitions), got %v", result) } } diff --git a/dfa/lazy/builder.go b/dfa/lazy/builder.go index 5864a13..f528e87 100644 --- a/dfa/lazy/builder.go +++ b/dfa/lazy/builder.go @@ -213,12 +213,6 @@ func (b *Builder) epsilonClosure(states []nfa.StateID, lookHave LookSet) []nfa.S return closure.ToSlice() } -// move computes the set of NFA states reachable from the given states on input byte b. -// This version does not track word context - use moveWithWordContext for patterns with \b/\B. -func (b *Builder) move(states []nfa.StateID, input byte) []nfa.StateID { - return b.moveWithWordContext(states, input, false) -} - // moveWithWordContext computes the set of NFA states reachable from the given states on input byte b, // with full word boundary tracking. // @@ -605,33 +599,51 @@ func DetectAccelerationFromCached(state *State) []byte { // // When byteClasses is nil, falls back to identity mapping (no compression). func DetectAccelerationFromCachedWithClasses(state *State, byteClasses *nfa.ByteClasses) []byte { - if state == nil { - return nil - } + // State no longer stores transitions — they live in DFACache.flatTrans. + // This function cannot detect acceleration without the flat table. + // Use DetectAccelerationFromFlat() instead. + return nil +} - stride := state.Stride() - // Need most transitions cached to detect accurately - // For compressed alphabet, we need most of stride, not 240 - minCachedRequired := stride - stride/16 // At least ~94% cached +// DetectAccelerationFromFlat analyzes transitions from the flat table. +// Used by tryDetectAcceleration when State.transitions will be removed. +func DetectAccelerationFromFlat(sid StateID, flatTrans []StateID, stride int, byteClasses *nfa.ByteClasses) []byte { + ftLen := len(flatTrans) + return detectAccelFromTransitions(sid, stride, func(classIdx int) (StateID, bool) { + offset := safeOffset(sid, stride, classIdx) + if offset >= ftLen { + return InvalidState, false + } + next := flatTrans[offset] + return next, next != InvalidState + }, byteClasses) +} + +// detectAccelFromTransitions is the shared implementation for acceleration detection. +// transitionFn returns (nextID, cached) for a given class index. +func detectAccelFromTransitions(selfID StateID, stride int, transitionFn func(int) (StateID, bool), byteClasses *nfa.ByteClasses) []byte { + // Count cached transitions first + cachedCount := 0 + for classIdx := 0; classIdx < stride; classIdx++ { + if _, ok := transitionFn(classIdx); ok { + cachedCount++ + } + } + minCachedRequired := stride - stride/16 if minCachedRequired < 1 { minCachedRequired = 1 } - transitionCount := state.TransitionCount() - if transitionCount < minCachedRequired { + if cachedCount < minCachedRequired { return nil } - selfID := state.ID() var exitClasses []byte uncachedCount := 0 - // Scan only the CACHED transitions by equivalence class for classIdx := 0; classIdx < stride; classIdx++ { - nextID, ok := state.Transition(byte(classIdx)) + nextID, ok := transitionFn(classIdx) if !ok { - // Not cached yet - count as unknown uncachedCount++ - // Too many unknowns means we can't detect reliably maxUncached := stride / 16 if maxUncached < 1 { maxUncached = 1 @@ -642,16 +654,12 @@ func DetectAccelerationFromCachedWithClasses(state *State, byteClasses *nfa.Byte continue } - // Transition is cached if nextID == selfID || nextID == DeadState { - // Self-loop or dead - counts as "skip" continue } - // This class causes exit - record it exitClasses = append(exitClasses, byte(classIdx)) if len(exitClasses) > 3 { - // Too many exit classes - not accelerable return nil } } @@ -696,85 +704,10 @@ func DetectAccelerationFromCachedWithClasses(state *State, byteClasses *nfa.Byte // // Returns the exit bytes (1-3) or nil if not accelerable. func (b *Builder) DetectAcceleration(state *State) []byte { - if state == nil { - return nil - } - - byteClasses := b.nfa.ByteClasses() - selfID := state.ID() - var exitClasses []byte - stride := state.Stride() - - // Check all equivalence classes - for classIdx := 0; classIdx < stride; classIdx++ { - // Check if transition is already cached - nextID, ok := state.Transition(byte(classIdx)) - if !ok { - // Need to compute this transition - // Find a representative byte for this class to use with move() - var repByte byte - if byteClasses == nil { - repByte = byte(classIdx) - } else { - repByte = byte(classIdx) // Default to class index - for bi := 0; bi < 256; bi++ { - if byteClasses.Get(byte(bi)) == byte(classIdx) { - repByte = byte(bi) - break - } - } - } - - nextNFAStates := b.move(state.NFAStates(), repByte) - if len(nextNFAStates) == 0 { - // Dead state - counts as "skip" - continue - } - - // This leads to a non-dead state - it's an exit class - exitClasses = append(exitClasses, byte(classIdx)) - if len(exitClasses) > 3 { - // Too many exit classes - not accelerable - return nil - } - continue - } - - // Transition is cached - if nextID == selfID || nextID == DeadState { - // Self-loop or dead - counts as "skip" - continue - } - - // Transition to a different state - it's an exit class - exitClasses = append(exitClasses, byte(classIdx)) - if len(exitClasses) > 3 { - // Too many exit classes - not accelerable - return nil - } - } - - // Accelerable if we have 1-3 exit classes - if len(exitClasses) < 1 || len(exitClasses) > 3 { - return nil - } - - // Convert class indices back to representative bytes for memchr - if byteClasses == nil { - return exitClasses - } - - exitBytes := make([]byte, 0, len(exitClasses)) - for _, classIdx := range exitClasses { - for bi := 0; bi < 256; bi++ { - if byteClasses.Get(byte(bi)) == classIdx { - exitBytes = append(exitBytes, byte(bi)) - break - } - } - } - - return exitBytes + // State no longer stores transitions — they live in DFACache.flatTrans. + // This method cannot detect acceleration without the flat table. + // Use DetectAccelerationFromFlat() instead. + return nil } // checkHasWordBoundary checks if the NFA contains any word boundary assertions (\b or \B). diff --git a/dfa/lazy/lazy.go b/dfa/lazy/lazy.go index 07672cd..46bc320 100644 --- a/dfa/lazy/lazy.go +++ b/dfa/lazy/lazy.go @@ -898,7 +898,7 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int start, end, matched := d.pikevm.SearchAt(haystack, startPos) return matched && start >= 0 && end >= start } - d.tryDetectAcceleration(currentState) + d.tryDetectAccelerationWithCache(currentState, cache) // State acceleration: if current state is accelerable, use SIMD to skip ahead if exitBytes := currentState.AccelExitBytes(); len(exitBytes) > 0 { @@ -1454,7 +1454,7 @@ func (d *DFA) searchAt(cache *DFACache, haystack []byte, startPos int) int { //n if currentState == nil { return d.nfaFallback(haystack, startPos) } - d.tryDetectAcceleration(currentState) + d.tryDetectAccelerationWithCache(currentState, cache) if exitBytes := currentState.AccelExitBytes(); len(exitBytes) > 0 { nextPos := d.accelerate(haystack, pos, exitBytes) @@ -1558,7 +1558,6 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro if len(nextNFAStates) == 0 { // Cache the dead state transition to avoid re-computation // Use classIdx for transition storage (compressed alphabet) - current.AddTransition(classIdx, DeadState) cache.SetFlatTransition(current.id, int(classIdx), DeadState) return nil, nil //nolint:nilnil // dead state is valid, not an error } @@ -1585,7 +1584,6 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro if existing, ok := cache.Get(key); ok { // Cache hit: reuse existing state // Use classIdx for transition storage (compressed alphabet) - current.AddTransition(classIdx, existing.ID()) cache.SetFlatTransition(current.id, int(classIdx), existing.ID()) return existing, nil } @@ -1624,7 +1622,6 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro // Add transition from current state to new state // Use classIdx for transition storage (compressed alphabet) - current.AddTransition(classIdx, newState.ID()) cache.SetFlatTransition(current.id, int(classIdx), newState.ID()) return newState, nil @@ -1808,16 +1805,17 @@ func (d *DFA) matchesEmpty(cache *DFACache) bool { return matched && start == 0 && end == 0 } -// tryDetectAcceleration attempts lazy acceleration detection for a state. -// This is called when a state has enough cached transitions to detect reliably. -// It only runs once per state (tracked via AccelChecked flag). -func (d *DFA) tryDetectAcceleration(state *State) { +// tryDetectAccelerationWithCache attempts acceleration detection using flatTrans. +func (d *DFA) tryDetectAccelerationWithCache(state *State, cache *DFACache) { if state == nil || state.AccelChecked() { return } - // Try lazy detection from cached transitions with ByteClasses support - if exitBytes := DetectAccelerationFromCachedWithClasses(state, d.byteClasses); len(exitBytes) > 0 { + var exitBytes []byte + if cache != nil && cache.stride > 0 { + exitBytes = DetectAccelerationFromFlat(state.ID(), cache.flatTrans, cache.stride, d.byteClasses) + } + if len(exitBytes) > 0 { state.SetAccelBytes(exitBytes) } else { state.MarkAccelChecked() diff --git a/dfa/lazy/state.go b/dfa/lazy/state.go index ba6f70d..33edf96 100644 --- a/dfa/lazy/state.go +++ b/dfa/lazy/state.go @@ -53,14 +53,7 @@ type State struct { // id uniquely identifies this state in the cache id StateID - // transitions maps equivalence class → next state ID. - // The slice length equals the alphabet size (ByteClasses.AlphabetLen()). - // InvalidState means no transition for that equivalence class. - // Lookup: transitions[byteClasses.Get(byte)] - transitions []StateID - - // transitionCount tracks how many valid transitions exist (for statistics/debugging) - transitionCount int + // Note: transitions removed — stored in DFACache.flatTrans only. // isMatch indicates if this is an accepting state isMatch bool @@ -126,18 +119,13 @@ func NewStateWithStride(id StateID, nfaStates []nfa.StateID, isMatch bool, isFro nfaStatesCopy := make([]nfa.StateID, len(nfaStates)) copy(nfaStatesCopy, nfaStates) - // Create transitions slice initialized to InvalidState - transitions := make([]StateID, stride) - for i := range transitions { - transitions[i] = InvalidState - } - + // Note: transitions stored in DFACache.flatTrans (single source of truth). + // State struct keeps only metadata. return &State{ - id: id, - transitions: transitions, - isMatch: isMatch, - isFromWord: isFromWord, - nfaStates: nfaStatesCopy, + id: id, + isMatch: isMatch, + isFromWord: isFromWord, + nfaStates: nfaStatesCopy, } } @@ -157,20 +145,6 @@ func (s *State) IsFromWord() bool { return s.isFromWord } -// Transition returns the next state for the given equivalence class index. -// Returns (InvalidState, false) if no transition exists. -// This is the hot path - O(1) slice lookup. -// -// IMPORTANT: The caller must convert the input byte to an equivalence class -// index via byteClasses.Get(byte) before calling this method. -func (s *State) Transition(classIdx byte) (StateID, bool) { - if int(classIdx) >= len(s.transitions) { - return InvalidState, false - } - next := s.transitions[classIdx] - return next, next != InvalidState -} - // checkWordBoundaryFast checks if consuming byte b would produce a match // via word boundary resolution. Uses pre-computed flags — O(1), no allocation. // Replaces the expensive checkWordBoundaryMatch (30% CPU) which created Builder @@ -186,42 +160,15 @@ func (s *State) checkWordBoundaryFast(b byte) bool { return s.matchAtNonWordBoundary } -// AddTransition adds a transition from this state to another on equivalence class classIdx. -// Overwrites any existing transition for this class. -// -// IMPORTANT: The caller must convert the input byte to an equivalence class -// index via byteClasses.Get(byte) before calling this method. -func (s *State) AddTransition(classIdx byte, next StateID) { - if int(classIdx) >= len(s.transitions) { - return // Ignore out-of-bounds (shouldn't happen with correct stride) - } - if s.transitions[classIdx] == InvalidState && next != InvalidState { - s.transitionCount++ - } else if s.transitions[classIdx] != InvalidState && next == InvalidState { - s.transitionCount-- - } - s.transitions[classIdx] = next -} - -// Stride returns the alphabet size (number of equivalence classes). -func (s *State) Stride() int { - return len(s.transitions) -} - // NFAStates returns the NFA states represented by this DFA state func (s *State) NFAStates() []nfa.StateID { return s.nfaStates } -// TransitionCount returns the number of valid transitions from this state -func (s *State) TransitionCount() int { - return s.transitionCount -} - // String returns a human-readable representation of the state func (s *State) String() string { - return fmt.Sprintf("DFAState(id=%d, isMatch=%v, transitions=%d, nfaStates=%v)", - s.id, s.isMatch, s.transitionCount, s.nfaStates) + return fmt.Sprintf("DFAState(id=%d, isMatch=%v, nfaStates=%v)", + s.id, s.isMatch, s.nfaStates) } // IsAccelerable returns true if this state can use SIMD acceleration. diff --git a/dfa/lazy/state_set_test.go b/dfa/lazy/state_set_test.go index abd5622..06eae76 100644 --- a/dfa/lazy/state_set_test.go +++ b/dfa/lazy/state_set_test.go @@ -383,100 +383,8 @@ func TestStateCreation(t *testing.T) { } } -func TestStateTransitions(t *testing.T) { - state := NewState(StateID(1), []nfa.StateID{0}, false) - - // Initially no valid transitions - if state.TransitionCount() != 0 { - t.Errorf("Initial TransitionCount() = %d, want 0", state.TransitionCount()) - } - - // All transitions should be InvalidState - for i := 0; i < 256; i++ { - next, ok := state.Transition(byte(i)) - if ok { - t.Errorf("Transition(%d) should be invalid, got %d", i, next) - } - if next != InvalidState { - t.Errorf("Transition(%d) = %d, want InvalidState", i, next) - } - } - - // Add some transitions - state.AddTransition(byte('a'), StateID(2)) - state.AddTransition(byte('b'), StateID(3)) - - if state.TransitionCount() != 2 { - t.Errorf("TransitionCount() = %d, want 2", state.TransitionCount()) - } - - next, ok := state.Transition(byte('a')) - if !ok || next != StateID(2) { - t.Errorf("Transition('a') = (%d, %v), want (2, true)", next, ok) - } - - next, ok = state.Transition(byte('b')) - if !ok || next != StateID(3) { - t.Errorf("Transition('b') = (%d, %v), want (3, true)", next, ok) - } - - // Overwrite transition - state.AddTransition(byte('a'), StateID(5)) - next, ok = state.Transition(byte('a')) - if !ok || next != StateID(5) { - t.Errorf("After overwrite, Transition('a') = (%d, %v), want (5, true)", next, ok) - } - if state.TransitionCount() != 2 { - t.Errorf("TransitionCount() = %d, want 2 after overwrite", state.TransitionCount()) - } - - // Remove transition by setting to InvalidState - state.AddTransition(byte('a'), InvalidState) - _, ok = state.Transition(byte('a')) - if ok { - t.Error("Transition('a') should be invalid after removal") - } - if state.TransitionCount() != 1 { - t.Errorf("TransitionCount() = %d, want 1 after removal", state.TransitionCount()) - } -} - -func TestStateTransitionOutOfBounds(t *testing.T) { - // Create state with small stride - state := NewStateWithStride(StateID(1), []nfa.StateID{0}, false, false, 4) - - // Transition beyond stride should return InvalidState - next, ok := state.Transition(byte(10)) - if ok { - t.Errorf("Transition beyond stride should be invalid, got %d", next) - } - - // AddTransition beyond stride should be ignored (no panic) - state.AddTransition(byte(10), StateID(5)) - if state.TransitionCount() != 0 { - t.Error("AddTransition beyond stride should be ignored") - } -} - -func TestStateStride(t *testing.T) { - tests := []struct { - name string - stride int - }{ - {name: "default", stride: 256}, - {name: "small", stride: 4}, - {name: "medium", stride: 64}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - state := NewStateWithStride(StateID(1), []nfa.StateID{0}, false, false, tt.stride) - if state.Stride() != tt.stride { - t.Errorf("Stride() = %d, want %d", state.Stride(), tt.stride) - } - }) - } -} +// Note: TestStateTransitions, TestStateTransitionOutOfBounds, TestStateStride +// removed — transitions now stored in DFACache.flatTrans, not in State struct. func TestStateString(t *testing.T) { state := NewState(StateID(5), []nfa.StateID{1, 2, 3}, true) From a000ab0570f460e64428a6d83ff015f527114c02 Mon Sep 17 00:00:00 2001 From: Andy Date: Tue, 24 Mar 2026 18:06:54 +0300 Subject: [PATCH 2/7] =?UTF-8?q?perf:=20Rust-aligned=20BT=20visited=20limit?= =?UTF-8?q?=20for=20UseNFA=20=E2=80=94=2072%=20less=20memory?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add NewBoundedBacktrackerSmall() with 128K entries (256KB) visited capacity, matching Rust regex's default visited_capacity. UseNFA path now creates BT with small limit. When haystack exceeds BT capacity, falls back to PikeVM (correct for leftmost-first). UseBoundedBacktracker strategy retains 32M limit for POSIX longest-match. LangArena LogParser (7MB log, 13 patterns, 10 iterations): - Total alloc: 89MB -> 25MB (-72%) - RSS (Sys): 353MB -> 41MB (-88%) - errors pattern: 66MB -> 2.4MB (-96%) - Speed: no regression (113-126ms per iter) --- CHANGELOG.md | 11 +++++++++-- meta/compile.go | 5 +++-- nfa/backtrack.go | 29 ++++++++++++++++++++++++++--- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 710a167..9cf2bba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,10 +15,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.12.19] - 2026-03-24 ### Performance +- **Rust-aligned BoundedBacktracker visited limit for UseNFA** — reduced visited + table capacity from 32M entries (64MB) to 128K entries (256KB) for UseNFA paths, + matching Rust regex's `visited_capacity` default. On Kostya's LangArena LogParser + (7MB log, 13 patterns): total alloc **89MB → 25MB** (-72%), RSS **353MB → 41MB** + (-88%). `errors` pattern: **66MB → 2.4MB** (-96%). No speed regression. + `UseBoundedBacktracker` strategy retains full 32M limit for POSIX longest-match + correctness (Go stdlib compatibility). + - **Remove dual transition storage** — eliminated `transitions []StateID` and `transitionCount` from `State` struct. Transitions now stored exclusively in - `DFACache.flatTrans`. Removes ~93MB of redundant per-state transition slices - (222MB → ~150MB). Acceleration detection migrated to `DetectAccelerationFromFlat()` + `DFACache.flatTrans`. Acceleration detection migrated to `DetectAccelerationFromFlat()` reading directly from flat table. ## [0.12.18] - 2026-03-24 diff --git a/meta/compile.go b/meta/compile.go index 0ff9384..abff531 100644 --- a/meta/compile.go +++ b/meta/compile.go @@ -365,9 +365,10 @@ func buildCharClassSearchers( // For UseNFA with small NFAs, also create BoundedBacktracker as fallback. // BoundedBacktracker is 2-3x faster than PikeVM on small inputs due to // generation-based visited tracking (O(1) reset) vs PikeVM's thread queues. - // This is similar to how stdlib uses backtracking for simple patterns. + // Use small capacity (256KB like Rust) — for UseNFA, BT is optional; + // PikeVM handles large inputs correctly. This prevents 37MB+ visited allocations. if result.finalStrategy == UseNFA && result.boundedBT == nil && nfaEngine.States() < 50 { - result.boundedBT = nfa.NewBoundedBacktracker(btNFA) + result.boundedBT = nfa.NewBoundedBacktrackerSmall(btNFA) } return result diff --git a/nfa/backtrack.go b/nfa/backtrack.go index d8b8352..85ed3a9 100644 --- a/nfa/backtrack.go +++ b/nfa/backtrack.go @@ -67,14 +67,37 @@ type BacktrackerState struct { } // NewBoundedBacktracker creates a new bounded backtracker for the given NFA. -// Default maxVisitedSize is 32M entries (32MB memory with uint8), allowing +// Default maxVisitedSize is 32M entries (64MB memory with uint16), allowing // ~900KB inputs for patterns with 35 states like (\w{2,8})+. -// Uses uint8 generation tracking (4x memory savings vs uint32, O(1) reset). +// +// This large limit is required for UseBoundedBacktracker strategy where BT +// is the primary engine with leftmost-longest semantics. PikeVM fallback +// gives leftmost-first results which would break correctness. +// +// For UseNFA strategy (where BT is optional), use NewBoundedBacktrackerSmall. func NewBoundedBacktracker(nfa *NFA) *BoundedBacktracker { return &BoundedBacktracker{ nfa: nfa, numStates: nfa.States(), - maxVisitedSize: 32 * 1024 * 1024, // 32M entries = 64MB memory (2 bytes per entry) + maxVisitedSize: 32 * 1024 * 1024, // 32M entries = 64MB (unchanged for BT strategy) + } +} + +// NewBoundedBacktrackerSmall creates a BoundedBacktracker with Rust-aligned +// visited capacity (128K entries = 256KB). Use for UseNFA paths where BT +// is an optional optimization and PikeVM is the correct fallback. +// +// This prevents massive visited table allocations (37MB+) for patterns like +// ` [5][0-9]{2} | [4][0-9]{2} ` on large inputs. When BT can't handle +// the input size, the caller falls back to PikeVM which is O(n*states) +// memory per step, not O(n*states) total. +// +// Matches Rust regex's default visited_capacity of 256KB. +func NewBoundedBacktrackerSmall(nfa *NFA) *BoundedBacktracker { + return &BoundedBacktracker{ + nfa: nfa, + numStates: nfa.States(), + maxVisitedSize: 128 * 1024, // 128K entries × 2 bytes = 256KB (Rust default) } } From d0ad9aa5a5f04b64db46321b1d1b3024519d0ed9 Mon Sep 17 00:00:00 2001 From: Andy Date: Tue, 24 Mar 2026 18:37:56 +0300 Subject: [PATCH 3/7] =?UTF-8?q?perf:=20byte-based=20DFA=20cache=20limit=20?= =?UTF-8?q?=E2=80=94=202MB=20default=20like=20Rust?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace MaxStates (count) with CacheCapacityBytes (bytes). Default: 2MB matching Rust regex's hybrid_cache_capacity. - Add DFACache.MemoryUsage() (mirrors Rust Cache::memory_usage) - Insert checks MemoryUsage() >= capacityBytes instead of state count - Config: CacheCapacityBytes (new), MaxStates (deprecated, backward compat) - Self-adjusting: fewer states for large stride, more for small - effectiveCapacityBytes() bridges legacy MaxStates to bytes (~100B/state) --- CHANGELOG.md | 5 +++ dfa/lazy/cache.go | 52 +++++++++++++++++++----- dfa/lazy/cache_test.go | 46 +++++++++++++-------- dfa/lazy/config.go | 71 +++++++++++++++++++++++++-------- dfa/lazy/coverage_final_test.go | 5 ++- dfa/lazy/lazy.go | 21 +++++----- 6 files changed, 145 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cf2bba..9866ec2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `UseBoundedBacktracker` strategy retains full 32M limit for POSIX longest-match correctness (Go stdlib compatibility). +- **Byte-based DFA cache limit** (Rust approach) — replaced `MaxStates` count limit + with `CacheCapacityBytes` (default 2MB, matching Rust's `hybrid_cache_capacity`). + Cache limit is now self-adjusting: fewer states for large alphabets, more for small. + Added `MemoryUsage()` method for runtime cache introspection. + - **Remove dual transition storage** — eliminated `transitions []StateID` and `transitionCount` from `State` struct. Transitions now stored exclusively in `DFACache.flatTrans`. Acceleration detection migrated to `DetectAccelerationFromFlat()` diff --git a/dfa/lazy/cache.go b/dfa/lazy/cache.go index 4044111..d8b277e 100644 --- a/dfa/lazy/cache.go +++ b/dfa/lazy/cache.go @@ -1,8 +1,6 @@ package lazy -import ( - "github.com/coregx/coregex/internal/conv" -) +// DFACache uses byte-based capacity (like Rust's cache_capacity). // DFACache holds mutable state for DFA search operations. // @@ -15,7 +13,7 @@ import ( // the DFA configuration is immutable and per-thread cache is mutable. // // The cache maps StateKey (NFA state set hash) -> DFA State. -// When the cache reaches maxStates, it can be cleared and rebuilt +// When the cache reaches capacityBytes, it can be cleared and rebuilt // (up to a configured limit) before falling back to NFA. // // Thread safety: NOT thread-safe. Each DFACache must be owned by a single @@ -59,8 +57,10 @@ type DFACache struct { // startTable caches start states for different look-behind contexts. startTable StartTable - // maxStates is the capacity limit - maxStates uint32 + // capacityBytes is the maximum cache memory in bytes (Rust approach). + // When MemoryUsage() exceeds this, Insert returns ErrCacheFull. + // Default: 2MB (matches Rust regex's hybrid_cache_capacity). + capacityBytes int // nextID is the next available state ID. nextID StateID @@ -93,8 +93,8 @@ func (c *DFACache) Insert(key StateKey, state *State) (StateID, error) { return existing.ID(), nil } - // Check capacity - if conv.IntToUint32(len(c.states)) >= c.maxStates { + // Check capacity (byte-based, like Rust's cache_capacity) + if c.MemoryUsage() >= c.capacityBytes { c.misses++ return InvalidState, ErrCacheFull } @@ -206,9 +206,39 @@ func (c *DFACache) Size() int { return len(c.states) } -// IsFull returns true if the cache has reached its maximum capacity. +// MemoryUsage returns the estimated heap memory used by this cache in bytes. +// Mirrors Rust's Cache::memory_usage() (hybrid/dfa.rs:2021). +// +// Components: +// - flatTrans: len * 4 bytes (StateID = uint32) +// - stateList: len * 8 bytes (pointer) +// - matchFlags: len * 1 byte +// - states map: ~len * 48 bytes (key + pointer + map overhead) +// - State heap: nfaStates slices + accelBytes +func (c *DFACache) MemoryUsage() int { + const stateIDSize = 4 // uint32 + const ptrSize = 8 // pointer on 64-bit + const mapEntrySize = 48 // approximate: key(8) + value(8) + map overhead(32) + + usage := len(c.flatTrans) * stateIDSize + usage += len(c.stateList) * ptrSize + usage += len(c.matchFlags) + usage += len(c.states) * mapEntrySize + + // State struct heap: nfaStates slice per state + for _, s := range c.stateList { + if s != nil { + usage += len(s.NFAStates()) * 4 // nfa.StateID = uint32 + usage += len(s.AccelExitBytes()) + } + } + + return usage +} + +// IsFull returns true if the cache has reached its capacity. func (c *DFACache) IsFull() bool { - return conv.IntToUint32(len(c.states)) >= c.maxStates + return c.MemoryUsage() >= c.capacityBytes } // Stats returns cache hit/miss statistics. @@ -237,7 +267,7 @@ func (c *DFACache) ResetStats() { // This also resets the clear counter. Primarily for testing. func (c *DFACache) Clear() { // Clear map (GC will reclaim memory) - c.states = make(map[StateKey]*State, c.maxStates) + c.states = make(map[StateKey]*State) c.stateList = c.stateList[:0] c.startTable = newStartTableFromByteMap(&c.startTable.byteMap) c.nextID = StartState + 1 diff --git a/dfa/lazy/cache_test.go b/dfa/lazy/cache_test.go index 8138676..4cecb8b 100644 --- a/dfa/lazy/cache_test.go +++ b/dfa/lazy/cache_test.go @@ -8,15 +8,23 @@ import ( ) // newTestCache creates a DFACache for testing without needing a DFA. +// maxStates is converted to a byte limit (~56 bytes per state for stride=0 test caches). func newTestCache(maxStates uint32) *DFACache { var byteMap [256]StartKind initByteMap(&byteMap) + // Each state in a stride=0 test cache uses ~56 bytes: + // map entry(48) + stateList ptr(8) + nfaStates heap(~4) = ~60 + // Use 52 to be slightly conservative (ensure IsFull after N inserts) + capacityBytes := int(maxStates) * 52 + if capacityBytes == 0 { + capacityBytes = DefaultCacheCapacity + } return &DFACache{ - states: make(map[StateKey]*State, maxStates), - stateList: make([]*State, 0, maxStates), - startTable: newStartTableFromByteMap(&byteMap), - maxStates: maxStates, - nextID: StartState + 1, + states: make(map[StateKey]*State, maxStates), + stateList: make([]*State, 0, maxStates), + startTable: newStartTableFromByteMap(&byteMap), + capacityBytes: capacityBytes, + nextID: StartState + 1, } } @@ -116,9 +124,9 @@ func TestCacheInsertDuplicate(t *testing.T) { } func TestCacheIsFull(t *testing.T) { - c := newTestCache(3) + c := newTestCache(100) // Start with large capacity - // Insert up to capacity + // Insert 3 states for i := nfa.StateID(0); i < 3; i++ { nfaStates := []nfa.StateID{i} key := ComputeStateKey(nfaStates) @@ -129,9 +137,10 @@ func TestCacheIsFull(t *testing.T) { } } - // Cache should be full + // Set capacity to current usage — should be full + c.capacityBytes = c.MemoryUsage() if !c.IsFull() { - t.Error("Cache should be full after inserting maxStates items") + t.Error("Cache should be full when capacity == usage") } // Next insert should fail with ErrCacheFull @@ -178,9 +187,9 @@ func TestCacheGetOrInsert(t *testing.T) { } func TestCacheGetOrInsertFull(t *testing.T) { - c := newTestCache(1) + c := newTestCache(100) // Start with large capacity - // Fill the cache + // Insert one state nfaStates := []nfa.StateID{1} key := ComputeStateKey(nfaStates) state := NewState(InvalidState, nfaStates, false) @@ -189,6 +198,9 @@ func TestCacheGetOrInsertFull(t *testing.T) { t.Fatalf("GetOrInsert failed: %v", err) } + // Set capacity to current usage — full + c.capacityBytes = c.MemoryUsage() + // Next GetOrInsert with new key should fail nfaStates2 := []nfa.StateID{2} key2 := ComputeStateKey(nfaStates2) @@ -354,8 +366,8 @@ func TestCacheResetClearCount(t *testing.T) { } func TestCacheCapacityBoundary(t *testing.T) { - // Test with capacity of 1 - c := newTestCache(1) + // Create cache, insert one state, measure usage, then set capacity to that + c := newTestCache(100) nfaStates := []nfa.StateID{1} key := ComputeStateKey(nfaStates) @@ -364,11 +376,13 @@ func TestCacheCapacityBoundary(t *testing.T) { // First insert should succeed _, err := c.Insert(key, state) if err != nil { - t.Fatalf("Insert on capacity-1 cache failed: %v", err) + t.Fatalf("Insert failed: %v", err) } + // Now set capacity to exactly the current usage — should be full + c.capacityBytes = c.MemoryUsage() if !c.IsFull() { - t.Error("Capacity-1 cache should be full after 1 insert") + t.Error("Cache should be full when capacity == usage") } // Second insert with different key should fail @@ -377,7 +391,7 @@ func TestCacheCapacityBoundary(t *testing.T) { state2 := NewState(InvalidState, nfaStates2, false) _, err = c.Insert(key2, state2) if !errors.Is(err, ErrCacheFull) { - t.Errorf("Second insert on capacity-1 cache: got %v, want ErrCacheFull", err) + t.Errorf("Insert on full cache: got %v, want ErrCacheFull", err) } } diff --git a/dfa/lazy/config.go b/dfa/lazy/config.go index 0df0c57..31901d8 100644 --- a/dfa/lazy/config.go +++ b/dfa/lazy/config.go @@ -5,18 +5,28 @@ package lazy // The configuration allows tuning the trade-off between memory usage and // performance. Larger caches provide better hit rates but consume more memory. type Config struct { - // MaxStates is the maximum number of DFA states to cache. - // When this limit is reached, the DFA clears the cache and continues - // DFA search (up to MaxCacheClears times per search), then falls back - // to NFA execution if the clear limit is exceeded. + // CacheCapacityBytes is the maximum memory (in bytes) that the DFA cache + // may use for transition tables, state storage, and metadata. // - // Default: 10,000 states (~1MB with 256-byte transition tables) - // Memory usage: ~100-200 bytes per state (depending on transitions) + // When MemoryUsage() exceeds this limit, the cache is considered full. + // The DFA will then clear the cache (up to MaxCacheClears times) and + // rebuild states on demand, or fall back to NFA. + // + // Default: 2MB (2 * 1024 * 1024), matching Rust regex's hybrid_cache_capacity. // // Tuning guidelines: - // - Simple patterns: 100-1,000 states sufficient - // - Complex patterns: 10,000-100,000 states - // - Memory-constrained: 1,000 states (~100KB) + // - Simple patterns: 256KB-1MB sufficient + // - Complex patterns or large alphabets: 2MB-10MB + // - Memory-constrained environments: 256KB + // - Performance-critical with complex patterns: 10MB-100MB + CacheCapacityBytes int + + // MaxStates is a legacy limit on the number of DFA states. + // When CacheCapacityBytes > 0, MaxStates is ignored. + // When CacheCapacityBytes == 0 and MaxStates > 0, + // an approximate byte limit is computed from MaxStates. + // + // Deprecated: Use CacheCapacityBytes instead. MaxStates uint32 // MaxCacheClears is the maximum number of times the DFA cache can be @@ -71,20 +81,24 @@ type Config struct { DeterminizationLimit int } +// DefaultCacheCapacity is the default DFA cache capacity in bytes. +// Matches Rust regex's hybrid_cache_capacity: 2 * (1 << 20) = 2MB. +const DefaultCacheCapacity = 2 * 1024 * 1024 + // DefaultConfig returns a configuration with sensible defaults. // // These defaults are tuned for general-purpose regex matching: -// - Balance memory usage (~1MB) with performance +// - Cache capacity: 2MB (matches Rust regex default) // - Enable prefilter for maximum speedup // - Prevent exponential state explosion // // For specific use cases, tune the parameters: -// - Memory-constrained: reduce MaxStates to 1,000 -// - Performance-critical: increase MaxStates to 100,000 +// - Memory-constrained: reduce CacheCapacityBytes to 256KB +// - Performance-critical: increase CacheCapacityBytes to 10MB // - Complex patterns: increase DeterminizationLimit func DefaultConfig() Config { return Config{ - MaxStates: 10_000, + CacheCapacityBytes: DefaultCacheCapacity, MaxCacheClears: 5, // Allow 5 cache clears before NFA fallback CacheHitThreshold: 0.0, // Disabled by default UsePrefilter: true, @@ -93,13 +107,27 @@ func DefaultConfig() Config { } } +// effectiveCapacityBytes returns the cache capacity in bytes. +// Uses CacheCapacityBytes if set, otherwise derives from legacy MaxStates. +func (c *Config) effectiveCapacityBytes() int { + if c.CacheCapacityBytes > 0 { + return c.CacheCapacityBytes + } + if c.MaxStates > 0 { + // Legacy: approximate bytes from state count. + // Each state uses ~100 bytes (flatTrans row + map entry + State struct). + return int(c.MaxStates) * 100 + } + return DefaultCacheCapacity +} + // Validate checks if the configuration is valid. // Returns an error if any parameter is out of acceptable range. func (c *Config) Validate() error { - if c.MaxStates == 0 { + if c.CacheCapacityBytes == 0 && c.MaxStates == 0 { return &DFAError{ Kind: InvalidConfig, - Message: "MaxStates must be > 0", + Message: "CacheCapacityBytes or MaxStates must be > 0", } } @@ -134,9 +162,20 @@ func (c *Config) Validate() error { return nil } -// WithMaxStates returns a new config with the specified max states +// WithCacheCapacity returns a new config with the specified cache capacity in bytes. +// Default is 2MB (matching Rust regex). Set to 0 to use MaxStates instead. +func (c Config) WithCacheCapacity(bytes int) Config { + c.CacheCapacityBytes = bytes + return c +} + +// WithMaxStates returns a new config with the specified max states. +// +// Deprecated: Use WithCacheCapacity instead. func (c Config) WithMaxStates(maxStates uint32) Config { c.MaxStates = maxStates + // Clear byte limit so legacy MaxStates takes effect + c.CacheCapacityBytes = 0 return c } diff --git a/dfa/lazy/coverage_final_test.go b/dfa/lazy/coverage_final_test.go index a35be87..7251f9f 100644 --- a/dfa/lazy/coverage_final_test.go +++ b/dfa/lazy/coverage_final_test.go @@ -373,12 +373,13 @@ func TestDetectAccelerationNilBuilder(t *testing.T) { // TestConfigValidateEdgeCases tests config validation edge cases. func TestConfigValidateEdgeCases(t *testing.T) { - // Invalid: zero max states + // Invalid: zero capacity and zero max states cfg := DefaultConfig() + cfg.CacheCapacityBytes = 0 cfg.MaxStates = 0 err := cfg.Validate() if err == nil { - t.Error("expected error for zero MaxStates") + t.Error("expected error for zero CacheCapacityBytes and MaxStates") } // Invalid: zero determinization limit diff --git a/dfa/lazy/lazy.go b/dfa/lazy/lazy.go index 46bc320..5dfc23f 100644 --- a/dfa/lazy/lazy.go +++ b/dfa/lazy/lazy.go @@ -99,7 +99,7 @@ type DFA struct { // across searches via Reset(), or pooled via sync.Pool in the meta layer. // // The cache is initialized with: -// - A state map sized to config.MaxStates +// - A state map (grows on demand up to CacheCapacityBytes) // - A stateList for O(1) state-by-ID lookup // - A StartTable with the DFA's immutable byteMap func (d *DFA) NewCache() *DFACache { @@ -108,14 +108,14 @@ func (d *DFA) NewCache() *DFACache { const initCap = 64 stride := d.AlphabetLen() return &DFACache{ - states: make(map[StateKey]*State, initCap), - stateList: make([]*State, 0, initCap), - flatTrans: make([]StateID, 0, initCap*stride), - matchFlags: make([]bool, 0, initCap), - stride: stride, - startTable: newStartTableFromByteMap(&d.startByteMap), - maxStates: d.config.MaxStates, - nextID: StartState + 1, + states: make(map[StateKey]*State, initCap), + stateList: make([]*State, 0, initCap), + flatTrans: make([]StateID, 0, initCap*stride), + matchFlags: make([]bool, 0, initCap), + stride: stride, + startTable: newStartTableFromByteMap(&d.startByteMap), + capacityBytes: d.config.effectiveCapacityBytes(), + nextID: StartState + 1, } } @@ -1861,9 +1861,10 @@ func (d *DFA) accelerate(haystack []byte, pos int, exitBytes []byte) int { // Useful for performance tuning and diagnostics. // // Returns (size, capacity, hits, misses, hitRate). +// capacity is the cache limit in bytes. func (d *DFA) CacheStats(cache *DFACache) (size int, capacity uint32, hits, misses uint64, hitRate float64) { size = cache.Size() - capacity = d.config.MaxStates + capacity = uint32(cache.capacityBytes) hits, misses, hitRate = cache.Stats() return } From d68d59f5eb7e1048d56b787380d4a40c1ef01ecf Mon Sep 17 00:00:00 2001 From: Andy Date: Tue, 24 Mar 2026 19:02:02 +0300 Subject: [PATCH 4/7] =?UTF-8?q?wip:=20SlotTable-based=20capture=20search?= =?UTF-8?q?=20=E2=80=94=20greedy=20loop=20capture=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SearchWithSlotTableCapturesAt now uses SlotTable instead of legacy COW. Works for simple patterns like (foo)(bar), but greedy repetitions (a+)(b+) lose group start positions during loop iterations. Root cause: addSearchThread CopySlots overwrites capture slots on each loop iteration. Need stack-based epsilon closure with RestoreCapture frames (Rust approach) to preserve capture context through loops. TODO: Convert recursive addSearchThread to stack-based with save/restore Status: 2 NFA unit test failures, all meta tests pass (meta still on COW) --- nfa/pikevm.go | 273 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 254 insertions(+), 19 deletions(-) diff --git a/nfa/pikevm.go b/nfa/pikevm.go index 22f6b9e..a3d790e 100644 --- a/nfa/pikevm.go +++ b/nfa/pikevm.go @@ -2009,30 +2009,265 @@ func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystac } // SearchWithSlotTableCaptures finds the first match and returns captures. -// -// NOTE: This method currently delegates to the legacy SearchWithCapturesAt -// because per-state SlotTable storage doesn't correctly track per-thread -// capture paths. The SlotTable architecture is designed for Find/IsMatch -// modes where captures are not needed. -// -// Future optimization: Implement a proper thread-indexed slot table similar -// to Rust's pikevm.rs Slots structure. -// -// Returns nil if no match found. +// Uses zero-allocation SlotTable architecture (Rust approach). func (p *PikeVM) SearchWithSlotTableCaptures(haystack []byte) *MatchWithCaptures { return p.SearchWithSlotTableCapturesAt(haystack, 0) } // SearchWithSlotTableCapturesAt finds the first match with captures starting from 'at'. +// Uses SlotTable for zero-allocation capture tracking (Rust approach). // -// NOTE: Currently delegates to legacy SearchWithCapturesAt for correct capture tracking. -// See SearchWithSlotTableCaptures for details. +// SlotTable per-state storage works correctly because the Visited sparse set +// guarantees each NFA state is visited at most once per generation — the same +// invariant that makes Rust's SlotTable correct. func (p *PikeVM) SearchWithSlotTableCapturesAt(haystack []byte, at int) *MatchWithCaptures { - // Delegate to the legacy capture implementation which correctly tracks - // per-thread capture positions using COW semantics. - // - // The SlotTable per-state architecture cannot correctly track captures - // because multiple threads can pass through the same state with different - // capture positions. A proper implementation would need thread-indexed slots. - return p.SearchWithCapturesAt(haystack, at) + if at > len(haystack) { + return nil + } + + // Configure SlotTable for full capture mode + totalSlots := p.nfa.CaptureCount() * 2 + p.internalState.SlotTable.SetActiveSlots(totalSlots) + + // Handle edge cases + if at == len(haystack) { + if p.matchesEmptyAt(haystack, at) { + return &MatchWithCaptures{ + Start: at, + End: at, + Captures: [][]int{{at, at}}, + } + } + return nil + } + if len(haystack) == 0 { + if p.matchesEmpty() { + return &MatchWithCaptures{ + Start: 0, + End: 0, + Captures: [][]int{{0, 0}}, + } + } + return nil + } + + if p.nfa.IsAnchored() { + return p.searchWithSlotTableCapturesAnchored(haystack, at) + } + return p.searchWithSlotTableCapturesUnanchored(haystack, at) +} + +// searchWithSlotTableCapturesUnanchored implements unanchored search with captures. +// Captures stored in SlotTable per-state, saved to bestSlots on match. +// +//nolint:gocognit +func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt int) *MatchWithCaptures { + st := &p.internalState + st.SearchQueue = st.SearchQueue[:0] + st.SearchNextQueue = st.SearchNextQueue[:0] + st.Visited.Clear() + st.SlotTable.Reset() + + totalSlots := st.SlotTable.ActiveSlots() + bestStart := -1 + bestEnd := -1 + // bestSlots stores capture slots for the best match found so far + var bestSlots []int + + for pos := startAt; pos <= len(haystack); pos++ { + if bestStart == -1 { + // Skip-ahead + if len(st.SearchQueue) == 0 && p.skipAhead != nil && pos > startAt { + candidate := p.skipAhead.Find(haystack, pos) + if candidate == -1 { + break + } + pos = candidate + } + st.Visited.Clear() + // Initialize slots for start state to all -1 (AllAbsent scratch) + absentSlots := st.SlotTable.AllAbsent() + for i := range absentSlots { + absentSlots[i] = -1 + } + startSid := p.nfa.StartAnchored() + // Copy absent slots to start state + startSlots := st.SlotTable.ForState(startSid) + if startSlots != nil { + copy(startSlots, absentSlots) + } + p.addSearchThread(searchThread{state: startSid, startPos: pos}, haystack, pos) + } + + if pos < len(haystack) { + b := haystack[pos] + st.Visited.Clear() + for _, t := range st.SearchQueue { + if p.nfa.IsMatch(t.state) { + if p.isBetterMatch(bestStart, bestEnd, t.startPos, pos) { + bestStart = t.startPos + bestEnd = pos + // Save capture slots for this match + matchSlots := st.SlotTable.ForState(t.state) + if matchSlots != nil && totalSlots > 0 { + if bestSlots == nil { + bestSlots = make([]int, totalSlots) + } + copy(bestSlots, matchSlots) + } + } + if !st.Longest { + break + } + continue + } + p.stepSearchThread(t, b, haystack, pos+1) + } + } else { + for _, t := range st.SearchQueue { + if p.nfa.IsMatch(t.state) { + if p.isBetterMatch(bestStart, bestEnd, t.startPos, pos) { + bestStart = t.startPos + bestEnd = pos + matchSlots := st.SlotTable.ForState(t.state) + if matchSlots != nil && totalSlots > 0 { + if bestSlots == nil { + bestSlots = make([]int, totalSlots) + } + copy(bestSlots, matchSlots) + } + } + break + } + } + } + + if pos >= len(haystack) { + break + } + + if bestStart != -1 { + hasLeftmostCandidate := false + for _, t := range st.SearchNextQueue { + if t.startPos <= bestStart { + hasLeftmostCandidate = true + break + } + } + if !hasLeftmostCandidate { + break + } + } + + st.SearchQueue, st.SearchNextQueue = st.SearchNextQueue, st.SearchQueue[:0] + } + + if bestStart == -1 { + return nil + } + return p.buildCapturesFromSlots(bestSlots, bestStart, bestEnd) +} + +// searchWithSlotTableCapturesAnchored implements anchored search with captures. +func (p *PikeVM) searchWithSlotTableCapturesAnchored(haystack []byte, startPos int) *MatchWithCaptures { + st := &p.internalState + st.SearchQueue = st.SearchQueue[:0] + st.SearchNextQueue = st.SearchNextQueue[:0] + st.Visited.Clear() + st.SlotTable.Reset() + + totalSlots := st.SlotTable.ActiveSlots() + + // Initialize start state slots + startSid := p.nfa.StartAnchored() + startSlots := st.SlotTable.ForState(startSid) + if startSlots != nil { + for i := range startSlots { + startSlots[i] = -1 + } + } + + p.addSearchThread(searchThread{state: startSid, startPos: startPos}, haystack, startPos) + + lastMatchPos := -1 + var bestSlots []int + + for pos := startPos; pos <= len(haystack); pos++ { + if pos < len(haystack) { + b := haystack[pos] + st.Visited.Clear() + for _, t := range st.SearchQueue { + if p.nfa.IsMatch(t.state) { + if pos > lastMatchPos || lastMatchPos == -1 { + lastMatchPos = pos + matchSlots := st.SlotTable.ForState(t.state) + if matchSlots != nil && totalSlots > 0 { + if bestSlots == nil { + bestSlots = make([]int, totalSlots) + } + copy(bestSlots, matchSlots) + } + } + if !st.Longest { + break + } + continue + } + p.stepSearchThread(t, b, haystack, pos+1) + } + } else { + for _, t := range st.SearchQueue { + if p.nfa.IsMatch(t.state) { + if pos > lastMatchPos || lastMatchPos == -1 { + lastMatchPos = pos + matchSlots := st.SlotTable.ForState(t.state) + if matchSlots != nil && totalSlots > 0 { + if bestSlots == nil { + bestSlots = make([]int, totalSlots) + } + copy(bestSlots, matchSlots) + } + } + break + } + } + } + + if len(st.SearchNextQueue) == 0 && (pos >= len(haystack) || lastMatchPos != -1) { + break + } + if pos >= len(haystack) { + break + } + + st.SearchQueue, st.SearchNextQueue = st.SearchNextQueue, st.SearchQueue[:0] + } + + if lastMatchPos == -1 { + return nil + } + return p.buildCapturesFromSlots(bestSlots, startPos, lastMatchPos) +} + +// buildCapturesFromSlots converts flat slot data to MatchWithCaptures result. +func (p *PikeVM) buildCapturesFromSlots(slots []int, matchStart, matchEnd int) *MatchWithCaptures { + numGroups := p.nfa.CaptureCount() + captures := make([][]int, numGroups) + captures[0] = []int{matchStart, matchEnd} + + if slots != nil { + for i := 1; i < numGroups && i*2+1 < len(slots); i++ { + start := slots[i*2] + end := slots[i*2+1] + if start >= 0 && end >= 0 { + captures[i] = []int{start, end} + } + } + } + + return &MatchWithCaptures{ + Start: matchStart, + End: matchEnd, + Captures: captures, + } } From d667d34c0b2db64662da670e55657c59048be41e Mon Sep 17 00:00:00 2001 From: Andy Date: Tue, 24 Mar 2026 19:08:39 +0300 Subject: [PATCH 5/7] wip: stack-based epsilon closure with RestoreCapture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Converted addSearchThread and addSearchThreadToNext from recursive to stack-based with captureFrame (Explore + RestoreCapture frames). Mirrors Rust pikevm.rs FollowEpsilon::RestoreCapture pattern. Still failing: greedy loop captures (a+)(b+) — per-state SlotTable overwrites group start on each loop iteration (State visited again in next generation). Per-thread COW preserves all variants. Root issue: per-state storage loses capture history across byte transitions in greedy loops. Need either per-thread indexing or generation-aware slot preservation. Status: 2 NFA unit tests fail, all meta tests pass --- nfa/pikevm.go | 330 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 216 insertions(+), 114 deletions(-) diff --git a/nfa/pikevm.go b/nfa/pikevm.go index a3d790e..8ad032f 100644 --- a/nfa/pikevm.go +++ b/nfa/pikevm.go @@ -57,6 +57,17 @@ type searchThread struct { startPos int // Position where this thread's match attempt started } +// captureFrame is a stack frame for capture-aware epsilon closure. +// Two kinds: Explore (process a state) and RestoreCapture (undo a capture write). +// Mirrors Rust's FollowEpsilon enum (pikevm.rs:1611). +type captureFrame struct { + state StateID // state to explore (InvalidState = restore frame) + startPos int // thread start position + // For restore frames: + slot int // slot index to restore + value int // old value to restore +} + // PikeVM implements the Pike VM algorithm for NFA execution. // It simulates the NFA by maintaining a set of active states and // exploring all possible paths through the automaton. @@ -113,6 +124,15 @@ type PikeVMState struct { // Reference: rust-regex/regex-automata/src/nfa/thompson/pikevm.rs:2044-2160 SlotTable *SlotTable + // captureStack is used for stack-based epsilon closure with capture save/restore. + // Mirrors Rust's FollowEpsilon::RestoreCapture pattern (pikevm.rs:1611-1637). + captureStack []captureFrame + + // currSlots is the working capture buffer during epsilon closure. + // Modified in-place, saved/restored via captureStack for StateCapture. + // Copied to SlotTable row when terminal state reached. + currSlots []int + // Longest enables leftmost-longest (POSIX) matching semantics. // By default (false), uses leftmost-first (Perl) semantics where // the first alternative wins. When true, the longest match wins. @@ -273,6 +293,12 @@ func (p *PikeVM) initState(state *PikeVMState) { // Each capture group has 2 slots (start and end position) slotsPerState := p.nfa.CaptureCount() * 2 state.SlotTable = NewSlotTable(p.nfa.States(), slotsPerState) + + // Capture-aware epsilon closure stack and working buffer + state.captureStack = make([]captureFrame, 0, capacity) + if slotsPerState > 0 { + state.currSlots = make([]int, slotsPerState) + } } // SetSkipAhead sets the prefilter for skip-ahead optimization. @@ -1812,72 +1838,115 @@ func (p *PikeVM) searchWithSlotTableAnchored(haystack []byte, startPos int) (int // addSearchThread adds a lightweight thread to the current queue, following epsilon transitions. // Captures are stored in SlotTable, not in the thread. func (p *PikeVM) addSearchThread(t searchThread, haystack []byte, pos int) { - // Check if already visited this state - if !p.internalState.Visited.Insert(uint32(t.state)) { - return - } + st := &p.internalState + activeSlots := st.SlotTable.ActiveSlots() - state := p.nfa.State(t.state) - if state == nil { - return - } + // Stack-based epsilon closure with capture save/restore (Rust approach). + // Uses currSlots as working buffer, modified in-place during closure. + // When terminal state reached: copy currSlots to SlotTable row. + // When StateCapture encountered: save old value, set new, push RestoreCapture. + // Reference: rust-regex pikevm.rs:1611-1749 (FollowEpsilon::RestoreCapture) - switch state.Kind() { - case StateMatch, StateByteRange, StateSparse, StateRuneAny, StateRuneAnyNotNL: - p.internalState.SearchQueue = append(p.internalState.SearchQueue, t) + st.captureStack = st.captureStack[:0] + st.captureStack = append(st.captureStack, captureFrame{ + state: t.state, startPos: t.startPos, + }) - case StateEpsilon: - next := state.Epsilon() - if next != InvalidState { - p.addSearchThread(searchThread{state: next, startPos: t.startPos}, haystack, pos) - } - - case StateSplit: - left, right := state.Split() + for len(st.captureStack) > 0 { + n := len(st.captureStack) + frame := st.captureStack[n-1] + st.captureStack = st.captureStack[:n-1] - if left != InvalidState { - if p.internalState.SlotTable.ActiveSlots() > 2 { - p.internalState.SlotTable.CopySlots(left, t.state) + // RestoreCapture frame: undo a capture slot modification + if frame.state == InvalidState { + if activeSlots > 2 && frame.slot < len(st.currSlots) { + st.currSlots[frame.slot] = frame.value } - p.addSearchThread(searchThread{state: left, startPos: t.startPos}, haystack, pos) + continue } - if right != InvalidState { - if p.internalState.SlotTable.ActiveSlots() > 2 { - p.internalState.SlotTable.CopySlots(right, t.state) - } - p.addSearchThread(searchThread{state: right, startPos: t.startPos}, haystack, pos) + + sid := frame.state + if !st.Visited.Insert(uint32(sid)) { + continue } - case StateCapture: - groupIndex, isStart, next := state.Capture() - if next != InvalidState { - // For Find mode (activeSlots=2), group 0 is tracked via thread.startPos/pos - if p.internalState.SlotTable.ActiveSlots() > 2 { - slotIndex := int(groupIndex) * 2 - if !isStart { - slotIndex++ + state := p.nfa.State(sid) + if state == nil { + continue + } + + switch state.Kind() { + case StateMatch, StateByteRange, StateSparse, StateRuneAny, StateRuneAnyNotNL: + // Terminal state: copy currSlots to this state's SlotTable row + if activeSlots > 2 { + stateSlots := st.SlotTable.ForState(sid) + if stateSlots != nil { + copy(stateSlots, st.currSlots) } - if p.internalState.SlotTable.ActiveSlots() > slotIndex { - // Copy parent slots to next state first - p.internalState.SlotTable.CopySlots(next, t.state) - // Then update the capture slot - p.internalState.SlotTable.SetSlot(next, slotIndex, pos) + } + st.SearchQueue = append(st.SearchQueue, searchThread{ + state: sid, startPos: frame.startPos, + }) + + case StateEpsilon: + next := state.Epsilon() + if next != InvalidState { + st.captureStack = append(st.captureStack, captureFrame{ + state: next, startPos: frame.startPos, + }) + } + + case StateSplit: + left, right := state.Split() + // Push right first (processed last = DFS left-first ordering) + if right != InvalidState { + st.captureStack = append(st.captureStack, captureFrame{ + state: right, startPos: frame.startPos, + }) + } + if left != InvalidState { + st.captureStack = append(st.captureStack, captureFrame{ + state: left, startPos: frame.startPos, + }) + } + + case StateCapture: + groupIndex, isStart, next := state.Capture() + if next != InvalidState { + if activeSlots > 2 { + slotIndex := int(groupIndex) * 2 + if !isStart { + slotIndex++ + } + if slotIndex < len(st.currSlots) { + // Save old value for restore + oldValue := st.currSlots[slotIndex] + // Push RestoreCapture BEFORE explore (will execute after) + st.captureStack = append(st.captureStack, captureFrame{ + state: InvalidState, // marker: restore frame + slot: slotIndex, + value: oldValue, + }) + // Set new capture value + st.currSlots[slotIndex] = pos + } } + st.captureStack = append(st.captureStack, captureFrame{ + state: next, startPos: frame.startPos, + }) } - p.addSearchThread(searchThread{state: next, startPos: t.startPos}, haystack, pos) - } - case StateLook: - look, next := state.Look() - if checkLookAssertion(look, haystack, pos) && next != InvalidState { - if p.internalState.SlotTable.ActiveSlots() > 2 { - p.internalState.SlotTable.CopySlots(next, t.state) + case StateLook: + look, next := state.Look() + if checkLookAssertion(look, haystack, pos) && next != InvalidState { + st.captureStack = append(st.captureStack, captureFrame{ + state: next, startPos: frame.startPos, + }) } - p.addSearchThread(searchThread{state: next, startPos: t.startPos}, haystack, pos) - } - case StateFail: - // Dead state + case StateFail: + // Dead state + } } } @@ -1936,76 +2005,115 @@ func (p *PikeVM) stepSearchThread(t searchThread, b byte, haystack []byte, nextP // addSearchThreadToNext adds a lightweight thread to the next queue. // srcState is the state we came from (for slot copying). +// Uses stack-based epsilon closure with capture save/restore. func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystack []byte, pos int) { - if !p.internalState.Visited.Insert(uint32(t.state)) { - return - } + st := &p.internalState + activeSlots := st.SlotTable.ActiveSlots() - state := p.nfa.State(t.state) - if state == nil { - return + // Load currSlots from source state's SlotTable row + if activeSlots > 2 { + srcSlots := st.SlotTable.ForState(srcState) + if srcSlots != nil && len(st.currSlots) > 0 { + copy(st.currSlots, srcSlots) + } } - // Copy slots from source to new state (only for Captures mode) - if p.internalState.SlotTable.ActiveSlots() > 2 { - p.internalState.SlotTable.CopySlots(t.state, srcState) - } + // Stack-based epsilon closure (same as addSearchThread but targets NextQueue) + st.captureStack = st.captureStack[:0] + st.captureStack = append(st.captureStack, captureFrame{ + state: t.state, startPos: t.startPos, + }) - switch state.Kind() { - case StateEpsilon: - next := state.Epsilon() - if next != InvalidState { - p.addSearchThreadToNext(searchThread{state: next, startPos: t.startPos}, t.state, haystack, pos) - } - return + for len(st.captureStack) > 0 { + n := len(st.captureStack) + frame := st.captureStack[n-1] + st.captureStack = st.captureStack[:n-1] - case StateSplit: - left, right := state.Split() - - if left != InvalidState { - if p.internalState.SlotTable.ActiveSlots() > 2 { - p.internalState.SlotTable.CopySlots(left, t.state) + if frame.state == InvalidState { + if activeSlots > 2 && frame.slot < len(st.currSlots) { + st.currSlots[frame.slot] = frame.value } - p.addSearchThreadToNext(searchThread{state: left, startPos: t.startPos}, left, haystack, pos) + continue } - if right != InvalidState { - if p.internalState.SlotTable.ActiveSlots() > 2 { - p.internalState.SlotTable.CopySlots(right, t.state) - } - p.addSearchThreadToNext(searchThread{state: right, startPos: t.startPos}, right, haystack, pos) + + sid := frame.state + if !st.Visited.Insert(uint32(sid)) { + continue } - return - case StateCapture: - groupIndex, isStart, next := state.Capture() - if next != InvalidState { - if p.internalState.SlotTable.ActiveSlots() > 2 { - slotIndex := int(groupIndex) * 2 - if !isStart { - slotIndex++ + state := p.nfa.State(sid) + if state == nil { + continue + } + + switch state.Kind() { + case StateMatch, StateByteRange, StateSparse, StateRuneAny, StateRuneAnyNotNL: + if activeSlots > 2 { + stateSlots := st.SlotTable.ForState(sid) + if stateSlots != nil { + copy(stateSlots, st.currSlots) } - if p.internalState.SlotTable.ActiveSlots() > slotIndex { - p.internalState.SlotTable.CopySlots(next, t.state) - p.internalState.SlotTable.SetSlot(next, slotIndex, pos) + } + // Add to NEXT queue (not current) + st.SearchNextQueue = append(st.SearchNextQueue, searchThread{ + state: sid, startPos: frame.startPos, + }) + + case StateEpsilon: + next := state.Epsilon() + if next != InvalidState { + st.captureStack = append(st.captureStack, captureFrame{ + state: next, startPos: frame.startPos, + }) + } + + case StateSplit: + left, right := state.Split() + if right != InvalidState { + st.captureStack = append(st.captureStack, captureFrame{ + state: right, startPos: frame.startPos, + }) + } + if left != InvalidState { + st.captureStack = append(st.captureStack, captureFrame{ + state: left, startPos: frame.startPos, + }) + } + + case StateCapture: + groupIndex, isStart, next := state.Capture() + if next != InvalidState { + if activeSlots > 2 { + slotIndex := int(groupIndex) * 2 + if !isStart { + slotIndex++ + } + if slotIndex < len(st.currSlots) { + oldValue := st.currSlots[slotIndex] + st.captureStack = append(st.captureStack, captureFrame{ + state: InvalidState, + slot: slotIndex, + value: oldValue, + }) + st.currSlots[slotIndex] = pos + } } + st.captureStack = append(st.captureStack, captureFrame{ + state: next, startPos: frame.startPos, + }) } - p.addSearchThreadToNext(searchThread{state: next, startPos: t.startPos}, next, haystack, pos) - } - return - case StateLook: - look, next := state.Look() - if checkLookAssertion(look, haystack, pos) && next != InvalidState { - if p.internalState.SlotTable.ActiveSlots() > 2 { - p.internalState.SlotTable.CopySlots(next, t.state) + case StateLook: + look, next := state.Look() + if checkLookAssertion(look, haystack, pos) && next != InvalidState { + st.captureStack = append(st.captureStack, captureFrame{ + state: next, startPos: frame.startPos, + }) } - p.addSearchThreadToNext(searchThread{state: next, startPos: t.startPos}, next, haystack, pos) + + case StateFail: } - return } - - // Add to next queue - p.internalState.SearchNextQueue = append(p.internalState.SearchNextQueue, t) } // SearchWithSlotTableCaptures finds the first match and returns captures. @@ -2085,16 +2193,10 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt pos = candidate } st.Visited.Clear() - // Initialize slots for start state to all -1 (AllAbsent scratch) - absentSlots := st.SlotTable.AllAbsent() - for i := range absentSlots { - absentSlots[i] = -1 - } startSid := p.nfa.StartAnchored() - // Copy absent slots to start state - startSlots := st.SlotTable.ForState(startSid) - if startSlots != nil { - copy(startSlots, absentSlots) + // Initialize currSlots to -1 (unset) before epsilon closure + for i := range st.currSlots { + st.currSlots[i] = -1 } p.addSearchThread(searchThread{state: startSid, startPos: pos}, haystack, pos) } From f4c3fd0d3a9bde2eff10b77fb1ad326b98281484 Mon Sep 17 00:00:00 2001 From: Andy Date: Tue, 24 Mar 2026 19:17:57 +0300 Subject: [PATCH 6/7] =?UTF-8?q?feat:=20dual=20SlotTable=20capture=20tracki?= =?UTF-8?q?ng=20=E2=80=94=20zero-alloc=20FindSubmatch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement Rust-style dual SlotTable (curr/next) for capture propagation across byte transitions. Stack-based epsilon closure with RestoreCapture frames preserves capture context through greedy loops. Key changes: - Add NextSlotTable + captureStack + currSlots to PikeVMState - addSearchThread: stack-based with captureFrame (Explore + RestoreCapture) - addSearchThreadToNext: loads from curr SlotTable, writes to next - Swap SlotTable/NextSlotTable after each byte (Rust mem::swap pattern) - Don't clear Visited before seed — prevents SlotTable row overwrite - Wire meta FindSubmatch to use SlotTable path - Fix empty match capture groups (buildCapturesFromSlots) FindAllSubmatch (5 patterns, 50K matches, 800KB input): - Alloc: 554MB -> 26MB (-95%) - Mallocs: 12.5M -> 440K (-96%) - Time: 1.48s -> 0.45s (3.3x faster) --- meta/findall.go | 4 +-- nfa/pikevm.go | 81 +++++++++++++++++++++++++------------------------ 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/meta/findall.go b/meta/findall.go index ba80453..f4fcb4e 100644 --- a/meta/findall.go +++ b/meta/findall.go @@ -90,7 +90,7 @@ func (e *Engine) findSubmatchAtWithState(haystack []byte, at int, state *SearchS case UseBoundedBacktracker, UseNFA, UseDFA, UseBoth, UseDigitPrefilter: atomic.AddUint64(&e.stats.NFASearches, 1) - nfaMatch := state.pikevm.SearchWithCapturesAt(haystack, at) + nfaMatch := state.pikevm.SearchWithSlotTableCapturesAt(haystack, at) if nfaMatch == nil { return nil } @@ -118,7 +118,7 @@ func (e *Engine) findSubmatchAtWithState(haystack []byte, at int, state *SearchS nfaMatch := state.pikevm.SearchWithCapturesInSpan(haystack, start, end) if nfaMatch == nil { // Defensive fallback: DFA found a match but PikeVM disagrees. - nfaMatch = state.pikevm.SearchWithCapturesAt(haystack, at) + nfaMatch = state.pikevm.SearchWithSlotTableCapturesAt(haystack, at) if nfaMatch == nil { return nil } diff --git a/nfa/pikevm.go b/nfa/pikevm.go index 8ad032f..34446d8 100644 --- a/nfa/pikevm.go +++ b/nfa/pikevm.go @@ -118,11 +118,12 @@ type PikeVMState struct { // Reference: rust-regex/regex-automata/src/nfa/thompson/pikevm.rs:2198 epsilonStack []StateID - // SlotTable stores capture slot values per NFA state. - // This is a 2D table (flattened to 1D) following the Rust regex architecture. - // Enables O(1) access to capture positions for any state. - // Reference: rust-regex/regex-automata/src/nfa/thompson/pikevm.rs:2044-2160 - SlotTable *SlotTable + // SlotTable / NextSlotTable: two slot tables swapped between generations. + // Mirrors Rust's curr/next ActiveStates pattern (pikevm.rs:1878). + // SlotTable = current generation, NextSlotTable = next generation. + // After each byte transition: swap(SlotTable, NextSlotTable). + SlotTable *SlotTable + NextSlotTable *SlotTable // captureStack is used for stack-based epsilon closure with capture save/restore. // Mirrors Rust's FollowEpsilon::RestoreCapture pattern (pikevm.rs:1611-1637). @@ -289,10 +290,11 @@ func (p *PikeVM) initState(state *PikeVMState) { // Pre-allocate epsilon stack for loop-based closure in IsMatch (Rust pattern) state.epsilonStack = make([]StateID, 0, capacity) - // Initialize SlotTable for capture tracking + // Initialize SlotTables for capture tracking (curr/next, swapped per byte) // Each capture group has 2 slots (start and end position) slotsPerState := p.nfa.CaptureCount() * 2 state.SlotTable = NewSlotTable(p.nfa.States(), slotsPerState) + state.NextSlotTable = NewSlotTable(p.nfa.States(), slotsPerState) // Capture-aware epsilon closure stack and working buffer state.captureStack = make([]captureFrame, 0, capacity) @@ -1837,6 +1839,8 @@ func (p *PikeVM) searchWithSlotTableAnchored(haystack []byte, startPos int) (int // addSearchThread adds a lightweight thread to the current queue, following epsilon transitions. // Captures are stored in SlotTable, not in the thread. +// +//nolint:gocognit // Stack-based epsilon closure with 7 state types is inherently complex func (p *PikeVM) addSearchThread(t searchThread, haystack []byte, pos int) { st := &p.internalState activeSlots := st.SlotTable.ActiveSlots() @@ -2006,11 +2010,13 @@ func (p *PikeVM) stepSearchThread(t searchThread, b byte, haystack []byte, nextP // addSearchThreadToNext adds a lightweight thread to the next queue. // srcState is the state we came from (for slot copying). // Uses stack-based epsilon closure with capture save/restore. +// +//nolint:gocognit,gocyclo,cyclop // Stack-based epsilon closure with capture save/restore func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystack []byte, pos int) { st := &p.internalState activeSlots := st.SlotTable.ActiveSlots() - // Load currSlots from source state's SlotTable row + // Load currSlots from CURRENT SlotTable (source state's row) if activeSlots > 2 { srcSlots := st.SlotTable.ForState(srcState) if srcSlots != nil && len(st.currSlots) > 0 { @@ -2018,7 +2024,7 @@ func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystac } } - // Stack-based epsilon closure (same as addSearchThread but targets NextQueue) + // Stack-based epsilon closure writing to NEXT SlotTable st.captureStack = st.captureStack[:0] st.captureStack = append(st.captureStack, captureFrame{ state: t.state, startPos: t.startPos, @@ -2048,13 +2054,13 @@ func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystac switch state.Kind() { case StateMatch, StateByteRange, StateSparse, StateRuneAny, StateRuneAnyNotNL: + // Write to NEXT SlotTable (not current!) if activeSlots > 2 { - stateSlots := st.SlotTable.ForState(sid) + stateSlots := st.NextSlotTable.ForState(sid) if stateSlots != nil { copy(stateSlots, st.currSlots) } } - // Add to NEXT queue (not current) st.SearchNextQueue = append(st.SearchNextQueue, searchThread{ state: sid, startPos: frame.startPos, }) @@ -2123,41 +2129,32 @@ func (p *PikeVM) SearchWithSlotTableCaptures(haystack []byte) *MatchWithCaptures } // SearchWithSlotTableCapturesAt finds the first match with captures starting from 'at'. -// Uses SlotTable for zero-allocation capture tracking (Rust approach). -// -// SlotTable per-state storage works correctly because the Visited sparse set -// guarantees each NFA state is visited at most once per generation — the same -// invariant that makes Rust's SlotTable correct. +// Uses dual SlotTable (curr/next) for zero-allocation capture tracking. +// Matches Rust's PikeVM Cache with curr/next ActiveStates (pikevm.rs:1878). func (p *PikeVM) SearchWithSlotTableCapturesAt(haystack []byte, at int) *MatchWithCaptures { if at > len(haystack) { return nil } - // Configure SlotTable for full capture mode totalSlots := p.nfa.CaptureCount() * 2 p.internalState.SlotTable.SetActiveSlots(totalSlots) + p.internalState.NextSlotTable.SetActiveSlots(totalSlots) + + numGroups := p.nfa.CaptureCount() - // Handle edge cases if at == len(haystack) { if p.matchesEmptyAt(haystack, at) { - return &MatchWithCaptures{ - Start: at, - End: at, - Captures: [][]int{{at, at}}, - } + return p.buildCapturesFromSlots(nil, at, at) } return nil } if len(haystack) == 0 { if p.matchesEmpty() { - return &MatchWithCaptures{ - Start: 0, - End: 0, - Captures: [][]int{{0, 0}}, - } + return p.buildCapturesFromSlots(nil, 0, 0) } return nil } + _ = numGroups if p.nfa.IsAnchored() { return p.searchWithSlotTableCapturesAnchored(haystack, at) @@ -2168,23 +2165,23 @@ func (p *PikeVM) SearchWithSlotTableCapturesAt(haystack []byte, at int) *MatchWi // searchWithSlotTableCapturesUnanchored implements unanchored search with captures. // Captures stored in SlotTable per-state, saved to bestSlots on match. // -//nolint:gocognit +//nolint:gocognit,gocyclo,cyclop // Merged match-check + step + seed loop func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt int) *MatchWithCaptures { st := &p.internalState st.SearchQueue = st.SearchQueue[:0] st.SearchNextQueue = st.SearchNextQueue[:0] st.Visited.Clear() st.SlotTable.Reset() + st.NextSlotTable.Reset() totalSlots := st.SlotTable.ActiveSlots() + st.NextSlotTable.SetActiveSlots(totalSlots) bestStart := -1 bestEnd := -1 - // bestSlots stores capture slots for the best match found so far var bestSlots []int for pos := startAt; pos <= len(haystack); pos++ { if bestStart == -1 { - // Skip-ahead if len(st.SearchQueue) == 0 && p.skipAhead != nil && pos > startAt { candidate := p.skipAhead.Find(haystack, pos) if candidate == -1 { @@ -2192,9 +2189,10 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt } pos = candidate } - st.Visited.Clear() + // DON'T clear Visited here — states already in SearchQueue must + // not be overwritten by new seed. Visited prevents re-entry. + // Visited is cleared before step loop (below), not before seed. startSid := p.nfa.StartAnchored() - // Initialize currSlots to -1 (unset) before epsilon closure for i := range st.currSlots { st.currSlots[i] = -1 } @@ -2209,7 +2207,7 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt if p.isBetterMatch(bestStart, bestEnd, t.startPos, pos) { bestStart = t.startPos bestEnd = pos - // Save capture slots for this match + // Read from CURRENT SlotTable matchSlots := st.SlotTable.ForState(t.state) if matchSlots != nil && totalSlots > 0 { if bestSlots == nil { @@ -2223,6 +2221,7 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt } continue } + // stepSearchThread → addSearchThreadToNext reads curr, writes next p.stepSearchThread(t, b, haystack, pos+1) } } else { @@ -2261,7 +2260,9 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt } } + // Swap queues AND SlotTables (Rust: core::mem::swap(curr, next)) st.SearchQueue, st.SearchNextQueue = st.SearchNextQueue, st.SearchQueue[:0] + st.SlotTable, st.NextSlotTable = st.NextSlotTable, st.SlotTable } if bestStart == -1 { @@ -2271,24 +2272,23 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt } // searchWithSlotTableCapturesAnchored implements anchored search with captures. +// +//nolint:gocognit // Merged match-check + step loop (Rust's nexts pattern) func (p *PikeVM) searchWithSlotTableCapturesAnchored(haystack []byte, startPos int) *MatchWithCaptures { st := &p.internalState st.SearchQueue = st.SearchQueue[:0] st.SearchNextQueue = st.SearchNextQueue[:0] st.Visited.Clear() st.SlotTable.Reset() + st.NextSlotTable.Reset() totalSlots := st.SlotTable.ActiveSlots() + st.NextSlotTable.SetActiveSlots(totalSlots) - // Initialize start state slots startSid := p.nfa.StartAnchored() - startSlots := st.SlotTable.ForState(startSid) - if startSlots != nil { - for i := range startSlots { - startSlots[i] = -1 - } + for i := range st.currSlots { + st.currSlots[i] = -1 } - p.addSearchThread(searchThread{state: startSid, startPos: startPos}, haystack, startPos) lastMatchPos := -1 @@ -2343,6 +2343,7 @@ func (p *PikeVM) searchWithSlotTableCapturesAnchored(haystack []byte, startPos i } st.SearchQueue, st.SearchNextQueue = st.SearchNextQueue, st.SearchQueue[:0] + st.SlotTable, st.NextSlotTable = st.NextSlotTable, st.SlotTable } if lastMatchPos == -1 { From e71029c2d43dfc31e415fee4ddb7aa5a94e583fe Mon Sep 17 00:00:00 2001 From: Andy Date: Tue, 24 Mar 2026 19:36:20 +0300 Subject: [PATCH 7/7] docs: update CHANGELOG, OPTIMIZATIONS, add ARCHITECTURE.md for v0.12.19 - CHANGELOG: add SlotTable capture tracking entry - OPTIMIZATIONS: add #10 Dual SlotTable (95% less memory), update version - ARCHITECTURE.md: new file documenting engine architecture, memory model, thread safety, and Rust alignment --- CHANGELOG.md | 8 +++ README.md | 25 +++++----- docs/ARCHITECTURE.md | 110 ++++++++++++++++++++++++++++++++++++++++++ docs/OPTIMIZATIONS.md | 61 +++++++++++++++++++++-- 4 files changed, 189 insertions(+), 15 deletions(-) create mode 100644 docs/ARCHITECTURE.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 9866ec2..433028f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.12.19] - 2026-03-24 ### Performance +- **Zero-alloc FindSubmatch via dual SlotTable** (Rust approach) — replaced per-thread + COW capture allocation with Rust-style flat SlotTable. Two SlotTables (curr/next) + swap between byte generations. Stack-based epsilon closure with RestoreCapture + frames preserves capture context through greedy loops. FindAllSubmatch (5 patterns, + 50K matches, 800KB input): alloc **554MB → 26MB** (-95%), mallocs **12.5M → 440K** + (-96%), time **1.48s → 0.45s** (3.3x faster). Reference: Rust `pikevm.rs` + `ActiveStates` + `SlotTable` + `FollowEpsilon::RestoreCapture`. + - **Rust-aligned BoundedBacktracker visited limit for UseNFA** — reduced visited table capacity from 32M entries (64MB) to 128K entries (256KB) for UseNFA paths, matching Rust regex's `visited_capacity` default. On Kostya's LangArena LogParser diff --git a/README.md b/README.md index 5d3fd70..3c4201c 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ High-performance regex engine for Go. Drop-in replacement for `regexp` with **3- Go's stdlib `regexp` is intentionally simple — single NFA engine, no optimizations. This guarantees O(n) time but leaves performance on the table. coregex brings Rust regex-crate architecture to Go: -- **Multi-engine**: Lazy DFA, PikeVM, OnePass, BoundedBacktracker +- **Multi-engine**: 17 strategies — Lazy DFA, PikeVM, OnePass, BoundedBacktracker, and more - **SIMD prefilters**: AVX2/SSSE3 for fast candidate rejection - **Reverse search**: Suffix/inner literal patterns run 1000x+ faster - **O(n) guarantee**: No backtracking, no ReDoS vulnerabilities @@ -187,20 +187,23 @@ Uses Go's `regexp/syntax` parser: ``` Pattern → Parse → NFA → Literal Extract → Strategy Select ↓ - ┌─────────────────────────────────┐ - │ Engines (17 strategies): │ - │ LazyDFA, PikeVM, OnePass, │ - │ BoundedBacktracker, │ - │ ReverseInner, ReverseSuffix, │ - │ ReverseSuffixSet, AnchoredLiteral, │ - │ CharClassSearcher, Teddy, │ - │ DigitPrefilter, AhoCorasick, │ - │ CompositeSearcher, BranchDispatch │ - └─────────────────────────────────┘ + ┌────────────────────────────────────────────┐ + │ Engines (17 strategies): │ + │ LazyDFA, PikeVM, OnePass, │ + │ BoundedBacktracker, ReverseAnchored, │ + │ ReverseInner, ReverseSuffix, │ + │ ReverseSuffixSet, MultilineReverseSuffix, │ + │ AnchoredLiteral, CharClassSearcher, │ + │ Teddy, DigitPrefilter, AhoCorasick, │ + │ CompositeSearcher, BranchDispatch, Both │ + └────────────────────────────────────────────┘ ↓ Input → Prefilter (SIMD) → Engine → Match Result ``` +> For detailed architecture documentation, see [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md). +> For optimization details, see [docs/OPTIMIZATIONS.md](docs/OPTIMIZATIONS.md). + **SIMD Primitives** (AMD64): - `memchr` — single byte search (AVX2) - `memmem` — substring search (SSSE3) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..4d5b05d --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,110 @@ +# coregex Architecture + +Production-grade regex engine for Go achieving 3-3000x speedup over stdlib +through multi-engine architecture and SIMD optimizations. + +## Execution Pipeline + +``` +Pattern → Parse → NFA Compile → Literal Extract → Strategy Select + ↓ + ┌──────────────────────────────────────┐ + │ Strategy (one of 15): │ + │ UseNFA, UseDFA, UseBoth, │ + │ UseReverseAnchored, UseReverseSuffix│ + │ UseOnePass, UseReverseInner, │ + │ UseBoundedBacktracker, UseTeddy, │ + │ UseReverseSuffixSet, UseCharClass, │ + │ UseDigitPrefilter, UseAhoCorasick, │ + │ UseComposite, UseAnchoredLiteral │ + └──────────────────────────────────────┘ + ↓ +Input → Prefilter (memchr/memmem/teddy) → Engine Search → Match Result +``` + +## Engine Architecture (Rust-aligned) + +### DFA Layer (`dfa/lazy/`) + +- **Lazy DFA**: On-demand state construction with byte class compression +- **Flat transition table**: `flatTrans[sid*stride+class]` — single array lookup, no pointer chase +- **Byte-based cache limit**: 2MB default (matches Rust `hybrid_cache_capacity`) +- **Cache clearing**: Up to 5 clears before NFA fallback (Rust approach) +- **Acceleration**: Detects self-loop states, uses SIMD memchr for skip-ahead +- **Integrated prefilter**: Skip-ahead at start state in DFA loop (Rust `hybrid/search.rs:232`) +- **Per-goroutine cache**: Immutable DFA + mutable DFACache (thread-safe) + +### PikeVM Layer (`nfa/pikevm.go`) + +- **Dual SlotTable**: Flat per-state capture storage (curr/next, swapped per byte) + - Zero-allocation capture tracking (Rust `ActiveStates` pattern) + - Stack-based epsilon closure with `RestoreCapture` frames + - `searchThread` (12 bytes) vs legacy `thread` (40+ bytes with COW) +- **Integrated prefilter**: Skip-ahead when no active threads (Rust `pikevm.rs:1293`) +- **SearchMode**: Dynamic slot sizing (0=IsMatch, 2=Find, full=Captures) + +### BoundedBacktracker (`nfa/backtrack.go`) + +- Generation-based visited table (O(1) reset, uint16) +- Visited limit: 256KB for UseNFA (Rust default), 64MB for UseBoundedBacktracker (POSIX) +- Fallback to PikeVM when input exceeds capacity + +### Prefilter Layer (`prefilter/`) + +- **AVX2 memchr**: SIMD byte search (12x faster than `bytes.IndexByte`) +- **Memmem**: SIMD substring search with Rabin-Karp fingerprinting +- **Teddy**: SIMD multi-pattern matching (1-8 patterns, AVX2/SSSE3) +- **Aho-Corasick**: DFA-based multi-pattern for >8 patterns +- **DigitPrefilter**: SIMD digit detection for `\d+` patterns + +## Memory Architecture + +### Per-Pattern (compile-time, shared immutable) +- NFA graph (states, transitions) +- DFA configuration (byte classes, start map) +- Prefilter (literal tables, SIMD masks) +- Strategy-specific searchers (reverse DFA, composite, etc.) + +### Per-Goroutine (search-time, pooled via sync.Pool) +- `SearchState` holds all mutable search state +- `DFACache`: flat transition table + state map (2MB default) +- `PikeVMState`: dual SlotTable + thread queues + visited set +- `BacktrackerState`: visited array + generation counter + +### Memory Budget (Kostya LangArena, 13 patterns, 7MB log) + +| Component | v0.12.18 | v0.12.19 | +|-----------|---------|---------| +| Total alloc (FindAll) | 89 MB | **25 MB** | +| RSS | 353 MB | **41 MB** | +| FindAllSubmatch (5 pat, 50K matches) | 554 MB | **26 MB** | + +## Thread Safety + +``` + ┌──────────────┐ + │ Engine │ ← Immutable after compile + │ (shared) │ + └──────┬───────┘ + │ + ┌────────────┼────────────┐ + ↓ ↓ ↓ + ┌────────────┐ ┌────────────┐ ┌────────────┐ + │ SearchState│ │ SearchState│ │ SearchState│ ← Per-goroutine + │(goroutine1)│ │(goroutine2)│ │(goroutine3)│ (sync.Pool) + └────────────┘ └────────────┘ └────────────┘ +``` + +## Key Design Decisions + +1. **Multi-engine**: Strategy selection at compile time, not runtime +2. **Rust reference**: Architecture mirrors Rust regex crate (lazy DFA, PikeVM, prefilters) +3. **Go stdlib compat**: POSIX leftmost-longest semantics (differs from Rust leftmost-first) +4. **Zero-alloc hot paths**: `IsMatch()`, `FindIndices()`, `Count()` — no heap allocation +5. **SIMD first**: AVX2/SSSE3 prefilters for x86_64, pure Go fallback for other archs + +## References + +- [Rust regex crate](https://github.com/rust-lang/regex) — primary architecture reference +- [RE2](https://github.com/google/re2) — O(n) performance guarantees +- [Hyperscan](https://github.com/intel/hyperscan) — SIMD multi-pattern (Teddy algorithm) diff --git a/docs/OPTIMIZATIONS.md b/docs/OPTIMIZATIONS.md index c0e9f17..dccef6c 100644 --- a/docs/OPTIMIZATIONS.md +++ b/docs/OPTIMIZATIONS.md @@ -1,6 +1,6 @@ # coregex Optimizations that Beat Rust regex -This document describes the 9 key optimizations in coregex that outperform the Rust regex crate. +This document describes the 10 key optimizations in coregex that outperform the Rust regex crate. These algorithms are critical to coregex's competitive advantage and **MUST NOT REGRESS**. ## Summary @@ -8,6 +8,7 @@ These algorithms are critical to coregex's competitive advantage and **MUST NOT | Optimization | File | Pattern Type | vs stdlib | Benchmark | |--------------|------|--------------|-----------|-----------| | **AnchoredLiteral** | `meta/anchored_literal.go` | `^prefix.*suffix$` | **32-133x faster** | anchored_literal | +| **Flat SlotTable** | `nfa/pikevm.go`, `nfa/slot_table.go` | FindSubmatch | **-95% memory** | submatch | | CharClassSearcher | `nfa/charclass_searcher.go` | `[\w]+`, `[a-z]+` | **23x faster** | char_class | | CompositeSearcher | `nfa/composite.go` | `[a-zA-Z]+[0-9]+` | **5x faster** | composite | | BranchDispatch | `nfa/branch_dispatch.go` | `^(\d+\|UUID\|hex32)` | **5-20x faster** | anchored_alt | @@ -593,6 +594,58 @@ bash scripts/bench.sh --compare baseline current --- +## 10. Dual SlotTable Capture Tracking (95% less memory) - NEW in v0.12.19 + +**Files**: `nfa/pikevm.go`, `nfa/slot_table.go` + +**Pattern types**: All FindSubmatch/FindAllSubmatch patterns with capture groups + +### Architecture + +Replaces per-thread COW (copy-on-write) capture allocation with Rust-style flat +SlotTable indexed by NFA state ID. Two SlotTables (curr/next) swap between byte +generations — matching Rust's `ActiveStates` pattern. + +``` +SlotTable layout: table[stateID * slotsPerState + slotIndex] +Each NFA state owns a row of slots: [g0_start, g0_end, g1_start, g1_end, ...] + +Epsilon closure: stack-based with RestoreCapture frames + Explore(sid) → process state, push children + RestoreCapture(slot) → undo capture write after subtree processed +``` + +Key invariant: Visited sparse set guarantees each NFA state is visited at most +once per generation → one thread per state → per-state storage is correct. + +### Why faster than COW approach + +| | COW (old) | SlotTable (new) | +|---|---|---| +| Thread fork | `make([]int, numSlots)` — heap alloc | `copy(row, currSlots)` — no alloc | +| Capture update | COW copy if shared — heap alloc | `currSlots[i] = pos` — in-place | +| Match save | `copyData()` — heap alloc | `copy(bestSlots, row)` — one copy | +| Memory per search | O(threads × slots) | O(states × slots) — fixed | + +### Benchmark data + +``` +FindAllSubmatch: 5 patterns, 50K matches, 800KB input + +Metric COW (old) SlotTable (new) Improvement +Alloc 554 MB 26 MB -95% +Mallocs 12,500,000 440,000 -96% +Time 1.48s 0.45s 3.3x faster +``` + +### Reference + +- Rust: `regex-automata/src/nfa/thompson/pikevm.rs:2065` (SlotTable struct) +- Rust: `regex-automata/src/nfa/thompson/pikevm.rs:1611` (FollowEpsilon::RestoreCapture) +- Rust: `regex-automata/src/nfa/thompson/pikevm.rs:1878` (Cache with curr/next ActiveStates) + +--- + ## References - **Rust regex crate**: Architecture inspiration for multi-engine design @@ -603,6 +656,6 @@ bash scripts/bench.sh --compare baseline current --- -*Document version: 1.2.0* -*Last updated: 2026-01-15* -*Benchmark data: regex-bench v0.11.0* +*Document version: 1.3.0* +*Last updated: 2026-03-24* +*Benchmark data: regex-bench v0.12.19*