From 5b2499daf24c9655257348d1b0975e8c79e2122b Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 24 Mar 2026 15:55:01 +0300
Subject: [PATCH 1/7] =?UTF-8?q?perf:=20remove=20dual=20transition=20storag?=
 =?UTF-8?q?e=20=E2=80=94=20State.transitions=20eliminated?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove transitions []StateID and transitionCount from State struct.
Transitions now stored exclusively in DFACache.flatTrans flat table.

- Remove State.AddTransition(), Transition(), Stride(), TransitionCount()
- Remove Builder.move() (unused after DetectAcceleration simplification)
- Simplify DetectAcceleration/DetectAccelerationFromCached to return nil
- Add DetectAccelerationFromFlat() reading from flat table
- Simplify tryDetectAccelerationWithCache (flatTrans-only path)
- Remove 3 redundant AddTransition calls from determinize
- Update tests: add TestDetectAccelerationFromFlat, remove State transition tests

Memory: ~222MB -> ~150MB (eliminates redundant per-state transition slices)
---
 CHANGELOG.md                               |   9 ++
 dfa/lazy/accel_test.go                     |  40 +++---
 dfa/lazy/anchored_search_prefilter_test.go |  63 ++--------
 dfa/lazy/builder.go                        | 139 ++++++---------------
 dfa/lazy/lazy.go                           |  20 ++-
 dfa/lazy/state.go                          |  71 ++---------
 dfa/lazy/state_set_test.go                 |  96 +-------------
 7 files changed, 95 insertions(+), 343 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 91b97c2..710a167 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120))
 - SIMD prefilter for CompositeSequenceDFA (#83)
 
+## [0.12.19] - 2026-03-24
+
+### Performance
+- **Remove dual transition storage** — eliminated `transitions []StateID` and
+  `transitionCount` from `State` struct. Transitions now stored exclusively in
+  `DFACache.flatTrans`. Removes ~93MB of redundant per-state transition slices
+  (222MB → ~150MB). Acceleration detection migrated to `DetectAccelerationFromFlat()`
+  reading directly from flat table.
+
 ## [0.12.18] - 2026-03-24
 
 ### Performance
diff --git a/dfa/lazy/accel_test.go b/dfa/lazy/accel_test.go
index 8c31819..d434dea 100644
--- a/dfa/lazy/accel_test.go
+++ b/dfa/lazy/accel_test.go
@@ -87,37 +87,37 @@ func TestDetectAcceleration(t *testing.T) {
 }
 
 func TestDetectAccelerationFromCached(t *testing.T) {
-	// Test the lazy detection that only uses cached transitions
+	// State no longer stores transitions — DetectAccelerationFromCached returns nil.
+	// Acceleration is now detected via DetectAccelerationFromFlat using flatTrans.
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-
-	// Initially no cached transitions - should return nil
 	exitBytes := DetectAccelerationFromCached(state)
 	if exitBytes != nil {
-		t.Errorf("Expected nil with no cached transitions, got %v", exitBytes)
+		t.Errorf("Expected nil (State has no transitions), got %v", exitBytes)
 	}
+}
+
+func TestDetectAccelerationFromFlat(t *testing.T) {
+	// Test acceleration detection via flat transition table
+	stride := 256
+	sid := StateID(1)
+	flatTrans := make([]StateID, 2*stride) // 2 states
 
-	// Add 250 self-loop transitions
+	// State 1: 250 self-loops, 3 exits to state 2, 3 dead
+	base := int(sid) * stride
 	for i := 0; i < 250; i++ {
-		state.AddTransition(byte(i), StateID(1)) // Self-loop
+		flatTrans[base+i] = sid // Self-loop
 	}
+	flatTrans[base+250] = StateID(2)
+	flatTrans[base+251] = StateID(2)
+	flatTrans[base+252] = StateID(2)
+	flatTrans[base+253] = DeadState
+	flatTrans[base+254] = DeadState
+	flatTrans[base+255] = DeadState
 
-	// Add 3 exit bytes
-	state.AddTransition(byte(250), StateID(2)) // Exit to state 2
-	state.AddTransition(byte(251), StateID(2)) // Exit to state 2
-	state.AddTransition(byte(252), StateID(2)) // Exit to state 2
-
-	// Add 3 dead transitions
-	state.AddTransition(byte(253), DeadState)
-	state.AddTransition(byte(254), DeadState)
-	state.AddTransition(byte(255), DeadState)
-
-	// Now should detect as accelerable
-	exitBytes = DetectAccelerationFromCached(state)
+	exitBytes := DetectAccelerationFromFlat(sid, flatTrans, stride, nil)
 	if len(exitBytes) != 3 {
 		t.Errorf("Expected 3 exit bytes, got %v", exitBytes)
 	}
-
-	// Verify the exit bytes are correct
 	expected := map[byte]bool{250: true, 251: true, 252: true}
 	for _, b := range exitBytes {
 		if !expected[b] {
diff --git a/dfa/lazy/anchored_search_prefilter_test.go b/dfa/lazy/anchored_search_prefilter_test.go
index ee8584a..0593e89 100644
--- a/dfa/lazy/anchored_search_prefilter_test.go
+++ b/dfa/lazy/anchored_search_prefilter_test.go
@@ -73,82 +73,39 @@ func TestDetectAccelFromCachedWithClassesByteMapping(t *testing.T) {
 
 // TestDetectAccelFromCachedWithClassesNilClasses verifies the nil byteClasses fallback.
 func TestDetectAccelFromCachedWithClassesNilClasses(t *testing.T) {
-	// Create a state with known transitions (stride=256, no compression)
+	// State no longer stores transitions — DetectAccelerationFromCachedWithClasses returns nil.
+	// Use DetectAccelerationFromFlat for flat table detection.
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-
-	// Fill 253 self-loop transitions
-	for i := 0; i < 253; i++ {
-		state.AddTransition(byte(i), StateID(1))
-	}
-	// Add 3 exit transitions to a different state
-	state.AddTransition(253, StateID(2))
-	state.AddTransition(254, StateID(2))
-	state.AddTransition(255, StateID(2))
-
-	// nil byteClasses -> exit class indices ARE the bytes (identity)
 	result := DetectAccelerationFromCachedWithClasses(state, nil)
-	if len(result) != 3 {
-		t.Fatalf("expected 3 exit bytes with nil classes, got %v", result)
-	}
-	expected := map[byte]bool{253: true, 254: true, 255: true}
-	for _, b := range result {
-		if !expected[b] {
-			t.Errorf("unexpected exit byte %d", b)
-		}
+	if result != nil {
+		t.Errorf("expected nil (State has no transitions), got %v", result)
 	}
 }
 
-// TestDetectAccelFromCachedInsufficientTransitions tests that when too few
-// transitions are cached, acceleration detection returns nil.
+// TestDetectAccelFromCachedInsufficientTransitions tests that State-based detection returns nil.
 func TestDetectAccelFromCachedInsufficientTransitions(t *testing.T) {
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-	// Only add a few transitions (way below 94% threshold)
-	state.AddTransition(0, StateID(1))
-	state.AddTransition(1, StateID(2))
-
 	result := DetectAccelerationFromCachedWithClasses(state, nil)
 	if result != nil {
-		t.Errorf("expected nil for insufficient cached transitions, got %v", result)
+		t.Errorf("expected nil (State has no transitions), got %v", result)
 	}
 }
 
-// TestDetectAccelFromCachedTooManyExitClasses tests that >3 exit classes returns nil.
+// TestDetectAccelFromCachedTooManyExitClasses tests that State-based detection returns nil.
 func TestDetectAccelFromCachedTooManyExitClasses(t *testing.T) {
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-	// Fill 250 self-loops
-	for i := 0; i < 250; i++ {
-		state.AddTransition(byte(i), StateID(1))
-	}
-	// Add 4 distinct exit transitions (> 3 limit)
-	state.AddTransition(250, StateID(2))
-	state.AddTransition(251, StateID(3))
-	state.AddTransition(252, StateID(4))
-	state.AddTransition(253, StateID(5))
-	// Fill remaining with dead
-	state.AddTransition(254, DeadState)
-	state.AddTransition(255, DeadState)
-
 	result := DetectAccelerationFromCachedWithClasses(state, nil)
 	if result != nil {
-		t.Errorf("expected nil for >3 exit classes, got %v", result)
+		t.Errorf("expected nil (State has no transitions), got %v", result)
 	}
 }
 
-// TestDetectAccelFromCachedZeroExitClasses tests that 0 exit classes returns nil.
+// TestDetectAccelFromCachedZeroExitClasses tests that State-based detection returns nil.
 func TestDetectAccelFromCachedZeroExitClasses(t *testing.T) {
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-	// All transitions are self-loops or dead
-	for i := 0; i < 256; i++ {
-		if i < 200 {
-			state.AddTransition(byte(i), StateID(1)) // self-loop
-		} else {
-			state.AddTransition(byte(i), DeadState) // dead
-		}
-	}
-
 	result := DetectAccelerationFromCachedWithClasses(state, nil)
 	if result != nil {
-		t.Errorf("expected nil for 0 exit classes, got %v", result)
+		t.Errorf("expected nil (State has no transitions), got %v", result)
 	}
 }
 
diff --git a/dfa/lazy/builder.go b/dfa/lazy/builder.go
index 5864a13..f528e87 100644
--- a/dfa/lazy/builder.go
+++ b/dfa/lazy/builder.go
@@ -213,12 +213,6 @@ func (b *Builder) epsilonClosure(states []nfa.StateID, lookHave LookSet) []nfa.S
 	return closure.ToSlice()
 }
 
-// move computes the set of NFA states reachable from the given states on input byte b.
-// This version does not track word context - use moveWithWordContext for patterns with \b/\B.
-func (b *Builder) move(states []nfa.StateID, input byte) []nfa.StateID {
-	return b.moveWithWordContext(states, input, false)
-}
-
 // moveWithWordContext computes the set of NFA states reachable from the given states on input byte b,
 // with full word boundary tracking.
 //
@@ -605,33 +599,51 @@ func DetectAccelerationFromCached(state *State) []byte {
 //
 // When byteClasses is nil, falls back to identity mapping (no compression).
 func DetectAccelerationFromCachedWithClasses(state *State, byteClasses *nfa.ByteClasses) []byte {
-	if state == nil {
-		return nil
-	}
+	// State no longer stores transitions — they live in DFACache.flatTrans.
+	// This function cannot detect acceleration without the flat table.
+	// Use DetectAccelerationFromFlat() instead.
+	return nil
+}
 
-	stride := state.Stride()
-	// Need most transitions cached to detect accurately
-	// For compressed alphabet, we need most of stride, not 240
-	minCachedRequired := stride - stride/16 // At least ~94% cached
+// DetectAccelerationFromFlat analyzes transitions from the flat table.
+// Used by tryDetectAcceleration when State.transitions will be removed.
+func DetectAccelerationFromFlat(sid StateID, flatTrans []StateID, stride int, byteClasses *nfa.ByteClasses) []byte {
+	ftLen := len(flatTrans)
+	return detectAccelFromTransitions(sid, stride, func(classIdx int) (StateID, bool) {
+		offset := safeOffset(sid, stride, classIdx)
+		if offset >= ftLen {
+			return InvalidState, false
+		}
+		next := flatTrans[offset]
+		return next, next != InvalidState
+	}, byteClasses)
+}
+
+// detectAccelFromTransitions is the shared implementation for acceleration detection.
+// transitionFn returns (nextID, cached) for a given class index.
+func detectAccelFromTransitions(selfID StateID, stride int, transitionFn func(int) (StateID, bool), byteClasses *nfa.ByteClasses) []byte {
+	// Count cached transitions first
+	cachedCount := 0
+	for classIdx := 0; classIdx < stride; classIdx++ {
+		if _, ok := transitionFn(classIdx); ok {
+			cachedCount++
+		}
+	}
+	minCachedRequired := stride - stride/16
 	if minCachedRequired < 1 {
 		minCachedRequired = 1
 	}
-	transitionCount := state.TransitionCount()
-	if transitionCount < minCachedRequired {
+	if cachedCount < minCachedRequired {
 		return nil
 	}
 
-	selfID := state.ID()
 	var exitClasses []byte
 	uncachedCount := 0
 
-	// Scan only the CACHED transitions by equivalence class
 	for classIdx := 0; classIdx < stride; classIdx++ {
-		nextID, ok := state.Transition(byte(classIdx))
+		nextID, ok := transitionFn(classIdx)
 		if !ok {
-			// Not cached yet - count as unknown
 			uncachedCount++
-			// Too many unknowns means we can't detect reliably
 			maxUncached := stride / 16
 			if maxUncached < 1 {
 				maxUncached = 1
@@ -642,16 +654,12 @@ func DetectAccelerationFromCachedWithClasses(state *State, byteClasses *nfa.Byte
 			continue
 		}
 
-		// Transition is cached
 		if nextID == selfID || nextID == DeadState {
-			// Self-loop or dead - counts as "skip"
 			continue
 		}
 
-		// This class causes exit - record it
 		exitClasses = append(exitClasses, byte(classIdx))
 		if len(exitClasses) > 3 {
-			// Too many exit classes - not accelerable
 			return nil
 		}
 	}
@@ -696,85 +704,10 @@ func DetectAccelerationFromCachedWithClasses(state *State, byteClasses *nfa.Byte
 //
 // Returns the exit bytes (1-3) or nil if not accelerable.
 func (b *Builder) DetectAcceleration(state *State) []byte {
-	if state == nil {
-		return nil
-	}
-
-	byteClasses := b.nfa.ByteClasses()
-	selfID := state.ID()
-	var exitClasses []byte
-	stride := state.Stride()
-
-	// Check all equivalence classes
-	for classIdx := 0; classIdx < stride; classIdx++ {
-		// Check if transition is already cached
-		nextID, ok := state.Transition(byte(classIdx))
-		if !ok {
-			// Need to compute this transition
-			// Find a representative byte for this class to use with move()
-			var repByte byte
-			if byteClasses == nil {
-				repByte = byte(classIdx)
-			} else {
-				repByte = byte(classIdx) // Default to class index
-				for bi := 0; bi < 256; bi++ {
-					if byteClasses.Get(byte(bi)) == byte(classIdx) {
-						repByte = byte(bi)
-						break
-					}
-				}
-			}
-
-			nextNFAStates := b.move(state.NFAStates(), repByte)
-			if len(nextNFAStates) == 0 {
-				// Dead state - counts as "skip"
-				continue
-			}
-
-			// This leads to a non-dead state - it's an exit class
-			exitClasses = append(exitClasses, byte(classIdx))
-			if len(exitClasses) > 3 {
-				// Too many exit classes - not accelerable
-				return nil
-			}
-			continue
-		}
-
-		// Transition is cached
-		if nextID == selfID || nextID == DeadState {
-			// Self-loop or dead - counts as "skip"
-			continue
-		}
-
-		// Transition to a different state - it's an exit class
-		exitClasses = append(exitClasses, byte(classIdx))
-		if len(exitClasses) > 3 {
-			// Too many exit classes - not accelerable
-			return nil
-		}
-	}
-
-	// Accelerable if we have 1-3 exit classes
-	if len(exitClasses) < 1 || len(exitClasses) > 3 {
-		return nil
-	}
-
-	// Convert class indices back to representative bytes for memchr
-	if byteClasses == nil {
-		return exitClasses
-	}
-
-	exitBytes := make([]byte, 0, len(exitClasses))
-	for _, classIdx := range exitClasses {
-		for bi := 0; bi < 256; bi++ {
-			if byteClasses.Get(byte(bi)) == classIdx {
-				exitBytes = append(exitBytes, byte(bi))
-				break
-			}
-		}
-	}
-
-	return exitBytes
+	// State no longer stores transitions — they live in DFACache.flatTrans.
+	// This method cannot detect acceleration without the flat table.
+	// Use DetectAccelerationFromFlat() instead.
+	return nil
 }
 
 // checkHasWordBoundary checks if the NFA contains any word boundary assertions (\b or \B).
diff --git a/dfa/lazy/lazy.go b/dfa/lazy/lazy.go
index 07672cd..46bc320 100644
--- a/dfa/lazy/lazy.go
+++ b/dfa/lazy/lazy.go
@@ -898,7 +898,7 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int
 			start, end, matched := d.pikevm.SearchAt(haystack, startPos)
 			return matched && start >= 0 && end >= start
 		}
-		d.tryDetectAcceleration(currentState)
+		d.tryDetectAccelerationWithCache(currentState, cache)
 
 		// State acceleration: if current state is accelerable, use SIMD to skip ahead
 		if exitBytes := currentState.AccelExitBytes(); len(exitBytes) > 0 {
@@ -1454,7 +1454,7 @@ func (d *DFA) searchAt(cache *DFACache, haystack []byte, startPos int) int { //n
 		if currentState == nil {
 			return d.nfaFallback(haystack, startPos)
 		}
-		d.tryDetectAcceleration(currentState)
+		d.tryDetectAccelerationWithCache(currentState, cache)
 
 		if exitBytes := currentState.AccelExitBytes(); len(exitBytes) > 0 {
 			nextPos := d.accelerate(haystack, pos, exitBytes)
@@ -1558,7 +1558,6 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro
 	if len(nextNFAStates) == 0 {
 		// Cache the dead state transition to avoid re-computation
 		// Use classIdx for transition storage (compressed alphabet)
-		current.AddTransition(classIdx, DeadState)
 		cache.SetFlatTransition(current.id, int(classIdx), DeadState)
 		return nil, nil //nolint:nilnil // dead state is valid, not an error
 	}
@@ -1585,7 +1584,6 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro
 	if existing, ok := cache.Get(key); ok {
 		// Cache hit: reuse existing state
 		// Use classIdx for transition storage (compressed alphabet)
-		current.AddTransition(classIdx, existing.ID())
 		cache.SetFlatTransition(current.id, int(classIdx), existing.ID())
 		return existing, nil
 	}
@@ -1624,7 +1622,6 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro
 
 	// Add transition from current state to new state
 	// Use classIdx for transition storage (compressed alphabet)
-	current.AddTransition(classIdx, newState.ID())
 	cache.SetFlatTransition(current.id, int(classIdx), newState.ID())
 
 	return newState, nil
@@ -1808,16 +1805,17 @@ func (d *DFA) matchesEmpty(cache *DFACache) bool {
 	return matched && start == 0 && end == 0
 }
 
-// tryDetectAcceleration attempts lazy acceleration detection for a state.
-// This is called when a state has enough cached transitions to detect reliably.
-// It only runs once per state (tracked via AccelChecked flag).
-func (d *DFA) tryDetectAcceleration(state *State) {
+// tryDetectAccelerationWithCache attempts acceleration detection using flatTrans.
+func (d *DFA) tryDetectAccelerationWithCache(state *State, cache *DFACache) {
 	if state == nil || state.AccelChecked() {
 		return
 	}
 
-	// Try lazy detection from cached transitions with ByteClasses support
-	if exitBytes := DetectAccelerationFromCachedWithClasses(state, d.byteClasses); len(exitBytes) > 0 {
+	var exitBytes []byte
+	if cache != nil && cache.stride > 0 {
+		exitBytes = DetectAccelerationFromFlat(state.ID(), cache.flatTrans, cache.stride, d.byteClasses)
+	}
+	if len(exitBytes) > 0 {
 		state.SetAccelBytes(exitBytes)
 	} else {
 		state.MarkAccelChecked()
diff --git a/dfa/lazy/state.go b/dfa/lazy/state.go
index ba6f70d..33edf96 100644
--- a/dfa/lazy/state.go
+++ b/dfa/lazy/state.go
@@ -53,14 +53,7 @@ type State struct {
 	// id uniquely identifies this state in the cache
 	id StateID
 
-	// transitions maps equivalence class → next state ID.
-	// The slice length equals the alphabet size (ByteClasses.AlphabetLen()).
-	// InvalidState means no transition for that equivalence class.
-	// Lookup: transitions[byteClasses.Get(byte)]
-	transitions []StateID
-
-	// transitionCount tracks how many valid transitions exist (for statistics/debugging)
-	transitionCount int
+	// Note: transitions removed — stored in DFACache.flatTrans only.
 
 	// isMatch indicates if this is an accepting state
 	isMatch bool
@@ -126,18 +119,13 @@ func NewStateWithStride(id StateID, nfaStates []nfa.StateID, isMatch bool, isFro
 	nfaStatesCopy := make([]nfa.StateID, len(nfaStates))
 	copy(nfaStatesCopy, nfaStates)
 
-	// Create transitions slice initialized to InvalidState
-	transitions := make([]StateID, stride)
-	for i := range transitions {
-		transitions[i] = InvalidState
-	}
-
+	// Note: transitions stored in DFACache.flatTrans (single source of truth).
+	// State struct keeps only metadata.
 	return &State{
-		id:          id,
-		transitions: transitions,
-		isMatch:     isMatch,
-		isFromWord:  isFromWord,
-		nfaStates:   nfaStatesCopy,
+		id:         id,
+		isMatch:    isMatch,
+		isFromWord: isFromWord,
+		nfaStates:  nfaStatesCopy,
 	}
 }
 
@@ -157,20 +145,6 @@ func (s *State) IsFromWord() bool {
 	return s.isFromWord
 }
 
-// Transition returns the next state for the given equivalence class index.
-// Returns (InvalidState, false) if no transition exists.
-// This is the hot path - O(1) slice lookup.
-//
-// IMPORTANT: The caller must convert the input byte to an equivalence class
-// index via byteClasses.Get(byte) before calling this method.
-func (s *State) Transition(classIdx byte) (StateID, bool) {
-	if int(classIdx) >= len(s.transitions) {
-		return InvalidState, false
-	}
-	next := s.transitions[classIdx]
-	return next, next != InvalidState
-}
-
 // checkWordBoundaryFast checks if consuming byte b would produce a match
 // via word boundary resolution. Uses pre-computed flags — O(1), no allocation.
 // Replaces the expensive checkWordBoundaryMatch (30% CPU) which created Builder
@@ -186,42 +160,15 @@ func (s *State) checkWordBoundaryFast(b byte) bool {
 	return s.matchAtNonWordBoundary
 }
 
-// AddTransition adds a transition from this state to another on equivalence class classIdx.
-// Overwrites any existing transition for this class.
-//
-// IMPORTANT: The caller must convert the input byte to an equivalence class
-// index via byteClasses.Get(byte) before calling this method.
-func (s *State) AddTransition(classIdx byte, next StateID) {
-	if int(classIdx) >= len(s.transitions) {
-		return // Ignore out-of-bounds (shouldn't happen with correct stride)
-	}
-	if s.transitions[classIdx] == InvalidState && next != InvalidState {
-		s.transitionCount++
-	} else if s.transitions[classIdx] != InvalidState && next == InvalidState {
-		s.transitionCount--
-	}
-	s.transitions[classIdx] = next
-}
-
-// Stride returns the alphabet size (number of equivalence classes).
-func (s *State) Stride() int {
-	return len(s.transitions)
-}
-
 // NFAStates returns the NFA states represented by this DFA state
 func (s *State) NFAStates() []nfa.StateID {
 	return s.nfaStates
 }
 
-// TransitionCount returns the number of valid transitions from this state
-func (s *State) TransitionCount() int {
-	return s.transitionCount
-}
-
 // String returns a human-readable representation of the state
 func (s *State) String() string {
-	return fmt.Sprintf("DFAState(id=%d, isMatch=%v, transitions=%d, nfaStates=%v)",
-		s.id, s.isMatch, s.transitionCount, s.nfaStates)
+	return fmt.Sprintf("DFAState(id=%d, isMatch=%v, nfaStates=%v)",
+		s.id, s.isMatch, s.nfaStates)
 }
 
 // IsAccelerable returns true if this state can use SIMD acceleration.
diff --git a/dfa/lazy/state_set_test.go b/dfa/lazy/state_set_test.go
index abd5622..06eae76 100644
--- a/dfa/lazy/state_set_test.go
+++ b/dfa/lazy/state_set_test.go
@@ -383,100 +383,8 @@ func TestStateCreation(t *testing.T) {
 	}
 }
 
-func TestStateTransitions(t *testing.T) {
-	state := NewState(StateID(1), []nfa.StateID{0}, false)
-
-	// Initially no valid transitions
-	if state.TransitionCount() != 0 {
-		t.Errorf("Initial TransitionCount() = %d, want 0", state.TransitionCount())
-	}
-
-	// All transitions should be InvalidState
-	for i := 0; i < 256; i++ {
-		next, ok := state.Transition(byte(i))
-		if ok {
-			t.Errorf("Transition(%d) should be invalid, got %d", i, next)
-		}
-		if next != InvalidState {
-			t.Errorf("Transition(%d) = %d, want InvalidState", i, next)
-		}
-	}
-
-	// Add some transitions
-	state.AddTransition(byte('a'), StateID(2))
-	state.AddTransition(byte('b'), StateID(3))
-
-	if state.TransitionCount() != 2 {
-		t.Errorf("TransitionCount() = %d, want 2", state.TransitionCount())
-	}
-
-	next, ok := state.Transition(byte('a'))
-	if !ok || next != StateID(2) {
-		t.Errorf("Transition('a') = (%d, %v), want (2, true)", next, ok)
-	}
-
-	next, ok = state.Transition(byte('b'))
-	if !ok || next != StateID(3) {
-		t.Errorf("Transition('b') = (%d, %v), want (3, true)", next, ok)
-	}
-
-	// Overwrite transition
-	state.AddTransition(byte('a'), StateID(5))
-	next, ok = state.Transition(byte('a'))
-	if !ok || next != StateID(5) {
-		t.Errorf("After overwrite, Transition('a') = (%d, %v), want (5, true)", next, ok)
-	}
-	if state.TransitionCount() != 2 {
-		t.Errorf("TransitionCount() = %d, want 2 after overwrite", state.TransitionCount())
-	}
-
-	// Remove transition by setting to InvalidState
-	state.AddTransition(byte('a'), InvalidState)
-	_, ok = state.Transition(byte('a'))
-	if ok {
-		t.Error("Transition('a') should be invalid after removal")
-	}
-	if state.TransitionCount() != 1 {
-		t.Errorf("TransitionCount() = %d, want 1 after removal", state.TransitionCount())
-	}
-}
-
-func TestStateTransitionOutOfBounds(t *testing.T) {
-	// Create state with small stride
-	state := NewStateWithStride(StateID(1), []nfa.StateID{0}, false, false, 4)
-
-	// Transition beyond stride should return InvalidState
-	next, ok := state.Transition(byte(10))
-	if ok {
-		t.Errorf("Transition beyond stride should be invalid, got %d", next)
-	}
-
-	// AddTransition beyond stride should be ignored (no panic)
-	state.AddTransition(byte(10), StateID(5))
-	if state.TransitionCount() != 0 {
-		t.Error("AddTransition beyond stride should be ignored")
-	}
-}
-
-func TestStateStride(t *testing.T) {
-	tests := []struct {
-		name   string
-		stride int
-	}{
-		{name: "default", stride: 256},
-		{name: "small", stride: 4},
-		{name: "medium", stride: 64},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			state := NewStateWithStride(StateID(1), []nfa.StateID{0}, false, false, tt.stride)
-			if state.Stride() != tt.stride {
-				t.Errorf("Stride() = %d, want %d", state.Stride(), tt.stride)
-			}
-		})
-	}
-}
+// Note: TestStateTransitions, TestStateTransitionOutOfBounds, TestStateStride
+// removed — transitions now stored in DFACache.flatTrans, not in State struct.
 
 func TestStateString(t *testing.T) {
 	state := NewState(StateID(5), []nfa.StateID{1, 2, 3}, true)

From a000ab0570f460e64428a6d83ff015f527114c02 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 24 Mar 2026 18:06:54 +0300
Subject: [PATCH 2/7] =?UTF-8?q?perf:=20Rust-aligned=20BT=20visited=20limit?=
 =?UTF-8?q?=20for=20UseNFA=20=E2=80=94=2072%=20less=20memory?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add NewBoundedBacktrackerSmall() with 128K entries (256KB) visited
capacity, matching Rust regex's default visited_capacity.

UseNFA path now creates BT with small limit. When haystack exceeds
BT capacity, falls back to PikeVM (correct for leftmost-first).
UseBoundedBacktracker strategy retains 32M limit for POSIX longest-match.

LangArena LogParser (7MB log, 13 patterns, 10 iterations):
- Total alloc: 89MB -> 25MB (-72%)
- RSS (Sys): 353MB -> 41MB (-88%)
- errors pattern: 66MB -> 2.4MB (-96%)
- Speed: no regression (113-126ms per iter)
---
 CHANGELOG.md     | 11 +++++++++--
 meta/compile.go  |  5 +++--
 nfa/backtrack.go | 29 ++++++++++++++++++++++++++---
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 710a167..9cf2bba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,10 +15,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.12.19] - 2026-03-24
 
 ### Performance
+- **Rust-aligned BoundedBacktracker visited limit for UseNFA** — reduced visited
+  table capacity from 32M entries (64MB) to 128K entries (256KB) for UseNFA paths,
+  matching Rust regex's `visited_capacity` default. On Kostya's LangArena LogParser
+  (7MB log, 13 patterns): total alloc **89MB → 25MB** (-72%), RSS **353MB → 41MB**
+  (-88%). `errors` pattern: **66MB → 2.4MB** (-96%). No speed regression.
+  `UseBoundedBacktracker` strategy retains full 32M limit for POSIX longest-match
+  correctness (Go stdlib compatibility).
+
 - **Remove dual transition storage** — eliminated `transitions []StateID` and
   `transitionCount` from `State` struct. Transitions now stored exclusively in
-  `DFACache.flatTrans`. Removes ~93MB of redundant per-state transition slices
-  (222MB → ~150MB). Acceleration detection migrated to `DetectAccelerationFromFlat()`
+  `DFACache.flatTrans`. Acceleration detection migrated to `DetectAccelerationFromFlat()`
   reading directly from flat table.
 
 ## [0.12.18] - 2026-03-24
diff --git a/meta/compile.go b/meta/compile.go
index 0ff9384..abff531 100644
--- a/meta/compile.go
+++ b/meta/compile.go
@@ -365,9 +365,10 @@ func buildCharClassSearchers(
 	// For UseNFA with small NFAs, also create BoundedBacktracker as fallback.
 	// BoundedBacktracker is 2-3x faster than PikeVM on small inputs due to
 	// generation-based visited tracking (O(1) reset) vs PikeVM's thread queues.
-	// This is similar to how stdlib uses backtracking for simple patterns.
+	// Use small capacity (256KB like Rust) — for UseNFA, BT is optional;
+	// PikeVM handles large inputs correctly. This prevents 37MB+ visited allocations.
 	if result.finalStrategy == UseNFA && result.boundedBT == nil && nfaEngine.States() < 50 {
-		result.boundedBT = nfa.NewBoundedBacktracker(btNFA)
+		result.boundedBT = nfa.NewBoundedBacktrackerSmall(btNFA)
 	}
 
 	return result
diff --git a/nfa/backtrack.go b/nfa/backtrack.go
index d8b8352..85ed3a9 100644
--- a/nfa/backtrack.go
+++ b/nfa/backtrack.go
@@ -67,14 +67,37 @@ type BacktrackerState struct {
 }
 
 // NewBoundedBacktracker creates a new bounded backtracker for the given NFA.
-// Default maxVisitedSize is 32M entries (32MB memory with uint8), allowing
+// Default maxVisitedSize is 32M entries (64MB memory with uint16), allowing
 // ~900KB inputs for patterns with 35 states like (\w{2,8})+.
-// Uses uint8 generation tracking (4x memory savings vs uint32, O(1) reset).
+//
+// This large limit is required for UseBoundedBacktracker strategy where BT
+// is the primary engine with leftmost-longest semantics. PikeVM fallback
+// gives leftmost-first results which would break correctness.
+//
+// For UseNFA strategy (where BT is optional), use NewBoundedBacktrackerSmall.
 func NewBoundedBacktracker(nfa *NFA) *BoundedBacktracker {
 	return &BoundedBacktracker{
 		nfa:            nfa,
 		numStates:      nfa.States(),
-		maxVisitedSize: 32 * 1024 * 1024, // 32M entries = 64MB memory (2 bytes per entry)
+		maxVisitedSize: 32 * 1024 * 1024, // 32M entries = 64MB (unchanged for BT strategy)
+	}
+}
+
+// NewBoundedBacktrackerSmall creates a BoundedBacktracker with Rust-aligned
+// visited capacity (128K entries = 256KB). Use for UseNFA paths where BT
+// is an optional optimization and PikeVM is the correct fallback.
+//
+// This prevents massive visited table allocations (37MB+) for patterns like
+// ` [5][0-9]{2} | [4][0-9]{2} ` on large inputs. When BT can't handle
+// the input size, the caller falls back to PikeVM which is O(n*states)
+// memory per step, not O(n*states) total.
+//
+// Matches Rust regex's default visited_capacity of 256KB.
+func NewBoundedBacktrackerSmall(nfa *NFA) *BoundedBacktracker {
+	return &BoundedBacktracker{
+		nfa:            nfa,
+		numStates:      nfa.States(),
+		maxVisitedSize: 128 * 1024, // 128K entries × 2 bytes = 256KB (Rust default)
 	}
 }
 

From d0ad9aa5a5f04b64db46321b1d1b3024519d0ed9 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 24 Mar 2026 18:37:56 +0300
Subject: [PATCH 3/7] =?UTF-8?q?perf:=20byte-based=20DFA=20cache=20limit=20?=
 =?UTF-8?q?=E2=80=94=202MB=20default=20like=20Rust?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace MaxStates (count) with CacheCapacityBytes (bytes).
Default: 2MB matching Rust regex's hybrid_cache_capacity.

- Add DFACache.MemoryUsage() (mirrors Rust Cache::memory_usage)
- Insert checks MemoryUsage() >= capacityBytes instead of state count
- Config: CacheCapacityBytes (new), MaxStates (deprecated, backward compat)
- Self-adjusting: fewer states for large stride, more for small
- effectiveCapacityBytes() bridges legacy MaxStates to bytes (~100B/state)
---
 CHANGELOG.md                    |  5 +++
 dfa/lazy/cache.go               | 52 +++++++++++++++++++-----
 dfa/lazy/cache_test.go          | 46 +++++++++++++--------
 dfa/lazy/config.go              | 71 +++++++++++++++++++++++++--------
 dfa/lazy/coverage_final_test.go |  5 ++-
 dfa/lazy/lazy.go                | 21 +++++-----
 6 files changed, 145 insertions(+), 55 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9cf2bba..9866ec2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   `UseBoundedBacktracker` strategy retains full 32M limit for POSIX longest-match
   correctness (Go stdlib compatibility).
 
+- **Byte-based DFA cache limit** (Rust approach) — replaced `MaxStates` count limit
+  with `CacheCapacityBytes` (default 2MB, matching Rust's `hybrid_cache_capacity`).
+  Cache limit is now self-adjusting: fewer states for large alphabets, more for small.
+  Added `MemoryUsage()` method for runtime cache introspection.
+
 - **Remove dual transition storage** — eliminated `transitions []StateID` and
   `transitionCount` from `State` struct. Transitions now stored exclusively in
   `DFACache.flatTrans`. Acceleration detection migrated to `DetectAccelerationFromFlat()`
diff --git a/dfa/lazy/cache.go b/dfa/lazy/cache.go
index 4044111..d8b277e 100644
--- a/dfa/lazy/cache.go
+++ b/dfa/lazy/cache.go
@@ -1,8 +1,6 @@
 package lazy
 
-import (
-	"github.com/coregx/coregex/internal/conv"
-)
+// DFACache uses byte-based capacity (like Rust's cache_capacity).
 
 // DFACache holds mutable state for DFA search operations.
 //
@@ -15,7 +13,7 @@ import (
 // the DFA configuration is immutable and per-thread cache is mutable.
 //
 // The cache maps StateKey (NFA state set hash) -> DFA State.
-// When the cache reaches maxStates, it can be cleared and rebuilt
+// When the cache reaches capacityBytes, it can be cleared and rebuilt
 // (up to a configured limit) before falling back to NFA.
 //
 // Thread safety: NOT thread-safe. Each DFACache must be owned by a single
@@ -59,8 +57,10 @@ type DFACache struct {
 	// startTable caches start states for different look-behind contexts.
 	startTable StartTable
 
-	// maxStates is the capacity limit
-	maxStates uint32
+	// capacityBytes is the maximum cache memory in bytes (Rust approach).
+	// When MemoryUsage() exceeds this, Insert returns ErrCacheFull.
+	// Default: 2MB (matches Rust regex's hybrid_cache_capacity).
+	capacityBytes int
 
 	// nextID is the next available state ID.
 	nextID StateID
@@ -93,8 +93,8 @@ func (c *DFACache) Insert(key StateKey, state *State) (StateID, error) {
 		return existing.ID(), nil
 	}
 
-	// Check capacity
-	if conv.IntToUint32(len(c.states)) >= c.maxStates {
+	// Check capacity (byte-based, like Rust's cache_capacity)
+	if c.MemoryUsage() >= c.capacityBytes {
 		c.misses++
 		return InvalidState, ErrCacheFull
 	}
@@ -206,9 +206,39 @@ func (c *DFACache) Size() int {
 	return len(c.states)
 }
 
-// IsFull returns true if the cache has reached its maximum capacity.
+// MemoryUsage returns the estimated heap memory used by this cache in bytes.
+// Mirrors Rust's Cache::memory_usage() (hybrid/dfa.rs:2021).
+//
+// Components:
+//   - flatTrans: len * 4 bytes (StateID = uint32)
+//   - stateList: len * 8 bytes (pointer)
+//   - matchFlags: len * 1 byte
+//   - states map: ~len * 48 bytes (key + pointer + map overhead)
+//   - State heap: nfaStates slices + accelBytes
+func (c *DFACache) MemoryUsage() int {
+	const stateIDSize = 4   // uint32
+	const ptrSize = 8       // pointer on 64-bit
+	const mapEntrySize = 48 // approximate: key(8) + value(8) + map overhead(32)
+
+	usage := len(c.flatTrans) * stateIDSize
+	usage += len(c.stateList) * ptrSize
+	usage += len(c.matchFlags)
+	usage += len(c.states) * mapEntrySize
+
+	// State struct heap: nfaStates slice per state
+	for _, s := range c.stateList {
+		if s != nil {
+			usage += len(s.NFAStates()) * 4 // nfa.StateID = uint32
+			usage += len(s.AccelExitBytes())
+		}
+	}
+
+	return usage
+}
+
+// IsFull returns true if the cache has reached its capacity.
 func (c *DFACache) IsFull() bool {
-	return conv.IntToUint32(len(c.states)) >= c.maxStates
+	return c.MemoryUsage() >= c.capacityBytes
 }
 
 // Stats returns cache hit/miss statistics.
@@ -237,7 +267,7 @@ func (c *DFACache) ResetStats() {
 // This also resets the clear counter. Primarily for testing.
 func (c *DFACache) Clear() {
 	// Clear map (GC will reclaim memory)
-	c.states = make(map[StateKey]*State, c.maxStates)
+	c.states = make(map[StateKey]*State)
 	c.stateList = c.stateList[:0]
 	c.startTable = newStartTableFromByteMap(&c.startTable.byteMap)
 	c.nextID = StartState + 1
diff --git a/dfa/lazy/cache_test.go b/dfa/lazy/cache_test.go
index 8138676..4cecb8b 100644
--- a/dfa/lazy/cache_test.go
+++ b/dfa/lazy/cache_test.go
@@ -8,15 +8,23 @@ import (
 )
 
 // newTestCache creates a DFACache for testing without needing a DFA.
+// maxStates is converted to a byte limit (~56 bytes per state for stride=0 test caches).
 func newTestCache(maxStates uint32) *DFACache {
 	var byteMap [256]StartKind
 	initByteMap(&byteMap)
+	// Each state in a stride=0 test cache uses ~56 bytes:
+	// map entry(48) + stateList ptr(8) + nfaStates heap(~4) = ~60
+	// Use 52 to be slightly conservative (ensure IsFull after N inserts)
+	capacityBytes := int(maxStates) * 52
+	if capacityBytes == 0 {
+		capacityBytes = DefaultCacheCapacity
+	}
 	return &DFACache{
-		states:     make(map[StateKey]*State, maxStates),
-		stateList:  make([]*State, 0, maxStates),
-		startTable: newStartTableFromByteMap(&byteMap),
-		maxStates:  maxStates,
-		nextID:     StartState + 1,
+		states:        make(map[StateKey]*State, maxStates),
+		stateList:     make([]*State, 0, maxStates),
+		startTable:    newStartTableFromByteMap(&byteMap),
+		capacityBytes: capacityBytes,
+		nextID:        StartState + 1,
 	}
 }
 
@@ -116,9 +124,9 @@ func TestCacheInsertDuplicate(t *testing.T) {
 }
 
 func TestCacheIsFull(t *testing.T) {
-	c := newTestCache(3)
+	c := newTestCache(100) // Start with large capacity
 
-	// Insert up to capacity
+	// Insert 3 states
 	for i := nfa.StateID(0); i < 3; i++ {
 		nfaStates := []nfa.StateID{i}
 		key := ComputeStateKey(nfaStates)
@@ -129,9 +137,10 @@ func TestCacheIsFull(t *testing.T) {
 		}
 	}
 
-	// Cache should be full
+	// Set capacity to current usage — should be full
+	c.capacityBytes = c.MemoryUsage()
 	if !c.IsFull() {
-		t.Error("Cache should be full after inserting maxStates items")
+		t.Error("Cache should be full when capacity == usage")
 	}
 
 	// Next insert should fail with ErrCacheFull
@@ -178,9 +187,9 @@ func TestCacheGetOrInsert(t *testing.T) {
 }
 
 func TestCacheGetOrInsertFull(t *testing.T) {
-	c := newTestCache(1)
+	c := newTestCache(100) // Start with large capacity
 
-	// Fill the cache
+	// Insert one state
 	nfaStates := []nfa.StateID{1}
 	key := ComputeStateKey(nfaStates)
 	state := NewState(InvalidState, nfaStates, false)
@@ -189,6 +198,9 @@ func TestCacheGetOrInsertFull(t *testing.T) {
 		t.Fatalf("GetOrInsert failed: %v", err)
 	}
 
+	// Set capacity to current usage — full
+	c.capacityBytes = c.MemoryUsage()
+
 	// Next GetOrInsert with new key should fail
 	nfaStates2 := []nfa.StateID{2}
 	key2 := ComputeStateKey(nfaStates2)
@@ -354,8 +366,8 @@ func TestCacheResetClearCount(t *testing.T) {
 }
 
 func TestCacheCapacityBoundary(t *testing.T) {
-	// Test with capacity of 1
-	c := newTestCache(1)
+	// Create cache, insert one state, measure usage, then set capacity to that
+	c := newTestCache(100)
 
 	nfaStates := []nfa.StateID{1}
 	key := ComputeStateKey(nfaStates)
@@ -364,11 +376,13 @@ func TestCacheCapacityBoundary(t *testing.T) {
 	// First insert should succeed
 	_, err := c.Insert(key, state)
 	if err != nil {
-		t.Fatalf("Insert on capacity-1 cache failed: %v", err)
+		t.Fatalf("Insert failed: %v", err)
 	}
 
+	// Now set capacity to exactly the current usage — should be full
+	c.capacityBytes = c.MemoryUsage()
 	if !c.IsFull() {
-		t.Error("Capacity-1 cache should be full after 1 insert")
+		t.Error("Cache should be full when capacity == usage")
 	}
 
 	// Second insert with different key should fail
@@ -377,7 +391,7 @@ func TestCacheCapacityBoundary(t *testing.T) {
 	state2 := NewState(InvalidState, nfaStates2, false)
 	_, err = c.Insert(key2, state2)
 	if !errors.Is(err, ErrCacheFull) {
-		t.Errorf("Second insert on capacity-1 cache: got %v, want ErrCacheFull", err)
+		t.Errorf("Insert on full cache: got %v, want ErrCacheFull", err)
 	}
 }
 
diff --git a/dfa/lazy/config.go b/dfa/lazy/config.go
index 0df0c57..31901d8 100644
--- a/dfa/lazy/config.go
+++ b/dfa/lazy/config.go
@@ -5,18 +5,28 @@ package lazy
 // The configuration allows tuning the trade-off between memory usage and
 // performance. Larger caches provide better hit rates but consume more memory.
 type Config struct {
-	// MaxStates is the maximum number of DFA states to cache.
-	// When this limit is reached, the DFA clears the cache and continues
-	// DFA search (up to MaxCacheClears times per search), then falls back
-	// to NFA execution if the clear limit is exceeded.
+	// CacheCapacityBytes is the maximum memory (in bytes) that the DFA cache
+	// may use for transition tables, state storage, and metadata.
 	//
-	// Default: 10,000 states (~1MB with 256-byte transition tables)
-	// Memory usage: ~100-200 bytes per state (depending on transitions)
+	// When MemoryUsage() exceeds this limit, the cache is considered full.
+	// The DFA will then clear the cache (up to MaxCacheClears times) and
+	// rebuild states on demand, or fall back to NFA.
+	//
+	// Default: 2MB (2 * 1024 * 1024), matching Rust regex's hybrid_cache_capacity.
 	//
 	// Tuning guidelines:
-	//   - Simple patterns: 100-1,000 states sufficient
-	//   - Complex patterns: 10,000-100,000 states
-	//   - Memory-constrained: 1,000 states (~100KB)
+	//   - Simple patterns: 256KB-1MB sufficient
+	//   - Complex patterns or large alphabets: 2MB-10MB
+	//   - Memory-constrained environments: 256KB
+	//   - Performance-critical with complex patterns: 10MB-100MB
+	CacheCapacityBytes int
+
+	// MaxStates is a legacy limit on the number of DFA states.
+	// When CacheCapacityBytes > 0, MaxStates is ignored.
+	// When CacheCapacityBytes == 0 and MaxStates > 0,
+	// an approximate byte limit is computed from MaxStates.
+	//
+	// Deprecated: Use CacheCapacityBytes instead.
 	MaxStates uint32
 
 	// MaxCacheClears is the maximum number of times the DFA cache can be
@@ -71,20 +81,24 @@ type Config struct {
 	DeterminizationLimit int
 }
 
+// DefaultCacheCapacity is the default DFA cache capacity in bytes.
+// Matches Rust regex's hybrid_cache_capacity: 2 * (1 << 20) = 2MB.
+const DefaultCacheCapacity = 2 * 1024 * 1024
+
 // DefaultConfig returns a configuration with sensible defaults.
 //
 // These defaults are tuned for general-purpose regex matching:
-//   - Balance memory usage (~1MB) with performance
+//   - Cache capacity: 2MB (matches Rust regex default)
 //   - Enable prefilter for maximum speedup
 //   - Prevent exponential state explosion
 //
 // For specific use cases, tune the parameters:
-//   - Memory-constrained: reduce MaxStates to 1,000
-//   - Performance-critical: increase MaxStates to 100,000
+//   - Memory-constrained: reduce CacheCapacityBytes to 256KB
+//   - Performance-critical: increase CacheCapacityBytes to 10MB
 //   - Complex patterns: increase DeterminizationLimit
 func DefaultConfig() Config {
 	return Config{
-		MaxStates:            10_000,
+		CacheCapacityBytes:   DefaultCacheCapacity,
 		MaxCacheClears:       5,   // Allow 5 cache clears before NFA fallback
 		CacheHitThreshold:    0.0, // Disabled by default
 		UsePrefilter:         true,
@@ -93,13 +107,27 @@ func DefaultConfig() Config {
 	}
 }
 
+// effectiveCapacityBytes returns the cache capacity in bytes.
+// Uses CacheCapacityBytes if set, otherwise derives from legacy MaxStates.
+func (c *Config) effectiveCapacityBytes() int {
+	if c.CacheCapacityBytes > 0 {
+		return c.CacheCapacityBytes
+	}
+	if c.MaxStates > 0 {
+		// Legacy: approximate bytes from state count.
+		// Each state uses ~100 bytes (flatTrans row + map entry + State struct).
+		return int(c.MaxStates) * 100
+	}
+	return DefaultCacheCapacity
+}
+
 // Validate checks if the configuration is valid.
 // Returns an error if any parameter is out of acceptable range.
 func (c *Config) Validate() error {
-	if c.MaxStates == 0 {
+	if c.CacheCapacityBytes == 0 && c.MaxStates == 0 {
 		return &DFAError{
 			Kind:    InvalidConfig,
-			Message: "MaxStates must be > 0",
+			Message: "CacheCapacityBytes or MaxStates must be > 0",
 		}
 	}
 
@@ -134,9 +162,20 @@ func (c *Config) Validate() error {
 	return nil
 }
 
-// WithMaxStates returns a new config with the specified max states
+// WithCacheCapacity returns a new config with the specified cache capacity in bytes.
+// Default is 2MB (matching Rust regex). Set to 0 to use MaxStates instead.
+func (c Config) WithCacheCapacity(bytes int) Config {
+	c.CacheCapacityBytes = bytes
+	return c
+}
+
+// WithMaxStates returns a new config with the specified max states.
+//
+// Deprecated: Use WithCacheCapacity instead.
 func (c Config) WithMaxStates(maxStates uint32) Config {
 	c.MaxStates = maxStates
+	// Clear byte limit so legacy MaxStates takes effect
+	c.CacheCapacityBytes = 0
 	return c
 }
 
diff --git a/dfa/lazy/coverage_final_test.go b/dfa/lazy/coverage_final_test.go
index a35be87..7251f9f 100644
--- a/dfa/lazy/coverage_final_test.go
+++ b/dfa/lazy/coverage_final_test.go
@@ -373,12 +373,13 @@ func TestDetectAccelerationNilBuilder(t *testing.T) {
 
 // TestConfigValidateEdgeCases tests config validation edge cases.
 func TestConfigValidateEdgeCases(t *testing.T) {
-	// Invalid: zero max states
+	// Invalid: zero capacity and zero max states
 	cfg := DefaultConfig()
+	cfg.CacheCapacityBytes = 0
 	cfg.MaxStates = 0
 	err := cfg.Validate()
 	if err == nil {
-		t.Error("expected error for zero MaxStates")
+		t.Error("expected error for zero CacheCapacityBytes and MaxStates")
 	}
 
 	// Invalid: zero determinization limit
diff --git a/dfa/lazy/lazy.go b/dfa/lazy/lazy.go
index 46bc320..5dfc23f 100644
--- a/dfa/lazy/lazy.go
+++ b/dfa/lazy/lazy.go
@@ -99,7 +99,7 @@ type DFA struct {
 // across searches via Reset(), or pooled via sync.Pool in the meta layer.
 //
 // The cache is initialized with:
-//   - A state map sized to config.MaxStates
+//   - A state map (grows on demand up to CacheCapacityBytes)
 //   - A stateList for O(1) state-by-ID lookup
 //   - A StartTable with the DFA's immutable byteMap
 func (d *DFA) NewCache() *DFACache {
@@ -108,14 +108,14 @@ func (d *DFA) NewCache() *DFACache {
 	const initCap = 64
 	stride := d.AlphabetLen()
 	return &DFACache{
-		states:     make(map[StateKey]*State, initCap),
-		stateList:  make([]*State, 0, initCap),
-		flatTrans:  make([]StateID, 0, initCap*stride),
-		matchFlags: make([]bool, 0, initCap),
-		stride:     stride,
-		startTable: newStartTableFromByteMap(&d.startByteMap),
-		maxStates:  d.config.MaxStates,
-		nextID:     StartState + 1,
+		states:        make(map[StateKey]*State, initCap),
+		stateList:     make([]*State, 0, initCap),
+		flatTrans:     make([]StateID, 0, initCap*stride),
+		matchFlags:    make([]bool, 0, initCap),
+		stride:        stride,
+		startTable:    newStartTableFromByteMap(&d.startByteMap),
+		capacityBytes: d.config.effectiveCapacityBytes(),
+		nextID:        StartState + 1,
 	}
 }
 
@@ -1861,9 +1861,10 @@ func (d *DFA) accelerate(haystack []byte, pos int, exitBytes []byte) int {
 // Useful for performance tuning and diagnostics.
 //
 // Returns (size, capacity, hits, misses, hitRate).
+// capacity is the cache limit in bytes.
 func (d *DFA) CacheStats(cache *DFACache) (size int, capacity uint32, hits, misses uint64, hitRate float64) {
 	size = cache.Size()
-	capacity = d.config.MaxStates
+	capacity = uint32(cache.capacityBytes)
 	hits, misses, hitRate = cache.Stats()
 	return
 }

From d68d59f5eb7e1048d56b787380d4a40c1ef01ecf Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 24 Mar 2026 19:02:02 +0300
Subject: [PATCH 4/7] =?UTF-8?q?wip:=20SlotTable-based=20capture=20search?=
 =?UTF-8?q?=20=E2=80=94=20greedy=20loop=20capture=20bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SearchWithSlotTableCapturesAt now uses SlotTable instead of legacy COW.
Works for simple patterns like (foo)(bar), but greedy repetitions
(a+)(b+) lose group start positions during loop iterations.

Root cause: addSearchThread CopySlots overwrites capture slots on each
loop iteration. Need stack-based epsilon closure with RestoreCapture
frames (Rust approach) to preserve capture context through loops.

TODO: Convert recursive addSearchThread to stack-based with save/restore
Status: 2 NFA unit test failures, all meta tests pass (meta still on COW)
---
 nfa/pikevm.go | 273 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 254 insertions(+), 19 deletions(-)

diff --git a/nfa/pikevm.go b/nfa/pikevm.go
index 22f6b9e..a3d790e 100644
--- a/nfa/pikevm.go
+++ b/nfa/pikevm.go
@@ -2009,30 +2009,265 @@ func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystac
 }
 
 // SearchWithSlotTableCaptures finds the first match and returns captures.
-//
-// NOTE: This method currently delegates to the legacy SearchWithCapturesAt
-// because per-state SlotTable storage doesn't correctly track per-thread
-// capture paths. The SlotTable architecture is designed for Find/IsMatch
-// modes where captures are not needed.
-//
-// Future optimization: Implement a proper thread-indexed slot table similar
-// to Rust's pikevm.rs Slots structure.
-//
-// Returns nil if no match found.
+// Uses zero-allocation SlotTable architecture (Rust approach).
 func (p *PikeVM) SearchWithSlotTableCaptures(haystack []byte) *MatchWithCaptures {
 	return p.SearchWithSlotTableCapturesAt(haystack, 0)
 }
 
 // SearchWithSlotTableCapturesAt finds the first match with captures starting from 'at'.
+// Uses SlotTable for zero-allocation capture tracking (Rust approach).
 //
-// NOTE: Currently delegates to legacy SearchWithCapturesAt for correct capture tracking.
-// See SearchWithSlotTableCaptures for details.
+// SlotTable per-state storage works correctly because the Visited sparse set
+// guarantees each NFA state is visited at most once per generation — the same
+// invariant that makes Rust's SlotTable correct.
 func (p *PikeVM) SearchWithSlotTableCapturesAt(haystack []byte, at int) *MatchWithCaptures {
-	// Delegate to the legacy capture implementation which correctly tracks
-	// per-thread capture positions using COW semantics.
-	//
-	// The SlotTable per-state architecture cannot correctly track captures
-	// because multiple threads can pass through the same state with different
-	// capture positions. A proper implementation would need thread-indexed slots.
-	return p.SearchWithCapturesAt(haystack, at)
+	if at > len(haystack) {
+		return nil
+	}
+
+	// Configure SlotTable for full capture mode
+	totalSlots := p.nfa.CaptureCount() * 2
+	p.internalState.SlotTable.SetActiveSlots(totalSlots)
+
+	// Handle edge cases
+	if at == len(haystack) {
+		if p.matchesEmptyAt(haystack, at) {
+			return &MatchWithCaptures{
+				Start:    at,
+				End:      at,
+				Captures: [][]int{{at, at}},
+			}
+		}
+		return nil
+	}
+	if len(haystack) == 0 {
+		if p.matchesEmpty() {
+			return &MatchWithCaptures{
+				Start:    0,
+				End:      0,
+				Captures: [][]int{{0, 0}},
+			}
+		}
+		return nil
+	}
+
+	if p.nfa.IsAnchored() {
+		return p.searchWithSlotTableCapturesAnchored(haystack, at)
+	}
+	return p.searchWithSlotTableCapturesUnanchored(haystack, at)
+}
+
+// searchWithSlotTableCapturesUnanchored implements unanchored search with captures.
+// Captures stored in SlotTable per-state, saved to bestSlots on match.
+//
+//nolint:gocognit
+func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt int) *MatchWithCaptures {
+	st := &p.internalState
+	st.SearchQueue = st.SearchQueue[:0]
+	st.SearchNextQueue = st.SearchNextQueue[:0]
+	st.Visited.Clear()
+	st.SlotTable.Reset()
+
+	totalSlots := st.SlotTable.ActiveSlots()
+	bestStart := -1
+	bestEnd := -1
+	// bestSlots stores capture slots for the best match found so far
+	var bestSlots []int
+
+	for pos := startAt; pos <= len(haystack); pos++ {
+		if bestStart == -1 {
+			// Skip-ahead
+			if len(st.SearchQueue) == 0 && p.skipAhead != nil && pos > startAt {
+				candidate := p.skipAhead.Find(haystack, pos)
+				if candidate == -1 {
+					break
+				}
+				pos = candidate
+			}
+			st.Visited.Clear()
+			// Initialize slots for start state to all -1 (AllAbsent scratch)
+			absentSlots := st.SlotTable.AllAbsent()
+			for i := range absentSlots {
+				absentSlots[i] = -1
+			}
+			startSid := p.nfa.StartAnchored()
+			// Copy absent slots to start state
+			startSlots := st.SlotTable.ForState(startSid)
+			if startSlots != nil {
+				copy(startSlots, absentSlots)
+			}
+			p.addSearchThread(searchThread{state: startSid, startPos: pos}, haystack, pos)
+		}
+
+		if pos < len(haystack) {
+			b := haystack[pos]
+			st.Visited.Clear()
+			for _, t := range st.SearchQueue {
+				if p.nfa.IsMatch(t.state) {
+					if p.isBetterMatch(bestStart, bestEnd, t.startPos, pos) {
+						bestStart = t.startPos
+						bestEnd = pos
+						// Save capture slots for this match
+						matchSlots := st.SlotTable.ForState(t.state)
+						if matchSlots != nil && totalSlots > 0 {
+							if bestSlots == nil {
+								bestSlots = make([]int, totalSlots)
+							}
+							copy(bestSlots, matchSlots)
+						}
+					}
+					if !st.Longest {
+						break
+					}
+					continue
+				}
+				p.stepSearchThread(t, b, haystack, pos+1)
+			}
+		} else {
+			for _, t := range st.SearchQueue {
+				if p.nfa.IsMatch(t.state) {
+					if p.isBetterMatch(bestStart, bestEnd, t.startPos, pos) {
+						bestStart = t.startPos
+						bestEnd = pos
+						matchSlots := st.SlotTable.ForState(t.state)
+						if matchSlots != nil && totalSlots > 0 {
+							if bestSlots == nil {
+								bestSlots = make([]int, totalSlots)
+							}
+							copy(bestSlots, matchSlots)
+						}
+					}
+					break
+				}
+			}
+		}
+
+		if pos >= len(haystack) {
+			break
+		}
+
+		if bestStart != -1 {
+			hasLeftmostCandidate := false
+			for _, t := range st.SearchNextQueue {
+				if t.startPos <= bestStart {
+					hasLeftmostCandidate = true
+					break
+				}
+			}
+			if !hasLeftmostCandidate {
+				break
+			}
+		}
+
+		st.SearchQueue, st.SearchNextQueue = st.SearchNextQueue, st.SearchQueue[:0]
+	}
+
+	if bestStart == -1 {
+		return nil
+	}
+	return p.buildCapturesFromSlots(bestSlots, bestStart, bestEnd)
+}
+
+// searchWithSlotTableCapturesAnchored implements anchored search with captures.
+func (p *PikeVM) searchWithSlotTableCapturesAnchored(haystack []byte, startPos int) *MatchWithCaptures {
+	st := &p.internalState
+	st.SearchQueue = st.SearchQueue[:0]
+	st.SearchNextQueue = st.SearchNextQueue[:0]
+	st.Visited.Clear()
+	st.SlotTable.Reset()
+
+	totalSlots := st.SlotTable.ActiveSlots()
+
+	// Initialize start state slots
+	startSid := p.nfa.StartAnchored()
+	startSlots := st.SlotTable.ForState(startSid)
+	if startSlots != nil {
+		for i := range startSlots {
+			startSlots[i] = -1
+		}
+	}
+
+	p.addSearchThread(searchThread{state: startSid, startPos: startPos}, haystack, startPos)
+
+	lastMatchPos := -1
+	var bestSlots []int
+
+	for pos := startPos; pos <= len(haystack); pos++ {
+		if pos < len(haystack) {
+			b := haystack[pos]
+			st.Visited.Clear()
+			for _, t := range st.SearchQueue {
+				if p.nfa.IsMatch(t.state) {
+					if pos > lastMatchPos || lastMatchPos == -1 {
+						lastMatchPos = pos
+						matchSlots := st.SlotTable.ForState(t.state)
+						if matchSlots != nil && totalSlots > 0 {
+							if bestSlots == nil {
+								bestSlots = make([]int, totalSlots)
+							}
+							copy(bestSlots, matchSlots)
+						}
+					}
+					if !st.Longest {
+						break
+					}
+					continue
+				}
+				p.stepSearchThread(t, b, haystack, pos+1)
+			}
+		} else {
+			for _, t := range st.SearchQueue {
+				if p.nfa.IsMatch(t.state) {
+					if pos > lastMatchPos || lastMatchPos == -1 {
+						lastMatchPos = pos
+						matchSlots := st.SlotTable.ForState(t.state)
+						if matchSlots != nil && totalSlots > 0 {
+							if bestSlots == nil {
+								bestSlots = make([]int, totalSlots)
+							}
+							copy(bestSlots, matchSlots)
+						}
+					}
+					break
+				}
+			}
+		}
+
+		if len(st.SearchNextQueue) == 0 && (pos >= len(haystack) || lastMatchPos != -1) {
+			break
+		}
+		if pos >= len(haystack) {
+			break
+		}
+
+		st.SearchQueue, st.SearchNextQueue = st.SearchNextQueue, st.SearchQueue[:0]
+	}
+
+	if lastMatchPos == -1 {
+		return nil
+	}
+	return p.buildCapturesFromSlots(bestSlots, startPos, lastMatchPos)
+}
+
+// buildCapturesFromSlots converts flat slot data to MatchWithCaptures result.
+func (p *PikeVM) buildCapturesFromSlots(slots []int, matchStart, matchEnd int) *MatchWithCaptures {
+	numGroups := p.nfa.CaptureCount()
+	captures := make([][]int, numGroups)
+	captures[0] = []int{matchStart, matchEnd}
+
+	if slots != nil {
+		for i := 1; i < numGroups && i*2+1 < len(slots); i++ {
+			start := slots[i*2]
+			end := slots[i*2+1]
+			if start >= 0 && end >= 0 {
+				captures[i] = []int{start, end}
+			}
+		}
+	}
+
+	return &MatchWithCaptures{
+		Start:    matchStart,
+		End:      matchEnd,
+		Captures: captures,
+	}
 }

From d667d34c0b2db64662da670e55657c59048be41e Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 24 Mar 2026 19:08:39 +0300
Subject: [PATCH 5/7] wip: stack-based epsilon closure with RestoreCapture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Converted addSearchThread and addSearchThreadToNext from recursive to
stack-based with captureFrame (Explore + RestoreCapture frames).
Mirrors Rust pikevm.rs FollowEpsilon::RestoreCapture pattern.

Still failing: greedy loop captures (a+)(b+) — per-state SlotTable
overwrites group start on each loop iteration (State visited again
in next generation). Per-thread COW preserves all variants.

Root issue: per-state storage loses capture history across byte
transitions in greedy loops. Need either per-thread indexing or
generation-aware slot preservation.

Status: 2 NFA unit tests fail, all meta tests pass
---
 nfa/pikevm.go | 330 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 216 insertions(+), 114 deletions(-)

diff --git a/nfa/pikevm.go b/nfa/pikevm.go
index a3d790e..8ad032f 100644
--- a/nfa/pikevm.go
+++ b/nfa/pikevm.go
@@ -57,6 +57,17 @@ type searchThread struct {
 	startPos int     // Position where this thread's match attempt started
 }
 
+// captureFrame is a stack frame for capture-aware epsilon closure.
+// Two kinds: Explore (process a state) and RestoreCapture (undo a capture write).
+// Mirrors Rust's FollowEpsilon enum (pikevm.rs:1611).
+type captureFrame struct {
+	state    StateID // state to explore (InvalidState = restore frame)
+	startPos int     // thread start position
+	// For restore frames:
+	slot  int // slot index to restore
+	value int // old value to restore
+}
+
 // PikeVM implements the Pike VM algorithm for NFA execution.
 // It simulates the NFA by maintaining a set of active states and
 // exploring all possible paths through the automaton.
@@ -113,6 +124,15 @@ type PikeVMState struct {
 	// Reference: rust-regex/regex-automata/src/nfa/thompson/pikevm.rs:2044-2160
 	SlotTable *SlotTable
 
+	// captureStack is used for stack-based epsilon closure with capture save/restore.
+	// Mirrors Rust's FollowEpsilon::RestoreCapture pattern (pikevm.rs:1611-1637).
+	captureStack []captureFrame
+
+	// currSlots is the working capture buffer during epsilon closure.
+	// Modified in-place, saved/restored via captureStack for StateCapture.
+	// Copied to SlotTable row when terminal state reached.
+	currSlots []int
+
 	// Longest enables leftmost-longest (POSIX) matching semantics.
 	// By default (false), uses leftmost-first (Perl) semantics where
 	// the first alternative wins. When true, the longest match wins.
@@ -273,6 +293,12 @@ func (p *PikeVM) initState(state *PikeVMState) {
 	// Each capture group has 2 slots (start and end position)
 	slotsPerState := p.nfa.CaptureCount() * 2
 	state.SlotTable = NewSlotTable(p.nfa.States(), slotsPerState)
+
+	// Capture-aware epsilon closure stack and working buffer
+	state.captureStack = make([]captureFrame, 0, capacity)
+	if slotsPerState > 0 {
+		state.currSlots = make([]int, slotsPerState)
+	}
 }
 
 // SetSkipAhead sets the prefilter for skip-ahead optimization.
@@ -1812,72 +1838,115 @@ func (p *PikeVM) searchWithSlotTableAnchored(haystack []byte, startPos int) (int
 // addSearchThread adds a lightweight thread to the current queue, following epsilon transitions.
 // Captures are stored in SlotTable, not in the thread.
 func (p *PikeVM) addSearchThread(t searchThread, haystack []byte, pos int) {
-	// Check if already visited this state
-	if !p.internalState.Visited.Insert(uint32(t.state)) {
-		return
-	}
+	st := &p.internalState
+	activeSlots := st.SlotTable.ActiveSlots()
 
-	state := p.nfa.State(t.state)
-	if state == nil {
-		return
-	}
+	// Stack-based epsilon closure with capture save/restore (Rust approach).
+	// Uses currSlots as working buffer, modified in-place during closure.
+	// When terminal state reached: copy currSlots to SlotTable row.
+	// When StateCapture encountered: save old value, set new, push RestoreCapture.
+	// Reference: rust-regex pikevm.rs:1611-1749 (FollowEpsilon::RestoreCapture)
 
-	switch state.Kind() {
-	case StateMatch, StateByteRange, StateSparse, StateRuneAny, StateRuneAnyNotNL:
-		p.internalState.SearchQueue = append(p.internalState.SearchQueue, t)
+	st.captureStack = st.captureStack[:0]
+	st.captureStack = append(st.captureStack, captureFrame{
+		state: t.state, startPos: t.startPos,
+	})
 
-	case StateEpsilon:
-		next := state.Epsilon()
-		if next != InvalidState {
-			p.addSearchThread(searchThread{state: next, startPos: t.startPos}, haystack, pos)
-		}
-
-	case StateSplit:
-		left, right := state.Split()
+	for len(st.captureStack) > 0 {
+		n := len(st.captureStack)
+		frame := st.captureStack[n-1]
+		st.captureStack = st.captureStack[:n-1]
 
-		if left != InvalidState {
-			if p.internalState.SlotTable.ActiveSlots() > 2 {
-				p.internalState.SlotTable.CopySlots(left, t.state)
+		// RestoreCapture frame: undo a capture slot modification
+		if frame.state == InvalidState {
+			if activeSlots > 2 && frame.slot < len(st.currSlots) {
+				st.currSlots[frame.slot] = frame.value
 			}
-			p.addSearchThread(searchThread{state: left, startPos: t.startPos}, haystack, pos)
+			continue
 		}
-		if right != InvalidState {
-			if p.internalState.SlotTable.ActiveSlots() > 2 {
-				p.internalState.SlotTable.CopySlots(right, t.state)
-			}
-			p.addSearchThread(searchThread{state: right, startPos: t.startPos}, haystack, pos)
+
+		sid := frame.state
+		if !st.Visited.Insert(uint32(sid)) {
+			continue
 		}
 
-	case StateCapture:
-		groupIndex, isStart, next := state.Capture()
-		if next != InvalidState {
-			// For Find mode (activeSlots=2), group 0 is tracked via thread.startPos/pos
-			if p.internalState.SlotTable.ActiveSlots() > 2 {
-				slotIndex := int(groupIndex) * 2
-				if !isStart {
-					slotIndex++
+		state := p.nfa.State(sid)
+		if state == nil {
+			continue
+		}
+
+		switch state.Kind() {
+		case StateMatch, StateByteRange, StateSparse, StateRuneAny, StateRuneAnyNotNL:
+			// Terminal state: copy currSlots to this state's SlotTable row
+			if activeSlots > 2 {
+				stateSlots := st.SlotTable.ForState(sid)
+				if stateSlots != nil {
+					copy(stateSlots, st.currSlots)
 				}
-				if p.internalState.SlotTable.ActiveSlots() > slotIndex {
-					// Copy parent slots to next state first
-					p.internalState.SlotTable.CopySlots(next, t.state)
-					// Then update the capture slot
-					p.internalState.SlotTable.SetSlot(next, slotIndex, pos)
+			}
+			st.SearchQueue = append(st.SearchQueue, searchThread{
+				state: sid, startPos: frame.startPos,
+			})
+
+		case StateEpsilon:
+			next := state.Epsilon()
+			if next != InvalidState {
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: next, startPos: frame.startPos,
+				})
+			}
+
+		case StateSplit:
+			left, right := state.Split()
+			// Push right first (processed last = DFS left-first ordering)
+			if right != InvalidState {
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: right, startPos: frame.startPos,
+				})
+			}
+			if left != InvalidState {
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: left, startPos: frame.startPos,
+				})
+			}
+
+		case StateCapture:
+			groupIndex, isStart, next := state.Capture()
+			if next != InvalidState {
+				if activeSlots > 2 {
+					slotIndex := int(groupIndex) * 2
+					if !isStart {
+						slotIndex++
+					}
+					if slotIndex < len(st.currSlots) {
+						// Save old value for restore
+						oldValue := st.currSlots[slotIndex]
+						// Push RestoreCapture BEFORE explore (will execute after)
+						st.captureStack = append(st.captureStack, captureFrame{
+							state: InvalidState, // marker: restore frame
+							slot:  slotIndex,
+							value: oldValue,
+						})
+						// Set new capture value
+						st.currSlots[slotIndex] = pos
+					}
 				}
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: next, startPos: frame.startPos,
+				})
 			}
-			p.addSearchThread(searchThread{state: next, startPos: t.startPos}, haystack, pos)
-		}
 
-	case StateLook:
-		look, next := state.Look()
-		if checkLookAssertion(look, haystack, pos) && next != InvalidState {
-			if p.internalState.SlotTable.ActiveSlots() > 2 {
-				p.internalState.SlotTable.CopySlots(next, t.state)
+		case StateLook:
+			look, next := state.Look()
+			if checkLookAssertion(look, haystack, pos) && next != InvalidState {
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: next, startPos: frame.startPos,
+				})
 			}
-			p.addSearchThread(searchThread{state: next, startPos: t.startPos}, haystack, pos)
-		}
 
-	case StateFail:
-		// Dead state
+		case StateFail:
+			// Dead state
+		}
 	}
 }
 
@@ -1936,76 +2005,115 @@ func (p *PikeVM) stepSearchThread(t searchThread, b byte, haystack []byte, nextP
 
 // addSearchThreadToNext adds a lightweight thread to the next queue.
 // srcState is the state we came from (for slot copying).
+// Uses stack-based epsilon closure with capture save/restore.
 func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystack []byte, pos int) {
-	if !p.internalState.Visited.Insert(uint32(t.state)) {
-		return
-	}
+	st := &p.internalState
+	activeSlots := st.SlotTable.ActiveSlots()
 
-	state := p.nfa.State(t.state)
-	if state == nil {
-		return
+	// Load currSlots from source state's SlotTable row
+	if activeSlots > 2 {
+		srcSlots := st.SlotTable.ForState(srcState)
+		if srcSlots != nil && len(st.currSlots) > 0 {
+			copy(st.currSlots, srcSlots)
+		}
 	}
 
-	// Copy slots from source to new state (only for Captures mode)
-	if p.internalState.SlotTable.ActiveSlots() > 2 {
-		p.internalState.SlotTable.CopySlots(t.state, srcState)
-	}
+	// Stack-based epsilon closure (same as addSearchThread but targets NextQueue)
+	st.captureStack = st.captureStack[:0]
+	st.captureStack = append(st.captureStack, captureFrame{
+		state: t.state, startPos: t.startPos,
+	})
 
-	switch state.Kind() {
-	case StateEpsilon:
-		next := state.Epsilon()
-		if next != InvalidState {
-			p.addSearchThreadToNext(searchThread{state: next, startPos: t.startPos}, t.state, haystack, pos)
-		}
-		return
+	for len(st.captureStack) > 0 {
+		n := len(st.captureStack)
+		frame := st.captureStack[n-1]
+		st.captureStack = st.captureStack[:n-1]
 
-	case StateSplit:
-		left, right := state.Split()
-
-		if left != InvalidState {
-			if p.internalState.SlotTable.ActiveSlots() > 2 {
-				p.internalState.SlotTable.CopySlots(left, t.state)
+		if frame.state == InvalidState {
+			if activeSlots > 2 && frame.slot < len(st.currSlots) {
+				st.currSlots[frame.slot] = frame.value
 			}
-			p.addSearchThreadToNext(searchThread{state: left, startPos: t.startPos}, left, haystack, pos)
+			continue
 		}
-		if right != InvalidState {
-			if p.internalState.SlotTable.ActiveSlots() > 2 {
-				p.internalState.SlotTable.CopySlots(right, t.state)
-			}
-			p.addSearchThreadToNext(searchThread{state: right, startPos: t.startPos}, right, haystack, pos)
+
+		sid := frame.state
+		if !st.Visited.Insert(uint32(sid)) {
+			continue
 		}
-		return
 
-	case StateCapture:
-		groupIndex, isStart, next := state.Capture()
-		if next != InvalidState {
-			if p.internalState.SlotTable.ActiveSlots() > 2 {
-				slotIndex := int(groupIndex) * 2
-				if !isStart {
-					slotIndex++
+		state := p.nfa.State(sid)
+		if state == nil {
+			continue
+		}
+
+		switch state.Kind() {
+		case StateMatch, StateByteRange, StateSparse, StateRuneAny, StateRuneAnyNotNL:
+			if activeSlots > 2 {
+				stateSlots := st.SlotTable.ForState(sid)
+				if stateSlots != nil {
+					copy(stateSlots, st.currSlots)
 				}
-				if p.internalState.SlotTable.ActiveSlots() > slotIndex {
-					p.internalState.SlotTable.CopySlots(next, t.state)
-					p.internalState.SlotTable.SetSlot(next, slotIndex, pos)
+			}
+			// Add to NEXT queue (not current)
+			st.SearchNextQueue = append(st.SearchNextQueue, searchThread{
+				state: sid, startPos: frame.startPos,
+			})
+
+		case StateEpsilon:
+			next := state.Epsilon()
+			if next != InvalidState {
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: next, startPos: frame.startPos,
+				})
+			}
+
+		case StateSplit:
+			left, right := state.Split()
+			if right != InvalidState {
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: right, startPos: frame.startPos,
+				})
+			}
+			if left != InvalidState {
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: left, startPos: frame.startPos,
+				})
+			}
+
+		case StateCapture:
+			groupIndex, isStart, next := state.Capture()
+			if next != InvalidState {
+				if activeSlots > 2 {
+					slotIndex := int(groupIndex) * 2
+					if !isStart {
+						slotIndex++
+					}
+					if slotIndex < len(st.currSlots) {
+						oldValue := st.currSlots[slotIndex]
+						st.captureStack = append(st.captureStack, captureFrame{
+							state: InvalidState,
+							slot:  slotIndex,
+							value: oldValue,
+						})
+						st.currSlots[slotIndex] = pos
+					}
 				}
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: next, startPos: frame.startPos,
+				})
 			}
-			p.addSearchThreadToNext(searchThread{state: next, startPos: t.startPos}, next, haystack, pos)
-		}
-		return
 
-	case StateLook:
-		look, next := state.Look()
-		if checkLookAssertion(look, haystack, pos) && next != InvalidState {
-			if p.internalState.SlotTable.ActiveSlots() > 2 {
-				p.internalState.SlotTable.CopySlots(next, t.state)
+		case StateLook:
+			look, next := state.Look()
+			if checkLookAssertion(look, haystack, pos) && next != InvalidState {
+				st.captureStack = append(st.captureStack, captureFrame{
+					state: next, startPos: frame.startPos,
+				})
 			}
-			p.addSearchThreadToNext(searchThread{state: next, startPos: t.startPos}, next, haystack, pos)
+
+		case StateFail:
 		}
-		return
 	}
-
-	// Add to next queue
-	p.internalState.SearchNextQueue = append(p.internalState.SearchNextQueue, t)
 }
 
 // SearchWithSlotTableCaptures finds the first match and returns captures.
@@ -2085,16 +2193,10 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt
 				pos = candidate
 			}
 			st.Visited.Clear()
-			// Initialize slots for start state to all -1 (AllAbsent scratch)
-			absentSlots := st.SlotTable.AllAbsent()
-			for i := range absentSlots {
-				absentSlots[i] = -1
-			}
 			startSid := p.nfa.StartAnchored()
-			// Copy absent slots to start state
-			startSlots := st.SlotTable.ForState(startSid)
-			if startSlots != nil {
-				copy(startSlots, absentSlots)
+			// Initialize currSlots to -1 (unset) before epsilon closure
+			for i := range st.currSlots {
+				st.currSlots[i] = -1
 			}
 			p.addSearchThread(searchThread{state: startSid, startPos: pos}, haystack, pos)
 		}

From f4c3fd0d3a9bde2eff10b77fb1ad326b98281484 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 24 Mar 2026 19:17:57 +0300
Subject: [PATCH 6/7] =?UTF-8?q?feat:=20dual=20SlotTable=20capture=20tracki?=
 =?UTF-8?q?ng=20=E2=80=94=20zero-alloc=20FindSubmatch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement Rust-style dual SlotTable (curr/next) for capture propagation
across byte transitions. Stack-based epsilon closure with RestoreCapture
frames preserves capture context through greedy loops.

Key changes:
- Add NextSlotTable + captureStack + currSlots to PikeVMState
- addSearchThread: stack-based with captureFrame (Explore + RestoreCapture)
- addSearchThreadToNext: loads from curr SlotTable, writes to next
- Swap SlotTable/NextSlotTable after each byte (Rust mem::swap pattern)
- Don't clear Visited before seed — prevents SlotTable row overwrite
- Wire meta FindSubmatch to use SlotTable path
- Fix empty match capture groups (buildCapturesFromSlots)

FindAllSubmatch (5 patterns, 50K matches, 800KB input):
- Alloc: 554MB -> 26MB (-95%)
- Mallocs: 12.5M -> 440K (-96%)
- Time: 1.48s -> 0.45s (3.3x faster)
---
 meta/findall.go |  4 +--
 nfa/pikevm.go   | 81 +++++++++++++++++++++++++------------------------
 2 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/meta/findall.go b/meta/findall.go
index ba80453..f4fcb4e 100644
--- a/meta/findall.go
+++ b/meta/findall.go
@@ -90,7 +90,7 @@ func (e *Engine) findSubmatchAtWithState(haystack []byte, at int, state *SearchS
 	case UseBoundedBacktracker, UseNFA,
 		UseDFA, UseBoth, UseDigitPrefilter:
 		atomic.AddUint64(&e.stats.NFASearches, 1)
-		nfaMatch := state.pikevm.SearchWithCapturesAt(haystack, at)
+		nfaMatch := state.pikevm.SearchWithSlotTableCapturesAt(haystack, at)
 		if nfaMatch == nil {
 			return nil
 		}
@@ -118,7 +118,7 @@ func (e *Engine) findSubmatchAtWithState(haystack []byte, at int, state *SearchS
 	nfaMatch := state.pikevm.SearchWithCapturesInSpan(haystack, start, end)
 	if nfaMatch == nil {
 		// Defensive fallback: DFA found a match but PikeVM disagrees.
-		nfaMatch = state.pikevm.SearchWithCapturesAt(haystack, at)
+		nfaMatch = state.pikevm.SearchWithSlotTableCapturesAt(haystack, at)
 		if nfaMatch == nil {
 			return nil
 		}
diff --git a/nfa/pikevm.go b/nfa/pikevm.go
index 8ad032f..34446d8 100644
--- a/nfa/pikevm.go
+++ b/nfa/pikevm.go
@@ -118,11 +118,12 @@ type PikeVMState struct {
 	// Reference: rust-regex/regex-automata/src/nfa/thompson/pikevm.rs:2198
 	epsilonStack []StateID
 
-	// SlotTable stores capture slot values per NFA state.
-	// This is a 2D table (flattened to 1D) following the Rust regex architecture.
-	// Enables O(1) access to capture positions for any state.
-	// Reference: rust-regex/regex-automata/src/nfa/thompson/pikevm.rs:2044-2160
-	SlotTable *SlotTable
+	// SlotTable / NextSlotTable: two slot tables swapped between generations.
+	// Mirrors Rust's curr/next ActiveStates pattern (pikevm.rs:1878).
+	// SlotTable = current generation, NextSlotTable = next generation.
+	// After each byte transition: swap(SlotTable, NextSlotTable).
+	SlotTable     *SlotTable
+	NextSlotTable *SlotTable
 
 	// captureStack is used for stack-based epsilon closure with capture save/restore.
 	// Mirrors Rust's FollowEpsilon::RestoreCapture pattern (pikevm.rs:1611-1637).
@@ -289,10 +290,11 @@ func (p *PikeVM) initState(state *PikeVMState) {
 	// Pre-allocate epsilon stack for loop-based closure in IsMatch (Rust pattern)
 	state.epsilonStack = make([]StateID, 0, capacity)
 
-	// Initialize SlotTable for capture tracking
+	// Initialize SlotTables for capture tracking (curr/next, swapped per byte)
 	// Each capture group has 2 slots (start and end position)
 	slotsPerState := p.nfa.CaptureCount() * 2
 	state.SlotTable = NewSlotTable(p.nfa.States(), slotsPerState)
+	state.NextSlotTable = NewSlotTable(p.nfa.States(), slotsPerState)
 
 	// Capture-aware epsilon closure stack and working buffer
 	state.captureStack = make([]captureFrame, 0, capacity)
@@ -1837,6 +1839,8 @@ func (p *PikeVM) searchWithSlotTableAnchored(haystack []byte, startPos int) (int
 
 // addSearchThread adds a lightweight thread to the current queue, following epsilon transitions.
 // Captures are stored in SlotTable, not in the thread.
+//
+//nolint:gocognit // Stack-based epsilon closure with 7 state types is inherently complex
 func (p *PikeVM) addSearchThread(t searchThread, haystack []byte, pos int) {
 	st := &p.internalState
 	activeSlots := st.SlotTable.ActiveSlots()
@@ -2006,11 +2010,13 @@ func (p *PikeVM) stepSearchThread(t searchThread, b byte, haystack []byte, nextP
 // addSearchThreadToNext adds a lightweight thread to the next queue.
 // srcState is the state we came from (for slot copying).
 // Uses stack-based epsilon closure with capture save/restore.
+//
+//nolint:gocognit,gocyclo,cyclop // Stack-based epsilon closure with capture save/restore
 func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystack []byte, pos int) {
 	st := &p.internalState
 	activeSlots := st.SlotTable.ActiveSlots()
 
-	// Load currSlots from source state's SlotTable row
+	// Load currSlots from CURRENT SlotTable (source state's row)
 	if activeSlots > 2 {
 		srcSlots := st.SlotTable.ForState(srcState)
 		if srcSlots != nil && len(st.currSlots) > 0 {
@@ -2018,7 +2024,7 @@ func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystac
 		}
 	}
 
-	// Stack-based epsilon closure (same as addSearchThread but targets NextQueue)
+	// Stack-based epsilon closure writing to NEXT SlotTable
 	st.captureStack = st.captureStack[:0]
 	st.captureStack = append(st.captureStack, captureFrame{
 		state: t.state, startPos: t.startPos,
@@ -2048,13 +2054,13 @@ func (p *PikeVM) addSearchThreadToNext(t searchThread, srcState StateID, haystac
 
 		switch state.Kind() {
 		case StateMatch, StateByteRange, StateSparse, StateRuneAny, StateRuneAnyNotNL:
+			// Write to NEXT SlotTable (not current!)
 			if activeSlots > 2 {
-				stateSlots := st.SlotTable.ForState(sid)
+				stateSlots := st.NextSlotTable.ForState(sid)
 				if stateSlots != nil {
 					copy(stateSlots, st.currSlots)
 				}
 			}
-			// Add to NEXT queue (not current)
 			st.SearchNextQueue = append(st.SearchNextQueue, searchThread{
 				state: sid, startPos: frame.startPos,
 			})
@@ -2123,41 +2129,32 @@ func (p *PikeVM) SearchWithSlotTableCaptures(haystack []byte) *MatchWithCaptures
 }
 
 // SearchWithSlotTableCapturesAt finds the first match with captures starting from 'at'.
-// Uses SlotTable for zero-allocation capture tracking (Rust approach).
-//
-// SlotTable per-state storage works correctly because the Visited sparse set
-// guarantees each NFA state is visited at most once per generation — the same
-// invariant that makes Rust's SlotTable correct.
+// Uses dual SlotTable (curr/next) for zero-allocation capture tracking.
+// Matches Rust's PikeVM Cache with curr/next ActiveStates (pikevm.rs:1878).
 func (p *PikeVM) SearchWithSlotTableCapturesAt(haystack []byte, at int) *MatchWithCaptures {
 	if at > len(haystack) {
 		return nil
 	}
 
-	// Configure SlotTable for full capture mode
 	totalSlots := p.nfa.CaptureCount() * 2
 	p.internalState.SlotTable.SetActiveSlots(totalSlots)
+	p.internalState.NextSlotTable.SetActiveSlots(totalSlots)
+
+	numGroups := p.nfa.CaptureCount()
 
-	// Handle edge cases
 	if at == len(haystack) {
 		if p.matchesEmptyAt(haystack, at) {
-			return &MatchWithCaptures{
-				Start:    at,
-				End:      at,
-				Captures: [][]int{{at, at}},
-			}
+			return p.buildCapturesFromSlots(nil, at, at)
 		}
 		return nil
 	}
 	if len(haystack) == 0 {
 		if p.matchesEmpty() {
-			return &MatchWithCaptures{
-				Start:    0,
-				End:      0,
-				Captures: [][]int{{0, 0}},
-			}
+			return p.buildCapturesFromSlots(nil, 0, 0)
 		}
 		return nil
 	}
+	_ = numGroups
 
 	if p.nfa.IsAnchored() {
 		return p.searchWithSlotTableCapturesAnchored(haystack, at)
@@ -2168,23 +2165,23 @@ func (p *PikeVM) SearchWithSlotTableCapturesAt(haystack []byte, at int) *MatchWi
 // searchWithSlotTableCapturesUnanchored implements unanchored search with captures.
 // Captures stored in SlotTable per-state, saved to bestSlots on match.
 //
-//nolint:gocognit
+//nolint:gocognit,gocyclo,cyclop // Merged match-check + step + seed loop
 func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt int) *MatchWithCaptures {
 	st := &p.internalState
 	st.SearchQueue = st.SearchQueue[:0]
 	st.SearchNextQueue = st.SearchNextQueue[:0]
 	st.Visited.Clear()
 	st.SlotTable.Reset()
+	st.NextSlotTable.Reset()
 
 	totalSlots := st.SlotTable.ActiveSlots()
+	st.NextSlotTable.SetActiveSlots(totalSlots)
 	bestStart := -1
 	bestEnd := -1
-	// bestSlots stores capture slots for the best match found so far
 	var bestSlots []int
 
 	for pos := startAt; pos <= len(haystack); pos++ {
 		if bestStart == -1 {
-			// Skip-ahead
 			if len(st.SearchQueue) == 0 && p.skipAhead != nil && pos > startAt {
 				candidate := p.skipAhead.Find(haystack, pos)
 				if candidate == -1 {
@@ -2192,9 +2189,10 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt
 				}
 				pos = candidate
 			}
-			st.Visited.Clear()
+			// DON'T clear Visited here — states already in SearchQueue must
+			// not be overwritten by new seed. Visited prevents re-entry.
+			// Visited is cleared before step loop (below), not before seed.
 			startSid := p.nfa.StartAnchored()
-			// Initialize currSlots to -1 (unset) before epsilon closure
 			for i := range st.currSlots {
 				st.currSlots[i] = -1
 			}
@@ -2209,7 +2207,7 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt
 					if p.isBetterMatch(bestStart, bestEnd, t.startPos, pos) {
 						bestStart = t.startPos
 						bestEnd = pos
-						// Save capture slots for this match
+						// Read from CURRENT SlotTable
 						matchSlots := st.SlotTable.ForState(t.state)
 						if matchSlots != nil && totalSlots > 0 {
 							if bestSlots == nil {
@@ -2223,6 +2221,7 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt
 					}
 					continue
 				}
+				// stepSearchThread → addSearchThreadToNext reads curr, writes next
 				p.stepSearchThread(t, b, haystack, pos+1)
 			}
 		} else {
@@ -2261,7 +2260,9 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt
 			}
 		}
 
+		// Swap queues AND SlotTables (Rust: core::mem::swap(curr, next))
 		st.SearchQueue, st.SearchNextQueue = st.SearchNextQueue, st.SearchQueue[:0]
+		st.SlotTable, st.NextSlotTable = st.NextSlotTable, st.SlotTable
 	}
 
 	if bestStart == -1 {
@@ -2271,24 +2272,23 @@ func (p *PikeVM) searchWithSlotTableCapturesUnanchored(haystack []byte, startAt
 }
 
 // searchWithSlotTableCapturesAnchored implements anchored search with captures.
+//
+//nolint:gocognit // Merged match-check + step loop (Rust's nexts pattern)
 func (p *PikeVM) searchWithSlotTableCapturesAnchored(haystack []byte, startPos int) *MatchWithCaptures {
 	st := &p.internalState
 	st.SearchQueue = st.SearchQueue[:0]
 	st.SearchNextQueue = st.SearchNextQueue[:0]
 	st.Visited.Clear()
 	st.SlotTable.Reset()
+	st.NextSlotTable.Reset()
 
 	totalSlots := st.SlotTable.ActiveSlots()
+	st.NextSlotTable.SetActiveSlots(totalSlots)
 
-	// Initialize start state slots
 	startSid := p.nfa.StartAnchored()
-	startSlots := st.SlotTable.ForState(startSid)
-	if startSlots != nil {
-		for i := range startSlots {
-			startSlots[i] = -1
-		}
+	for i := range st.currSlots {
+		st.currSlots[i] = -1
 	}
-
 	p.addSearchThread(searchThread{state: startSid, startPos: startPos}, haystack, startPos)
 
 	lastMatchPos := -1
@@ -2343,6 +2343,7 @@ func (p *PikeVM) searchWithSlotTableCapturesAnchored(haystack []byte, startPos i
 		}
 
 		st.SearchQueue, st.SearchNextQueue = st.SearchNextQueue, st.SearchQueue[:0]
+		st.SlotTable, st.NextSlotTable = st.NextSlotTable, st.SlotTable
 	}
 
 	if lastMatchPos == -1 {

From e71029c2d43dfc31e415fee4ddb7aa5a94e583fe Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 24 Mar 2026 19:36:20 +0300
Subject: [PATCH 7/7] docs: update CHANGELOG, OPTIMIZATIONS, add
 ARCHITECTURE.md for v0.12.19

- CHANGELOG: add SlotTable capture tracking entry
- OPTIMIZATIONS: add #10 Dual SlotTable (95% less memory), update version
- ARCHITECTURE.md: new file documenting engine architecture, memory model,
  thread safety, and Rust alignment
---
 CHANGELOG.md          |   8 +++
 README.md             |  25 +++++-----
 docs/ARCHITECTURE.md  | 110 ++++++++++++++++++++++++++++++++++++++++++
 docs/OPTIMIZATIONS.md |  61 +++++++++++++++++++++--
 4 files changed, 189 insertions(+), 15 deletions(-)
 create mode 100644 docs/ARCHITECTURE.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9866ec2..433028f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.12.19] - 2026-03-24
 
 ### Performance
+- **Zero-alloc FindSubmatch via dual SlotTable** (Rust approach) — replaced per-thread
+  COW capture allocation with Rust-style flat SlotTable. Two SlotTables (curr/next)
+  swap between byte generations. Stack-based epsilon closure with RestoreCapture
+  frames preserves capture context through greedy loops. FindAllSubmatch (5 patterns,
+  50K matches, 800KB input): alloc **554MB → 26MB** (-95%), mallocs **12.5M → 440K**
+  (-96%), time **1.48s → 0.45s** (3.3x faster). Reference: Rust `pikevm.rs`
+  `ActiveStates` + `SlotTable` + `FollowEpsilon::RestoreCapture`.
+
 - **Rust-aligned BoundedBacktracker visited limit for UseNFA** — reduced visited
   table capacity from 32M entries (64MB) to 128K entries (256KB) for UseNFA paths,
   matching Rust regex's `visited_capacity` default. On Kostya's LangArena LogParser
diff --git a/README.md b/README.md
index 5d3fd70..3c4201c 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ High-performance regex engine for Go. Drop-in replacement for `regexp` with **3-
 Go's stdlib `regexp` is intentionally simple — single NFA engine, no optimizations. This guarantees O(n) time but leaves performance on the table.
 
 coregex brings Rust regex-crate architecture to Go:
-- **Multi-engine**: Lazy DFA, PikeVM, OnePass, BoundedBacktracker
+- **Multi-engine**: 17 strategies — Lazy DFA, PikeVM, OnePass, BoundedBacktracker, and more
 - **SIMD prefilters**: AVX2/SSSE3 for fast candidate rejection
 - **Reverse search**: Suffix/inner literal patterns run 1000x+ faster
 - **O(n) guarantee**: No backtracking, no ReDoS vulnerabilities
@@ -187,20 +187,23 @@ Uses Go's `regexp/syntax` parser:
 ```
 Pattern → Parse → NFA → Literal Extract → Strategy Select
                                                ↓
-                         ┌─────────────────────────────────┐
-                         │ Engines (17 strategies):        │
-                         │  LazyDFA, PikeVM, OnePass,      │
-                         │  BoundedBacktracker,            │
-                         │  ReverseInner, ReverseSuffix,   │
-                         │  ReverseSuffixSet, AnchoredLiteral, │
-                         │  CharClassSearcher, Teddy,      │
-                         │  DigitPrefilter, AhoCorasick,   │
-                         │  CompositeSearcher, BranchDispatch │
-                         └─────────────────────────────────┘
+                  ┌────────────────────────────────────────────┐
+                  │ Engines (17 strategies):                   │
+                  │  LazyDFA, PikeVM, OnePass,                 │
+                  │  BoundedBacktracker, ReverseAnchored,      │
+                  │  ReverseInner, ReverseSuffix,              │
+                  │  ReverseSuffixSet, MultilineReverseSuffix, │
+                  │  AnchoredLiteral, CharClassSearcher,       │
+                  │  Teddy, DigitPrefilter, AhoCorasick,       │
+                  │  CompositeSearcher, BranchDispatch, Both   │
+                  └────────────────────────────────────────────┘
                                                ↓
 Input → Prefilter (SIMD) → Engine → Match Result
 ```
 
+> For detailed architecture documentation, see [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
+> For optimization details, see [docs/OPTIMIZATIONS.md](docs/OPTIMIZATIONS.md).
+
 **SIMD Primitives** (AMD64):
 - `memchr` — single byte search (AVX2)
 - `memmem` — substring search (SSSE3)
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
new file mode 100644
index 0000000..4d5b05d
--- /dev/null
+++ b/docs/ARCHITECTURE.md
@@ -0,0 +1,110 @@
+# coregex Architecture
+
+Production-grade regex engine for Go achieving 3-3000x speedup over stdlib
+through multi-engine architecture and SIMD optimizations.
+
+## Execution Pipeline
+
+```
+Pattern → Parse → NFA Compile → Literal Extract → Strategy Select
+                                                        ↓
+                                    ┌──────────────────────────────────────┐
+                                    │ Strategy (one of 15):                │
+                                    │  UseNFA, UseDFA, UseBoth,            │
+                                    │  UseReverseAnchored, UseReverseSuffix│
+                                    │  UseOnePass, UseReverseInner,        │
+                                    │  UseBoundedBacktracker, UseTeddy,    │
+                                    │  UseReverseSuffixSet, UseCharClass,  │
+                                    │  UseDigitPrefilter, UseAhoCorasick,  │
+                                    │  UseComposite, UseAnchoredLiteral    │
+                                    └──────────────────────────────────────┘
+                                                        ↓
+Input → Prefilter (memchr/memmem/teddy) → Engine Search → Match Result
+```
+
+## Engine Architecture (Rust-aligned)
+
+### DFA Layer (`dfa/lazy/`)
+
+- **Lazy DFA**: On-demand state construction with byte class compression
+- **Flat transition table**: `flatTrans[sid*stride+class]` — single array lookup, no pointer chase
+- **Byte-based cache limit**: 2MB default (matches Rust `hybrid_cache_capacity`)
+- **Cache clearing**: Up to 5 clears before NFA fallback (Rust approach)
+- **Acceleration**: Detects self-loop states, uses SIMD memchr for skip-ahead
+- **Integrated prefilter**: Skip-ahead at start state in DFA loop (Rust `hybrid/search.rs:232`)
+- **Per-goroutine cache**: Immutable DFA + mutable DFACache (thread-safe)
+
+### PikeVM Layer (`nfa/pikevm.go`)
+
+- **Dual SlotTable**: Flat per-state capture storage (curr/next, swapped per byte)
+  - Zero-allocation capture tracking (Rust `ActiveStates` pattern)
+  - Stack-based epsilon closure with `RestoreCapture` frames
+  - `searchThread` (12 bytes) vs legacy `thread` (40+ bytes with COW)
+- **Integrated prefilter**: Skip-ahead when no active threads (Rust `pikevm.rs:1293`)
+- **SearchMode**: Dynamic slot sizing (0=IsMatch, 2=Find, full=Captures)
+
+### BoundedBacktracker (`nfa/backtrack.go`)
+
+- Generation-based visited table (O(1) reset, uint16)
+- Visited limit: 256KB for UseNFA (Rust default), 64MB for UseBoundedBacktracker (POSIX)
+- Fallback to PikeVM when input exceeds capacity
+
+### Prefilter Layer (`prefilter/`)
+
+- **AVX2 memchr**: SIMD byte search (12x faster than `bytes.IndexByte`)
+- **Memmem**: SIMD substring search with Rabin-Karp fingerprinting
+- **Teddy**: SIMD multi-pattern matching (1-8 patterns, AVX2/SSSE3)
+- **Aho-Corasick**: DFA-based multi-pattern for >8 patterns
+- **DigitPrefilter**: SIMD digit detection for `\d+` patterns
+
+## Memory Architecture
+
+### Per-Pattern (compile-time, shared immutable)
+- NFA graph (states, transitions)
+- DFA configuration (byte classes, start map)
+- Prefilter (literal tables, SIMD masks)
+- Strategy-specific searchers (reverse DFA, composite, etc.)
+
+### Per-Goroutine (search-time, pooled via sync.Pool)
+- `SearchState` holds all mutable search state
+- `DFACache`: flat transition table + state map (2MB default)
+- `PikeVMState`: dual SlotTable + thread queues + visited set
+- `BacktrackerState`: visited array + generation counter
+
+### Memory Budget (Kostya LangArena, 13 patterns, 7MB log)
+
+| Component | v0.12.18 | v0.12.19 |
+|-----------|---------|---------|
+| Total alloc (FindAll) | 89 MB | **25 MB** |
+| RSS | 353 MB | **41 MB** |
+| FindAllSubmatch (5 pat, 50K matches) | 554 MB | **26 MB** |
+
+## Thread Safety
+
+```
+                    ┌──────────────┐
+                    │   Engine     │ ← Immutable after compile
+                    │  (shared)    │
+                    └──────┬───────┘
+                           │
+              ┌────────────┼────────────┐
+              ↓            ↓            ↓
+        ┌────────────┐ ┌────────────┐ ┌────────────┐
+        │ SearchState│ │ SearchState│ │ SearchState│ ← Per-goroutine
+        │(goroutine1)│ │(goroutine2)│ │(goroutine3)│    (sync.Pool)
+        └────────────┘ └────────────┘ └────────────┘
+```
+
+## Key Design Decisions
+
+1. **Multi-engine**: Strategy selection at compile time, not runtime
+2. **Rust reference**: Architecture mirrors Rust regex crate (lazy DFA, PikeVM, prefilters)
+3. **Go stdlib compat**: POSIX leftmost-longest semantics (differs from Rust leftmost-first)
+4. **Zero-alloc hot paths**: `IsMatch()`, `FindIndices()`, `Count()` — no heap allocation
+5. **SIMD first**: AVX2/SSSE3 prefilters for x86_64, pure Go fallback for other archs
+
+## References
+
+- [Rust regex crate](https://github.com/rust-lang/regex) — primary architecture reference
+- [RE2](https://github.com/google/re2) — O(n) performance guarantees
+- [Hyperscan](https://github.com/intel/hyperscan) — SIMD multi-pattern (Teddy algorithm)
diff --git a/docs/OPTIMIZATIONS.md b/docs/OPTIMIZATIONS.md
index c0e9f17..dccef6c 100644
--- a/docs/OPTIMIZATIONS.md
+++ b/docs/OPTIMIZATIONS.md
@@ -1,6 +1,6 @@
 # coregex Optimizations that Beat Rust regex
 
-This document describes the 9 key optimizations in coregex that outperform the Rust regex crate.
+This document describes the 10 key optimizations in coregex that outperform the Rust regex crate.
 These algorithms are critical to coregex's competitive advantage and **MUST NOT REGRESS**.
 
 ## Summary
@@ -8,6 +8,7 @@ These algorithms are critical to coregex's competitive advantage and **MUST NOT
 | Optimization | File | Pattern Type | vs stdlib | Benchmark |
 |--------------|------|--------------|-----------|-----------|
 | **AnchoredLiteral** | `meta/anchored_literal.go` | `^prefix.*suffix$` | **32-133x faster** | anchored_literal |
+| **Flat SlotTable** | `nfa/pikevm.go`, `nfa/slot_table.go` | FindSubmatch | **-95% memory** | submatch |
 | CharClassSearcher | `nfa/charclass_searcher.go` | `[\w]+`, `[a-z]+` | **23x faster** | char_class |
 | CompositeSearcher | `nfa/composite.go` | `[a-zA-Z]+[0-9]+` | **5x faster** | composite |
 | BranchDispatch | `nfa/branch_dispatch.go` | `^(\d+\|UUID\|hex32)` | **5-20x faster** | anchored_alt |
@@ -593,6 +594,58 @@ bash scripts/bench.sh --compare baseline current
 
 ---
 
+## 10. Dual SlotTable Capture Tracking (95% less memory) - NEW in v0.12.19
+
+**Files**: `nfa/pikevm.go`, `nfa/slot_table.go`
+
+**Pattern types**: All FindSubmatch/FindAllSubmatch patterns with capture groups
+
+### Architecture
+
+Replaces per-thread COW (copy-on-write) capture allocation with Rust-style flat
+SlotTable indexed by NFA state ID. Two SlotTables (curr/next) swap between byte
+generations — matching Rust's `ActiveStates` pattern.
+
+```
+SlotTable layout: table[stateID * slotsPerState + slotIndex]
+Each NFA state owns a row of slots: [g0_start, g0_end, g1_start, g1_end, ...]
+
+Epsilon closure: stack-based with RestoreCapture frames
+  Explore(sid)          → process state, push children
+  RestoreCapture(slot)  → undo capture write after subtree processed
+```
+
+Key invariant: Visited sparse set guarantees each NFA state is visited at most
+once per generation → one thread per state → per-state storage is correct.
+
+### Why faster than COW approach
+
+| | COW (old) | SlotTable (new) |
+|---|---|---|
+| Thread fork | `make([]int, numSlots)` — heap alloc | `copy(row, currSlots)` — no alloc |
+| Capture update | COW copy if shared — heap alloc | `currSlots[i] = pos` — in-place |
+| Match save | `copyData()` — heap alloc | `copy(bestSlots, row)` — one copy |
+| Memory per search | O(threads × slots) | O(states × slots) — fixed |
+
+### Benchmark data
+
+```
+FindAllSubmatch: 5 patterns, 50K matches, 800KB input
+
+Metric          COW (old)     SlotTable (new)   Improvement
+Alloc           554 MB        26 MB             -95%
+Mallocs         12,500,000    440,000           -96%
+Time            1.48s         0.45s             3.3x faster
+```
+
+### Reference
+
+- Rust: `regex-automata/src/nfa/thompson/pikevm.rs:2065` (SlotTable struct)
+- Rust: `regex-automata/src/nfa/thompson/pikevm.rs:1611` (FollowEpsilon::RestoreCapture)
+- Rust: `regex-automata/src/nfa/thompson/pikevm.rs:1878` (Cache with curr/next ActiveStates)
+
+---
+
 ## References
 
 - **Rust regex crate**: Architecture inspiration for multi-engine design
@@ -603,6 +656,6 @@ bash scripts/bench.sh --compare baseline current
 
 ---
 
-*Document version: 1.2.0*
-*Last updated: 2026-01-15*
-*Benchmark data: regex-bench v0.11.0*
+*Document version: 1.3.0*
+*Last updated: 2026-03-24*
+*Benchmark data: regex-bench v0.12.19*