diff --git a/CHANGELOG.md b/CHANGELOG.md
index b7f0059..5e9f103 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,62 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120))
 - SIMD prefilter for CompositeSequenceDFA (#83)
 
+## [0.12.21] - 2026-03-27
+
+### Performance
+- **Tagged start states** (Rust `LazyStateID` approach) — start states get tag bit,
+  always route to slow path. Enables prefilter skip-ahead only at start state,
+  eliminating O(n²) from start state self-loop. Unlocks UseDFA for tiny NFA patterns.
+
+- **DFA multiline $ fix** — EndLine look-ahead re-computation in determinize
+  (Rust mod.rs:131-212). `(?m)hello$` now works correctly in DFA.
+
+- **Dead-state prefilter restart** in searchEarliestMatch — IsMatch path uses
+  prefilter to skip past dead states, matching Rust find_fwd_imp approach.
+
+- **1100x fewer mallocs** — FindAllIndex/FindAllSubmatchIndex use flat buffer
+  (`compactToSliceOfSlice`): N matches → 2 allocations instead of N+1.
+
+- **Local SearchState cache** on Engine — atomic.Pointer single-slot cache
+  survives GC, avoids sync.Pool re-allocation overhead.
+
+- **Tiny NFA → UseDFA routing** — patterns with < 20 NFA states now use
+  bidirectional DFA (was PikeVM). 7x faster DFA vs PikeVM on large inputs.
+
+### Added
+- **`AllIndex(b []byte) iter.Seq[[2]int]`** — zero-alloc match index iterator (Go 1.23+)
+- **`AllStringIndex(s string) iter.Seq[[2]int]`** — string version
+- **`All(b []byte) iter.Seq[[]byte]`** — zero-alloc match content iterator
+- **`AllString(s string) iter.Seq[string]`** — string version
+- **`AppendAllIndex(dst [][2]int, b []byte, n int) [][2]int`** — buffer-reuse API
+- **`AppendAllStringIndex(dst [][2]int, s string, n int) [][2]int`** — string version
+
+Naming follows Go proposal #61902 (regexp iterator methods) and `strconv.Append*` convention.
+
+### Fixed
+- DFA `isMatchWithPrefilter` pfSkip off-by-one — `zx+` on "zzx" now correct
+- DFA multiline `$` EndLine look-ahead — `(?m)hello$` now matches before `\n`
+
+### Benchmarks (LangArena LogParser, 7.2 MB, 13 patterns)
+
+| Metric | v0.12.20 | v0.12.21 | Improvement |
+|--------|----------|----------|-------------|
+| Total time (FindAll) | 163ms | **107ms** | **-34%** |
+| errors pattern | 23ms | **8ms** (FindAll) / **5.5ms** (AllIndex) | **-65% / -76%** |
+| vs Rust gap | 3.9x | **2.9x** (FindAll) / **1.7x** (AllIndex) | **-56%** |
+| Mallocs/iter | 203K | **182** | **-99.9%** |
+
+### Zero-Alloc API Benchmarks (new methods vs stdlib-compat)
+
+| Method | errors (33K matches) | Alloc | vs Rust |
+|--------|---------------------|-------|---------|
+| FindAllStringIndex (stdlib) | 8.2ms / 3890 KB | 19 mallocs | 2.6x slower |
+| **AllIndex (iter.Seq)** | **5.9ms / 0 KB** | **0 mallocs** | **1.7x** |
+| **AppendAllIndex (reuse)** | **5.5ms / 0 KB** | **0 mallocs** | **1.7x** |
+| Rust find_iter | 3.2ms / 0 | 0 | — |
+
+emails pattern: `AppendAllIndex` **2.0ms vs Rust 2.6ms** — **faster than Rust!**
+
 ## [0.12.20] - 2026-03-25
 
 ### Performance
diff --git a/README.md b/README.md
index 3b4f4f0..d4314b3 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,7 @@ Cross-language benchmarks on 6MB input, AMD EPYC ([source](https://github.com/ko
 - Multi-pattern (`foo|bar|baz|...`) — Slim Teddy (≤32), Fat Teddy (33-64), or Aho-Corasick (>64)
 - Anchored alternations (`^(\d+|UUID|hex32)`) — O(1) branch dispatch (5-20x)
 - Concatenated char classes (`[a-zA-Z]+[0-9]+`) — DFA with byte classes (5-7x)
+- **Zero-alloc iterators** (`AllIndex`, `AppendAllIndex`) — 0 heap allocs, up to **30% faster** than FindAll. Email pattern **faster than Rust** with `AppendAllIndex`.
 
 ## Features
 
@@ -130,11 +131,28 @@ Supported methods:
 ### Zero-Allocation APIs
 
 ```go
-// Zero allocations — returns bool
+// Zero allocations — boolean match
 matched := re.IsMatch(text)
 
-// Zero allocations — returns (start, end, found)
+// Zero allocations — single match indices
 start, end, found := re.FindIndices(text)
+
+// Zero allocations — iterator over all matches (Go 1.23+)
+for m := range re.AllIndex(data) {
+    fmt.Printf("match at [%d, %d]\n", m[0], m[1])
+}
+
+// Zero allocations — match content iterator
+for s := range re.AllString(text) {
+    fmt.Println(s)
+}
+
+// Buffer-reuse — append to caller's slice (strconv.Append* pattern)
+var buf [][2]int
+for _, chunk := range chunks {
+    buf = re.AppendAllIndex(buf[:0], chunk, -1)
+    process(buf)
+}
 ```
 
 ### Configuration
diff --git a/ROADMAP.md b/ROADMAP.md
index d38bd6b..6367799 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -97,8 +97,11 @@ v0.12.18 ✅ → Flat DFA transition table, integrated prefilter, PikeVM skip-ah
          ↓
 v0.12.19 ✅ → Zero-alloc FindSubmatch, byte-based DFA cache, Rust-aligned visited limits
          ↓
-v0.12.20 (Current) → Premultiplied/tagged StateIDs, break-at-match DFA determinize,
-                      Phase 3 elimination (2-pass bidirectional DFA)
+v0.12.20 ✅ → Premultiplied/tagged StateIDs, break-at-match DFA determinize,
+               Phase 3 elimination (2-pass bidirectional DFA)
+         ↓
+v0.12.21 (Current) → Tagged start states, zero-alloc API (AllIndex iter.Seq),
+                      1100x fewer mallocs, UseDFA for tiny NFA, -32% LangArena
          ↓
 v1.0.0-rc → Feature freeze, API locked
          ↓
diff --git a/dfa/lazy/builder.go b/dfa/lazy/builder.go
index 4c3f065..1880c9a 100644
--- a/dfa/lazy/builder.go
+++ b/dfa/lazy/builder.go
@@ -64,6 +64,9 @@ func (b *Builder) Build() (*DFA, error) {
 	// Check if the NFA contains word boundary assertions
 	hasWordBoundary := b.checkHasWordBoundary()
 
+	// Check if the NFA contains EndLine ($) assertions
+	hasEndLine := b.checkHasEndLine()
+
 	// Check if the pattern is always anchored (has ^ prefix)
 	isAlwaysAnchored := b.nfa.IsAlwaysAnchored()
 
@@ -80,6 +83,7 @@ func (b *Builder) Build() (*DFA, error) {
 		byteClasses:      b.nfa.ByteClasses(),
 		unanchoredStart:  b.nfa.StartUnanchored(),
 		hasWordBoundary:  hasWordBoundary,
+		hasEndLine:       hasEndLine,
 		isAlwaysAnchored: isAlwaysAnchored,
 		startByteMap:     startByteMap,
 	}
@@ -706,3 +710,23 @@ func (b *Builder) checkHasWordBoundary() bool {
 	}
 	return false
 }
+
+// checkHasEndLine checks if the NFA contains EndLine ($) look assertions.
+// When true, determinize performs look-ahead re-computation on '\n' bytes.
+// Computed once at DFA build time for O(1) check in hot loop.
+func (b *Builder) checkHasEndLine() bool {
+	numStates := b.nfa.States()
+	for i := nfa.StateID(0); int(i) < numStates; i++ {
+		state := b.nfa.State(i)
+		if state == nil {
+			continue
+		}
+		if state.Kind() == nfa.StateLook {
+			look, _ := state.Look()
+			if look == nfa.LookEndLine {
+				return true
+			}
+		}
+	}
+	return false
+}
diff --git a/dfa/lazy/lazy.go b/dfa/lazy/lazy.go
index 8610a54..a48b532 100644
--- a/dfa/lazy/lazy.go
+++ b/dfa/lazy/lazy.go
@@ -78,6 +78,11 @@ type DFA struct {
 	// When false, we can skip expensive word boundary checks in the search loop.
 	hasWordBoundary bool
 
+	// hasEndLine is true if the NFA contains EndLine ($) look assertions.
+	// When true, determinize performs look-ahead re-computation on '\n' bytes.
+	// When false (most patterns), this check is skipped entirely.
+	hasEndLine bool
+
 	// isAlwaysAnchored is true if the pattern is inherently anchored (has ^ prefix).
 	// When true, we only need to try matching from position 0.
 	isAlwaysAnchored bool
@@ -375,31 +380,11 @@ func (d *DFA) searchFirstAt(cache *DFACache, haystack []byte, startPos int) int
 
 	canUnroll := !d.hasWordBoundary
 	ftLen := len(ft)
-	startSID := startState.id
 	hasPre := d.prefilter != nil
 
 	for pos < end {
-		// Prefilter skip-ahead at start state
-		if hasPre && sid == startSID && lastMatch < 0 && pos > startPos {
-			candidate := d.prefilter.Find(haystack, pos)
-			if candidate == -1 {
-				return lastMatch
-			}
-			if candidate > pos {
-				pos = candidate
-				newStart := d.getStartStateForUnanchored(cache, haystack, pos)
-				if newStart == nil {
-					return d.nfaFallback(haystack, startPos)
-				}
-				sid = newStart.id
-				startSID = sid
-				ft = cache.flatTrans
-				ftLen = len(ft)
-			}
-		}
-
 		// === 4x UNROLLED FAST PATH ===
-		// With match delay, tagged states (including match) break to slow path.
+		// With match delay, tagged states (including match, start) break to slow path.
 		if canUnroll && pos+3 < end {
 			if sid.Offset()+stride > ftLen {
 				goto searchFirstSlowPath
@@ -452,6 +437,25 @@ func (d *DFA) searchFirstAt(cache *DFACache, haystack []byte, startPos int) int
 			break
 		}
 
+		// Start state prefilter skip-ahead (Rust find_fwd_imp).
+		if sid.IsStartTag() && hasPre && lastMatch < 0 && pos > startPos {
+			candidate := d.prefilter.Find(haystack, pos)
+			if candidate == -1 {
+				return lastMatch
+			}
+			if candidate > pos {
+				pos = candidate
+				newStart := d.getStartStateForUnanchored(cache, haystack, pos)
+				if newStart == nil {
+					return d.nfaFallback(haystack, startPos)
+				}
+				sid = newStart.id
+				ft = cache.flatTrans
+				ftLen = len(ft)
+				continue
+			}
+		}
+
 		if d.hasWordBoundary {
 			st := cache.getState(sid)
 			if st != nil && st.checkWordBoundaryFast(haystack[pos]) {
@@ -526,12 +530,10 @@ func (d *DFA) IsMatch(cache *DFACache, haystack []byte) bool {
 		return d.matchesEmpty(cache)
 	}
 
-	// Use prefilter for acceleration if available
-	if d.prefilter != nil {
-		return d.isMatchWithPrefilter(cache, haystack)
-	}
-
-	// No prefilter: use optimized DFA search with early termination
+	// With tagged start states, searchEarliestMatch handles prefilter correctly:
+	// start-tagged states always enter slow path where prefilter skip-ahead
+	// runs only at start states — no O(n^2) on start state self-loop.
+	// This replaces the separate isMatchWithPrefilter path.
 	return d.searchEarliestMatch(cache, haystack, 0)
 }
 
@@ -550,136 +552,6 @@ func (d *DFA) IsMatchAt(cache *DFACache, haystack []byte, at int) bool {
 	return d.searchEarliestMatch(cache, haystack, at)
 }
 
-// isMatchWithPrefilter uses an integrated prefilter+DFA loop (Rust approach).
-//
-// Instead of two separate passes (prefilter.Find → DFA.searchAnchored → repeat),
-// this runs a single DFA loop where dead-state transitions trigger prefilter
-// skip-ahead. This eliminates Go function call overhead between passes and
-// avoids redundant start-state setup on each candidate.
-//
-// Reference: rust regex-automata hybrid/search.rs find_fwd_imp — prefilter
-// is called inside the DFA loop when returning to start state.
-func (d *DFA) isMatchWithPrefilter(cache *DFACache, haystack []byte) bool {
-	// If prefilter is complete, its match is sufficient
-	if d.prefilter.IsComplete() {
-		return d.prefilter.Find(haystack, 0) != -1
-	}
-
-	// Find first candidate to start DFA from
-	pos := d.prefilter.Find(haystack, 0)
-	if pos == -1 {
-		return false
-	}
-
-	// Get anchored start state at candidate position
-	currentState := d.getStartState(cache, haystack, pos, true)
-	if currentState == nil {
-		return d.isMatchWithPrefilterFallback(cache, haystack, pos)
-	}
-	// With 1-byte match delay, start states are never match states.
-
-	endPos := len(haystack)
-	sid := currentState.id
-	ft := cache.flatTrans
-	ftLen := len(ft)
-
-	for pos < endPos {
-		if d.hasWordBoundary {
-			st := cache.getState(sid)
-			if st != nil && st.checkWordBoundaryFast(haystack[pos]) {
-				return true
-			}
-		}
-
-		classIdx := int(d.byteToClass(haystack[pos]))
-		offset := sid.Offset() + classIdx
-		var nextID StateID
-		if offset < ftLen {
-			nextID = ft[offset]
-		} else {
-			nextID = InvalidState
-		}
-
-		switch nextID {
-		case InvalidState:
-			currentState = cache.getState(sid)
-			if currentState == nil {
-				start, end, matched := d.pikevm.SearchAt(haystack, pos)
-				return matched && start >= 0 && end >= start
-			}
-			nextState, err := d.determinize(cache, currentState, haystack[pos])
-			if err != nil {
-				start, end, matched := d.pikevm.SearchAt(haystack, pos)
-				return matched && start >= 0 && end >= start
-			}
-			if nextState == nil {
-				goto pfSkip
-			}
-			sid = nextState.id
-			ft = cache.flatTrans
-			ftLen = len(ft)
-
-		case DeadState:
-			goto pfSkip
-
-		default:
-			sid = nextID
-		}
-
-		pos++
-		// 1-byte match delay: check after transition
-		if cache.IsMatchState(sid) {
-			return true
-		}
-		continue
-
-	pfSkip:
-		pos++
-		candidate := d.prefilter.Find(haystack, pos)
-		if candidate == -1 {
-			return false
-		}
-		pos = candidate
-
-		newStart := d.getStartState(cache, haystack, pos, true)
-		if newStart == nil {
-			return d.isMatchWithPrefilterFallback(cache, haystack, pos)
-		}
-		sid = newStart.id
-		ft = cache.flatTrans
-		ftLen = len(ft)
-		// With match delay, start states are never match — continue loop.
-	}
-
-	eoi := cache.getState(sid)
-	if eoi != nil {
-		return d.checkEOIMatch(eoi)
-	}
-	return false
-}
-
-// isMatchWithPrefilterFallback is the old two-pass approach used when
-// DFA start state cannot be obtained (NFA fallback needed).
-func (d *DFA) isMatchWithPrefilterFallback(cache *DFACache, haystack []byte, pos int) bool {
-	// Try anchored DFA search at current position
-	if d.searchEarliestMatchAnchored(cache, haystack, pos) {
-		return true
-	}
-	// Continue with remaining candidates
-	for pos < len(haystack) {
-		pos++
-		candidate := d.prefilter.Find(haystack, pos)
-		if candidate == -1 {
-			return false
-		}
-		pos = candidate
-		if d.searchEarliestMatchAnchored(cache, haystack, pos) {
-			return true
-		}
-	}
-	return false
-}
-
 // searchEarliestMatch performs DFA search with early termination.
 // Returns true as soon as any match state is reached.
 // This is faster than searchAt because it doesn't track match positions
@@ -723,6 +595,8 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int
 		_ = ft[ftLen-1]
 	}
 
+	hasPre := d.prefilter != nil
+
 	for pos < endPos {
 		// === 4x UNROLLED FAST PATH (earliest match) ===
 		// For IsMatch(), we return true on ANY match, so no leftmost-longest tracking.
@@ -802,6 +676,45 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int
 			break
 		}
 
+		// Start state prefilter skip-ahead (Rust find_fwd_imp:232-261).
+		// Start-tagged states ALWAYS enter slow path (never unrolled fast path),
+		// so prefilter check happens only here — no O(n^2) on start state self-loop.
+		if sid.IsStartTag() {
+			if hasPre && pos > startPos {
+				candidate := d.prefilter.Find(haystack, pos)
+				if candidate == -1 {
+					return false
+				}
+				if candidate > pos {
+					pos = candidate
+					newStart := d.getStartStateForUnanchored(cache, haystack, pos)
+					if newStart == nil {
+						start, end, matched := d.pikevm.SearchAt(haystack, startPos)
+						return matched && start >= 0 && end >= start
+					}
+					sid = newStart.id
+					ft = cache.flatTrans
+					ftLen = len(ft)
+					continue
+				}
+			}
+			// Start state fast transition: skip getState/acceleration.
+			classIdx := int(d.byteToClass(haystack[pos]))
+			offset := sid.Offset() + classIdx
+			if offset < ftLen {
+				nextID := ft[offset]
+				if nextID != InvalidState && nextID != DeadState {
+					sid = nextID
+					pos++
+					if cache.IsMatchState(sid) {
+						return true
+					}
+					continue
+				}
+			}
+			// InvalidState/DeadState: fall through to full slow path
+		}
+
 		// Try lazy acceleration detection if not yet checked
 		currentState = cache.getState(sid)
 		if currentState == nil {
@@ -846,21 +759,18 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int
 			// Determinize on demand
 			nextState, err := d.determinize(cache, currentState, b)
 			if err != nil {
-				// Cache cleared or full — fall back to NFA from original start position.
 				start, end, matched := d.pikevm.SearchAt(haystack, startPos)
 				return matched && start >= 0 && end >= start
 			}
 			if nextState == nil {
-				// Dead state - no match possible from here
-				return false
+				goto earliestPreSkip
 			}
 			sid = nextState.id
 			ft = cache.flatTrans
 			ftLen = len(ft)
 
 		case DeadState:
-			// Dead state - no match possible from here
-			return false
+			goto earliestPreSkip
 
 		default:
 			sid = nextID
@@ -872,6 +782,31 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int
 		if cache.IsMatchState(sid) {
 			return true
 		}
+		continue
+
+	earliestPreSkip:
+		// Dead state with prefilter: advance past failed byte, find next candidate.
+		// Without prefilter: dead = no match.
+		if !hasPre {
+			return false
+		}
+		pos++
+		if pos >= endPos {
+			return false
+		}
+		candidate := d.prefilter.Find(haystack, pos)
+		if candidate == -1 {
+			return false
+		}
+		pos = candidate
+		newStart := d.getStartStateForUnanchored(cache, haystack, pos)
+		if newStart == nil {
+			start, end, matched := d.pikevm.SearchAt(haystack, startPos)
+			return matched && start >= 0 && end >= start
+		}
+		sid = newStart.id
+		ft = cache.flatTrans
+		ftLen = len(ft)
 	}
 
 	// Reached end of input without finding a match in the loop.
@@ -1006,9 +941,27 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int)
 	sid := currentState.id
 	ft := cache.flatTrans
 	ftLen := len(ft)
-	startSID := sid
 
 	for pos < len(haystack) {
+		// Start state prefilter skip-ahead (Rust find_fwd_imp).
+		if sid.IsStartTag() && lastMatch < 0 && pos > startAt {
+			candidate = d.prefilter.Find(haystack, pos)
+			if candidate == -1 {
+				return -1
+			}
+			if candidate > pos {
+				pos = candidate
+				newStart := d.getStartStateForUnanchored(cache, haystack, pos)
+				if newStart == nil {
+					return d.nfaFallback(haystack, 0)
+				}
+				sid = newStart.id
+				ft = cache.flatTrans
+				ftLen = len(ft)
+				continue
+			}
+		}
+
 		if d.hasWordBoundary {
 			st := cache.getState(sid)
 			if st != nil && d.checkWordBoundaryMatch(st, haystack[pos]) {
@@ -1039,7 +992,6 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int)
 						return d.nfaFallback(haystack, 0)
 					}
 					sid = newStart.id
-					startSID = sid
 					ft = cache.flatTrans
 					ftLen = len(ft)
 					continue
@@ -1062,7 +1014,6 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int)
 					return d.nfaFallback(haystack, 0)
 				}
 				sid = newStart.id
-				startSID = sid
 				ft = cache.flatTrans
 				ftLen = len(ft)
 				lastMatch = -1
@@ -1087,7 +1038,6 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int)
 				return d.nfaFallback(haystack, 0)
 			}
 			sid = newStart.id
-			startSID = sid
 			ft = cache.flatTrans
 			ftLen = len(ft)
 			lastMatch = -1
@@ -1103,17 +1053,6 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int)
 		}
 
 		pos++
-
-		// Start state prefilter skip-ahead
-		if lastMatch < 0 && sid == startSID && pos < len(haystack) {
-			candidate = d.prefilter.Find(haystack, pos)
-			if candidate == -1 {
-				return -1
-			}
-			if candidate > pos {
-				pos = candidate
-			}
-		}
 	}
 
 	// EOI check for delayed match
@@ -1195,35 +1134,16 @@ func (d *DFA) searchAt(cache *DFACache, haystack []byte, startPos int) int { //n
 		_ = ft[ftLen-1]
 	}
 
-	startSID := currentState.id
 	hasPre := d.prefilter != nil
 
 	for pos < end {
-		// Prefilter skip-ahead at start state (Rust hybrid/search.rs:232-258)
-		if hasPre && sid == startSID && lastMatch < 0 && pos > startPos {
-			candidate := d.prefilter.Find(haystack, pos)
-			if candidate == -1 {
-				return lastMatch
-			}
-			if candidate > pos {
-				pos = candidate
-				newStart := d.getStartStateForUnanchored(cache, haystack, pos)
-				if newStart == nil {
-					return d.nfaFallback(haystack, startPos)
-				}
-				sid = newStart.id
-				startSID = sid
-				ft = cache.flatTrans
-				ftLen = len(ft)
-			}
-		}
-
 		// === 4x UNROLLED FAST PATH ===
 		// Process 4 transitions per iteration when conditions allow.
 		// With match delay, match states break out of the unrolled loop
 		// to the slow path for proper handling.
+		// Start-tagged states also break to slow path for prefilter skip-ahead.
 		if canUnroll && pos+3 < end {
-			// Check acceleration on slow→fast transition
+			// Check acceleration on slow->fast transition
 			accelState := cache.getState(sid)
 			if accelState != nil && accelState.IsAccelerable() {
 				goto slowPath
@@ -1282,6 +1202,45 @@ func (d *DFA) searchAt(cache *DFACache, haystack []byte, startPos int) int { //n
 			break
 		}
 
+		// Start state prefilter skip-ahead (Rust find_fwd_imp:232-261).
+		// Start-tagged states always enter slow path, enabling prefilter
+		// check only here — no O(n^2) on start state self-loop.
+		if sid.IsStartTag() {
+			if hasPre && lastMatch < 0 && pos > startPos {
+				candidate := d.prefilter.Find(haystack, pos)
+				if candidate == -1 {
+					return lastMatch
+				}
+				if candidate > pos {
+					pos = candidate
+					newStart := d.getStartStateForUnanchored(cache, haystack, pos)
+					if newStart == nil {
+						return d.nfaFallback(haystack, startPos)
+					}
+					sid = newStart.id
+					ft = cache.flatTrans
+					ftLen = len(ft)
+					continue
+				}
+			}
+			// Start state fast transition: skip getState/acceleration (start is never
+			// accelerable). Do direct flatTrans lookup — same cost as searchFirstAt.
+			classIdx := int(d.byteToClass(haystack[pos]))
+			offset := sid.Offset() + classIdx
+			if offset < ftLen {
+				nextID := ft[offset]
+				if nextID != InvalidState && nextID != DeadState {
+					sid = nextID
+					if cache.IsMatchState(sid) {
+						lastMatch = pos
+					}
+					pos++
+					continue
+				}
+			}
+			// InvalidState/DeadState: fall through to full slow path
+		}
+
 		// Resolve State for slow path (acceleration, word boundary, determinize).
 		currentState = cache.getState(sid)
 		if currentState == nil {
@@ -1381,11 +1340,22 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro
 	// The actual byte value is still used for NFA move operations
 	classIdx := d.byteToClass(b)
 
+	// Look-ahead re-computation (Rust determinize mod.rs:131-212):
+	// Before checking for matches, resolve look-ahead assertions that depend
+	// on the current input byte. When input is '\n', EndLine ($) is satisfied
+	// for the CURRENT state, unlocking paths through $ assertions.
+	// This re-runs epsilon closure on the current state's NFA IDs with the
+	// new look-ahead, potentially adding Match states behind $ assertions.
+	currentNFAStates := current.NFAStates()
+	if d.hasEndLine && b == '\n' {
+		currentNFAStates = builder.epsilonClosure(currentNFAStates, LookEndLine)
+	}
+
 	// 1-byte match delay (Rust determinize mod.rs:254-286):
 	// Check if source (current) state's NFA states contain a match state.
 	// The NEW DFA state will be tagged as match if the OLD state had NFA match.
 	// This delays match reporting by 1 byte, enabling correct look-around (^, $, \b).
-	sourceHasMatch := builder.containsMatchState(current.NFAStates())
+	sourceHasMatch := builder.containsMatchState(currentNFAStates)
 
 	// Compute next NFA state set via move operation WITH word context.
 	// Leftmost-first (Rust determinize::next mod.rs:284):
@@ -1394,7 +1364,7 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro
 	// processed, causing the DFA to reach dead state with the committed match.
 	// BreakAtMatch is disabled for reverse DFAs to allow finding leftmost start.
 	breakAtMatch := sourceHasMatch && d.config.BreakAtMatch
-	nextNFAStates := builder.moveWithWordContextBreak(current.NFAStates(), b, current.IsFromWord(), breakAtMatch)
+	nextNFAStates := builder.moveWithWordContextBreak(currentNFAStates, b, current.IsFromWord(), breakAtMatch)
 
 	isMatch := sourceHasMatch
 
@@ -1520,6 +1490,9 @@ func (d *DFA) tryClearCache(cache *DFACache) error {
 	_, _ = cache.Insert(key, startState) // Cannot fail: cache was just cleared
 	cache.registerState(startState)
 
+	// Tag as start state (same as getStartState)
+	startState.id = startState.id.WithStartTag()
+
 	// Cache the default start state in StartTable
 	cache.startTable.Set(StartText, false, startState.ID())
 
@@ -1625,6 +1598,12 @@ func (d *DFA) getStartState(cache *DFACache, haystack []byte, pos int, anchored
 		cache.registerState(insertedState)
 	}
 
+	// Tag this state as a start state (Rust LazyStateID start tag approach).
+	// Start-tagged IDs always enter the slow path in the DFA hot loop,
+	// enabling prefilter skip-ahead ONLY at start states (not every byte).
+	// Offset() strips tags, so flatTrans lookups still work correctly.
+	insertedState.id = insertedState.id.WithStartTag()
+
 	// Cache in StartTable for fast lookup next time
 	cache.startTable.Set(kind, anchored, insertedState.ID())
 
@@ -2169,6 +2148,9 @@ func (d *DFA) getStartStateForReverse(cache *DFACache, haystack []byte, end int)
 		cache.registerState(insertedState)
 	}
 
+	// Tag as start state (same as forward getStartState)
+	insertedState.id = insertedState.id.WithStartTag()
+
 	cache.startTable.Set(kind, false, insertedState.ID())
 	return insertedState
 }
diff --git a/dfa/lazy/state.go b/dfa/lazy/state.go
index 9ff197b..abdee8a 100644
--- a/dfa/lazy/state.go
+++ b/dfa/lazy/state.go
@@ -91,6 +91,15 @@ func (sid StateID) IsInvalidTag() bool {
 	return sid&tagInvalid != 0
 }
 
+// IsStartTag returns true if this state has the start tag.
+// Start-tagged states always enter the slow path in the DFA search loop,
+// enabling prefilter skip-ahead only at start states (Rust LazyStateID approach).
+//
+//go:nosplit
+func (sid StateID) IsStartTag() bool {
+	return sid&tagStart != 0
+}
+
 // WithMatchTag returns a copy of this StateID with the match tag set.
 func (sid StateID) WithMatchTag() StateID {
 	return sid | tagMatch
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index f29a7b6..d0ed272 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -97,16 +97,18 @@ Input → Prefilter (memchr/memmem/teddy) → Engine Search → Match Result
               ↓            ↓            ↓
         ┌────────────┐ ┌────────────┐ ┌────────────┐
         │ SearchState│ │ SearchState│ │ SearchState│ ← Per-goroutine
-        │(goroutine1)│ │(goroutine2)│ │(goroutine3)│    (sync.Pool)
+        │(goroutine1)│ │(goroutine2)│ │(goroutine3)│    (atomic local + sync.Pool)
         └────────────┘ └────────────┘ └────────────┘
 ```
 
+First goroutine uses atomic local cache (survives GC), concurrent goroutines fall back to sync.Pool.
+
 ## Key Design Decisions
 
 1. **Multi-engine**: Strategy selection at compile time, not runtime
 2. **Rust reference**: Architecture mirrors Rust regex crate (lazy DFA, PikeVM, prefilters)
 3. **Leftmost-first match**: DFA break-at-match matches Rust semantics (verified via cargo run)
-4. **Zero-alloc hot paths**: `IsMatch()`, `FindIndices()`, `Count()` — no heap allocation
+4. **Zero-alloc hot paths**: `IsMatch()`, `FindIndices()`, `Count()`, `AllIndex()` iterator — no heap allocation
 5. **SIMD first**: AVX2/SSSE3 prefilters for x86_64, pure Go fallback for other archs
 
 ## References
diff --git a/docs/STDLIB_COMPATIBILITY.md b/docs/STDLIB_COMPATIBILITY.md
index 012eba8..f02fb2e 100644
--- a/docs/STDLIB_COMPATIBILITY.md
+++ b/docs/STDLIB_COMPATIBILITY.md
@@ -151,6 +151,46 @@ input := "world\nhello\ntest"
 
 **Note:** Common case-insensitive patterns work correctly. This affects only complex edge cases with overlapping matches.
 
+## coregex Extensions (beyond stdlib)
+
+These methods are NOT in Go's stdlib `regexp` but provide zero-allocation alternatives:
+
+### Iterator API (Go 1.23+, zero allocation)
+
+```go
+// Iterate over all match indices — zero heap allocation
+for m := range re.AllIndex(data) {
+    fmt.Printf("[%d, %d]\n", m[0], m[1])
+}
+
+// Iterate over match content — zero copy
+for s := range re.AllString(text) {
+    fmt.Println(s)
+}
+```
+
+Methods: `AllIndex`, `AllStringIndex`, `All`, `AllString`
+
+Naming follows Go proposal [#61902](https://github.com/golang/go/issues/61902) (regexp iterator methods).
+
+### Buffer-Reuse API (zero allocation with reused buffer)
+
+```go
+// Append matches to caller's buffer — strconv.Append* pattern
+var buf [][2]int
+buf = re.AppendAllIndex(buf[:0], data, -1)
+```
+
+Methods: `AppendAllIndex`, `AppendAllStringIndex`
+
+### Zero-Allocation Search
+
+```go
+re.IsMatch(data)                    // bool, zero alloc
+start, end, found := re.FindIndices(data)  // indices, zero alloc
+count := re.Count(data, -1)         // count, zero alloc
+```
+
 ## Migration Guide
 
 ### Step 1: Simple Find-and-Replace
diff --git a/meta/compile.go b/meta/compile.go
index 1a60f34..a6715b5 100644
--- a/meta/compile.go
+++ b/meta/compile.go
@@ -603,7 +603,9 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
 	// Initialize state pool for thread-safe concurrent searches
 	numCaptures := nfaEngine.CaptureCount()
 
-	return &Engine{
+	ssCfg := buildSearchStateConfig(pikevmNFA, numCaptures, engines, strategy)
+
+	eng := &Engine{
 		nfa:                            nfaEngine,
 		runeNFA:                        runeNFAEngine,
 		asciiNFA:                       asciiNFAEngine,
@@ -636,11 +638,15 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
 		canMatchEmpty:                  canMatchEmpty,
 		isStartAnchored:                isStartAnchored,
 		fatTeddyFallback:               fatTeddyFallback,
-		statePool: newSearchStatePool(buildSearchStateConfig(
-			pikevmNFA, numCaptures, engines, strategy,
-		)),
-		stats: Stats{},
-	}, nil
+		statePool:                      newSearchStatePool(ssCfg),
+		stats:                          Stats{},
+	}
+
+	// Eagerly create one SearchState and store it in the local GC-proof cache.
+	// This ensures the first search call doesn't allocate via sync.Pool.
+	eng.localState.Store(newSearchState(ssCfg))
+
+	return eng, nil
 }
 
 // adjustForAnchors fixes prefilter for patterns with anchors.
diff --git a/meta/engine.go b/meta/engine.go
index dd1d5ca..10227e5 100644
--- a/meta/engine.go
+++ b/meta/engine.go
@@ -5,6 +5,8 @@
 package meta
 
 import (
+	"sync/atomic"
+
 	"github.com/coregx/ahocorasick"
 	"github.com/coregx/coregex/dfa/lazy"
 	"github.com/coregx/coregex/dfa/onepass"
@@ -123,6 +125,16 @@ type Engine struct {
 	// This enables concurrent searches on the same Engine instance.
 	statePool *searchStatePool
 
+	// localState is a single-slot GC-proof cache for the common single-goroutine path.
+	// Unlike sync.Pool entries which are collected every GC cycle, this pointer is a
+	// strong reference that survives GC indefinitely. On LangArena (13 patterns × 10
+	// iterations), this eliminates ~221 MB of DFACache re-allocation caused by GC
+	// clearing the sync.Pool between iterations.
+	//
+	// Thread safety: atomic swap ensures only one goroutine gets the cached state.
+	// Additional concurrent goroutines fall through to statePool.
+	localState atomic.Pointer[SearchState]
+
 	// longest enables leftmost-longest (POSIX) matching semantics
 	// By default (false), uses leftmost-first (Perl) semantics
 	longest bool
@@ -243,11 +255,16 @@ func (e *Engine) SetLongest(longest bool) {
 	}
 }
 
-// getSearchState retrieves a SearchState from the pool.
+// getSearchState retrieves a SearchState, trying the local GC-proof cache first.
 // Caller must call putSearchState when done.
 // The returned state contains its own PikeVM instance for thread-safe concurrent use.
 func (e *Engine) getSearchState() *SearchState {
-	state := e.statePool.get()
+	// Fast path: grab from local cache (survives GC, zero-alloc steady state).
+	state := e.localState.Swap(nil)
+	if state == nil {
+		// Slow path: concurrent access or first call before eager init.
+		state = e.statePool.get()
+	}
 
 	// Initialize state for BoundedBacktracker if needed
 	if e.boundedBacktracker != nil && state.backtracker != nil {
@@ -262,7 +279,18 @@ func (e *Engine) getSearchState() *SearchState {
 	return state
 }
 
-// putSearchState returns a SearchState to the pool.
+// putSearchState returns a SearchState, trying the local cache first.
+// The local cache slot holds one state as a strong reference that survives GC.
+// Overflow goes to sync.Pool (may be collected by GC).
 func (e *Engine) putSearchState(state *SearchState) {
+	if state == nil {
+		return
+	}
+	state.reset()
+	// Try to store in local cache (GC-proof single slot).
+	if e.localState.CompareAndSwap(nil, state) {
+		return
+	}
+	// Local slot occupied (concurrent goroutine), fall back to pool.
 	e.statePool.put(state)
 }
diff --git a/meta/find_indices.go b/meta/find_indices.go
index cd794e3..111d387 100644
--- a/meta/find_indices.go
+++ b/meta/find_indices.go
@@ -219,6 +219,12 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) {
 func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) { //nolint:cyclop // DFA with prefilter paths
 	atomic.AddUint64(&e.stats.DFASearches, 1)
 
+	// Longest (POSIX) mode: DFA uses leftmost-first (break-at-match), which is
+	// incompatible with leftmost-longest semantics. Fall back to PikeVM.
+	if e.longest {
+		return e.pikevm.Search(haystack)
+	}
+
 	// Literal fast path — complete prefilter returns match directly
 	if e.prefilter != nil && e.prefilter.IsComplete() {
 		pos := e.prefilter.Find(haystack, 0)
@@ -328,6 +334,11 @@ func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) { //nolint:cyc
 func (e *Engine) findIndicesDFAAt(haystack []byte, at int) (int, int, bool) {
 	atomic.AddUint64(&e.stats.DFASearches, 1)
 
+	// Longest (POSIX) mode: DFA uses leftmost-first, fall back to PikeVM.
+	if e.longest {
+		return e.pikevm.SearchAt(haystack, at)
+	}
+
 	// Prefilter skip-ahead — safe for all prefilters, DFA verifies.
 	if e.prefilter != nil {
 		pos := e.prefilter.Find(haystack, at)
@@ -356,6 +367,86 @@ func (e *Engine) findIndicesDFAAt(haystack []byte, at int) (int, int, bool) {
 	return e.pikevm.SearchAt(haystack, at)
 }
 
+// findIndicesDFAAtWithState searches using DFA starting at position, reusing provided state.
+// Eliminates per-match sync.Pool overhead when called from FindAll/Count loops.
+func (e *Engine) findIndicesDFAAtWithState(haystack []byte, at int, state *SearchState) (int, int, bool) {
+	atomic.AddUint64(&e.stats.DFASearches, 1)
+
+	// Longest (POSIX) mode: DFA uses leftmost-first, fall back to PikeVM.
+	if e.longest {
+		return state.pikevm.SearchAt(haystack, at)
+	}
+
+	// Prefilter skip-ahead — safe for all prefilters, DFA verifies.
+	if e.prefilter != nil {
+		pos := e.prefilter.Find(haystack, at)
+		if pos == -1 {
+			return -1, -1, false
+		}
+		atomic.AddUint64(&e.stats.PrefilterHits, 1)
+		// Bidirectional DFA: forward DFA → end, reverse DFA → start. O(n) total.
+		if e.reverseDFA != nil {
+			return e.findIndicesBidirectionalDFACore(haystack, pos, state)
+		}
+		return state.pikevm.SearchAt(haystack, pos)
+	}
+
+	if e.reverseDFA != nil {
+		return e.findIndicesBidirectionalDFACore(haystack, at, state)
+	}
+	matched := e.dfa.IsMatchAt(state.dfaCache, haystack, at)
+	if !matched {
+		return -1, -1, false
+	}
+
+	// DFA confirmed a match exists - use PikeVM for exact bounds
+	return state.pikevm.SearchAt(haystack, at)
+}
+
+// findIndicesAdaptiveAtWithState tries prefilter+DFA first, falls back to NFA.
+// Reuses provided SearchState to eliminate per-match sync.Pool overhead.
+func (e *Engine) findIndicesAdaptiveAtWithState(haystack []byte, at int, state *SearchState) (int, int, bool) {
+	// Use prefilter if available for fast candidate finding
+	if e.prefilter != nil && e.dfa != nil {
+		pos := e.prefilter.Find(haystack, at)
+		if pos == -1 {
+			return -1, -1, false
+		}
+		atomic.AddUint64(&e.stats.PrefilterHits, 1)
+		atomic.AddUint64(&e.stats.DFASearches, 1)
+
+		// Literal fast path
+		if e.prefilter.IsComplete() {
+			literalLen := e.prefilter.LiteralLen()
+			if literalLen > 0 {
+				return pos, pos + literalLen, true
+			}
+		}
+
+		// Search from prefilter position - O(m) not O(n)
+		return state.pikevm.SearchAt(haystack, pos)
+	}
+
+	// Try DFA without prefilter
+	if e.dfa != nil {
+		atomic.AddUint64(&e.stats.DFASearches, 1)
+		endPos := e.dfa.FindAt(state.dfaCache, haystack, at)
+		if endPos != -1 {
+			// Use estimated start for O(m) search
+			estimatedStart := at
+			if endPos > at+100 {
+				estimatedStart = endPos - 100
+			}
+			return state.pikevm.SearchAt(haystack, estimatedStart)
+		}
+		size, capacity, _, _, _ := e.dfa.CacheStats(state.dfaCache)
+		if size >= int(capacity)*9/10 {
+			atomic.AddUint64(&e.stats.DFACacheFull, 1)
+		}
+	}
+	return e.findIndicesNFAAtWithState(haystack, at, state)
+}
+
 // findIndicesAdaptive tries prefilter+DFA first, falls back to NFA - zero alloc.
 func (e *Engine) findIndicesAdaptive(haystack []byte) (int, int, bool) {
 	// Use prefilter if available for fast candidate finding
@@ -586,6 +677,13 @@ func (e *Engine) findIndicesBidirectionalDFA(haystack []byte, at int) (int, int,
 	atomic.AddUint64(&e.stats.DFASearches, 1)
 	state := e.getSearchState()
 	defer e.putSearchState(state)
+	return e.findIndicesBidirectionalDFACore(haystack, at, state)
+}
+
+// findIndicesBidirectionalDFACore is the poolless core of bidirectional DFA search.
+// Caller must provide a valid SearchState (either from pool or already held).
+// Used by findAllIndicesLoop and Count to avoid per-match pool round-trips.
+func (e *Engine) findIndicesBidirectionalDFACore(haystack []byte, at int, state *SearchState) (int, int, bool) {
 	// Forward DFA: leftmost-first match end (matches Rust find_fwd)
 	end := e.dfa.SearchAt(state.dfaCache, haystack, at)
 	if end == -1 {
@@ -947,6 +1045,48 @@ func (e *Engine) findIndicesDigitPrefilterAt(haystack []byte, at int) (int, int,
 	return -1, -1, false
 }
 
+// findIndicesDigitPrefilterAtWithState searches using digit prefilter, reusing provided state.
+// Eliminates per-match sync.Pool overhead when called from FindAll/Count loops.
+func (e *Engine) findIndicesDigitPrefilterAtWithState(haystack []byte, at int, state *SearchState) (int, int, bool) {
+	if e.digitPrefilter == nil || at >= len(haystack) {
+		return e.findIndicesNFAAtWithState(haystack, at, state)
+	}
+
+	atomic.AddUint64(&e.stats.PrefilterHits, 1)
+	pos := at
+
+	for pos < len(haystack) {
+		digitPos := e.digitPrefilter.Find(haystack, pos)
+		if digitPos < 0 {
+			return -1, -1, false
+		}
+
+		if e.dfa != nil {
+			atomic.AddUint64(&e.stats.DFASearches, 1)
+			// Use anchored search - pattern MUST start at digitPos
+			endPos := e.dfa.SearchAtAnchored(state.dfaCache, haystack, digitPos)
+			if endPos != -1 {
+				return digitPos, endPos, true
+			}
+		} else {
+			atomic.AddUint64(&e.stats.NFASearches, 1)
+			start, end, found := state.pikevm.SearchAt(haystack, digitPos)
+			if found {
+				return start, end, true
+			}
+		}
+
+		pos = digitPos + 1
+		if e.digitRunSkipSafe {
+			for pos < len(haystack) && haystack[pos] >= '0' && haystack[pos] <= '9' {
+				pos++
+			}
+		}
+	}
+
+	return -1, -1, false
+}
+
 // findIndicesAhoCorasick returns indices using Aho-Corasick - zero alloc.
 func (e *Engine) findIndicesAhoCorasick(haystack []byte) (int, int, bool) {
 	if e.ahoCorasick == nil {
@@ -994,11 +1134,9 @@ func (e *Engine) findIndicesAtWithState(haystack []byte, at int, state *SearchSt
 	case UseNFA:
 		return e.findIndicesNFAAtWithState(haystack, at, state)
 	case UseDFA:
-		// DFA uses e.pikevm (shared) for final bounds, not pooled state
-		return e.findIndicesDFAAt(haystack, at)
+		return e.findIndicesDFAAtWithState(haystack, at, state)
 	case UseBoth:
-		// Adaptive uses e.pikevm (shared) or delegates to NFA path
-		return e.findIndicesAdaptiveAt(haystack, at)
+		return e.findIndicesAdaptiveAtWithState(haystack, at, state)
 	case UseReverseSuffix:
 		return e.reverseSuffixSearcher.FindIndicesAtWithCaches(haystack, at, state.stratFwdCache, state.stratRevCache)
 	case UseReverseSuffixSet:
@@ -1016,7 +1154,7 @@ func (e *Engine) findIndicesAtWithState(haystack []byte, at int, state *SearchSt
 	case UseTeddy:
 		return e.findIndicesTeddyAt(haystack, at)
 	case UseDigitPrefilter:
-		return e.findIndicesDigitPrefilterAt(haystack, at)
+		return e.findIndicesDigitPrefilterAtWithState(haystack, at, state)
 	case UseAhoCorasick:
 		return e.findIndicesAhoCorasickAt(haystack, at)
 	case UseMultilineReverseSuffix:
diff --git a/meta/findall.go b/meta/findall.go
index 9e4713a..3a7eec7 100644
--- a/meta/findall.go
+++ b/meta/findall.go
@@ -307,9 +307,33 @@ func (e *Engine) Count(haystack []byte, n int) int {
 	state := e.getSearchState()
 	defer e.putSearchState(state)
 
+	// DFA fast path: call DFA functions directly, skip meta prefilter layer.
+	// SearchAt has integrated prefilter at start state — no duplicate scan.
+	useDFADirect := (e.strategy == UseDFA || e.strategy == UseBoth) &&
+		e.dfa != nil && e.reverseDFA != nil &&
+		state.dfaCache != nil && state.revDFACache != nil
+
 	for pos <= len(haystack) {
-		// Use state-reusing version for zero sync.Pool overhead per match
-		start, end, found := e.findIndicesAtWithState(haystack, pos, state)
+		var start, end int
+		var found bool
+
+		if useDFADirect {
+			matchEnd := e.dfa.SearchAt(state.dfaCache, haystack, pos)
+			if matchEnd < 0 {
+				break
+			}
+			if matchEnd == pos {
+				start, end, found = pos, pos, true
+			} else {
+				matchStart := e.reverseDFA.SearchReverse(state.revDFACache, haystack, pos, matchEnd)
+				if matchStart < 0 {
+					break
+				}
+				start, end, found = matchStart, matchEnd, true
+			}
+		} else {
+			start, end, found = e.findIndicesAtWithState(haystack, pos, state)
+		}
 		if !found {
 			break
 		}
diff --git a/meta/strategy.go b/meta/strategy.go
index 8f4caa0..c68b4de 100644
--- a/meta/strategy.go
+++ b/meta/strategy.go
@@ -1307,12 +1307,42 @@ func analyzeLiterals(literals *literal.Seq, config Config) literalAnalysis {
 	return result
 }
 
+// hasWordBoundaryAnchorCombo returns true if the pattern combines word boundary
+// assertions (\b, \B) with anchors (^, $) in a way that causes DFA correctness issues.
+// Example: `\b^` matches in stdlib but DFA fails to handle the assertion combo.
+func hasWordBoundaryAnchorCombo(re *syntax.Regexp) bool {
+	return hasWordBoundary(re) && hasAnchorAssertions(re)
+}
+
+// hasCaseInsensitiveUnicode returns true if the pattern uses case-insensitive (?i)
+// flag with non-ASCII characters. DFA may produce incorrect results for case-folded
+// Unicode (e.g., `(?i)привет` matching "ПРИВЕТ" returns partial match).
+func hasCaseInsensitiveUnicode(re *syntax.Regexp) bool {
+	if re == nil {
+		return false
+	}
+	// Check if this node has FoldCase flag AND contains non-ASCII literals
+	if re.Flags&syntax.FoldCase != 0 {
+		for _, r := range re.Rune {
+			if r > 127 {
+				return true
+			}
+		}
+	}
+	for _, sub := range re.Sub {
+		if hasCaseInsensitiveUnicode(sub) {
+			return true
+		}
+	}
+	return false
+}
+
 // SelectStrategy analyzes the NFA and literals to choose the best execution strategy.
 //
 // Algorithm:
 //  1. If end-anchored ($ or \z) and not start-anchored → UseReverseAnchored
 //  2. If DFA disabled in config → UseNFA
-//  3. If NFA is tiny (< 20 states) → UseNFA (DFA overhead not worth it)
+//  3. If NFA is tiny (< 20 states) → UseDFA (tagged start states enable pure DFA)
 //  4. If simple character class pattern without literals → UseNFA (DFA overhead not worth it)
 //  5. If good literals exist → UseDFA (prefilter + DFA is fastest)
 //  6. If NFA is large (> 100 states) → UseDFA (essential for performance)
@@ -1453,27 +1483,27 @@ func SelectStrategy(n *nfa.NFA, re *syntax.Regexp, literals *literal.Seq, config
 		return strategy
 	}
 
-	// Tiny NFA with literals: use prefilter + NFA (like Rust)
-	// For patterns like "j[a-z]+p", DFA construction overhead is not worth it
-	// on small inputs. NFA with prefilter skip-ahead is faster.
-	// The prefilter (memchr) jumps to candidates, NFA verifies in O(pattern) time.
-	if nfaSize < 20 && litAnalysis.hasGoodLiterals {
-		return UseNFA // findIndicesNFA now uses prefilter for skip-ahead
-	}
-
-	// Check for simple digit-lead patterns BEFORE tiny NFA fallback.
+	// Check for simple digit-lead patterns before general DFA routing.
 	// Patterns like `\d+\.\d+\.\d+` (14 NFA states) benefit more from
-	// DigitPrefilter than plain NFA because SIMD digit scanning skips
+	// DigitPrefilter than DFA because SIMD digit scanning skips
 	// non-digit regions entirely.
 	if shouldUseDigitPrefilter(re, nfaSize, config) {
 		return UseDigitPrefilter
 	}
 
-	// Tiny NFA without literals: use PikeVM directly (DFA overhead not worth it)
-	// For patterns like "a", ".", "[0-9]", the DFA cache lookup and
-	// determinization overhead exceeds the benefit.
+	// Small NFA (< 20 states): use pure DFA (no PikeVM verification).
+	// With tagged start states (Rust LazyStateID approach), DFA search handles
+	// prefilter correctly: start-tagged states always enter slow path for
+	// prefilter skip-ahead. This eliminates the O(n^2) that previously blocked
+	// UseDFA routing.
+	// Benchmarked: UseDFA is 7x faster than UseBoth for 10-14 state NFA on
+	// large inputs (7.2 MB), because UseBoth still uses PikeVM verification.
+	// Guards: some patterns have DFA issues — keep UseNFA for those.
 	if nfaSize < 20 {
-		return UseNFA
+		if hasCaseInsensitiveUnicode(re) || hasWordBoundaryAnchorCombo(re) || canMatchEmpty(re) || hasMultilineLineAnchor(re) {
+			return UseNFA
+		}
+		return UseDFA
 	}
 
 	// Patterns that can match empty string (e.g., `.*`, `a*`, `(a|)`) must use
@@ -1564,11 +1594,14 @@ func strategyReasonComplex(strategy Strategy, n *nfa.NFA, literals *literal.Seq,
 			return "DFA disabled in configuration"
 		}
 		if nfaSize < 20 {
-			return "tiny NFA (< 20 states), DFA overhead not worth it"
+			return "tiny NFA (< 20 states) with empty-match or special guards"
 		}
 		return "no good literals and small NFA"
 
 	case UseDFA:
+		if nfaSize < 20 {
+			return "tiny NFA (< 20 states), pure DFA with tagged start states"
+		}
 		if literals != nil && !literals.IsEmpty() {
 			lcp := literals.LongestCommonPrefix()
 			if len(lcp) >= config.MinLiteralLen {
diff --git a/meta/strategy_selection_test.go b/meta/strategy_selection_test.go
index 9e72c70..030a625 100644
--- a/meta/strategy_selection_test.go
+++ b/meta/strategy_selection_test.go
@@ -14,10 +14,10 @@ func TestStrategySelectionComprehensive(t *testing.T) {
 		pattern string
 		want    Strategy
 	}{
-		// ========== UseNFA: tiny patterns without useful literals ==========
-		{"nfa_single_char", "a", UseNFA},
-		{"nfa_single_char_b", "b", UseNFA},
-		{"nfa_two_char_literal", "ab", UseNFA},
+		// ========== UseDFA: tiny patterns — pure DFA with tagged start states ==========
+		{"dfa_single_char", "a", UseDFA},
+		{"dfa_single_char_b", "b", UseDFA},
+		{"dfa_two_char_literal", "ab", UseDFA},
 
 		// ========== UseReverseSuffix: .*suffix patterns ==========
 		{"rsuffix_dot_star_txt", `.*\.txt`, UseReverseSuffix},
diff --git a/meta/strategy_test.go b/meta/strategy_test.go
index dbb7da0..268352b 100644
--- a/meta/strategy_test.go
+++ b/meta/strategy_test.go
@@ -222,7 +222,7 @@ func TestDigitPrefilterStrategySelection(t *testing.T) {
 		{`\d+`, UseCharClassSearcher, "simple \\d+ uses CharClassSearcher"},
 
 		// Patterns with good prefix literals use UseDFA
-		{`123\d+`, UseNFA, "literal prefix uses NFA (tiny pattern with literals)"},
+		{`123\d+`, UseDigitPrefilter, "digit-lead literal uses DigitPrefilter"},
 
 		// Non-digit patterns should NOT use UseDigitPrefilter
 		{`[a-z]+`, UseCharClassSearcher, "letter class uses CharClassSearcher"},
diff --git a/regex.go b/regex.go
index b170659..61c6373 100644
--- a/regex.go
+++ b/regex.go
@@ -47,6 +47,7 @@ package coregex
 
 import (
 	"io"
+	"iter"
 	"regexp/syntax"
 	"strings"
 	"unsafe"
@@ -696,132 +697,72 @@ func (r *Regex) FindAllIndex(b []byte, n int) [][]int {
 		return nil
 	}
 
-	// Fast path: CharClassSearcher uses streaming state machine (single-pass, no per-match overhead)
-	// This is 2-3x faster than the loop below for patterns like \w+, \d+, [a-z]+
-	if r.engine.Strategy() == meta.UseCharClassSearcher {
-		return r.findAllIndexStreaming(b, n)
-	}
-
-	var indices [][]int
-	pos := 0
-	lastMatchEnd := -1 // Track where the last non-empty match ended
-
-	for {
-		// Use zero-allocation FindIndicesAt instead of FindAt (avoids Match object creation)
-		start, end, found := r.engine.FindIndicesAt(b, pos)
-		if !found {
-			break
-		}
-
-		// Lazy allocation: only allocate once we find the first match
-		if indices == nil {
-			// Pre-allocate with estimated capacity
-			// Heuristic: for typical patterns, estimate ~10 matches per 1KB
-			estimatedCap := len(b) / 100
-			if estimatedCap < 4 {
-				estimatedCap = 4
-			}
-			if n > 0 && estimatedCap > n {
-				estimatedCap = n
-			}
-			indices = make([][]int, 0, estimatedCap)
-		}
-
-		// Skip empty matches that start exactly where the previous non-empty match ended.
-		// This matches Go's stdlib behavior:
-		// - "a*" on "ab" returns [[0 1] [2 2]], not [[0 1] [1 1] [2 2]]
-		// - After matching "a" at [0,1], an empty match at [1,1] is skipped
-		// - But empty matches at [2,2] (after the 'b') are allowed
-		//nolint:gocritic // badCond: intentional - checking empty match (start==end) at lastMatchEnd
-		if start == end && start == lastMatchEnd {
-			// Skip this empty match and try at the next position
-			pos++
-			if pos > len(b) {
-				break
-			}
-			continue
-		}
-
-		indices = append(indices, []int{start, end})
-
-		// Track non-empty match ends for the skip rule
-		if start != end {
-			lastMatchEnd = end
-		}
-
-		// Move position past this match
-		switch {
-		case start == end:
-			// Empty match: advance by 1 to avoid infinite loop
-			pos = end + 1
-		case end > pos:
-			pos = end
-		default:
-			// Fallback (shouldn't normally happen)
-			pos++
-		}
-
-		if pos > len(b) {
-			break
-		}
-
-		// Check limit
-		if n > 0 && len(indices) >= n {
-			break
-		}
-	}
-
-	return indices
+	// Use compact [][2]int internally, convert at the boundary.
+	// This reduces allocations from N+1 (one []int per match) to 2 (one flat buffer + one slice header).
+	compact := r.engine.FindAllIndicesStreaming(b, n, nil)
+	return compactToSliceOfSlice(compact)
 }
 
-// findAllIndexStreaming uses single-pass streaming state machine for CharClassSearcher patterns.
-// This avoids per-match function call overhead (2-3x faster than the loop approach).
-// CharClassSearcher patterns like \w+, \d+, [a-z]+ cannot produce empty matches (minMatch=1),
-// so the empty match handling logic is not needed here.
-func (r *Regex) findAllIndexStreaming(b []byte, n int) [][]int {
-	// Get streaming results ([][2]int format)
-	streamResults := r.engine.FindAllIndicesStreaming(b, n, nil)
-
-	if len(streamResults) == 0 {
+// compactToSliceOfSlice converts [][2]int to [][]int using a flat buffer.
+// This reduces allocations from N+1 (one []int heap alloc per match) to exactly 2:
+// one flat []int buffer for all indices, one [][]int for slice headers.
+// Each result[i] is a length-2/capacity-2 slice into the flat buffer.
+func compactToSliceOfSlice(compact [][2]int) [][]int {
+	if len(compact) == 0 {
 		return nil
 	}
 
-	// Convert [][2]int to [][]int for stdlib-compatible API
-	// This allocation is necessary for API compatibility, but still faster than per-match overhead
-	indices := make([][]int, len(streamResults))
-	for i, m := range streamResults {
-		indices[i] = []int{m[0], m[1]}
+	buf := make([]int, len(compact)*2)
+	result := make([][]int, len(compact))
+	for i, m := range compact {
+		buf[i*2] = m[0]
+		buf[i*2+1] = m[1]
+		result[i] = buf[i*2 : i*2+2 : i*2+2]
 	}
-
-	return indices
+	return result
 }
 
-// FindAllIndexCompact returns all successive matches as a compact [][2]int slice.
-// This is a zero-allocation API (single allocation for the result slice).
-// Unlike FindAllIndex which returns [][]int (N allocations for N matches),
-// this method pre-allocates the entire result in one contiguous block.
+// AppendAllIndex appends all successive match index pairs to dst and returns
+// the extended slice. This follows the Go stdlib append pattern (like
+// strconv.AppendInt) with dst as the first parameter.
 //
-// Performance: ~2x fewer allocations than FindAllIndex for high match counts.
+// Zero-allocation when dst has sufficient capacity. Unlike FindAllIndex which
+// returns [][]int (N heap allocations for N matches), AppendAllIndex uses a
+// flat [][2]int layout requiring at most one allocation for the backing array.
 //
-// If n > 0, it returns at most n matches. If n <= 0, it returns all matches.
-// The optional 'results' slice can be provided for reuse (set to nil for fresh allocation).
+// If n > 0, it appends at most n matches. If n <= 0, it appends all matches.
+// Pass nil as dst for a fresh allocation.
 //
 // Example:
 //
 //	re := coregex.MustCompile(`\d+`)
-//	indices := re.FindAllIndexCompact([]byte("a1b2c3"), -1, nil)
+//	indices := re.AppendAllIndex(nil, []byte("a1b2c3"), -1)
 //	// indices = [[1,2], [3,4], [5,6]]
-func (r *Regex) FindAllIndexCompact(b []byte, n int, results [][2]int) [][2]int {
+//
+//	// Reuse buffer across calls:
+//	buf := make([][2]int, 0, 64)
+//	buf = re.AppendAllIndex(buf[:0], data1, -1)
+//	process(buf)
+//	buf = re.AppendAllIndex(buf[:0], data2, -1)
+//	process(buf)
+func (r *Regex) AppendAllIndex(dst [][2]int, b []byte, n int) [][2]int {
 	if n == 0 {
 		return nil
 	}
-	return r.engine.FindAllIndicesStreaming(b, n, results)
+	return r.engine.FindAllIndicesStreaming(b, n, dst)
 }
 
-// FindAllStringIndexCompact returns all successive matches as a compact [][2]int slice.
-// This is the string version of FindAllIndexCompact.
-func (r *Regex) FindAllStringIndexCompact(s string, n int, results [][2]int) [][2]int {
-	return r.FindAllIndexCompact(stringToBytes(s), n, results)
+// AppendAllStringIndex appends all successive match index pairs for the string
+// s to dst and returns the extended slice. This is the string version of
+// AppendAllIndex.
+//
+// Example:
+//
+//	re := coregex.MustCompile(`\d+`)
+//	indices := re.AppendAllStringIndex(nil, "a1b2c3", -1)
+//	// indices = [[1,2], [3,4], [5,6]]
+func (r *Regex) AppendAllStringIndex(dst [][2]int, s string, n int) [][2]int {
+	return r.AppendAllIndex(dst, stringToBytes(s), n)
 }
 
 // FindAllStringIndex returns a slice of all successive matches of the pattern in s,
@@ -1485,21 +1426,25 @@ func (r *Regex) FindAllSubmatchIndex(b []byte, n int) [][]int {
 		return nil
 	}
 
+	// All matches have same number of capture groups, so use a flat buffer.
+	// This reduces allocations from N+1 to exactly 2 (flat buffer + slice headers).
+	numGroups := matches[0].NumCaptures()
+	stride := numGroups * 2
+	buf := make([]int, len(matches)*stride)
 	result := make([][]int, len(matches))
 	for i, m := range matches {
-		numGroups := m.NumCaptures()
-		indices := make([]int, numGroups*2)
+		base := i * stride
 		for j := 0; j < numGroups; j++ {
 			idx := m.GroupIndex(j)
 			if len(idx) >= 2 {
-				indices[j*2] = idx[0]
-				indices[j*2+1] = idx[1]
+				buf[base+j*2] = idx[0]
+				buf[base+j*2+1] = idx[1]
 			} else {
-				indices[j*2] = -1
-				indices[j*2+1] = -1
+				buf[base+j*2] = -1
+				buf[base+j*2+1] = -1
 			}
 		}
-		result[i] = indices
+		result[i] = buf[base : base+stride : base+stride]
 	}
 	return result
 }
@@ -1516,6 +1461,119 @@ func (r *Regex) FindAllStringSubmatchIndex(s string, n int) [][]int {
 	return r.FindAllSubmatchIndex(stringToBytes(s), n)
 }
 
+// AllIndex returns an iterator over all successive non-overlapping match index
+// pairs in b. Each yielded [2]int contains the start and end byte offsets of a
+// match. Matches are returned left-to-right.
+//
+// Zero allocation: the iterator uses FindIndicesAt internally and yields
+// stack-allocated [2]int values. No slice is allocated for the results.
+//
+// Empty match handling follows Go stdlib regexp semantics: an empty match at a
+// position where a non-empty match just ended is skipped, and the search
+// advances by one byte after each empty match.
+//
+// Example:
+//
+//	re := coregex.MustCompile(`\d+`)
+//	for m := range re.AllIndex([]byte("a1b22c333")) {
+//	    fmt.Printf("match at [%d, %d]\n", m[0], m[1])
+//	}
+//	// Output:
+//	// match at [1, 2]
+//	// match at [3, 5]
+//	// match at [6, 9]
+func (r *Regex) AllIndex(b []byte) iter.Seq[[2]int] {
+	return func(yield func([2]int) bool) {
+		pos := 0
+		lastMatchEnd := -1
+		for pos <= len(b) {
+			start, end, found := r.engine.FindIndicesAt(b, pos)
+			if !found {
+				return
+			}
+			// Skip empty matches at the position where a non-empty match just ended.
+			// This matches Go stdlib behavior.
+			//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+			if start == end && start == lastMatchEnd {
+				pos++
+				if pos > len(b) {
+					return
+				}
+				continue
+			}
+			if !yield([2]int{start, end}) {
+				return
+			}
+			if start != end {
+				lastMatchEnd = end
+			}
+			if end == pos {
+				pos++
+			} else {
+				pos = end
+			}
+		}
+	}
+}
+
+// AllStringIndex returns an iterator over all successive non-overlapping match
+// index pairs in s. This is the string version of AllIndex.
+//
+// Example:
+//
+//	re := coregex.MustCompile(`\w+`)
+//	for m := range re.AllStringIndex("hello world") {
+//	    fmt.Printf("%s\n", s[m[0]:m[1]])
+//	}
+func (r *Regex) AllStringIndex(s string) iter.Seq[[2]int] {
+	return r.AllIndex(stringToBytes(s))
+}
+
+// All returns an iterator over all successive non-overlapping matches in b.
+// Each yielded []byte is a sub-slice of b (no copy, no allocation).
+//
+// Example:
+//
+//	re := coregex.MustCompile(`\d+`)
+//	for m := range re.All([]byte("a1b22c333")) {
+//	    fmt.Println(string(m))
+//	}
+//	// Output:
+//	// 1
+//	// 22
+//	// 333
+func (r *Regex) All(b []byte) iter.Seq[[]byte] {
+	return func(yield func([]byte) bool) {
+		for m := range r.AllIndex(b) {
+			if !yield(b[m[0]:m[1]]) {
+				return
+			}
+		}
+	}
+}
+
+// AllString returns an iterator over all successive non-overlapping matches in s.
+// Each yielded string is a substring of s (no copy, no allocation).
+//
+// Example:
+//
+//	re := coregex.MustCompile(`\w+`)
+//	for word := range re.AllString("hello world") {
+//	    fmt.Println(word)
+//	}
+//	// Output:
+//	// hello
+//	// world
+func (r *Regex) AllString(s string) iter.Seq[string] {
+	return func(yield func(string) bool) {
+		for m := range r.AllStringIndex(s) {
+			if !yield(s[m[0]:m[1]]) {
+				return
+			}
+		}
+	}
+}
+
 // Copy returns a new Regex object copied from re.
 // Calling Longest on one copy does not affect another.
 //
diff --git a/word_digit_bench_test.go b/word_digit_bench_test.go
index 584ebdf..bfe0daf 100644
--- a/word_digit_bench_test.go
+++ b/word_digit_bench_test.go
@@ -59,22 +59,22 @@ func BenchmarkAlphaDigit_1MB_Coregex(b *testing.B) {
 	}
 }
 
-// Compact API benchmarks - zero per-match allocations
-func BenchmarkWordDigit_1MB_CoregexCompact(b *testing.B) {
+// AppendAllIndex benchmarks - zero per-match allocations
+func BenchmarkWordDigit_1MB_CoregexAppend(b *testing.B) {
 	re := MustCompile(`\w+[0-9]+`)
 	b.SetBytes(int64(len(benchData)))
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		re.FindAllIndexCompact(benchData, -1, nil)
+		re.AppendAllIndex(nil, benchData, -1)
 	}
 }
 
-func BenchmarkWordDigit_1MB_CoregexCompactReuse(b *testing.B) {
+func BenchmarkWordDigit_1MB_CoregexAppendReuse(b *testing.B) {
 	re := MustCompile(`\w+[0-9]+`)
 	results := make([][2]int, 0, 65536)
 	b.SetBytes(int64(len(benchData)))
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		results = re.FindAllIndexCompact(benchData, -1, results)
+		results = re.AppendAllIndex(results[:0], benchData, -1)
 	}
 }