diff --git a/CHANGELOG.md b/CHANGELOG.md index b7f0059..5e9f103 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,62 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120)) - SIMD prefilter for CompositeSequenceDFA (#83) +## [0.12.21] - 2026-03-27 + +### Performance +- **Tagged start states** (Rust `LazyStateID` approach) — start states get tag bit, + always route to slow path. Enables prefilter skip-ahead only at start state, + eliminating O(n²) from start state self-loop. Unlocks UseDFA for tiny NFA patterns. + +- **DFA multiline $ fix** — EndLine look-ahead re-computation in determinize + (Rust mod.rs:131-212). `(?m)hello$` now works correctly in DFA. + +- **Dead-state prefilter restart** in searchEarliestMatch — IsMatch path uses + prefilter to skip past dead states, matching Rust find_fwd_imp approach. + +- **1100x fewer mallocs** — FindAllIndex/FindAllSubmatchIndex use flat buffer + (`compactToSliceOfSlice`): N matches → 2 allocations instead of N+1. + +- **Local SearchState cache** on Engine — atomic.Pointer single-slot cache + survives GC, avoids sync.Pool re-allocation overhead. + +- **Tiny NFA → UseDFA routing** — patterns with < 20 NFA states now use + bidirectional DFA (was PikeVM). 7x faster DFA vs PikeVM on large inputs. + +### Added +- **`AllIndex(b []byte) iter.Seq[[2]int]`** — zero-alloc match index iterator (Go 1.23+) +- **`AllStringIndex(s string) iter.Seq[[2]int]`** — string version +- **`All(b []byte) iter.Seq[[]byte]`** — zero-alloc match content iterator +- **`AllString(s string) iter.Seq[string]`** — string version +- **`AppendAllIndex(dst [][2]int, b []byte, n int) [][2]int`** — buffer-reuse API +- **`AppendAllStringIndex(dst [][2]int, s string, n int) [][2]int`** — string version + +Naming follows Go proposal #61902 (regexp iterator methods) and `strconv.Append*` convention. + +### Fixed +- DFA `isMatchWithPrefilter` pfSkip off-by-one — `zx+` on "zzx" now correct +- DFA multiline `$` EndLine look-ahead — `(?m)hello$` now matches before `\n` + +### Benchmarks (LangArena LogParser, 7.2 MB, 13 patterns) + +| Metric | v0.12.20 | v0.12.21 | Improvement | +|--------|----------|----------|-------------| +| Total time (FindAll) | 163ms | **107ms** | **-34%** | +| errors pattern | 23ms | **8ms** (FindAll) / **5.5ms** (AllIndex) | **-65% / -76%** | +| vs Rust gap | 3.9x | **2.9x** (FindAll) / **1.7x** (AllIndex) | **-56%** | +| Mallocs/iter | 203K | **182** | **-99.9%** | + +### Zero-Alloc API Benchmarks (new methods vs stdlib-compat) + +| Method | errors (33K matches) | Alloc | vs Rust | +|--------|---------------------|-------|---------| +| FindAllStringIndex (stdlib) | 8.2ms / 3890 KB | 19 mallocs | 2.6x slower | +| **AllIndex (iter.Seq)** | **5.9ms / 0 KB** | **0 mallocs** | **1.7x** | +| **AppendAllIndex (reuse)** | **5.5ms / 0 KB** | **0 mallocs** | **1.7x** | +| Rust find_iter | 3.2ms / 0 | 0 | — | + +emails pattern: `AppendAllIndex` **2.0ms vs Rust 2.6ms** — **faster than Rust!** + ## [0.12.20] - 2026-03-25 ### Performance diff --git a/README.md b/README.md index 3b4f4f0..d4314b3 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ Cross-language benchmarks on 6MB input, AMD EPYC ([source](https://github.com/ko - Multi-pattern (`foo|bar|baz|...`) — Slim Teddy (≤32), Fat Teddy (33-64), or Aho-Corasick (>64) - Anchored alternations (`^(\d+|UUID|hex32)`) — O(1) branch dispatch (5-20x) - Concatenated char classes (`[a-zA-Z]+[0-9]+`) — DFA with byte classes (5-7x) +- **Zero-alloc iterators** (`AllIndex`, `AppendAllIndex`) — 0 heap allocs, up to **30% faster** than FindAll. Email pattern **faster than Rust** with `AppendAllIndex`. ## Features @@ -130,11 +131,28 @@ Supported methods: ### Zero-Allocation APIs ```go -// Zero allocations — returns bool +// Zero allocations — boolean match matched := re.IsMatch(text) -// Zero allocations — returns (start, end, found) +// Zero allocations — single match indices start, end, found := re.FindIndices(text) + +// Zero allocations — iterator over all matches (Go 1.23+) +for m := range re.AllIndex(data) { + fmt.Printf("match at [%d, %d]\n", m[0], m[1]) +} + +// Zero allocations — match content iterator +for s := range re.AllString(text) { + fmt.Println(s) +} + +// Buffer-reuse — append to caller's slice (strconv.Append* pattern) +var buf [][2]int +for _, chunk := range chunks { + buf = re.AppendAllIndex(buf[:0], chunk, -1) + process(buf) +} ``` ### Configuration diff --git a/ROADMAP.md b/ROADMAP.md index d38bd6b..6367799 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -97,8 +97,11 @@ v0.12.18 ✅ → Flat DFA transition table, integrated prefilter, PikeVM skip-ah ↓ v0.12.19 ✅ → Zero-alloc FindSubmatch, byte-based DFA cache, Rust-aligned visited limits ↓ -v0.12.20 (Current) → Premultiplied/tagged StateIDs, break-at-match DFA determinize, - Phase 3 elimination (2-pass bidirectional DFA) +v0.12.20 ✅ → Premultiplied/tagged StateIDs, break-at-match DFA determinize, + Phase 3 elimination (2-pass bidirectional DFA) + ↓ +v0.12.21 (Current) → Tagged start states, zero-alloc API (AllIndex iter.Seq), + 1100x fewer mallocs, UseDFA for tiny NFA, -32% LangArena ↓ v1.0.0-rc → Feature freeze, API locked ↓ diff --git a/dfa/lazy/builder.go b/dfa/lazy/builder.go index 4c3f065..1880c9a 100644 --- a/dfa/lazy/builder.go +++ b/dfa/lazy/builder.go @@ -64,6 +64,9 @@ func (b *Builder) Build() (*DFA, error) { // Check if the NFA contains word boundary assertions hasWordBoundary := b.checkHasWordBoundary() + // Check if the NFA contains EndLine ($) assertions + hasEndLine := b.checkHasEndLine() + // Check if the pattern is always anchored (has ^ prefix) isAlwaysAnchored := b.nfa.IsAlwaysAnchored() @@ -80,6 +83,7 @@ func (b *Builder) Build() (*DFA, error) { byteClasses: b.nfa.ByteClasses(), unanchoredStart: b.nfa.StartUnanchored(), hasWordBoundary: hasWordBoundary, + hasEndLine: hasEndLine, isAlwaysAnchored: isAlwaysAnchored, startByteMap: startByteMap, } @@ -706,3 +710,23 @@ func (b *Builder) checkHasWordBoundary() bool { } return false } + +// checkHasEndLine checks if the NFA contains EndLine ($) look assertions. +// When true, determinize performs look-ahead re-computation on '\n' bytes. +// Computed once at DFA build time for O(1) check in hot loop. +func (b *Builder) checkHasEndLine() bool { + numStates := b.nfa.States() + for i := nfa.StateID(0); int(i) < numStates; i++ { + state := b.nfa.State(i) + if state == nil { + continue + } + if state.Kind() == nfa.StateLook { + look, _ := state.Look() + if look == nfa.LookEndLine { + return true + } + } + } + return false +} diff --git a/dfa/lazy/lazy.go b/dfa/lazy/lazy.go index 8610a54..a48b532 100644 --- a/dfa/lazy/lazy.go +++ b/dfa/lazy/lazy.go @@ -78,6 +78,11 @@ type DFA struct { // When false, we can skip expensive word boundary checks in the search loop. hasWordBoundary bool + // hasEndLine is true if the NFA contains EndLine ($) look assertions. + // When true, determinize performs look-ahead re-computation on '\n' bytes. + // When false (most patterns), this check is skipped entirely. + hasEndLine bool + // isAlwaysAnchored is true if the pattern is inherently anchored (has ^ prefix). // When true, we only need to try matching from position 0. isAlwaysAnchored bool @@ -375,31 +380,11 @@ func (d *DFA) searchFirstAt(cache *DFACache, haystack []byte, startPos int) int canUnroll := !d.hasWordBoundary ftLen := len(ft) - startSID := startState.id hasPre := d.prefilter != nil for pos < end { - // Prefilter skip-ahead at start state - if hasPre && sid == startSID && lastMatch < 0 && pos > startPos { - candidate := d.prefilter.Find(haystack, pos) - if candidate == -1 { - return lastMatch - } - if candidate > pos { - pos = candidate - newStart := d.getStartStateForUnanchored(cache, haystack, pos) - if newStart == nil { - return d.nfaFallback(haystack, startPos) - } - sid = newStart.id - startSID = sid - ft = cache.flatTrans - ftLen = len(ft) - } - } - // === 4x UNROLLED FAST PATH === - // With match delay, tagged states (including match) break to slow path. + // With match delay, tagged states (including match, start) break to slow path. if canUnroll && pos+3 < end { if sid.Offset()+stride > ftLen { goto searchFirstSlowPath @@ -452,6 +437,25 @@ func (d *DFA) searchFirstAt(cache *DFACache, haystack []byte, startPos int) int break } + // Start state prefilter skip-ahead (Rust find_fwd_imp). + if sid.IsStartTag() && hasPre && lastMatch < 0 && pos > startPos { + candidate := d.prefilter.Find(haystack, pos) + if candidate == -1 { + return lastMatch + } + if candidate > pos { + pos = candidate + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { + return d.nfaFallback(haystack, startPos) + } + sid = newStart.id + ft = cache.flatTrans + ftLen = len(ft) + continue + } + } + if d.hasWordBoundary { st := cache.getState(sid) if st != nil && st.checkWordBoundaryFast(haystack[pos]) { @@ -526,12 +530,10 @@ func (d *DFA) IsMatch(cache *DFACache, haystack []byte) bool { return d.matchesEmpty(cache) } - // Use prefilter for acceleration if available - if d.prefilter != nil { - return d.isMatchWithPrefilter(cache, haystack) - } - - // No prefilter: use optimized DFA search with early termination + // With tagged start states, searchEarliestMatch handles prefilter correctly: + // start-tagged states always enter slow path where prefilter skip-ahead + // runs only at start states — no O(n^2) on start state self-loop. + // This replaces the separate isMatchWithPrefilter path. return d.searchEarliestMatch(cache, haystack, 0) } @@ -550,136 +552,6 @@ func (d *DFA) IsMatchAt(cache *DFACache, haystack []byte, at int) bool { return d.searchEarliestMatch(cache, haystack, at) } -// isMatchWithPrefilter uses an integrated prefilter+DFA loop (Rust approach). -// -// Instead of two separate passes (prefilter.Find → DFA.searchAnchored → repeat), -// this runs a single DFA loop where dead-state transitions trigger prefilter -// skip-ahead. This eliminates Go function call overhead between passes and -// avoids redundant start-state setup on each candidate. -// -// Reference: rust regex-automata hybrid/search.rs find_fwd_imp — prefilter -// is called inside the DFA loop when returning to start state. -func (d *DFA) isMatchWithPrefilter(cache *DFACache, haystack []byte) bool { - // If prefilter is complete, its match is sufficient - if d.prefilter.IsComplete() { - return d.prefilter.Find(haystack, 0) != -1 - } - - // Find first candidate to start DFA from - pos := d.prefilter.Find(haystack, 0) - if pos == -1 { - return false - } - - // Get anchored start state at candidate position - currentState := d.getStartState(cache, haystack, pos, true) - if currentState == nil { - return d.isMatchWithPrefilterFallback(cache, haystack, pos) - } - // With 1-byte match delay, start states are never match states. - - endPos := len(haystack) - sid := currentState.id - ft := cache.flatTrans - ftLen := len(ft) - - for pos < endPos { - if d.hasWordBoundary { - st := cache.getState(sid) - if st != nil && st.checkWordBoundaryFast(haystack[pos]) { - return true - } - } - - classIdx := int(d.byteToClass(haystack[pos])) - offset := sid.Offset() + classIdx - var nextID StateID - if offset < ftLen { - nextID = ft[offset] - } else { - nextID = InvalidState - } - - switch nextID { - case InvalidState: - currentState = cache.getState(sid) - if currentState == nil { - start, end, matched := d.pikevm.SearchAt(haystack, pos) - return matched && start >= 0 && end >= start - } - nextState, err := d.determinize(cache, currentState, haystack[pos]) - if err != nil { - start, end, matched := d.pikevm.SearchAt(haystack, pos) - return matched && start >= 0 && end >= start - } - if nextState == nil { - goto pfSkip - } - sid = nextState.id - ft = cache.flatTrans - ftLen = len(ft) - - case DeadState: - goto pfSkip - - default: - sid = nextID - } - - pos++ - // 1-byte match delay: check after transition - if cache.IsMatchState(sid) { - return true - } - continue - - pfSkip: - pos++ - candidate := d.prefilter.Find(haystack, pos) - if candidate == -1 { - return false - } - pos = candidate - - newStart := d.getStartState(cache, haystack, pos, true) - if newStart == nil { - return d.isMatchWithPrefilterFallback(cache, haystack, pos) - } - sid = newStart.id - ft = cache.flatTrans - ftLen = len(ft) - // With match delay, start states are never match — continue loop. - } - - eoi := cache.getState(sid) - if eoi != nil { - return d.checkEOIMatch(eoi) - } - return false -} - -// isMatchWithPrefilterFallback is the old two-pass approach used when -// DFA start state cannot be obtained (NFA fallback needed). -func (d *DFA) isMatchWithPrefilterFallback(cache *DFACache, haystack []byte, pos int) bool { - // Try anchored DFA search at current position - if d.searchEarliestMatchAnchored(cache, haystack, pos) { - return true - } - // Continue with remaining candidates - for pos < len(haystack) { - pos++ - candidate := d.prefilter.Find(haystack, pos) - if candidate == -1 { - return false - } - pos = candidate - if d.searchEarliestMatchAnchored(cache, haystack, pos) { - return true - } - } - return false -} - // searchEarliestMatch performs DFA search with early termination. // Returns true as soon as any match state is reached. // This is faster than searchAt because it doesn't track match positions @@ -723,6 +595,8 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int _ = ft[ftLen-1] } + hasPre := d.prefilter != nil + for pos < endPos { // === 4x UNROLLED FAST PATH (earliest match) === // For IsMatch(), we return true on ANY match, so no leftmost-longest tracking. @@ -802,6 +676,45 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int break } + // Start state prefilter skip-ahead (Rust find_fwd_imp:232-261). + // Start-tagged states ALWAYS enter slow path (never unrolled fast path), + // so prefilter check happens only here — no O(n^2) on start state self-loop. + if sid.IsStartTag() { + if hasPre && pos > startPos { + candidate := d.prefilter.Find(haystack, pos) + if candidate == -1 { + return false + } + if candidate > pos { + pos = candidate + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { + start, end, matched := d.pikevm.SearchAt(haystack, startPos) + return matched && start >= 0 && end >= start + } + sid = newStart.id + ft = cache.flatTrans + ftLen = len(ft) + continue + } + } + // Start state fast transition: skip getState/acceleration. + classIdx := int(d.byteToClass(haystack[pos])) + offset := sid.Offset() + classIdx + if offset < ftLen { + nextID := ft[offset] + if nextID != InvalidState && nextID != DeadState { + sid = nextID + pos++ + if cache.IsMatchState(sid) { + return true + } + continue + } + } + // InvalidState/DeadState: fall through to full slow path + } + // Try lazy acceleration detection if not yet checked currentState = cache.getState(sid) if currentState == nil { @@ -846,21 +759,18 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int // Determinize on demand nextState, err := d.determinize(cache, currentState, b) if err != nil { - // Cache cleared or full — fall back to NFA from original start position. start, end, matched := d.pikevm.SearchAt(haystack, startPos) return matched && start >= 0 && end >= start } if nextState == nil { - // Dead state - no match possible from here - return false + goto earliestPreSkip } sid = nextState.id ft = cache.flatTrans ftLen = len(ft) case DeadState: - // Dead state - no match possible from here - return false + goto earliestPreSkip default: sid = nextID @@ -872,6 +782,31 @@ func (d *DFA) searchEarliestMatch(cache *DFACache, haystack []byte, startPos int if cache.IsMatchState(sid) { return true } + continue + + earliestPreSkip: + // Dead state with prefilter: advance past failed byte, find next candidate. + // Without prefilter: dead = no match. + if !hasPre { + return false + } + pos++ + if pos >= endPos { + return false + } + candidate := d.prefilter.Find(haystack, pos) + if candidate == -1 { + return false + } + pos = candidate + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { + start, end, matched := d.pikevm.SearchAt(haystack, startPos) + return matched && start >= 0 && end >= start + } + sid = newStart.id + ft = cache.flatTrans + ftLen = len(ft) } // Reached end of input without finding a match in the loop. @@ -1006,9 +941,27 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int) sid := currentState.id ft := cache.flatTrans ftLen := len(ft) - startSID := sid for pos < len(haystack) { + // Start state prefilter skip-ahead (Rust find_fwd_imp). + if sid.IsStartTag() && lastMatch < 0 && pos > startAt { + candidate = d.prefilter.Find(haystack, pos) + if candidate == -1 { + return -1 + } + if candidate > pos { + pos = candidate + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { + return d.nfaFallback(haystack, 0) + } + sid = newStart.id + ft = cache.flatTrans + ftLen = len(ft) + continue + } + } + if d.hasWordBoundary { st := cache.getState(sid) if st != nil && d.checkWordBoundaryMatch(st, haystack[pos]) { @@ -1039,7 +992,6 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int) return d.nfaFallback(haystack, 0) } sid = newStart.id - startSID = sid ft = cache.flatTrans ftLen = len(ft) continue @@ -1062,7 +1014,6 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int) return d.nfaFallback(haystack, 0) } sid = newStart.id - startSID = sid ft = cache.flatTrans ftLen = len(ft) lastMatch = -1 @@ -1087,7 +1038,6 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int) return d.nfaFallback(haystack, 0) } sid = newStart.id - startSID = sid ft = cache.flatTrans ftLen = len(ft) lastMatch = -1 @@ -1103,17 +1053,6 @@ func (d *DFA) findWithPrefilterAt(cache *DFACache, haystack []byte, startAt int) } pos++ - - // Start state prefilter skip-ahead - if lastMatch < 0 && sid == startSID && pos < len(haystack) { - candidate = d.prefilter.Find(haystack, pos) - if candidate == -1 { - return -1 - } - if candidate > pos { - pos = candidate - } - } } // EOI check for delayed match @@ -1195,35 +1134,16 @@ func (d *DFA) searchAt(cache *DFACache, haystack []byte, startPos int) int { //n _ = ft[ftLen-1] } - startSID := currentState.id hasPre := d.prefilter != nil for pos < end { - // Prefilter skip-ahead at start state (Rust hybrid/search.rs:232-258) - if hasPre && sid == startSID && lastMatch < 0 && pos > startPos { - candidate := d.prefilter.Find(haystack, pos) - if candidate == -1 { - return lastMatch - } - if candidate > pos { - pos = candidate - newStart := d.getStartStateForUnanchored(cache, haystack, pos) - if newStart == nil { - return d.nfaFallback(haystack, startPos) - } - sid = newStart.id - startSID = sid - ft = cache.flatTrans - ftLen = len(ft) - } - } - // === 4x UNROLLED FAST PATH === // Process 4 transitions per iteration when conditions allow. // With match delay, match states break out of the unrolled loop // to the slow path for proper handling. + // Start-tagged states also break to slow path for prefilter skip-ahead. if canUnroll && pos+3 < end { - // Check acceleration on slow→fast transition + // Check acceleration on slow->fast transition accelState := cache.getState(sid) if accelState != nil && accelState.IsAccelerable() { goto slowPath @@ -1282,6 +1202,45 @@ func (d *DFA) searchAt(cache *DFACache, haystack []byte, startPos int) int { //n break } + // Start state prefilter skip-ahead (Rust find_fwd_imp:232-261). + // Start-tagged states always enter slow path, enabling prefilter + // check only here — no O(n^2) on start state self-loop. + if sid.IsStartTag() { + if hasPre && lastMatch < 0 && pos > startPos { + candidate := d.prefilter.Find(haystack, pos) + if candidate == -1 { + return lastMatch + } + if candidate > pos { + pos = candidate + newStart := d.getStartStateForUnanchored(cache, haystack, pos) + if newStart == nil { + return d.nfaFallback(haystack, startPos) + } + sid = newStart.id + ft = cache.flatTrans + ftLen = len(ft) + continue + } + } + // Start state fast transition: skip getState/acceleration (start is never + // accelerable). Do direct flatTrans lookup — same cost as searchFirstAt. + classIdx := int(d.byteToClass(haystack[pos])) + offset := sid.Offset() + classIdx + if offset < ftLen { + nextID := ft[offset] + if nextID != InvalidState && nextID != DeadState { + sid = nextID + if cache.IsMatchState(sid) { + lastMatch = pos + } + pos++ + continue + } + } + // InvalidState/DeadState: fall through to full slow path + } + // Resolve State for slow path (acceleration, word boundary, determinize). currentState = cache.getState(sid) if currentState == nil { @@ -1381,11 +1340,22 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro // The actual byte value is still used for NFA move operations classIdx := d.byteToClass(b) + // Look-ahead re-computation (Rust determinize mod.rs:131-212): + // Before checking for matches, resolve look-ahead assertions that depend + // on the current input byte. When input is '\n', EndLine ($) is satisfied + // for the CURRENT state, unlocking paths through $ assertions. + // This re-runs epsilon closure on the current state's NFA IDs with the + // new look-ahead, potentially adding Match states behind $ assertions. + currentNFAStates := current.NFAStates() + if d.hasEndLine && b == '\n' { + currentNFAStates = builder.epsilonClosure(currentNFAStates, LookEndLine) + } + // 1-byte match delay (Rust determinize mod.rs:254-286): // Check if source (current) state's NFA states contain a match state. // The NEW DFA state will be tagged as match if the OLD state had NFA match. // This delays match reporting by 1 byte, enabling correct look-around (^, $, \b). - sourceHasMatch := builder.containsMatchState(current.NFAStates()) + sourceHasMatch := builder.containsMatchState(currentNFAStates) // Compute next NFA state set via move operation WITH word context. // Leftmost-first (Rust determinize::next mod.rs:284): @@ -1394,7 +1364,7 @@ func (d *DFA) determinize(cache *DFACache, current *State, b byte) (*State, erro // processed, causing the DFA to reach dead state with the committed match. // BreakAtMatch is disabled for reverse DFAs to allow finding leftmost start. breakAtMatch := sourceHasMatch && d.config.BreakAtMatch - nextNFAStates := builder.moveWithWordContextBreak(current.NFAStates(), b, current.IsFromWord(), breakAtMatch) + nextNFAStates := builder.moveWithWordContextBreak(currentNFAStates, b, current.IsFromWord(), breakAtMatch) isMatch := sourceHasMatch @@ -1520,6 +1490,9 @@ func (d *DFA) tryClearCache(cache *DFACache) error { _, _ = cache.Insert(key, startState) // Cannot fail: cache was just cleared cache.registerState(startState) + // Tag as start state (same as getStartState) + startState.id = startState.id.WithStartTag() + // Cache the default start state in StartTable cache.startTable.Set(StartText, false, startState.ID()) @@ -1625,6 +1598,12 @@ func (d *DFA) getStartState(cache *DFACache, haystack []byte, pos int, anchored cache.registerState(insertedState) } + // Tag this state as a start state (Rust LazyStateID start tag approach). + // Start-tagged IDs always enter the slow path in the DFA hot loop, + // enabling prefilter skip-ahead ONLY at start states (not every byte). + // Offset() strips tags, so flatTrans lookups still work correctly. + insertedState.id = insertedState.id.WithStartTag() + // Cache in StartTable for fast lookup next time cache.startTable.Set(kind, anchored, insertedState.ID()) @@ -2169,6 +2148,9 @@ func (d *DFA) getStartStateForReverse(cache *DFACache, haystack []byte, end int) cache.registerState(insertedState) } + // Tag as start state (same as forward getStartState) + insertedState.id = insertedState.id.WithStartTag() + cache.startTable.Set(kind, false, insertedState.ID()) return insertedState } diff --git a/dfa/lazy/state.go b/dfa/lazy/state.go index 9ff197b..abdee8a 100644 --- a/dfa/lazy/state.go +++ b/dfa/lazy/state.go @@ -91,6 +91,15 @@ func (sid StateID) IsInvalidTag() bool { return sid&tagInvalid != 0 } +// IsStartTag returns true if this state has the start tag. +// Start-tagged states always enter the slow path in the DFA search loop, +// enabling prefilter skip-ahead only at start states (Rust LazyStateID approach). +// +//go:nosplit +func (sid StateID) IsStartTag() bool { + return sid&tagStart != 0 +} + // WithMatchTag returns a copy of this StateID with the match tag set. func (sid StateID) WithMatchTag() StateID { return sid | tagMatch diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index f29a7b6..d0ed272 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -97,16 +97,18 @@ Input → Prefilter (memchr/memmem/teddy) → Engine Search → Match Result ↓ ↓ ↓ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ SearchState│ │ SearchState│ │ SearchState│ ← Per-goroutine - │(goroutine1)│ │(goroutine2)│ │(goroutine3)│ (sync.Pool) + │(goroutine1)│ │(goroutine2)│ │(goroutine3)│ (atomic local + sync.Pool) └────────────┘ └────────────┘ └────────────┘ ``` +First goroutine uses atomic local cache (survives GC), concurrent goroutines fall back to sync.Pool. + ## Key Design Decisions 1. **Multi-engine**: Strategy selection at compile time, not runtime 2. **Rust reference**: Architecture mirrors Rust regex crate (lazy DFA, PikeVM, prefilters) 3. **Leftmost-first match**: DFA break-at-match matches Rust semantics (verified via cargo run) -4. **Zero-alloc hot paths**: `IsMatch()`, `FindIndices()`, `Count()` — no heap allocation +4. **Zero-alloc hot paths**: `IsMatch()`, `FindIndices()`, `Count()`, `AllIndex()` iterator — no heap allocation 5. **SIMD first**: AVX2/SSSE3 prefilters for x86_64, pure Go fallback for other archs ## References diff --git a/docs/STDLIB_COMPATIBILITY.md b/docs/STDLIB_COMPATIBILITY.md index 012eba8..f02fb2e 100644 --- a/docs/STDLIB_COMPATIBILITY.md +++ b/docs/STDLIB_COMPATIBILITY.md @@ -151,6 +151,46 @@ input := "world\nhello\ntest" **Note:** Common case-insensitive patterns work correctly. This affects only complex edge cases with overlapping matches. +## coregex Extensions (beyond stdlib) + +These methods are NOT in Go's stdlib `regexp` but provide zero-allocation alternatives: + +### Iterator API (Go 1.23+, zero allocation) + +```go +// Iterate over all match indices — zero heap allocation +for m := range re.AllIndex(data) { + fmt.Printf("[%d, %d]\n", m[0], m[1]) +} + +// Iterate over match content — zero copy +for s := range re.AllString(text) { + fmt.Println(s) +} +``` + +Methods: `AllIndex`, `AllStringIndex`, `All`, `AllString` + +Naming follows Go proposal [#61902](https://github.com/golang/go/issues/61902) (regexp iterator methods). + +### Buffer-Reuse API (zero allocation with reused buffer) + +```go +// Append matches to caller's buffer — strconv.Append* pattern +var buf [][2]int +buf = re.AppendAllIndex(buf[:0], data, -1) +``` + +Methods: `AppendAllIndex`, `AppendAllStringIndex` + +### Zero-Allocation Search + +```go +re.IsMatch(data) // bool, zero alloc +start, end, found := re.FindIndices(data) // indices, zero alloc +count := re.Count(data, -1) // count, zero alloc +``` + ## Migration Guide ### Step 1: Simple Find-and-Replace diff --git a/meta/compile.go b/meta/compile.go index 1a60f34..a6715b5 100644 --- a/meta/compile.go +++ b/meta/compile.go @@ -603,7 +603,9 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { // Initialize state pool for thread-safe concurrent searches numCaptures := nfaEngine.CaptureCount() - return &Engine{ + ssCfg := buildSearchStateConfig(pikevmNFA, numCaptures, engines, strategy) + + eng := &Engine{ nfa: nfaEngine, runeNFA: runeNFAEngine, asciiNFA: asciiNFAEngine, @@ -636,11 +638,15 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { canMatchEmpty: canMatchEmpty, isStartAnchored: isStartAnchored, fatTeddyFallback: fatTeddyFallback, - statePool: newSearchStatePool(buildSearchStateConfig( - pikevmNFA, numCaptures, engines, strategy, - )), - stats: Stats{}, - }, nil + statePool: newSearchStatePool(ssCfg), + stats: Stats{}, + } + + // Eagerly create one SearchState and store it in the local GC-proof cache. + // This ensures the first search call doesn't allocate via sync.Pool. + eng.localState.Store(newSearchState(ssCfg)) + + return eng, nil } // adjustForAnchors fixes prefilter for patterns with anchors. diff --git a/meta/engine.go b/meta/engine.go index dd1d5ca..10227e5 100644 --- a/meta/engine.go +++ b/meta/engine.go @@ -5,6 +5,8 @@ package meta import ( + "sync/atomic" + "github.com/coregx/ahocorasick" "github.com/coregx/coregex/dfa/lazy" "github.com/coregx/coregex/dfa/onepass" @@ -123,6 +125,16 @@ type Engine struct { // This enables concurrent searches on the same Engine instance. statePool *searchStatePool + // localState is a single-slot GC-proof cache for the common single-goroutine path. + // Unlike sync.Pool entries which are collected every GC cycle, this pointer is a + // strong reference that survives GC indefinitely. On LangArena (13 patterns × 10 + // iterations), this eliminates ~221 MB of DFACache re-allocation caused by GC + // clearing the sync.Pool between iterations. + // + // Thread safety: atomic swap ensures only one goroutine gets the cached state. + // Additional concurrent goroutines fall through to statePool. + localState atomic.Pointer[SearchState] + // longest enables leftmost-longest (POSIX) matching semantics // By default (false), uses leftmost-first (Perl) semantics longest bool @@ -243,11 +255,16 @@ func (e *Engine) SetLongest(longest bool) { } } -// getSearchState retrieves a SearchState from the pool. +// getSearchState retrieves a SearchState, trying the local GC-proof cache first. // Caller must call putSearchState when done. // The returned state contains its own PikeVM instance for thread-safe concurrent use. func (e *Engine) getSearchState() *SearchState { - state := e.statePool.get() + // Fast path: grab from local cache (survives GC, zero-alloc steady state). + state := e.localState.Swap(nil) + if state == nil { + // Slow path: concurrent access or first call before eager init. + state = e.statePool.get() + } // Initialize state for BoundedBacktracker if needed if e.boundedBacktracker != nil && state.backtracker != nil { @@ -262,7 +279,18 @@ func (e *Engine) getSearchState() *SearchState { return state } -// putSearchState returns a SearchState to the pool. +// putSearchState returns a SearchState, trying the local cache first. +// The local cache slot holds one state as a strong reference that survives GC. +// Overflow goes to sync.Pool (may be collected by GC). func (e *Engine) putSearchState(state *SearchState) { + if state == nil { + return + } + state.reset() + // Try to store in local cache (GC-proof single slot). + if e.localState.CompareAndSwap(nil, state) { + return + } + // Local slot occupied (concurrent goroutine), fall back to pool. e.statePool.put(state) } diff --git a/meta/find_indices.go b/meta/find_indices.go index cd794e3..111d387 100644 --- a/meta/find_indices.go +++ b/meta/find_indices.go @@ -219,6 +219,12 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) { func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) { //nolint:cyclop // DFA with prefilter paths atomic.AddUint64(&e.stats.DFASearches, 1) + // Longest (POSIX) mode: DFA uses leftmost-first (break-at-match), which is + // incompatible with leftmost-longest semantics. Fall back to PikeVM. + if e.longest { + return e.pikevm.Search(haystack) + } + // Literal fast path — complete prefilter returns match directly if e.prefilter != nil && e.prefilter.IsComplete() { pos := e.prefilter.Find(haystack, 0) @@ -328,6 +334,11 @@ func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) { //nolint:cyc func (e *Engine) findIndicesDFAAt(haystack []byte, at int) (int, int, bool) { atomic.AddUint64(&e.stats.DFASearches, 1) + // Longest (POSIX) mode: DFA uses leftmost-first, fall back to PikeVM. + if e.longest { + return e.pikevm.SearchAt(haystack, at) + } + // Prefilter skip-ahead — safe for all prefilters, DFA verifies. if e.prefilter != nil { pos := e.prefilter.Find(haystack, at) @@ -356,6 +367,86 @@ func (e *Engine) findIndicesDFAAt(haystack []byte, at int) (int, int, bool) { return e.pikevm.SearchAt(haystack, at) } +// findIndicesDFAAtWithState searches using DFA starting at position, reusing provided state. +// Eliminates per-match sync.Pool overhead when called from FindAll/Count loops. +func (e *Engine) findIndicesDFAAtWithState(haystack []byte, at int, state *SearchState) (int, int, bool) { + atomic.AddUint64(&e.stats.DFASearches, 1) + + // Longest (POSIX) mode: DFA uses leftmost-first, fall back to PikeVM. + if e.longest { + return state.pikevm.SearchAt(haystack, at) + } + + // Prefilter skip-ahead — safe for all prefilters, DFA verifies. + if e.prefilter != nil { + pos := e.prefilter.Find(haystack, at) + if pos == -1 { + return -1, -1, false + } + atomic.AddUint64(&e.stats.PrefilterHits, 1) + // Bidirectional DFA: forward DFA → end, reverse DFA → start. O(n) total. + if e.reverseDFA != nil { + return e.findIndicesBidirectionalDFACore(haystack, pos, state) + } + return state.pikevm.SearchAt(haystack, pos) + } + + if e.reverseDFA != nil { + return e.findIndicesBidirectionalDFACore(haystack, at, state) + } + matched := e.dfa.IsMatchAt(state.dfaCache, haystack, at) + if !matched { + return -1, -1, false + } + + // DFA confirmed a match exists - use PikeVM for exact bounds + return state.pikevm.SearchAt(haystack, at) +} + +// findIndicesAdaptiveAtWithState tries prefilter+DFA first, falls back to NFA. +// Reuses provided SearchState to eliminate per-match sync.Pool overhead. +func (e *Engine) findIndicesAdaptiveAtWithState(haystack []byte, at int, state *SearchState) (int, int, bool) { + // Use prefilter if available for fast candidate finding + if e.prefilter != nil && e.dfa != nil { + pos := e.prefilter.Find(haystack, at) + if pos == -1 { + return -1, -1, false + } + atomic.AddUint64(&e.stats.PrefilterHits, 1) + atomic.AddUint64(&e.stats.DFASearches, 1) + + // Literal fast path + if e.prefilter.IsComplete() { + literalLen := e.prefilter.LiteralLen() + if literalLen > 0 { + return pos, pos + literalLen, true + } + } + + // Search from prefilter position - O(m) not O(n) + return state.pikevm.SearchAt(haystack, pos) + } + + // Try DFA without prefilter + if e.dfa != nil { + atomic.AddUint64(&e.stats.DFASearches, 1) + endPos := e.dfa.FindAt(state.dfaCache, haystack, at) + if endPos != -1 { + // Use estimated start for O(m) search + estimatedStart := at + if endPos > at+100 { + estimatedStart = endPos - 100 + } + return state.pikevm.SearchAt(haystack, estimatedStart) + } + size, capacity, _, _, _ := e.dfa.CacheStats(state.dfaCache) + if size >= int(capacity)*9/10 { + atomic.AddUint64(&e.stats.DFACacheFull, 1) + } + } + return e.findIndicesNFAAtWithState(haystack, at, state) +} + // findIndicesAdaptive tries prefilter+DFA first, falls back to NFA - zero alloc. func (e *Engine) findIndicesAdaptive(haystack []byte) (int, int, bool) { // Use prefilter if available for fast candidate finding @@ -586,6 +677,13 @@ func (e *Engine) findIndicesBidirectionalDFA(haystack []byte, at int) (int, int, atomic.AddUint64(&e.stats.DFASearches, 1) state := e.getSearchState() defer e.putSearchState(state) + return e.findIndicesBidirectionalDFACore(haystack, at, state) +} + +// findIndicesBidirectionalDFACore is the poolless core of bidirectional DFA search. +// Caller must provide a valid SearchState (either from pool or already held). +// Used by findAllIndicesLoop and Count to avoid per-match pool round-trips. +func (e *Engine) findIndicesBidirectionalDFACore(haystack []byte, at int, state *SearchState) (int, int, bool) { // Forward DFA: leftmost-first match end (matches Rust find_fwd) end := e.dfa.SearchAt(state.dfaCache, haystack, at) if end == -1 { @@ -947,6 +1045,48 @@ func (e *Engine) findIndicesDigitPrefilterAt(haystack []byte, at int) (int, int, return -1, -1, false } +// findIndicesDigitPrefilterAtWithState searches using digit prefilter, reusing provided state. +// Eliminates per-match sync.Pool overhead when called from FindAll/Count loops. +func (e *Engine) findIndicesDigitPrefilterAtWithState(haystack []byte, at int, state *SearchState) (int, int, bool) { + if e.digitPrefilter == nil || at >= len(haystack) { + return e.findIndicesNFAAtWithState(haystack, at, state) + } + + atomic.AddUint64(&e.stats.PrefilterHits, 1) + pos := at + + for pos < len(haystack) { + digitPos := e.digitPrefilter.Find(haystack, pos) + if digitPos < 0 { + return -1, -1, false + } + + if e.dfa != nil { + atomic.AddUint64(&e.stats.DFASearches, 1) + // Use anchored search - pattern MUST start at digitPos + endPos := e.dfa.SearchAtAnchored(state.dfaCache, haystack, digitPos) + if endPos != -1 { + return digitPos, endPos, true + } + } else { + atomic.AddUint64(&e.stats.NFASearches, 1) + start, end, found := state.pikevm.SearchAt(haystack, digitPos) + if found { + return start, end, true + } + } + + pos = digitPos + 1 + if e.digitRunSkipSafe { + for pos < len(haystack) && haystack[pos] >= '0' && haystack[pos] <= '9' { + pos++ + } + } + } + + return -1, -1, false +} + // findIndicesAhoCorasick returns indices using Aho-Corasick - zero alloc. func (e *Engine) findIndicesAhoCorasick(haystack []byte) (int, int, bool) { if e.ahoCorasick == nil { @@ -994,11 +1134,9 @@ func (e *Engine) findIndicesAtWithState(haystack []byte, at int, state *SearchSt case UseNFA: return e.findIndicesNFAAtWithState(haystack, at, state) case UseDFA: - // DFA uses e.pikevm (shared) for final bounds, not pooled state - return e.findIndicesDFAAt(haystack, at) + return e.findIndicesDFAAtWithState(haystack, at, state) case UseBoth: - // Adaptive uses e.pikevm (shared) or delegates to NFA path - return e.findIndicesAdaptiveAt(haystack, at) + return e.findIndicesAdaptiveAtWithState(haystack, at, state) case UseReverseSuffix: return e.reverseSuffixSearcher.FindIndicesAtWithCaches(haystack, at, state.stratFwdCache, state.stratRevCache) case UseReverseSuffixSet: @@ -1016,7 +1154,7 @@ func (e *Engine) findIndicesAtWithState(haystack []byte, at int, state *SearchSt case UseTeddy: return e.findIndicesTeddyAt(haystack, at) case UseDigitPrefilter: - return e.findIndicesDigitPrefilterAt(haystack, at) + return e.findIndicesDigitPrefilterAtWithState(haystack, at, state) case UseAhoCorasick: return e.findIndicesAhoCorasickAt(haystack, at) case UseMultilineReverseSuffix: diff --git a/meta/findall.go b/meta/findall.go index 9e4713a..3a7eec7 100644 --- a/meta/findall.go +++ b/meta/findall.go @@ -307,9 +307,33 @@ func (e *Engine) Count(haystack []byte, n int) int { state := e.getSearchState() defer e.putSearchState(state) + // DFA fast path: call DFA functions directly, skip meta prefilter layer. + // SearchAt has integrated prefilter at start state — no duplicate scan. + useDFADirect := (e.strategy == UseDFA || e.strategy == UseBoth) && + e.dfa != nil && e.reverseDFA != nil && + state.dfaCache != nil && state.revDFACache != nil + for pos <= len(haystack) { - // Use state-reusing version for zero sync.Pool overhead per match - start, end, found := e.findIndicesAtWithState(haystack, pos, state) + var start, end int + var found bool + + if useDFADirect { + matchEnd := e.dfa.SearchAt(state.dfaCache, haystack, pos) + if matchEnd < 0 { + break + } + if matchEnd == pos { + start, end, found = pos, pos, true + } else { + matchStart := e.reverseDFA.SearchReverse(state.revDFACache, haystack, pos, matchEnd) + if matchStart < 0 { + break + } + start, end, found = matchStart, matchEnd, true + } + } else { + start, end, found = e.findIndicesAtWithState(haystack, pos, state) + } if !found { break } diff --git a/meta/strategy.go b/meta/strategy.go index 8f4caa0..c68b4de 100644 --- a/meta/strategy.go +++ b/meta/strategy.go @@ -1307,12 +1307,42 @@ func analyzeLiterals(literals *literal.Seq, config Config) literalAnalysis { return result } +// hasWordBoundaryAnchorCombo returns true if the pattern combines word boundary +// assertions (\b, \B) with anchors (^, $) in a way that causes DFA correctness issues. +// Example: `\b^` matches in stdlib but DFA fails to handle the assertion combo. +func hasWordBoundaryAnchorCombo(re *syntax.Regexp) bool { + return hasWordBoundary(re) && hasAnchorAssertions(re) +} + +// hasCaseInsensitiveUnicode returns true if the pattern uses case-insensitive (?i) +// flag with non-ASCII characters. DFA may produce incorrect results for case-folded +// Unicode (e.g., `(?i)привет` matching "ПРИВЕТ" returns partial match). +func hasCaseInsensitiveUnicode(re *syntax.Regexp) bool { + if re == nil { + return false + } + // Check if this node has FoldCase flag AND contains non-ASCII literals + if re.Flags&syntax.FoldCase != 0 { + for _, r := range re.Rune { + if r > 127 { + return true + } + } + } + for _, sub := range re.Sub { + if hasCaseInsensitiveUnicode(sub) { + return true + } + } + return false +} + // SelectStrategy analyzes the NFA and literals to choose the best execution strategy. // // Algorithm: // 1. If end-anchored ($ or \z) and not start-anchored → UseReverseAnchored // 2. If DFA disabled in config → UseNFA -// 3. If NFA is tiny (< 20 states) → UseNFA (DFA overhead not worth it) +// 3. If NFA is tiny (< 20 states) → UseDFA (tagged start states enable pure DFA) // 4. If simple character class pattern without literals → UseNFA (DFA overhead not worth it) // 5. If good literals exist → UseDFA (prefilter + DFA is fastest) // 6. If NFA is large (> 100 states) → UseDFA (essential for performance) @@ -1453,27 +1483,27 @@ func SelectStrategy(n *nfa.NFA, re *syntax.Regexp, literals *literal.Seq, config return strategy } - // Tiny NFA with literals: use prefilter + NFA (like Rust) - // For patterns like "j[a-z]+p", DFA construction overhead is not worth it - // on small inputs. NFA with prefilter skip-ahead is faster. - // The prefilter (memchr) jumps to candidates, NFA verifies in O(pattern) time. - if nfaSize < 20 && litAnalysis.hasGoodLiterals { - return UseNFA // findIndicesNFA now uses prefilter for skip-ahead - } - - // Check for simple digit-lead patterns BEFORE tiny NFA fallback. + // Check for simple digit-lead patterns before general DFA routing. // Patterns like `\d+\.\d+\.\d+` (14 NFA states) benefit more from - // DigitPrefilter than plain NFA because SIMD digit scanning skips + // DigitPrefilter than DFA because SIMD digit scanning skips // non-digit regions entirely. if shouldUseDigitPrefilter(re, nfaSize, config) { return UseDigitPrefilter } - // Tiny NFA without literals: use PikeVM directly (DFA overhead not worth it) - // For patterns like "a", ".", "[0-9]", the DFA cache lookup and - // determinization overhead exceeds the benefit. + // Small NFA (< 20 states): use pure DFA (no PikeVM verification). + // With tagged start states (Rust LazyStateID approach), DFA search handles + // prefilter correctly: start-tagged states always enter slow path for + // prefilter skip-ahead. This eliminates the O(n^2) that previously blocked + // UseDFA routing. + // Benchmarked: UseDFA is 7x faster than UseBoth for 10-14 state NFA on + // large inputs (7.2 MB), because UseBoth still uses PikeVM verification. + // Guards: some patterns have DFA issues — keep UseNFA for those. if nfaSize < 20 { - return UseNFA + if hasCaseInsensitiveUnicode(re) || hasWordBoundaryAnchorCombo(re) || canMatchEmpty(re) || hasMultilineLineAnchor(re) { + return UseNFA + } + return UseDFA } // Patterns that can match empty string (e.g., `.*`, `a*`, `(a|)`) must use @@ -1564,11 +1594,14 @@ func strategyReasonComplex(strategy Strategy, n *nfa.NFA, literals *literal.Seq, return "DFA disabled in configuration" } if nfaSize < 20 { - return "tiny NFA (< 20 states), DFA overhead not worth it" + return "tiny NFA (< 20 states) with empty-match or special guards" } return "no good literals and small NFA" case UseDFA: + if nfaSize < 20 { + return "tiny NFA (< 20 states), pure DFA with tagged start states" + } if literals != nil && !literals.IsEmpty() { lcp := literals.LongestCommonPrefix() if len(lcp) >= config.MinLiteralLen { diff --git a/meta/strategy_selection_test.go b/meta/strategy_selection_test.go index 9e72c70..030a625 100644 --- a/meta/strategy_selection_test.go +++ b/meta/strategy_selection_test.go @@ -14,10 +14,10 @@ func TestStrategySelectionComprehensive(t *testing.T) { pattern string want Strategy }{ - // ========== UseNFA: tiny patterns without useful literals ========== - {"nfa_single_char", "a", UseNFA}, - {"nfa_single_char_b", "b", UseNFA}, - {"nfa_two_char_literal", "ab", UseNFA}, + // ========== UseDFA: tiny patterns — pure DFA with tagged start states ========== + {"dfa_single_char", "a", UseDFA}, + {"dfa_single_char_b", "b", UseDFA}, + {"dfa_two_char_literal", "ab", UseDFA}, // ========== UseReverseSuffix: .*suffix patterns ========== {"rsuffix_dot_star_txt", `.*\.txt`, UseReverseSuffix}, diff --git a/meta/strategy_test.go b/meta/strategy_test.go index dbb7da0..268352b 100644 --- a/meta/strategy_test.go +++ b/meta/strategy_test.go @@ -222,7 +222,7 @@ func TestDigitPrefilterStrategySelection(t *testing.T) { {`\d+`, UseCharClassSearcher, "simple \\d+ uses CharClassSearcher"}, // Patterns with good prefix literals use UseDFA - {`123\d+`, UseNFA, "literal prefix uses NFA (tiny pattern with literals)"}, + {`123\d+`, UseDigitPrefilter, "digit-lead literal uses DigitPrefilter"}, // Non-digit patterns should NOT use UseDigitPrefilter {`[a-z]+`, UseCharClassSearcher, "letter class uses CharClassSearcher"}, diff --git a/regex.go b/regex.go index b170659..61c6373 100644 --- a/regex.go +++ b/regex.go @@ -47,6 +47,7 @@ package coregex import ( "io" + "iter" "regexp/syntax" "strings" "unsafe" @@ -696,132 +697,72 @@ func (r *Regex) FindAllIndex(b []byte, n int) [][]int { return nil } - // Fast path: CharClassSearcher uses streaming state machine (single-pass, no per-match overhead) - // This is 2-3x faster than the loop below for patterns like \w+, \d+, [a-z]+ - if r.engine.Strategy() == meta.UseCharClassSearcher { - return r.findAllIndexStreaming(b, n) - } - - var indices [][]int - pos := 0 - lastMatchEnd := -1 // Track where the last non-empty match ended - - for { - // Use zero-allocation FindIndicesAt instead of FindAt (avoids Match object creation) - start, end, found := r.engine.FindIndicesAt(b, pos) - if !found { - break - } - - // Lazy allocation: only allocate once we find the first match - if indices == nil { - // Pre-allocate with estimated capacity - // Heuristic: for typical patterns, estimate ~10 matches per 1KB - estimatedCap := len(b) / 100 - if estimatedCap < 4 { - estimatedCap = 4 - } - if n > 0 && estimatedCap > n { - estimatedCap = n - } - indices = make([][]int, 0, estimatedCap) - } - - // Skip empty matches that start exactly where the previous non-empty match ended. - // This matches Go's stdlib behavior: - // - "a*" on "ab" returns [[0 1] [2 2]], not [[0 1] [1 1] [2 2]] - // - After matching "a" at [0,1], an empty match at [1,1] is skipped - // - But empty matches at [2,2] (after the 'b') are allowed - //nolint:gocritic // badCond: intentional - checking empty match (start==end) at lastMatchEnd - if start == end && start == lastMatchEnd { - // Skip this empty match and try at the next position - pos++ - if pos > len(b) { - break - } - continue - } - - indices = append(indices, []int{start, end}) - - // Track non-empty match ends for the skip rule - if start != end { - lastMatchEnd = end - } - - // Move position past this match - switch { - case start == end: - // Empty match: advance by 1 to avoid infinite loop - pos = end + 1 - case end > pos: - pos = end - default: - // Fallback (shouldn't normally happen) - pos++ - } - - if pos > len(b) { - break - } - - // Check limit - if n > 0 && len(indices) >= n { - break - } - } - - return indices + // Use compact [][2]int internally, convert at the boundary. + // This reduces allocations from N+1 (one []int per match) to 2 (one flat buffer + one slice header). + compact := r.engine.FindAllIndicesStreaming(b, n, nil) + return compactToSliceOfSlice(compact) } -// findAllIndexStreaming uses single-pass streaming state machine for CharClassSearcher patterns. -// This avoids per-match function call overhead (2-3x faster than the loop approach). -// CharClassSearcher patterns like \w+, \d+, [a-z]+ cannot produce empty matches (minMatch=1), -// so the empty match handling logic is not needed here. -func (r *Regex) findAllIndexStreaming(b []byte, n int) [][]int { - // Get streaming results ([][2]int format) - streamResults := r.engine.FindAllIndicesStreaming(b, n, nil) - - if len(streamResults) == 0 { +// compactToSliceOfSlice converts [][2]int to [][]int using a flat buffer. +// This reduces allocations from N+1 (one []int heap alloc per match) to exactly 2: +// one flat []int buffer for all indices, one [][]int for slice headers. +// Each result[i] is a length-2/capacity-2 slice into the flat buffer. +func compactToSliceOfSlice(compact [][2]int) [][]int { + if len(compact) == 0 { return nil } - // Convert [][2]int to [][]int for stdlib-compatible API - // This allocation is necessary for API compatibility, but still faster than per-match overhead - indices := make([][]int, len(streamResults)) - for i, m := range streamResults { - indices[i] = []int{m[0], m[1]} + buf := make([]int, len(compact)*2) + result := make([][]int, len(compact)) + for i, m := range compact { + buf[i*2] = m[0] + buf[i*2+1] = m[1] + result[i] = buf[i*2 : i*2+2 : i*2+2] } - - return indices + return result } -// FindAllIndexCompact returns all successive matches as a compact [][2]int slice. -// This is a zero-allocation API (single allocation for the result slice). -// Unlike FindAllIndex which returns [][]int (N allocations for N matches), -// this method pre-allocates the entire result in one contiguous block. +// AppendAllIndex appends all successive match index pairs to dst and returns +// the extended slice. This follows the Go stdlib append pattern (like +// strconv.AppendInt) with dst as the first parameter. // -// Performance: ~2x fewer allocations than FindAllIndex for high match counts. +// Zero-allocation when dst has sufficient capacity. Unlike FindAllIndex which +// returns [][]int (N heap allocations for N matches), AppendAllIndex uses a +// flat [][2]int layout requiring at most one allocation for the backing array. // -// If n > 0, it returns at most n matches. If n <= 0, it returns all matches. -// The optional 'results' slice can be provided for reuse (set to nil for fresh allocation). +// If n > 0, it appends at most n matches. If n <= 0, it appends all matches. +// Pass nil as dst for a fresh allocation. // // Example: // // re := coregex.MustCompile(`\d+`) -// indices := re.FindAllIndexCompact([]byte("a1b2c3"), -1, nil) +// indices := re.AppendAllIndex(nil, []byte("a1b2c3"), -1) // // indices = [[1,2], [3,4], [5,6]] -func (r *Regex) FindAllIndexCompact(b []byte, n int, results [][2]int) [][2]int { +// +// // Reuse buffer across calls: +// buf := make([][2]int, 0, 64) +// buf = re.AppendAllIndex(buf[:0], data1, -1) +// process(buf) +// buf = re.AppendAllIndex(buf[:0], data2, -1) +// process(buf) +func (r *Regex) AppendAllIndex(dst [][2]int, b []byte, n int) [][2]int { if n == 0 { return nil } - return r.engine.FindAllIndicesStreaming(b, n, results) + return r.engine.FindAllIndicesStreaming(b, n, dst) } -// FindAllStringIndexCompact returns all successive matches as a compact [][2]int slice. -// This is the string version of FindAllIndexCompact. -func (r *Regex) FindAllStringIndexCompact(s string, n int, results [][2]int) [][2]int { - return r.FindAllIndexCompact(stringToBytes(s), n, results) +// AppendAllStringIndex appends all successive match index pairs for the string +// s to dst and returns the extended slice. This is the string version of +// AppendAllIndex. +// +// Example: +// +// re := coregex.MustCompile(`\d+`) +// indices := re.AppendAllStringIndex(nil, "a1b2c3", -1) +// // indices = [[1,2], [3,4], [5,6]] +func (r *Regex) AppendAllStringIndex(dst [][2]int, s string, n int) [][2]int { + return r.AppendAllIndex(dst, stringToBytes(s), n) } // FindAllStringIndex returns a slice of all successive matches of the pattern in s, @@ -1485,21 +1426,25 @@ func (r *Regex) FindAllSubmatchIndex(b []byte, n int) [][]int { return nil } + // All matches have same number of capture groups, so use a flat buffer. + // This reduces allocations from N+1 to exactly 2 (flat buffer + slice headers). + numGroups := matches[0].NumCaptures() + stride := numGroups * 2 + buf := make([]int, len(matches)*stride) result := make([][]int, len(matches)) for i, m := range matches { - numGroups := m.NumCaptures() - indices := make([]int, numGroups*2) + base := i * stride for j := 0; j < numGroups; j++ { idx := m.GroupIndex(j) if len(idx) >= 2 { - indices[j*2] = idx[0] - indices[j*2+1] = idx[1] + buf[base+j*2] = idx[0] + buf[base+j*2+1] = idx[1] } else { - indices[j*2] = -1 - indices[j*2+1] = -1 + buf[base+j*2] = -1 + buf[base+j*2+1] = -1 } } - result[i] = indices + result[i] = buf[base : base+stride : base+stride] } return result } @@ -1516,6 +1461,119 @@ func (r *Regex) FindAllStringSubmatchIndex(s string, n int) [][]int { return r.FindAllSubmatchIndex(stringToBytes(s), n) } +// AllIndex returns an iterator over all successive non-overlapping match index +// pairs in b. Each yielded [2]int contains the start and end byte offsets of a +// match. Matches are returned left-to-right. +// +// Zero allocation: the iterator uses FindIndicesAt internally and yields +// stack-allocated [2]int values. No slice is allocated for the results. +// +// Empty match handling follows Go stdlib regexp semantics: an empty match at a +// position where a non-empty match just ended is skipped, and the search +// advances by one byte after each empty match. +// +// Example: +// +// re := coregex.MustCompile(`\d+`) +// for m := range re.AllIndex([]byte("a1b22c333")) { +// fmt.Printf("match at [%d, %d]\n", m[0], m[1]) +// } +// // Output: +// // match at [1, 2] +// // match at [3, 5] +// // match at [6, 9] +func (r *Regex) AllIndex(b []byte) iter.Seq[[2]int] { + return func(yield func([2]int) bool) { + pos := 0 + lastMatchEnd := -1 + for pos <= len(b) { + start, end, found := r.engine.FindIndicesAt(b, pos) + if !found { + return + } + // Skip empty matches at the position where a non-empty match just ended. + // This matches Go stdlib behavior. + //nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd + if start == end && start == lastMatchEnd { + pos++ + if pos > len(b) { + return + } + continue + } + if !yield([2]int{start, end}) { + return + } + if start != end { + lastMatchEnd = end + } + if end == pos { + pos++ + } else { + pos = end + } + } + } +} + +// AllStringIndex returns an iterator over all successive non-overlapping match +// index pairs in s. This is the string version of AllIndex. +// +// Example: +// +// re := coregex.MustCompile(`\w+`) +// for m := range re.AllStringIndex("hello world") { +// fmt.Printf("%s\n", s[m[0]:m[1]]) +// } +func (r *Regex) AllStringIndex(s string) iter.Seq[[2]int] { + return r.AllIndex(stringToBytes(s)) +} + +// All returns an iterator over all successive non-overlapping matches in b. +// Each yielded []byte is a sub-slice of b (no copy, no allocation). +// +// Example: +// +// re := coregex.MustCompile(`\d+`) +// for m := range re.All([]byte("a1b22c333")) { +// fmt.Println(string(m)) +// } +// // Output: +// // 1 +// // 22 +// // 333 +func (r *Regex) All(b []byte) iter.Seq[[]byte] { + return func(yield func([]byte) bool) { + for m := range r.AllIndex(b) { + if !yield(b[m[0]:m[1]]) { + return + } + } + } +} + +// AllString returns an iterator over all successive non-overlapping matches in s. +// Each yielded string is a substring of s (no copy, no allocation). +// +// Example: +// +// re := coregex.MustCompile(`\w+`) +// for word := range re.AllString("hello world") { +// fmt.Println(word) +// } +// // Output: +// // hello +// // world +func (r *Regex) AllString(s string) iter.Seq[string] { + return func(yield func(string) bool) { + for m := range r.AllStringIndex(s) { + if !yield(s[m[0]:m[1]]) { + return + } + } + } +} + // Copy returns a new Regex object copied from re. // Calling Longest on one copy does not affect another. // diff --git a/word_digit_bench_test.go b/word_digit_bench_test.go index 584ebdf..bfe0daf 100644 --- a/word_digit_bench_test.go +++ b/word_digit_bench_test.go @@ -59,22 +59,22 @@ func BenchmarkAlphaDigit_1MB_Coregex(b *testing.B) { } } -// Compact API benchmarks - zero per-match allocations -func BenchmarkWordDigit_1MB_CoregexCompact(b *testing.B) { +// AppendAllIndex benchmarks - zero per-match allocations +func BenchmarkWordDigit_1MB_CoregexAppend(b *testing.B) { re := MustCompile(`\w+[0-9]+`) b.SetBytes(int64(len(benchData))) b.ResetTimer() for i := 0; i < b.N; i++ { - re.FindAllIndexCompact(benchData, -1, nil) + re.AppendAllIndex(nil, benchData, -1) } } -func BenchmarkWordDigit_1MB_CoregexCompactReuse(b *testing.B) { +func BenchmarkWordDigit_1MB_CoregexAppendReuse(b *testing.B) { re := MustCompile(`\w+[0-9]+`) results := make([][2]int, 0, 65536) b.SetBytes(int64(len(benchData))) b.ResetTimer() for i := 0; i < b.N; i++ { - results = re.FindAllIndexCompact(benchData, -1, results) + results = re.AppendAllIndex(results[:0], benchData, -1) } }