diff --git a/CHANGELOG.md b/CHANGELOG.md index a9060b6..38c7532 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120)) - SIMD prefilter for CompositeSequenceDFA (#83) +## [0.12.17] - 2026-03-23 + +### Fixed +- **Remove false DFA downgrade for `(?m)^` patterns** — `adjustForAnchors()` + incorrectly routed `(?m)^` multiline patterns from UseDFA to UseNFA, claiming + "DFA can't verify multiline line anchors". This is false — the lazy DFA already + handles `(?m)^` correctly via StartByteMap/StartLineLF (identical to Rust regex). + The downgrade caused 4 LangArena patterns (`api_calls`, `post_requests`, + `passwords`, `sessions`) to fall back to byte-by-byte NFA scan — catastrophic + on ARM64 without SIMD. LangArena total: 2335ms → **185ms** (12.6x faster). + +- **Restore partial prefilter for `(?i)` alternation overflow** — literal + extractor returned empty Seq on cross-product overflow (>250 variants), + killing all prefilter literals for patterns like `(?i)(eval|system|exec|...)`. + Now trims to 3-byte prefixes + dedup (Rust approach) and marks inexact. + Also guards NFA candidate loop with `IsComplete()` check — incomplete + prefilters skip candidate loop (NFA scans full input), preventing + correctness bugs from partial branch coverage. + `suspicious` pattern: UseNFA without prefilter (113ms) → UseNFA with + FatTeddy skip-ahead (**1ms**). + +- **Restore UseTeddy for `(?m)^` multiline patterns** — `selectLiteralStrategy` + blocked UseTeddy for any pattern with anchors. But `adjustForAnchors()` already + wraps the prefilter with `WrapLineAnchor` for `(?m)^`, making Teddy safe. + Now allows UseTeddy when anchors are only `(?m)^` (no \b, $, etc). + `http_methods` on macOS ARM64: 89ms → **<1ms** (restored to v0.12.14 level). + ## [0.12.16] - 2026-03-21 ### Performance diff --git a/README.md b/README.md index 4a4c1e0..355e187 100644 --- a/README.md +++ b/README.md @@ -66,14 +66,14 @@ Cross-language benchmarks on 6MB input, AMD EPYC ([source](https://github.com/ko |---------|-----------|---------|------------|-----------|---------| | Literal alternation | 475 ms | 4.4 ms | 0.6 ms | **108x** | 7.1x slower | | Multi-literal | 1412 ms | 12.8 ms | 4.7 ms | **110x** | 2.7x slower | -| Inner `.*keyword.*` | 234 ms | 0.35 ms | 0.28 ms | **667x** | 1.2x slower | -| Suffix `.*\.txt` | 236 ms | 1.83 ms | 1.08 ms | **128x** | 1.6x slower | -| Multiline `(?m)^/.*\.php` | 104 ms | 0.50 ms | 0.68 ms | **207x** | **1.3x faster** | -| Email validation | 262 ms | 0.50 ms | 0.23 ms | **523x** | 2.1x slower | -| URL extraction | 258 ms | 0.61 ms | 0.35 ms | **422x** | 1.7x slower | -| IP address | 497 ms | 2.2 ms | 12.0 ms | **229x** | **5.5x faster** | -| Char class `[\w]+` | 579 ms | 41.0 ms | 50.1 ms | **14x** | **1.2x faster** | -| Word repeat `(\w{2,8})+` | 652 ms | 186 ms | 48.3 ms | **3x** | 3.8x slower | +| Inner `.*keyword.*` | 232 ms | 0.30 ms | 0.27 ms | **774x** | 1.1x slower | +| Suffix `.*\.txt` | 236 ms | 1.82 ms | 1.13 ms | **129x** | 1.6x slower | +| Multiline `(?m)^/.*\.php` | 103 ms | 0.50 ms | 0.67 ms | **206x** | **1.3x faster** | +| Email validation | 265 ms | 0.62 ms | 0.27 ms | **428x** | 2.2x slower | +| URL extraction | 353 ms | 0.65 ms | 0.35 ms | **543x** | 1.8x slower | +| IP address | 496 ms | 2.1 ms | 12.1 ms | **231x** | **5.6x faster** | +| Char class `[\w]+` | 581 ms | 51.2 ms | 50.2 ms | **11x** | ~parity | +| Word repeat `(\w{2,8})+` | 712 ms | 186 ms | 48.7 ms | **3x** | 3.8x slower | **Where coregex excels:** - Multiline patterns (`(?m)^/.*\.php`) — near Rust parity, 100x+ vs stdlib diff --git a/literal/extractor.go b/literal/extractor.go index d7a117d..f3d0cad 100644 --- a/literal/extractor.go +++ b/literal/extractor.go @@ -264,20 +264,17 @@ func (e *Extractor) extractPrefixesAlternate(re *syntax.Regexp, depth int) *Seq } } - // If overflow occurred, NOT all alternation branches are represented. - // A partial prefilter would miss matches for unrepresented branches. - // Return empty Seq so no prefilter is built — NFA handles all branches. - // This matches Rust's approach: overflowed literal sets → no prefilter. - if overflowed { - return NewSeq() - } - result := NewSeq(allLits...) - if result.Len() > e.config.MaxLiterals { - // Too many literals but all branches represented: trim to 3-byte - // prefixes, dedup, mark inexact. After trim, all alternation branches - // have at least one prefix in the set (unlike overflow truncation). + if overflowed || result.Len() > e.config.MaxLiterals { + // Either not all branches are represented (overflow) or too many literals. + // Trim to 3-byte prefixes + dedup to fit prefilter capacity. + // Mark ALL as inexact — prefilter is used for skip-ahead only, + // DFA/NFA verifies each candidate (safe with partial coverage). + // + // Rust does the same: optimize_for_prefix_by_preference trims and deduplicates. + // A partial prefilter is much better than no prefilter — DFA with skip-ahead + // vs NFA byte-by-byte on 549 states is 100x+ difference on ARM64. result.KeepFirstBytes(3) e.markAllInexact(result) result.Dedup() diff --git a/meta/compile.go b/meta/compile.go index 92b6853..921866f 100644 --- a/meta/compile.go +++ b/meta/compile.go @@ -630,9 +630,13 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { }, nil } -// adjustForAnchors fixes prefilter and strategy for patterns with anchors. +// adjustForAnchors fixes prefilter for patterns with anchors. // Anchors (^, $, \b) require verification that Teddy/AC prefilter can't provide. -// Multiline line anchors ((?m)^) need NFA because DFA doesn't verify line positions. +// +// Note: the lazy DFA correctly handles (?m)^ via StartByteMap — after \n it +// selects StartLineLF which includes LookStartLine in the epsilon closure. +// Verified with direct DFA tests and Rust source analysis (identical approach). +// See docs/dev/research/v01216-arm64-regression.md for details. func adjustForAnchors(pf prefilter.Prefilter, strategy Strategy, re *syntax.Regexp) (prefilter.Prefilter, Strategy) { if !hasAnchorAssertions(re) { return pf, strategy @@ -654,10 +658,6 @@ func adjustForAnchors(pf prefilter.Prefilter, strategy Strategy, re *syntax.Rege } } - // DFA can't verify (?m)^ multiline line anchors — use NFA - if strategy == UseDFA && hasMultilineAnchor { - strategy = UseNFA - } return pf, strategy } diff --git a/meta/find_indices.go b/meta/find_indices.go index 6e07856..80ff8c9 100644 --- a/meta/find_indices.go +++ b/meta/find_indices.go @@ -119,8 +119,11 @@ func (e *Engine) findIndicesNFA(haystack []byte) (int, int, bool) { state := e.getSearchState() defer e.putSearchState(state) - // Use prefilter for skip-ahead if available - if e.prefilter != nil { + // Use prefilter candidate loop for skip-ahead — but ONLY when prefilter + // covers all possible match positions (IsComplete or all branches represented). + // Incomplete prefilters (partial case-fold coverage) cannot be used as + // correctness gates — they'd miss branches whose literals were truncated. + if e.prefilter != nil && e.prefilter.IsComplete() { at := 0 for at < len(haystack) { // Find next candidate position via prefilter @@ -172,17 +175,15 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) { state := e.getSearchState() defer e.putSearchState(state) - // Use prefilter for skip-ahead if available - if e.prefilter != nil { + // Use prefilter candidate loop — only safe with complete prefilter + if e.prefilter != nil && e.prefilter.IsComplete() { for at < len(haystack) { - // Find next candidate position via prefilter pos := e.prefilter.Find(haystack, at) if pos == -1 { - return -1, -1, false // No more candidates + return -1, -1, false } atomic.AddUint64(&e.stats.PrefilterHits, 1) - // Try to match at candidate position var start, end int var found bool if useBT && e.boundedBacktracker.CanHandle(len(haystack)-pos) { @@ -194,14 +195,13 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) { return start, end, true } - // Move past this position atomic.AddUint64(&e.stats.PrefilterMisses, 1) at = pos + 1 } return -1, -1, false } - // No prefilter: use BoundedBacktracker if available and safe + // No prefilter or incomplete: use BoundedBacktracker if available and safe if useBT && e.boundedBacktracker.CanHandle(len(haystack)-at) { return e.boundedBacktracker.SearchAtWithState(haystack, at, state.backtracker) } @@ -1028,17 +1028,16 @@ func (e *Engine) findIndicesNFAAtWithState(haystack []byte, at int, state *Searc // BoundedBacktracker can be used for Find operations only when safe useBT := e.boundedBacktracker != nil && !e.canMatchEmpty - // Use prefilter for skip-ahead if available - if e.prefilter != nil { + // Use prefilter candidate loop — only safe with complete prefilter. + // Incomplete prefilters (partial case-fold coverage) would miss branches. + if e.prefilter != nil && e.prefilter.IsComplete() { for at < len(haystack) { - // Find next candidate position via prefilter pos := e.prefilter.Find(haystack, at) if pos == -1 { - return -1, -1, false // No more candidates + return -1, -1, false } atomic.AddUint64(&e.stats.PrefilterHits, 1) - // Try to match at candidate position var start, end int var found bool if useBT && e.boundedBacktracker.CanHandle(len(haystack)-pos) { @@ -1050,14 +1049,13 @@ func (e *Engine) findIndicesNFAAtWithState(haystack []byte, at int, state *Searc return start, end, true } - // Move past this position atomic.AddUint64(&e.stats.PrefilterMisses, 1) at = pos + 1 } return -1, -1, false } - // No prefilter: use BoundedBacktracker if available and safe + // No prefilter or incomplete: use BoundedBacktracker if available and safe if useBT && e.boundedBacktracker.CanHandle(len(haystack)-at) { return e.boundedBacktracker.SearchAtWithState(haystack, at, state.backtracker) } diff --git a/meta/strategy.go b/meta/strategy.go index 92b22c9..8f4caa0 100644 --- a/meta/strategy.go +++ b/meta/strategy.go @@ -1133,7 +1133,8 @@ type literalAnalysis struct { hasGoodLiterals bool // Good prefix literal (LCP >= MinLiteralLen) hasTeddyLiterals bool // Suitable for Teddy (2-32 patterns, each >= 3 bytes) hasAhoCorasickLiterals bool // Suitable for Aho-Corasick (>32 patterns, each >= 1 byte) - hasAnchors bool // Pattern has anchors (^, $, \b) that Teddy can't verify + hasAnchors bool // Pattern has any anchors (^, $, \b) + hasNonLineAnchors bool // Pattern has anchors other than (?m)^ (\b, $, \A, \z) } // selectLiteralStrategy selects strategy based on literal analysis. @@ -1148,9 +1149,12 @@ func selectLiteralStrategy(literals *literal.Seq, litAnalysis literalAnalysis) S // Patterns like "(foo|bar|baz)" where all literals are complete don't need // DFA verification - Teddy.Find() returns exact matches. // Speedup: 50-250x by skipping all DFA/NFA construction overhead. - // BUT: patterns with anchors (e.g., (?m)^GET|POST) need DFA to verify - // that the match position satisfies the anchor constraint. - if litAnalysis.hasTeddyLiterals && literals.AllComplete() && !litAnalysis.hasAnchors { + // + // For (?m)^ multiline anchors: adjustForAnchors() wraps the prefilter with + // WrapLineAnchor which adds O(1) line-start verification. This makes Teddy + // safe for (?m)^ patterns — no DFA needed. + // Only block Teddy for non-line anchors (\b, $, \A, \z) that need DFA verify. + if litAnalysis.hasTeddyLiterals && literals.AllComplete() && !litAnalysis.hasNonLineAnchors { return UseTeddy } @@ -1418,6 +1422,7 @@ func SelectStrategy(n *nfa.NFA, re *syntax.Regexp, literals *literal.Seq, config nfaSize := n.States() litAnalysis := analyzeLiterals(literals, config) litAnalysis.hasAnchors = hasAnchorAssertions(re) + litAnalysis.hasNonLineAnchors = litAnalysis.hasAnchors && hasNonLineAnchors(re) // Check for simple char_class+ patterns (HIGHEST priority for character class patterns) // Patterns like [\w]+, [a-z]+, \d+ use CharClassSearcher: 14-17x faster than BoundedBacktracker