coregx · kolkov · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
@@ -12,6 +12,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120))
 - SIMD prefilter for CompositeSequenceDFA (#83)
 
+## [0.12.17] - 2026-03-23
+
+### Fixed
+- **Remove false DFA downgrade for `(?m)^` patterns** — `adjustForAnchors()`
+  incorrectly routed `(?m)^` multiline patterns from UseDFA to UseNFA, claiming
+  "DFA can't verify multiline line anchors". This is false — the lazy DFA already
+  handles `(?m)^` correctly via StartByteMap/StartLineLF (identical to Rust regex).
+  The downgrade caused 4 LangArena patterns (`api_calls`, `post_requests`,
+  `passwords`, `sessions`) to fall back to byte-by-byte NFA scan — catastrophic
+  on ARM64 without SIMD. LangArena total: 2335ms → **185ms** (12.6x faster).
+
+- **Restore partial prefilter for `(?i)` alternation overflow** — literal
+  extractor returned empty Seq on cross-product overflow (>250 variants),
+  killing all prefilter literals for patterns like `(?i)(eval|system|exec|...)`.
+  Now trims to 3-byte prefixes + dedup (Rust approach) and marks inexact.
+  Also guards NFA candidate loop with `IsComplete()` check — incomplete
+  prefilters skip candidate loop (NFA scans full input), preventing
+  correctness bugs from partial branch coverage.
+  `suspicious` pattern: UseNFA without prefilter (113ms) → UseNFA with
+  FatTeddy skip-ahead (**1ms**).
+
+- **Restore UseTeddy for `(?m)^` multiline patterns** — `selectLiteralStrategy`
+  blocked UseTeddy for any pattern with anchors. But `adjustForAnchors()` already
+  wraps the prefilter with `WrapLineAnchor` for `(?m)^`, making Teddy safe.
+  Now allows UseTeddy when anchors are only `(?m)^` (no \b, $, etc).
+  `http_methods` on macOS ARM64: 89ms → **<1ms** (restored to v0.12.14 level).
+
 ## [0.12.16] - 2026-03-21
 
 ### Performance

@@ -66,14 +66,14 @@ Cross-language benchmarks on 6MB input, AMD EPYC ([source](https://github.com/ko
 |---------|-----------|---------|------------|-----------|---------|
 | Literal alternation | 475 ms | 4.4 ms | 0.6 ms | **108x** | 7.1x slower |
 | Multi-literal | 1412 ms | 12.8 ms | 4.7 ms | **110x** | 2.7x slower |
-| Inner `.*keyword.*` | 234 ms | 0.35 ms | 0.28 ms | **667x** | 1.2x slower |
-| Suffix `.*\.txt` | 236 ms | 1.83 ms | 1.08 ms | **128x** | 1.6x slower |
-| Multiline `(?m)^/.*\.php` | 104 ms | 0.50 ms | 0.68 ms | **207x** | **1.3x faster** |
-| Email validation | 262 ms | 0.50 ms | 0.23 ms | **523x** | 2.1x slower |
-| URL extraction | 258 ms | 0.61 ms | 0.35 ms | **422x** | 1.7x slower |
-| IP address | 497 ms | 2.2 ms | 12.0 ms | **229x** | **5.5x faster** |
-| Char class `[\w]+` | 579 ms | 41.0 ms | 50.1 ms | **14x** | **1.2x faster** |
-| Word repeat `(\w{2,8})+` | 652 ms | 186 ms | 48.3 ms | **3x** | 3.8x slower |
+| Inner `.*keyword.*` | 232 ms | 0.30 ms | 0.27 ms | **774x** | 1.1x slower |
+| Suffix `.*\.txt` | 236 ms | 1.82 ms | 1.13 ms | **129x** | 1.6x slower |
+| Multiline `(?m)^/.*\.php` | 103 ms | 0.50 ms | 0.67 ms | **206x** | **1.3x faster** |
+| Email validation | 265 ms | 0.62 ms | 0.27 ms | **428x** | 2.2x slower |
+| URL extraction | 353 ms | 0.65 ms | 0.35 ms | **543x** | 1.8x slower |
+| IP address | 496 ms | 2.1 ms | 12.1 ms | **231x** | **5.6x faster** |
+| Char class `[\w]+` | 581 ms | 51.2 ms | 50.2 ms | **11x** | ~parity |
+| Word repeat `(\w{2,8})+` | 712 ms | 186 ms | 48.7 ms | **3x** | 3.8x slower |
 
 **Where coregex excels:**
 - Multiline patterns (`(?m)^/.*\.php`) — near Rust parity, 100x+ vs stdlib

@@ -264,20 +264,17 @@ func (e *Extractor) extractPrefixesAlternate(re *syntax.Regexp, depth int) *Seq
 		}
 	}
 
-	// If overflow occurred, NOT all alternation branches are represented.
-	// A partial prefilter would miss matches for unrepresented branches.
-	// Return empty Seq so no prefilter is built — NFA handles all branches.
-	// This matches Rust's approach: overflowed literal sets → no prefilter.
-	if overflowed {
-		return NewSeq()
-	}
-
 	result := NewSeq(allLits...)
 
-	if result.Len() > e.config.MaxLiterals {
-		// Too many literals but all branches represented: trim to 3-byte
-		// prefixes, dedup, mark inexact. After trim, all alternation branches
-		// have at least one prefix in the set (unlike overflow truncation).
+	if overflowed || result.Len() > e.config.MaxLiterals {
+		// Either not all branches are represented (overflow) or too many literals.
+		// Trim to 3-byte prefixes + dedup to fit prefilter capacity.
+		// Mark ALL as inexact — prefilter is used for skip-ahead only,
+		// DFA/NFA verifies each candidate (safe with partial coverage).
+		//
+		// Rust does the same: optimize_for_prefix_by_preference trims and deduplicates.
+		// A partial prefilter is much better than no prefilter — DFA with skip-ahead
+		// vs NFA byte-by-byte on 549 states is 100x+ difference on ARM64.
 		result.KeepFirstBytes(3)
 		e.markAllInexact(result)
 		result.Dedup()

@@ -630,9 +630,13 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
 	}, nil
 }
 
-// adjustForAnchors fixes prefilter and strategy for patterns with anchors.
+// adjustForAnchors fixes prefilter for patterns with anchors.
 // Anchors (^, $, \b) require verification that Teddy/AC prefilter can't provide.
-// Multiline line anchors ((?m)^) need NFA because DFA doesn't verify line positions.
+//
+// Note: the lazy DFA correctly handles (?m)^ via StartByteMap — after \n it
+// selects StartLineLF which includes LookStartLine in the epsilon closure.
+// Verified with direct DFA tests and Rust source analysis (identical approach).
+// See docs/dev/research/v01216-arm64-regression.md for details.
 func adjustForAnchors(pf prefilter.Prefilter, strategy Strategy, re *syntax.Regexp) (prefilter.Prefilter, Strategy) {
 	if !hasAnchorAssertions(re) {
 		return pf, strategy
@@ -654,10 +658,6 @@ func adjustForAnchors(pf prefilter.Prefilter, strategy Strategy, re *syntax.Rege
 		}
 	}
 
-	// DFA can't verify (?m)^ multiline line anchors — use NFA
-	if strategy == UseDFA && hasMultilineAnchor {
-		strategy = UseNFA
-	}
 	return pf, strategy
 }
 

@@ -119,8 +119,11 @@ func (e *Engine) findIndicesNFA(haystack []byte) (int, int, bool) {
 	state := e.getSearchState()
 	defer e.putSearchState(state)
 
-	// Use prefilter for skip-ahead if available
-	if e.prefilter != nil {
+	// Use prefilter candidate loop for skip-ahead — but ONLY when prefilter
+	// covers all possible match positions (IsComplete or all branches represented).
+	// Incomplete prefilters (partial case-fold coverage) cannot be used as
+	// correctness gates — they'd miss branches whose literals were truncated.
+	if e.prefilter != nil && e.prefilter.IsComplete() {
 		at := 0
 		for at < len(haystack) {
 			// Find next candidate position via prefilter
@@ -172,17 +175,15 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) {
 	state := e.getSearchState()
 	defer e.putSearchState(state)
 
-	// Use prefilter for skip-ahead if available
-	if e.prefilter != nil {
+	// Use prefilter candidate loop — only safe with complete prefilter
+	if e.prefilter != nil && e.prefilter.IsComplete() {
 		for at < len(haystack) {
-			// Find next candidate position via prefilter
 			pos := e.prefilter.Find(haystack, at)
 			if pos == -1 {
-				return -1, -1, false // No more candidates
+				return -1, -1, false
 			}
 			atomic.AddUint64(&e.stats.PrefilterHits, 1)
 
-			// Try to match at candidate position
 			var start, end int
 			var found bool
 			if useBT && e.boundedBacktracker.CanHandle(len(haystack)-pos) {
@@ -194,14 +195,13 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) {
 				return start, end, true
 			}
 
-			// Move past this position
 			atomic.AddUint64(&e.stats.PrefilterMisses, 1)
 			at = pos + 1
 		}
 		return -1, -1, false
 	}
 
-	// No prefilter: use BoundedBacktracker if available and safe
+	// No prefilter or incomplete: use BoundedBacktracker if available and safe
 	if useBT && e.boundedBacktracker.CanHandle(len(haystack)-at) {
 		return e.boundedBacktracker.SearchAtWithState(haystack, at, state.backtracker)
 	}
@@ -1028,17 +1028,16 @@ func (e *Engine) findIndicesNFAAtWithState(haystack []byte, at int, state *Searc
 	// BoundedBacktracker can be used for Find operations only when safe
 	useBT := e.boundedBacktracker != nil && !e.canMatchEmpty
 
-	// Use prefilter for skip-ahead if available
-	if e.prefilter != nil {
+	// Use prefilter candidate loop — only safe with complete prefilter.
+	// Incomplete prefilters (partial case-fold coverage) would miss branches.
+	if e.prefilter != nil && e.prefilter.IsComplete() {
 		for at < len(haystack) {
-			// Find next candidate position via prefilter
 			pos := e.prefilter.Find(haystack, at)
 			if pos == -1 {
-				return -1, -1, false // No more candidates
+				return -1, -1, false
 			}
 			atomic.AddUint64(&e.stats.PrefilterHits, 1)
 
-			// Try to match at candidate position
 			var start, end int
 			var found bool
 			if useBT && e.boundedBacktracker.CanHandle(len(haystack)-pos) {
@@ -1050,14 +1049,13 @@ func (e *Engine) findIndicesNFAAtWithState(haystack []byte, at int, state *Searc
 				return start, end, true
 			}
 
-			// Move past this position
 			atomic.AddUint64(&e.stats.PrefilterMisses, 1)
 			at = pos + 1
 		}
 		return -1, -1, false
 	}
 
-	// No prefilter: use BoundedBacktracker if available and safe
+	// No prefilter or incomplete: use BoundedBacktracker if available and safe
 	if useBT && e.boundedBacktracker.CanHandle(len(haystack)-at) {
 		return e.boundedBacktracker.SearchAtWithState(haystack, at, state.backtracker)
 	}

@@ -1133,7 +1133,8 @@ type literalAnalysis struct {
 	hasGoodLiterals        bool // Good prefix literal (LCP >= MinLiteralLen)
 	hasTeddyLiterals       bool // Suitable for Teddy (2-32 patterns, each >= 3 bytes)
 	hasAhoCorasickLiterals bool // Suitable for Aho-Corasick (>32 patterns, each >= 1 byte)
-	hasAnchors             bool // Pattern has anchors (^, $, \b) that Teddy can't verify
+	hasAnchors             bool // Pattern has any anchors (^, $, \b)
+	hasNonLineAnchors      bool // Pattern has anchors other than (?m)^ (\b, $, \A, \z)
 }
 
 // selectLiteralStrategy selects strategy based on literal analysis.
@@ -1148,9 +1149,12 @@ func selectLiteralStrategy(literals *literal.Seq, litAnalysis literalAnalysis) S
 	// Patterns like "(foo|bar|baz)" where all literals are complete don't need
 	// DFA verification - Teddy.Find() returns exact matches.
 	// Speedup: 50-250x by skipping all DFA/NFA construction overhead.
-	// BUT: patterns with anchors (e.g., (?m)^GET|POST) need DFA to verify
-	// that the match position satisfies the anchor constraint.
-	if litAnalysis.hasTeddyLiterals && literals.AllComplete() && !litAnalysis.hasAnchors {
+	//
+	// For (?m)^ multiline anchors: adjustForAnchors() wraps the prefilter with
+	// WrapLineAnchor which adds O(1) line-start verification. This makes Teddy
+	// safe for (?m)^ patterns — no DFA needed.
+	// Only block Teddy for non-line anchors (\b, $, \A, \z) that need DFA verify.
+	if litAnalysis.hasTeddyLiterals && literals.AllComplete() && !litAnalysis.hasNonLineAnchors {
 		return UseTeddy
 	}
 
@@ -1418,6 +1422,7 @@ func SelectStrategy(n *nfa.NFA, re *syntax.Regexp, literals *literal.Seq, config
 	nfaSize := n.States()
 	litAnalysis := analyzeLiterals(literals, config)
 	litAnalysis.hasAnchors = hasAnchorAssertions(re)
+	litAnalysis.hasNonLineAnchors = litAnalysis.hasAnchors && hasNonLineAnchors(re)
 
 	// Check for simple char_class+ patterns (HIGHEST priority for character class patterns)
 	// Patterns like [\w]+, [a-z]+, \d+ use CharClassSearcher: 14-17x faster than BoundedBacktracker