coregx
diff --git a/‎CHANGELOG.md‎
Lines changed: 29 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 14 additions & 11 deletions b/‎README.md‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎dfa/lazy/accel_test.go‎
Lines changed: 20 additions & 20 deletions b/‎dfa/lazy/accel_test.go‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎dfa/lazy/anchored_search_prefilter_test.go‎
Lines changed: 10 additions & 53 deletions b/‎dfa/lazy/anchored_search_prefilter_test.go‎
Lines changed: 10 additions & 53 deletions
@@ -12,6 +12,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120))
 - SIMD prefilter for CompositeSequenceDFA (#83)
 
+## [0.12.19] - 2026-03-24
+
+### Performance
+- **Zero-alloc FindSubmatch via dual SlotTable** (Rust approach) — replaced per-thread
+  COW capture allocation with Rust-style flat SlotTable. Two SlotTables (curr/next)
+  swap between byte generations. Stack-based epsilon closure with RestoreCapture
+  frames preserves capture context through greedy loops. FindAllSubmatch (5 patterns,
+  50K matches, 800KB input): alloc **554MB → 26MB** (-95%), mallocs **12.5M → 440K**
+  (-96%), time **1.48s → 0.45s** (3.3x faster). Reference: Rust `pikevm.rs`
+  `ActiveStates` + `SlotTable` + `FollowEpsilon::RestoreCapture`.
+
+- **Rust-aligned BoundedBacktracker visited limit for UseNFA** — reduced visited
+  table capacity from 32M entries (64MB) to 128K entries (256KB) for UseNFA paths,
+  matching Rust regex's `visited_capacity` default. On Kostya's LangArena LogParser
+  (7MB log, 13 patterns): total alloc **89MB → 25MB** (-72%), RSS **353MB → 41MB**
+  (-88%). `errors` pattern: **66MB → 2.4MB** (-96%). No speed regression.
+  `UseBoundedBacktracker` strategy retains full 32M limit for POSIX longest-match
+  correctness (Go stdlib compatibility).
+
+- **Byte-based DFA cache limit** (Rust approach) — replaced `MaxStates` count limit
+  with `CacheCapacityBytes` (default 2MB, matching Rust's `hybrid_cache_capacity`).
+  Cache limit is now self-adjusting: fewer states for large alphabets, more for small.
+  Added `MemoryUsage()` method for runtime cache introspection.
+
+- **Remove dual transition storage** — eliminated `transitions []StateID` and
+  `transitionCount` from `State` struct. Transitions now stored exclusively in
+  `DFACache.flatTrans`. Acceleration detection migrated to `DetectAccelerationFromFlat()`
+  reading directly from flat table.
+
 ## [0.12.18] - 2026-03-24
 
 ### Performance
 
@@ -20,7 +20,7 @@ High-performance regex engine for Go. Drop-in replacement for `regexp` with **3-
 Go's stdlib `regexp` is intentionally simple — single NFA engine, no optimizations. This guarantees O(n) time but leaves performance on the table.
 
 coregex brings Rust regex-crate architecture to Go:
-- **Multi-engine**: Lazy DFA, PikeVM, OnePass, BoundedBacktracker
+- **Multi-engine**: 17 strategies — Lazy DFA, PikeVM, OnePass, BoundedBacktracker, and more
 - **SIMD prefilters**: AVX2/SSSE3 for fast candidate rejection
 - **Reverse search**: Suffix/inner literal patterns run 1000x+ faster
 - **O(n) guarantee**: No backtracking, no ReDoS vulnerabilities
@@ -187,20 +187,23 @@ Uses Go's `regexp/syntax` parser:
 ```
 Pattern → Parse → NFA → Literal Extract → Strategy Select
                                                ↓
-                         ┌─────────────────────────────────┐
-                         │ Engines (17 strategies):        │
-                         │  LazyDFA, PikeVM, OnePass,      │
-                         │  BoundedBacktracker,            │
-                         │  ReverseInner, ReverseSuffix,   │
-                         │  ReverseSuffixSet, AnchoredLiteral, │
-                         │  CharClassSearcher, Teddy,      │
-                         │  DigitPrefilter, AhoCorasick,   │
-                         │  CompositeSearcher, BranchDispatch │
-                         └─────────────────────────────────┘
+                  ┌────────────────────────────────────────────┐
+                  │ Engines (17 strategies):                   │
+                  │  LazyDFA, PikeVM, OnePass,                 │
+                  │  BoundedBacktracker, ReverseAnchored,      │
+                  │  ReverseInner, ReverseSuffix,              │
+                  │  ReverseSuffixSet, MultilineReverseSuffix, │
+                  │  AnchoredLiteral, CharClassSearcher,       │
+                  │  Teddy, DigitPrefilter, AhoCorasick,       │
+                  │  CompositeSearcher, BranchDispatch, Both   │
+                  └────────────────────────────────────────────┘
                                                ↓
 Input → Prefilter (SIMD) → Engine → Match Result
 ```
 
+> For detailed architecture documentation, see [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
+> For optimization details, see [docs/OPTIMIZATIONS.md](docs/OPTIMIZATIONS.md).
+
 **SIMD Primitives** (AMD64):
 - `memchr` — single byte search (AVX2)
 - `memmem` — substring search (SSSE3)
 
@@ -87,37 +87,37 @@ func TestDetectAcceleration(t *testing.T) {
 }
 
 func TestDetectAccelerationFromCached(t *testing.T) {
-	// Test the lazy detection that only uses cached transitions
+	// State no longer stores transitions — DetectAccelerationFromCached returns nil.
+	// Acceleration is now detected via DetectAccelerationFromFlat using flatTrans.
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-
-	// Initially no cached transitions - should return nil
 	exitBytes := DetectAccelerationFromCached(state)
 	if exitBytes != nil {
-		t.Errorf("Expected nil with no cached transitions, got %v", exitBytes)
+		t.Errorf("Expected nil (State has no transitions), got %v", exitBytes)
 	}
+}
+
+func TestDetectAccelerationFromFlat(t *testing.T) {
+	// Test acceleration detection via flat transition table
+	stride := 256
+	sid := StateID(1)
+	flatTrans := make([]StateID, 2*stride) // 2 states
 
-	// Add 250 self-loop transitions
+	// State 1: 250 self-loops, 3 exits to state 2, 3 dead
+	base := int(sid) * stride
 	for i := 0; i < 250; i++ {
-		state.AddTransition(byte(i), StateID(1)) // Self-loop
+		flatTrans[base+i] = sid // Self-loop
 	}
+	flatTrans[base+250] = StateID(2)
+	flatTrans[base+251] = StateID(2)
+	flatTrans[base+252] = StateID(2)
+	flatTrans[base+253] = DeadState
+	flatTrans[base+254] = DeadState
+	flatTrans[base+255] = DeadState
 
-	// Add 3 exit bytes
-	state.AddTransition(byte(250), StateID(2)) // Exit to state 2
-	state.AddTransition(byte(251), StateID(2)) // Exit to state 2
-	state.AddTransition(byte(252), StateID(2)) // Exit to state 2
-
-	// Add 3 dead transitions
-	state.AddTransition(byte(253), DeadState)
-	state.AddTransition(byte(254), DeadState)
-	state.AddTransition(byte(255), DeadState)
-
-	// Now should detect as accelerable
-	exitBytes = DetectAccelerationFromCached(state)
+	exitBytes := DetectAccelerationFromFlat(sid, flatTrans, stride, nil)
 	if len(exitBytes) != 3 {
 		t.Errorf("Expected 3 exit bytes, got %v", exitBytes)
 	}
-
-	// Verify the exit bytes are correct
 	expected := map[byte]bool{250: true, 251: true, 252: true}
 	for _, b := range exitBytes {
 		if !expected[b] {
 
@@ -73,82 +73,39 @@ func TestDetectAccelFromCachedWithClassesByteMapping(t *testing.T) {
 
 // TestDetectAccelFromCachedWithClassesNilClasses verifies the nil byteClasses fallback.
 func TestDetectAccelFromCachedWithClassesNilClasses(t *testing.T) {
-	// Create a state with known transitions (stride=256, no compression)
+	// State no longer stores transitions — DetectAccelerationFromCachedWithClasses returns nil.
+	// Use DetectAccelerationFromFlat for flat table detection.
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-
-	// Fill 253 self-loop transitions
-	for i := 0; i < 253; i++ {
-		state.AddTransition(byte(i), StateID(1))
-	}
-	// Add 3 exit transitions to a different state
-	state.AddTransition(253, StateID(2))
-	state.AddTransition(254, StateID(2))
-	state.AddTransition(255, StateID(2))
-
-	// nil byteClasses -> exit class indices ARE the bytes (identity)
 	result := DetectAccelerationFromCachedWithClasses(state, nil)
-	if len(result) != 3 {
-		t.Fatalf("expected 3 exit bytes with nil classes, got %v", result)
-	}
-	expected := map[byte]bool{253: true, 254: true, 255: true}
-	for _, b := range result {
-		if !expected[b] {
-			t.Errorf("unexpected exit byte %d", b)
-		}
+	if result != nil {
+		t.Errorf("expected nil (State has no transitions), got %v", result)
 	}
 }
 
-// TestDetectAccelFromCachedInsufficientTransitions tests that when too few
-// transitions are cached, acceleration detection returns nil.
+// TestDetectAccelFromCachedInsufficientTransitions tests that State-based detection returns nil.
 func TestDetectAccelFromCachedInsufficientTransitions(t *testing.T) {
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-	// Only add a few transitions (way below 94% threshold)
-	state.AddTransition(0, StateID(1))
-	state.AddTransition(1, StateID(2))
-
 	result := DetectAccelerationFromCachedWithClasses(state, nil)
 	if result != nil {
-		t.Errorf("expected nil for insufficient cached transitions, got %v", result)
+		t.Errorf("expected nil (State has no transitions), got %v", result)
 	}
 }
 
-// TestDetectAccelFromCachedTooManyExitClasses tests that >3 exit classes returns nil.
+// TestDetectAccelFromCachedTooManyExitClasses tests that State-based detection returns nil.
 func TestDetectAccelFromCachedTooManyExitClasses(t *testing.T) {
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-	// Fill 250 self-loops
-	for i := 0; i < 250; i++ {
-		state.AddTransition(byte(i), StateID(1))
-	}
-	// Add 4 distinct exit transitions (> 3 limit)
-	state.AddTransition(250, StateID(2))
-	state.AddTransition(251, StateID(3))
-	state.AddTransition(252, StateID(4))
-	state.AddTransition(253, StateID(5))
-	// Fill remaining with dead
-	state.AddTransition(254, DeadState)
-	state.AddTransition(255, DeadState)
-
 	result := DetectAccelerationFromCachedWithClasses(state, nil)
 	if result != nil {
-		t.Errorf("expected nil for >3 exit classes, got %v", result)
+		t.Errorf("expected nil (State has no transitions), got %v", result)
 	}
 }
 
-// TestDetectAccelFromCachedZeroExitClasses tests that 0 exit classes returns nil.
+// TestDetectAccelFromCachedZeroExitClasses tests that State-based detection returns nil.
 func TestDetectAccelFromCachedZeroExitClasses(t *testing.T) {
 	state := NewState(StateID(1), []nfa.StateID{0}, false)
-	// All transitions are self-loops or dead
-	for i := 0; i < 256; i++ {
-		if i < 200 {
-			state.AddTransition(byte(i), StateID(1)) // self-loop
-		} else {
-			state.AddTransition(byte(i), DeadState) // dead
-		}
-	}
-
 	result := DetectAccelerationFromCachedWithClasses(state, nil)
 	if result != nil {
-		t.Errorf("expected nil for 0 exit classes, got %v", result)
+		t.Errorf("expected nil (State has no transitions), got %v", result)
 	}
 }