From ad666e9b979f9e22a8d5e588616c2585adb62c55 Mon Sep 17 00:00:00 2001
From: Diogo Martins <diogoalves@ua.pt>
Date: Thu, 4 Jun 2026 21:23:13 +0100
Subject: [PATCH 1/4] Fair managed multi-seg: reuse a buffer instead of the
 ToArray API

The managed multi-seg bench called TryExtractFullHeaderValidated, which does input.ToArray() per call (a heap allocation), while the FFI multi-seg bench reused a buffer (seq.CopyTo into a once-allocated array). That made the bindings look ~2x faster on multi-seg when the difference was allocation strategy, not parse speed. Now the managed multi-seg also linearizes into the reused buffer (seq.CopyTo + ROM parse), so every multi-seg path = reused-buffer linearize + parse. Result: multi-seg = contiguous + a memcpy for all, and the native-vs-managed gap matches contiguous. Managed multi-seg 32KB drops 9262 -> 5606.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 bench/README.md                          | 26 ++++++------
 bindings/dotnet/Glyph11.Bench/Program.cs | 10 +++--
 site/data.json                           | 50 ++++++++++++------------
 site/index.html                          |  5 ++-
 4 files changed, 48 insertions(+), 43 deletions(-)
diff --git a/bench/README.md b/bench/README.md
index 4de69c7..dd5ebe2 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -33,19 +33,21 @@ benchmarks page.
 
 | Payload | C# Ultra | Pure C  | C# (FFI) | Kotlin (FFI) |
 |---------|---------:|--------:|---------:|-------------:|
-| ~95 B   | 118 ns   | 98 ns   | 97 ns    | 102 ns |
-| 4 KB    | 730 ns   | 512 ns  | 556 ns   | 574 ns |
-| 32 KB   | 5028 ns  | 3784 ns | 4254 ns  | 4167 ns |
+| ~95 B   | 114 ns   | 95 ns   | 95 ns    | 100 ns |
+| 4 KB    | 710 ns   | 517 ns  | 548 ns   | 556 ns |
+| 32 KB   | 5180 ns  | 3767 ns | 4120 ns  | 4134 ns |
 
-**Multi-segment** (3 segments):
+**Multi-segment** (3 segments — each linearized into a *reused* buffer, then parsed):
 
 | Payload | C# Ultra | Pure C  | C# (FFI) | Kotlin (FFI) |
 |---------|---------:|--------:|---------:|-------------:|
-| ~95 B   | 257 ns   | 101 ns  | 106 ns   | 111 ns |
-| 4 KB    | 1363 ns  | 545 ns  | 587 ns   | 603 ns |
-| 32 KB   | 9262 ns  | 4256 ns | 4624 ns  | 4658 ns |
-
-The FFI bindings track the pure-C floor (`[SuppressGCTransition]` for .NET,
-reused off-heap buffers for Kotlin). Native multi-segment = contiguous + a
-`memcpy`, so it stays close to contiguous and ~2× faster than the managed
-multi-segment path (which allocates per call). Numbers vary run-to-run.
+| ~95 B   | 125 ns   | 99 ns   | 106 ns   | 110 ns |
+| 4 KB    | 751 ns   | 546 ns  | 601 ns   | 585 ns |
+| 32 KB   | 5606 ns  | 4222 ns | 4521 ns  | 4617 ns |
+
+Every parser linearizes the segments into a **reused buffer**, so multi-segment =
+contiguous + a `memcpy` for all of them and the native-vs-managed gap stays the
+same as contiguous — it's the parse engine, not the allocation. (The managed
+`TryExtractFullHeaderValidated` *convenience* API would `input.ToArray()` instead,
+a per-call allocation that makes it ~1.6× slower at 32 KB; the bench linearizes
+manually so the comparison is apples-to-apples.) Numbers vary run-to-run.
diff --git a/bindings/dotnet/Glyph11.Bench/Program.cs b/bindings/dotnet/Glyph11.Bench/Program.cs
index 994edd7..27f96cb 100644
--- a/bindings/dotnet/Glyph11.Bench/Program.cs
+++ b/bindings/dotnet/Glyph11.Bench/Program.cs
@@ -117,21 +117,23 @@ public static void Run(string dir)
             var data = File.ReadAllBytes(Path.Combine(dir, file));
             var rom = (ReadOnlyMemory<byte>)data;
             var seq = ThreeSegments(data);
+            var lin = new byte[data.Length]; // reused linearization buffer (no per-call allocation)
 
             // managed — ROM (single contiguous buffer)
             double mRom = Best(iters, () => { req.Clear(); var r = rom; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); });
             Console.WriteLine($"dotnet-managed-rom,{name},{mRom:F1}");
 
-            // managed — multi-segment (3 segments, linearized internally)
-            double mSeg = Best(iters, () => { req.Clear(); var s = seq; UltraHardenedParser.TryExtractFullHeaderValidated(ref s, req, in ManagedLimits, out _); });
+            // managed — multi-segment: linearize into the reused buffer, then ROM-parse, so it is
+            // apples-to-apples with the native multi-seg paths. (The convenience API
+            // TryExtractFullHeaderValidated would input.ToArray() here — a per-call allocation.)
+            double mSeg = Best(iters, () => { req.Clear(); seq.CopyTo(lin); ReadOnlyMemory<byte> r = lin; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); });
             Console.WriteLine($"dotnet-managed-multiseg,{name},{mSeg:F1}");
 
             // native binding (FFI) — contiguous
             double ffi = Best(iters, () => Glyph11Parser.Parse(data, h, q, NativeLimits, out _));
             Console.WriteLine($"dotnet-ffi,{name},{ffi:F1}");
 
-            // native binding (FFI) — multi-segment: linearize into a reused buffer, then parse
-            var lin = new byte[data.Length];
+            // native binding (FFI) — multi-segment: same reused-buffer linearization, then parse
             double ffiSeg = Best(iters, () => { seq.CopyTo(lin); Glyph11Parser.Parse(lin, h, q, NativeLimits, out _); });
             Console.WriteLine($"dotnet-ffi-multiseg,{name},{ffiSeg:F1}");
         }
diff --git a/site/data.json b/site/data.json
index 9625c84..b6d0a6c 100644
--- a/site/data.json
+++ b/site/data.json
@@ -1,6 +1,6 @@
 {
   "unit": "ns/op",
-  "generated": "2026-06-04 19:51 UTC",
+  "generated": "2026-06-04 20:22 UTC",
   "langs": [
     {
       "key": "dotnet-managed-rom",
@@ -39,38 +39,38 @@
     {
       "payload": "small",
       "label": "~95 B",
-      "dotnet-managed-rom": 118.0,
-      "dotnet-managed-multiseg": 256.6,
-      "pure-c": 97.8,
-      "pure-c-multiseg": 101.4,
-      "dotnet-ffi": 97.2,
-      "dotnet-ffi-multiseg": 106.3,
-      "kotlin-ffi": 102.2,
-      "kotlin-ffi-multiseg": 110.8
+      "dotnet-managed-rom": 113.9,
+      "dotnet-managed-multiseg": 125.3,
+      "pure-c": 94.8,
+      "pure-c-multiseg": 99.0,
+      "dotnet-ffi": 95.3,
+      "dotnet-ffi-multiseg": 106.4,
+      "kotlin-ffi": 100.1,
+      "kotlin-ffi-multiseg": 109.5
     },
     {
       "payload": "4k",
       "label": "4 KB",
-      "dotnet-managed-rom": 730.2,
-      "dotnet-managed-multiseg": 1362.7,
-      "pure-c": 512.4,
-      "pure-c-multiseg": 545.4,
-      "dotnet-ffi": 555.5,
-      "dotnet-ffi-multiseg": 586.7,
-      "kotlin-ffi": 574.0,
-      "kotlin-ffi-multiseg": 602.7
+      "dotnet-managed-rom": 710.2,
+      "dotnet-managed-multiseg": 750.8,
+      "pure-c": 516.8,
+      "pure-c-multiseg": 545.8,
+      "dotnet-ffi": 548.5,
+      "dotnet-ffi-multiseg": 601.3,
+      "kotlin-ffi": 555.6,
+      "kotlin-ffi-multiseg": 585.0
     },
     {
       "payload": "32k",
       "label": "32 KB",
-      "dotnet-managed-rom": 5028.1,
-      "dotnet-managed-multiseg": 9261.7,
-      "pure-c": 3784.2,
-      "pure-c-multiseg": 4256.1,
-      "dotnet-ffi": 4254.5,
-      "dotnet-ffi-multiseg": 4624.2,
-      "kotlin-ffi": 4166.6,
-      "kotlin-ffi-multiseg": 4657.8
+      "dotnet-managed-rom": 5180.3,
+      "dotnet-managed-multiseg": 5605.8,
+      "pure-c": 3767.3,
+      "pure-c-multiseg": 4222.5,
+      "dotnet-ffi": 4120.5,
+      "dotnet-ffi-multiseg": 4520.9,
+      "kotlin-ffi": 4134.3,
+      "kotlin-ffi-multiseg": 4617.3
     }
   ]
 }
\ No newline at end of file
diff --git a/site/index.html b/site/index.html
index 049afb4..c1a6271 100644
--- a/site/index.html
+++ b/site/index.html
@@ -50,8 +50,9 @@ <h2>Benchmarks</h2>
     payloads, in <b>contiguous</b> and <b>multi-segment</b> modes. <b>C# Ultra</b> is the standalone
     managed parser; <b>Pure C</b> is the native core; the bindings reach that core from .NET and the
     JVM. The bindings call the same C, so the native columns are within measurement noise of each
-    other — the meaningful gap is native vs. managed. Native multi-segment is contiguous + a memcpy
-    (no per-call allocation), unlike the managed multi-segment path.
+    other — the meaningful gap is native vs. managed. Multi-segment linearizes the 3 segments into a
+    reused buffer (a memcpy, no per-call allocation) for every implementation, so that gap stays the
+    parse engine, not the allocation.
     <br><a href="https://github.com/dotnet-web-stack/Glyph11">github.com/dotnet-web-stack/Glyph11</a>
   </footer>
 

From 69bebddec303cb797ef9dc6f5ba135f96dad2009 Mon Sep 17 00:00:00 2001
From: Diogo Martins <diogoalves@ua.pt>
Date: Thu, 4 Jun 2026 21:51:22 +0100
Subject: [PATCH 2/4] Multi-seg: count the managed library's real linearization
 (ToArray)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts the earlier 'fair' swap. Since the C core is single-slab, the binding must linearize, so that copy is the binding's real cost (counted: reused-buffer CopyTo). By the same logic the managed column must show ITS real linearization — TryExtractFullHeaderValidated, which input.ToArray()s every request — not a hand-rolled reused buffer. So multi-seg now reflects what each actually does: managed allocates the linearization buffer per request (~9200 ns @ 32KB), the bindings reuse one (~4500 ns). The copy is in both; the ~2x gap is the per-request allocation the single-slab binding avoids (a managed caller can match it by hand-rolling CopyTo+ROM).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 bench/README.md                          | 30 +++++++-------
 bindings/dotnet/Glyph11.Bench/Program.cs |  7 ++--
 site/data.json                           | 50 ++++++++++++------------
 site/index.html                          |  9 +++--
 4 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/bench/README.md b/bench/README.md
index dd5ebe2..7cb741f 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -33,21 +33,23 @@ benchmarks page.
 
 | Payload | C# Ultra | Pure C  | C# (FFI) | Kotlin (FFI) |
 |---------|---------:|--------:|---------:|-------------:|
-| ~95 B   | 114 ns   | 95 ns   | 95 ns    | 100 ns |
-| 4 KB    | 710 ns   | 517 ns  | 548 ns   | 556 ns |
-| 32 KB   | 5180 ns  | 3767 ns | 4120 ns  | 4134 ns |
+| ~95 B   | 116 ns   | 95 ns   | 96 ns    | 101 ns |
+| 4 KB    | 728 ns   | 529 ns  | 554 ns   | 585 ns |
+| 32 KB   | 5067 ns  | 3852 ns | 4203 ns  | 4226 ns |
 
-**Multi-segment** (3 segments — each linearized into a *reused* buffer, then parsed):
+**Multi-segment** (3 segments — linearization always counted):
 
 | Payload | C# Ultra | Pure C  | C# (FFI) | Kotlin (FFI) |
 |---------|---------:|--------:|---------:|-------------:|
-| ~95 B   | 125 ns   | 99 ns   | 106 ns   | 110 ns |
-| 4 KB    | 751 ns   | 546 ns  | 601 ns   | 585 ns |
-| 32 KB   | 5606 ns  | 4222 ns | 4521 ns  | 4617 ns |
-
-Every parser linearizes the segments into a **reused buffer**, so multi-segment =
-contiguous + a `memcpy` for all of them and the native-vs-managed gap stays the
-same as contiguous — it's the parse engine, not the allocation. (The managed
-`TryExtractFullHeaderValidated` *convenience* API would `input.ToArray()` instead,
-a per-call allocation that makes it ~1.6× slower at 32 KB; the bench linearizes
-manually so the comparison is apples-to-apples.) Numbers vary run-to-run.
+| ~95 B   | 252 ns   | 100 ns  | 107 ns   | 112 ns |
+| 4 KB    | 1346 ns  | 560 ns  | 600 ns   | 602 ns |
+| 32 KB   | 9202 ns  | 4444 ns | 4634 ns  | 4773 ns |
+
+Multi-segment input **must** be linearized into a contiguous buffer first — that
+copy is in every number above. The managed column is the library's real path,
+`TryExtractFullHeaderValidated`, which linearizes via `input.ToArray()` (a fresh
+allocation every request). The single-slab native core lets the bindings linearize
+into a **reused** scratch buffer, avoiding that per-request allocation — ~2× faster
+at 32 KB. That gap is a usage advantage, not a parser difference: a managed caller
+can match it by hand-rolling `CopyTo` + ROM (≈ contiguous + a memcpy). Numbers vary
+run-to-run.
diff --git a/bindings/dotnet/Glyph11.Bench/Program.cs b/bindings/dotnet/Glyph11.Bench/Program.cs
index 27f96cb..1d60c1e 100644
--- a/bindings/dotnet/Glyph11.Bench/Program.cs
+++ b/bindings/dotnet/Glyph11.Bench/Program.cs
@@ -123,10 +123,9 @@ public static void Run(string dir)
             double mRom = Best(iters, () => { req.Clear(); var r = rom; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); });
             Console.WriteLine($"dotnet-managed-rom,{name},{mRom:F1}");
 
-            // managed — multi-segment: linearize into the reused buffer, then ROM-parse, so it is
-            // apples-to-apples with the native multi-seg paths. (The convenience API
-            // TryExtractFullHeaderValidated would input.ToArray() here — a per-call allocation.)
-            double mSeg = Best(iters, () => { req.Clear(); seq.CopyTo(lin); ReadOnlyMemory<byte> r = lin; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); });
+            // managed — multi-segment: the library's real path. TryExtractFullHeaderValidated
+            // linearizes via input.ToArray() (a fresh allocation every request), then ROM-parses.
+            double mSeg = Best(iters, () => { req.Clear(); var s = seq; UltraHardenedParser.TryExtractFullHeaderValidated(ref s, req, in ManagedLimits, out _); });
             Console.WriteLine($"dotnet-managed-multiseg,{name},{mSeg:F1}");
 
             // native binding (FFI) — contiguous
diff --git a/site/data.json b/site/data.json
index b6d0a6c..a1e9aec 100644
--- a/site/data.json
+++ b/site/data.json
@@ -1,6 +1,6 @@
 {
   "unit": "ns/op",
-  "generated": "2026-06-04 20:22 UTC",
+  "generated": "2026-06-04 20:50 UTC",
   "langs": [
     {
       "key": "dotnet-managed-rom",
@@ -39,38 +39,38 @@
     {
       "payload": "small",
       "label": "~95 B",
-      "dotnet-managed-rom": 113.9,
-      "dotnet-managed-multiseg": 125.3,
-      "pure-c": 94.8,
-      "pure-c-multiseg": 99.0,
-      "dotnet-ffi": 95.3,
-      "dotnet-ffi-multiseg": 106.4,
-      "kotlin-ffi": 100.1,
-      "kotlin-ffi-multiseg": 109.5
+      "dotnet-managed-rom": 116.3,
+      "dotnet-managed-multiseg": 251.8,
+      "pure-c": 95.2,
+      "pure-c-multiseg": 99.8,
+      "dotnet-ffi": 96.2,
+      "dotnet-ffi-multiseg": 107.2,
+      "kotlin-ffi": 100.8,
+      "kotlin-ffi-multiseg": 111.7
     },
     {
       "payload": "4k",
       "label": "4 KB",
-      "dotnet-managed-rom": 710.2,
-      "dotnet-managed-multiseg": 750.8,
-      "pure-c": 516.8,
-      "pure-c-multiseg": 545.8,
-      "dotnet-ffi": 548.5,
-      "dotnet-ffi-multiseg": 601.3,
-      "kotlin-ffi": 555.6,
-      "kotlin-ffi-multiseg": 585.0
+      "dotnet-managed-rom": 728.1,
+      "dotnet-managed-multiseg": 1346.5,
+      "pure-c": 529.3,
+      "pure-c-multiseg": 560.3,
+      "dotnet-ffi": 553.8,
+      "dotnet-ffi-multiseg": 600.4,
+      "kotlin-ffi": 585.2,
+      "kotlin-ffi-multiseg": 602.0
     },
     {
       "payload": "32k",
       "label": "32 KB",
-      "dotnet-managed-rom": 5180.3,
-      "dotnet-managed-multiseg": 5605.8,
-      "pure-c": 3767.3,
-      "pure-c-multiseg": 4222.5,
-      "dotnet-ffi": 4120.5,
-      "dotnet-ffi-multiseg": 4520.9,
-      "kotlin-ffi": 4134.3,
-      "kotlin-ffi-multiseg": 4617.3
+      "dotnet-managed-rom": 5067.2,
+      "dotnet-managed-multiseg": 9202.0,
+      "pure-c": 3851.5,
+      "pure-c-multiseg": 4443.6,
+      "dotnet-ffi": 4203.1,
+      "dotnet-ffi-multiseg": 4634.1,
+      "kotlin-ffi": 4226.2,
+      "kotlin-ffi-multiseg": 4772.9
     }
   ]
 }
\ No newline at end of file
diff --git a/site/index.html b/site/index.html
index c1a6271..9d358da 100644
--- a/site/index.html
+++ b/site/index.html
@@ -50,9 +50,12 @@ <h2>Benchmarks</h2>
     payloads, in <b>contiguous</b> and <b>multi-segment</b> modes. <b>C# Ultra</b> is the standalone
     managed parser; <b>Pure C</b> is the native core; the bindings reach that core from .NET and the
     JVM. The bindings call the same C, so the native columns are within measurement noise of each
-    other — the meaningful gap is native vs. managed. Multi-segment linearizes the 3 segments into a
-    reused buffer (a memcpy, no per-call allocation) for every implementation, so that gap stays the
-    parse engine, not the allocation.
+    other — the meaningful gap is native vs. managed. Multi-segment must be linearized into a
+    contiguous buffer first, and that copy is counted in every multi-seg number: the managed parser
+    does it via <code>TryExtractFullHeaderValidated</code> (a per-request <code>ToArray</code>
+    allocation), while the single-slab native core lets the bindings linearize into a reused scratch
+    buffer — so the multi-seg gap reflects that allocation, which a managed caller can also avoid by
+    hand-rolling <code>CopyTo</code>&nbsp;+&nbsp;ROM.
     <br><a href="https://github.com/dotnet-web-stack/Glyph11">github.com/dotnet-web-stack/Glyph11</a>
   </footer>
 

From e51f5ddd0062d27904602ad83237d46137eb98d6 Mon Sep 17 00:00:00 2001
From: Diogo Martins <diogoalves@ua.pt>
Date: Thu, 4 Jun 2026 22:01:00 +0100
Subject: [PATCH 3/4] Multi-seg: linearize identically for all parsers (compare
 parser, not buffer strategy)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The C# FFI does linearize on multi-segment — both paths do. The bug was that the managed column used the one-shot API (ToArray) while the FFI used a reused buffer, smuggling the linearization strategy into a parser comparison. Now every multi-seg path does CopyTo/memcpy into a reused buffer + parse, so the copy is counted identically and the column reflects the parser: multi-seg = contiguous + a memcpy for all, native ~1.2x ahead in both modes. The TryExtractFullHeaderValidated ToArray-per-request cost (~9.2us vs ~5.4us @ 32KB) is now a footnote, not a confound.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 bench/README.md                          | 37 ++++++++++--------
 bindings/dotnet/Glyph11.Bench/Program.cs |  8 ++--
 site/data.json                           | 50 ++++++++++++------------
 site/index.html                          | 10 ++---
 4 files changed, 56 insertions(+), 49 deletions(-)

diff --git a/bench/README.md b/bench/README.md
index 7cb741f..3a2b84d 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -33,23 +33,28 @@ benchmarks page.
 
 | Payload | C# Ultra | Pure C  | C# (FFI) | Kotlin (FFI) |
 |---------|---------:|--------:|---------:|-------------:|
-| ~95 B   | 116 ns   | 95 ns   | 96 ns    | 101 ns |
-| 4 KB    | 728 ns   | 529 ns  | 554 ns   | 585 ns |
-| 32 KB   | 5067 ns  | 3852 ns | 4203 ns  | 4226 ns |
+| ~95 B   | 118 ns   | 97 ns   | 98 ns    | 100 ns |
+| 4 KB    | 727 ns   | 522 ns  | 562 ns   | 562 ns |
+| 32 KB   | 5039 ns  | 3906 ns | 4122 ns  | 4182 ns |
 
-**Multi-segment** (3 segments — linearization always counted):
+**Multi-segment** (3 segments — every parser linearizes into a reused buffer, copy counted):
 
 | Payload | C# Ultra | Pure C  | C# (FFI) | Kotlin (FFI) |
 |---------|---------:|--------:|---------:|-------------:|
-| ~95 B   | 252 ns   | 100 ns  | 107 ns   | 112 ns |
-| 4 KB    | 1346 ns  | 560 ns  | 600 ns   | 602 ns |
-| 32 KB   | 9202 ns  | 4444 ns | 4634 ns  | 4773 ns |
-
-Multi-segment input **must** be linearized into a contiguous buffer first — that
-copy is in every number above. The managed column is the library's real path,
-`TryExtractFullHeaderValidated`, which linearizes via `input.ToArray()` (a fresh
-allocation every request). The single-slab native core lets the bindings linearize
-into a **reused** scratch buffer, avoiding that per-request allocation — ~2× faster
-at 32 KB. That gap is a usage advantage, not a parser difference: a managed caller
-can match it by hand-rolling `CopyTo` + ROM (≈ contiguous + a memcpy). Numbers vary
-run-to-run.
+| ~95 B   | 130 ns   | 102 ns  | 110 ns   | 120 ns |
+| 4 KB    | 753 ns   | 553 ns  | 612 ns   | 592 ns |
+| 32 KB   | 5406 ns  | 4324 ns | 4567 ns  | 4795 ns |
+
+Multi-segment input must be linearized into a contiguous buffer first — that
+per-request copy is counted in every number above. To compare the **parsers** (not
+buffer strategy), every path linearizes the same way — `CopyTo`/`memcpy` into a
+**reused** scratch buffer, then parse — so multi-segment = contiguous + a `memcpy`
+for all of them, and native stays ~1.2× ahead in both modes (the parse engine).
+
+> The managed one-shot API `TryExtractFullHeaderValidated` instead allocates that
+> buffer via `input.ToArray()` **every request** — ~9.2 µs vs ~5.4 µs at 32 KB. For a
+> multi-segment hot path, hand-roll `CopyTo` + `TryExtractFullHeaderROM` (or, for the
+> binding, linearize into a reused buffer before the native call). It's an API cost,
+> not a parser difference — hence a note, not the comparison.
+
+Numbers vary run-to-run.
diff --git a/bindings/dotnet/Glyph11.Bench/Program.cs b/bindings/dotnet/Glyph11.Bench/Program.cs
index 1d60c1e..367359b 100644
--- a/bindings/dotnet/Glyph11.Bench/Program.cs
+++ b/bindings/dotnet/Glyph11.Bench/Program.cs
@@ -123,9 +123,11 @@ public static void Run(string dir)
             double mRom = Best(iters, () => { req.Clear(); var r = rom; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); });
             Console.WriteLine($"dotnet-managed-rom,{name},{mRom:F1}");
 
-            // managed — multi-segment: the library's real path. TryExtractFullHeaderValidated
-            // linearizes via input.ToArray() (a fresh allocation every request), then ROM-parses.
-            double mSeg = Best(iters, () => { req.Clear(); var s = seq; UltraHardenedParser.TryExtractFullHeaderValidated(ref s, req, in ManagedLimits, out _); });
+            // managed — multi-segment: linearize into the SAME reused buffer as the native paths,
+            // then ROM-parse, so the column compares the parser, not the linearization strategy.
+            // (The one-shot API TryExtractFullHeaderValidated would input.ToArray() instead — an
+            // allocation per request; that's an API cost, noted on the page/README, not here.)
+            double mSeg = Best(iters, () => { req.Clear(); seq.CopyTo(lin); ReadOnlyMemory<byte> r = lin; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); });
             Console.WriteLine($"dotnet-managed-multiseg,{name},{mSeg:F1}");
 
             // native binding (FFI) — contiguous
diff --git a/site/data.json b/site/data.json
index a1e9aec..be91209 100644
--- a/site/data.json
+++ b/site/data.json
@@ -1,6 +1,6 @@
 {
   "unit": "ns/op",
-  "generated": "2026-06-04 20:50 UTC",
+  "generated": "2026-06-04 21:00 UTC",
   "langs": [
     {
       "key": "dotnet-managed-rom",
@@ -39,38 +39,38 @@
     {
       "payload": "small",
       "label": "~95 B",
-      "dotnet-managed-rom": 116.3,
-      "dotnet-managed-multiseg": 251.8,
-      "pure-c": 95.2,
-      "pure-c-multiseg": 99.8,
-      "dotnet-ffi": 96.2,
-      "dotnet-ffi-multiseg": 107.2,
-      "kotlin-ffi": 100.8,
-      "kotlin-ffi-multiseg": 111.7
+      "dotnet-managed-rom": 118.2,
+      "dotnet-managed-multiseg": 130.2,
+      "pure-c": 97.1,
+      "pure-c-multiseg": 102.3,
+      "dotnet-ffi": 97.8,
+      "dotnet-ffi-multiseg": 109.6,
+      "kotlin-ffi": 100.2,
+      "kotlin-ffi-multiseg": 120.5
     },
     {
       "payload": "4k",
       "label": "4 KB",
-      "dotnet-managed-rom": 728.1,
-      "dotnet-managed-multiseg": 1346.5,
-      "pure-c": 529.3,
-      "pure-c-multiseg": 560.3,
-      "dotnet-ffi": 553.8,
-      "dotnet-ffi-multiseg": 600.4,
-      "kotlin-ffi": 585.2,
-      "kotlin-ffi-multiseg": 602.0
+      "dotnet-managed-rom": 727.2,
+      "dotnet-managed-multiseg": 753.2,
+      "pure-c": 521.5,
+      "pure-c-multiseg": 553.3,
+      "dotnet-ffi": 562.0,
+      "dotnet-ffi-multiseg": 612.4,
+      "kotlin-ffi": 561.5,
+      "kotlin-ffi-multiseg": 591.5
     },
     {
       "payload": "32k",
       "label": "32 KB",
-      "dotnet-managed-rom": 5067.2,
-      "dotnet-managed-multiseg": 9202.0,
-      "pure-c": 3851.5,
-      "pure-c-multiseg": 4443.6,
-      "dotnet-ffi": 4203.1,
-      "dotnet-ffi-multiseg": 4634.1,
-      "kotlin-ffi": 4226.2,
-      "kotlin-ffi-multiseg": 4772.9
+      "dotnet-managed-rom": 5039.4,
+      "dotnet-managed-multiseg": 5406.0,
+      "pure-c": 3906.2,
+      "pure-c-multiseg": 4324.2,
+      "dotnet-ffi": 4121.6,
+      "dotnet-ffi-multiseg": 4567.0,
+      "kotlin-ffi": 4182.1,
+      "kotlin-ffi-multiseg": 4795.3
     }
   ]
 }
\ No newline at end of file
diff --git a/site/index.html b/site/index.html
index 9d358da..89904f2 100644
--- a/site/index.html
+++ b/site/index.html
@@ -51,11 +51,11 @@ <h2>Benchmarks</h2>
     managed parser; <b>Pure C</b> is the native core; the bindings reach that core from .NET and the
     JVM. The bindings call the same C, so the native columns are within measurement noise of each
     other — the meaningful gap is native vs. managed. Multi-segment must be linearized into a
-    contiguous buffer first, and that copy is counted in every multi-seg number: the managed parser
-    does it via <code>TryExtractFullHeaderValidated</code> (a per-request <code>ToArray</code>
-    allocation), while the single-slab native core lets the bindings linearize into a reused scratch
-    buffer — so the multi-seg gap reflects that allocation, which a managed caller can also avoid by
-    hand-rolling <code>CopyTo</code>&nbsp;+&nbsp;ROM.
+    contiguous buffer first; that per-request copy is counted in every multi-seg number, using a
+    reused scratch buffer for <em>every</em> parser so the column compares the parser, not the copy.
+    (The managed one-shot API <code>TryExtractFullHeaderValidated</code> instead allocates that
+    buffer via <code>ToArray</code> each request — an API cost you avoid by hand-rolling
+    <code>CopyTo</code>&nbsp;+&nbsp;ROM.)
     <br><a href="https://github.com/dotnet-web-stack/Glyph11">github.com/dotnet-web-stack/Glyph11</a>
   </footer>
 

From 62d6f9f1115cfb036f4856bf6f98f148c7478c43 Mon Sep 17 00:00:00 2001
From: Diogo Martins <diogoalves@ua.pt>
Date: Thu, 4 Jun 2026 22:37:54 +0100
Subject: [PATCH 4/4] README: rewrite around usage examples for the lib + both
 bindings

Replaces the prose README with verified, copy-pasteable request-header-parsing examples for the C# library (UltraHardenedParser), the .NET binding (Glyph11Parser, zero-alloc caller storage), and the Kotlin binding (Glyph11.parse). To make the Kotlin example real, the binding now surfaces parsed headers/query as List<Glyph11Field> (name/value spans) instead of only a count. All three examples were compiled/run against the real libraries; Kotlin smoke now also asserts a header name/value.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md                                     | 149 ++++++++++--------
 .../src/main/kotlin/io/glyph11/Glyph11.kt     |  21 ++-
 .../kotlin/src/main/kotlin/io/glyph11/Main.kt |   1 +
 3 files changed, 103 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md
index b601c99..689422f 100644
--- a/README.md
+++ b/README.md
@@ -1,100 +1,121 @@
 # Glyph11
 
-Glyph11 is a dependency free, low allocation HTTP/1.1 parser for C#. It does not rely on any specific network technology but can be used with any (such as `Socket`, `NetworkStream`, `PipeReader` or anything else).
+A zero-allocation, hardened HTTP/1.1 request parser — a pure-C# library and a C core
+(`libglyph11`) reachable from .NET and the JVM. RFC 9110/9112 validation, configurable
+resource limits, and request-smuggling / semantic checks fused into a single zero-copy pass.
 
-![.NET](https://img.shields.io/badge/.NET-8.0%20%7C%209.0%20%7C%2010.0-512bd4)
 [![NuGet](https://img.shields.io/nuget/v/Glyph11.svg)](https://www.nuget.org/packages/Glyph11/)
+![.NET](https://img.shields.io/badge/.NET-8.0%20%7C%209.0%20%7C%2010.0-512bd4)
 [![Benchmarks](https://img.shields.io/badge/benchmarks-live-blue)](https://dotnet-web-stack.github.io/Glyph11/)
-[![Coverage](https://img.shields.io/sonar/coverage/MDA2AV_Glyph11?server=https%3A%2F%2Fsonarcloud.io)](https://sonarcloud.io/summary/new_code?id=MDA2AV_Glyph11)
-[![Quality Gate](https://sonarcloud.io/api/project_badges/measure?project=MDA2AV_Glyph11&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=MDA2AV_Glyph11)
 
-## Usage
+Three ways to use the same hardened parser:
+
+| | What | Header storage |
+|---|---|---|
+| **C# library** | pure managed `UltraHardenedParser` | pooled, internal |
+| **.NET binding** | the C core via P/Invoke | caller-provided (zero-alloc) |
+| **Kotlin binding** | the C core via Panama FFM | per-call, returned as a list |
 
-Glyph11 works with any source that produces a `ReadOnlySequence<byte>` or `ReadOnlyMemory<byte>` — `PipeReader`, `Socket`, `NetworkStream`, or raw byte arrays.
+## C# library (managed)
 
 ```csharp
 using System.Buffers;
+using System.Text;
 using Glyph11.Protocol;
 using Glyph11.Parser;
 using Glyph11.Parser.UltraHardened;
 
 var request = new BinaryRequest();
-var limits = ParserLimits.Default;
+var limits  = ParserLimits.Default;
 
-ReadOnlySequence<byte> buffer = ...; // from any network source
+// From any source that yields a ReadOnlySequence<byte> (PipeReader, Socket, NetworkStream, …)
+ReadOnlySequence<byte> buffer = new(Encoding.ASCII.GetBytes(
+    "GET /api/users?page=1 HTTP/1.1\r\nHost: example.com\r\nAccept: */*\r\n\r\n"));
 
-// UltraHardenedParser fuses structural parsing, resource limits, and every
-// semantic check (smuggling, traversal, Host rules, ...) into one pass.
-// It throws HttpParseException on any protocol or semantic violation.
 if (UltraHardenedParser.TryExtractFullHeaderValidated(ref buffer, request, in limits, out int bytesRead))
 {
-    // All parsed fields are zero-copy slices into the original buffer:
-    // request.Method.Span  → e.g. "GET"
-    // request.Path.Span    → e.g. "/api/users"
-    // request.Version.Span → e.g. "HTTP/1.1"
-    // request.Headers      → KeyValueList of name/value pairs
-    // request.QueryParameters → KeyValueList of query params
-
-    // The request is fully validated — safe to process.
-    // Then advance your reader by bytesRead.
-
-    // Reuse between requests — clear instead of reallocating:
-    request.Headers.Clear();
-    request.QueryParameters.Clear();
-}
-```
-
-Glyph11 plugs into a `PipeReader` loop: read a buffer, call `TryExtractFullHeaderValidated`, advance the reader by `bytesRead`, and repeat.
-
-## Parsers
-
-Glyph11 ships two parsers:
-
-- **`UltraHardenedParser`** — RFC 9110/9112 compliant with full validation, configurable resource limits, and every smuggling/semantic check fused into the parse pass. Recommended for internet-facing applications.
-- **`FlexibleParser`** — Minimal validation for maximum throughput. Suitable for trusted environments where input is pre-validated.
-
-## Performance
+    Console.WriteLine(Encoding.ASCII.GetString(request.Method.Span)); // GET
+    Console.WriteLine(Encoding.ASCII.GetString(request.Path.Span));   // /api/users
 
-- **ROM path is zero-allocation** — no GC pressure regardless of request size
-- **SIMD-accelerated validation** keeps the `UltraHardenedParser` within a small constant factor of the unvalidated `FlexibleParser`
-- **Multi-segment linearization** provides ROM-speed parsing with a single upfront allocation
+    for (int i = 0; i < request.Headers.Count; i++)
+    {
+        var (name, value) = request.Headers[i];                       // zero-copy slices
+        Console.WriteLine($"{Encoding.ASCII.GetString(name.Span)}: {Encoding.ASCII.GetString(value.Span)}");
+    }
 
-See the [live benchmarks](https://dotnet-web-stack.github.io/Glyph11/) — the managed parser vs. the C core and its .NET (P/Invoke) and JVM (Panama FFM) bindings, contiguous and multi-segment.
+    // advance your reader by bytesRead; reuse `request` across calls (request.Clear()).
+}
+// throws HttpParseException on a protocol/semantic violation; returns false if incomplete.
+```
 
-## CI Workflows
+`TryExtractFullHeaderROM(ref ReadOnlyMemory<byte>, …)` is the single-buffer (contiguous) fast path.
+`FlexibleParser` is a minimal-validation variant for trusted, pre-validated input.
 
-### Benchmarks
+## .NET binding (C core via P/Invoke)
 
-The **Benchmark** workflow (`.github/workflows/benchmark.yml`) measures parser throughput and allocation using BenchmarkDotNet.
+Calls `libglyph11` directly — same validation, native speed, **zero allocation** (you provide the
+header/query storage).
 
-| Trigger | Job | What it does |
-|---------|-----|--------------|
-| `pull_request` | **Parser Benchmarks** | Runs `FlexibleParserBenchmark` and `UltraHardenedParserBenchmark`, compares against the baseline on `gh-pages`, and posts a comment on the PR. Fails if any metric regresses by more than 15%. |
-| `workflow_dispatch` | **Full Benchmarks** | Runs all benchmarks (parsers + `AllSemanticChecksBenchmark`) and updates the baseline on `gh-pages`. |
+```csharp
+using System.Text;
+using Glyph11.Native;
 
-**Data flow:** benchmark results are stored as `benchmarks/data.js` on the `gh-pages` branch.
+byte[] request = Encoding.ASCII.GetBytes(
+    "GET /api/users?page=1 HTTP/1.1\r\nHost: example.com\r\nAccept: */*\r\n\r\n");
 
-> The cross-language comparison on the [live site](https://dotnet-web-stack.github.io/Glyph11/) is produced separately by the **Cross-Language Benchmark** workflow (`.github/workflows/cross-bench.yml`), which benchmarks the C core, both bindings, and the managed parser, then publishes `benchmarks/cross-lang.json` to `gh-pages`.
+Span<Glyph11Field> headers = stackalloc Glyph11Field[64];
+Span<Glyph11Field> query   = stackalloc Glyph11Field[32];
 
-To publish updated benchmark data:
+int status = Glyph11Parser.Parse(request, headers, query, Glyph11Limits.Default, out var r);
+if (status == Glyph11Parser.Ok)
+{
+    string Slice(Glyph11Span s) => Encoding.ASCII.GetString(request, (int)s.Offset, (int)s.Length);
 
-1. Merge your changes to `main`.
-2. Go to **Actions > Benchmark > Run workflow** on `main`.
+    Console.WriteLine(Slice(r.Method)); // GET
+    Console.WriteLine(Slice(r.Path));   // /api/users
 
-### Compliance Probe
+    for (int i = 0; i < r.HeaderCount; i++)
+        Console.WriteLine($"{Slice(headers[i].Name)}: {Slice(headers[i].Value)}");
+}
+// status: 0 = OK, 1 = incomplete (read more), otherwise a protocol/limit error (→ HTTP 400 / 431).
+```
 
-The **Probe** workflow (`.github/workflows/probe.yml`) tests HTTP/1.1 compliance across multiple server frameworks using [Glyph11.Probe](src/Glyph11.Probe), a tool that sends malformed and ambiguous HTTP requests and checks the server's response against strict RFC 9110/9112 expectations.
+Resolve the native library with the `GLYPH11_NATIVE_PATH` environment variable, or put
+`libglyph11.{so,dll,dylib}` on the OS load path.
+
+## Kotlin / JVM binding (C core via Panama FFM)
+
+```kotlin
+import io.glyph11.Glyph11
+import io.glyph11.Glyph11Span
+
+val request = "GET /api/users?page=1 HTTP/1.1\r\nHost: example.com\r\nAccept: */*\r\n\r\n"
+    .toByteArray(Charsets.ISO_8859_1)
+
+val r = Glyph11.parse(request)
+when {
+    r.isOk -> {
+        fun slice(s: Glyph11Span) = String(request, s.offset, s.length, Charsets.ISO_8859_1)
+        println(slice(r.method))                     // GET
+        println(slice(r.path))                       // /api/users
+        for (h in r.headers)
+            println("${slice(h.name)}: ${slice(h.value)}")
+    }
+    r.isIncomplete -> { /* read more bytes */ }
+    else -> println("rejected → HTTP ${Glyph11.httpCode(r.status)}")  // 400 / 431
+}
+```
 
-Servers tested: **Glyph11** (raw TCP + UltraHardenedParser), **Kestrel** (ASP.NET Core), **Flask** (Python), **Express** (Node.js), **Spring Boot** (Java), **Quarkus** (Java), **Nancy** (.NET), **Jetty** (Java), **Nginx** (native), **Apache** (native), **Caddy** (native), **Pingora** (Rust).
+Requires JDK 21+ (FFM). Point at the library with `-Dglyph11.lib=/path/to/libglyph11.so`.
 
-| Trigger | What it does |
-|---------|--------------|
-| `pull_request` | Starts all three servers, probes each one, evaluates results with strict status-code matching (e.g. a parser error must return `400`, not `404`), and posts a comparison table as a PR comment. Never fails the build — this is informational. |
-| `workflow_dispatch` | Same as above, plus pushes `probe/data.js` to `gh-pages`. |
+## Benchmarks
 
-**Data flow:** probe results are stored as `probe/data.js` on the `gh-pages` branch.
+Live cross-language numbers — managed vs. the C core and its .NET / JVM bindings, contiguous and
+multi-segment: **<https://dotnet-web-stack.github.io/Glyph11/>**
 
-To publish updated probe data:
+## Build the native core (for the bindings)
 
-1. Merge your changes to `main`.
-2. Go to **Actions > Probe > Run workflow** on `main`.
+```sh
+cmake -S core -B core/build-rel -DGLYPH11_BUILD_TESTS=OFF
+cmake --build core/build-rel     # → core/build-rel/libglyph11.{so,dll,dylib}
+```
diff --git a/bindings/kotlin/src/main/kotlin/io/glyph11/Glyph11.kt b/bindings/kotlin/src/main/kotlin/io/glyph11/Glyph11.kt
index 7247088..c2df262 100644
--- a/bindings/kotlin/src/main/kotlin/io/glyph11/Glyph11.kt
+++ b/bindings/kotlin/src/main/kotlin/io/glyph11/Glyph11.kt
@@ -11,6 +11,9 @@ import java.lang.invoke.MethodHandle
 /** A byte range (offset + length) into the parsed input buffer (zero-copy). */
 data class Glyph11Span(val offset: Int, val length: Int)
 
+/** A parsed name/value pair (header or query parameter); spans index into the input. */
+data class Glyph11Field(val name: Glyph11Span, val value: Glyph11Span)
+
 /** Parsed request fields. Spans index into the input passed to [Glyph11.parse]. */
 data class Glyph11Result(
     val status: Int,
@@ -18,12 +21,14 @@ data class Glyph11Result(
     val target: Glyph11Span,
     val path: Glyph11Span,
     val version: Glyph11Span,
-    val headerCount: Int,
-    val queryCount: Int,
+    val headers: List<Glyph11Field>,
+    val query: List<Glyph11Field>,
     val consumed: Long,
 ) {
     val isOk: Boolean get() = status == 0
     val isIncomplete: Boolean get() = status == 1
+    val headerCount: Int get() = headers.size
+    val queryCount: Int get() = query.size
 }
 
 /**
@@ -105,6 +110,14 @@ object Glyph11 {
 
             fun span(off: Long) =
                 Glyph11Span(req.get(ValueLayout.JAVA_INT, off), req.get(ValueLayout.JAVA_INT, off + 4))
+            fun fields(seg: MemorySegment, count: Int): List<Glyph11Field> =
+                (0 until count).map { i ->
+                    val b = i.toLong() * SIZEOF_FIELD
+                    Glyph11Field(
+                        Glyph11Span(seg.get(ValueLayout.JAVA_INT, b), seg.get(ValueLayout.JAVA_INT, b + 4)),
+                        Glyph11Span(seg.get(ValueLayout.JAVA_INT, b + 8), seg.get(ValueLayout.JAVA_INT, b + 12)),
+                    )
+                }
 
             return Glyph11Result(
                 status = status,
@@ -112,8 +125,8 @@ object Glyph11 {
                 target = span(8L),
                 path = span(16L),
                 version = span(24L),
-                headerCount = req.get(ValueLayout.JAVA_INT, OFF_HEADER_COUNT),
-                queryCount = req.get(ValueLayout.JAVA_INT, OFF_QUERY_COUNT),
+                headers = if (status == 0) fields(headers, req.get(ValueLayout.JAVA_INT, OFF_HEADER_COUNT)) else emptyList(),
+                query = if (status == 0) fields(query, req.get(ValueLayout.JAVA_INT, OFF_QUERY_COUNT)) else emptyList(),
                 consumed = if (status == 0) consumed.get(ValueLayout.JAVA_LONG, 0L) else 0L,
             )
         }
diff --git a/bindings/kotlin/src/main/kotlin/io/glyph11/Main.kt b/bindings/kotlin/src/main/kotlin/io/glyph11/Main.kt
index becdab9..a7719c5 100644
--- a/bindings/kotlin/src/main/kotlin/io/glyph11/Main.kt
+++ b/bindings/kotlin/src/main/kotlin/io/glyph11/Main.kt
@@ -43,6 +43,7 @@ private fun smoke() {
     check("path", slice(valid, r.path) == "/api/users")
     check("version", slice(valid, r.version) == "HTTP/1.1")
     check("headerCount", r.headerCount == 2)
+    check("header name/value", r.headers[0].let { slice(valid, it.name) == "Host" && slice(valid, it.value) == "example.com" })
     check("queryCount", r.queryCount == 2)
     check("consumed", r.consumed.toInt() == valid.size)