From ad666e9b979f9e22a8d5e588616c2585adb62c55 Mon Sep 17 00:00:00 2001 From: Diogo Martins Date: Thu, 4 Jun 2026 21:23:13 +0100 Subject: [PATCH 1/4] Fair managed multi-seg: reuse a buffer instead of the ToArray API The managed multi-seg bench called TryExtractFullHeaderValidated, which does input.ToArray() per call (a heap allocation), while the FFI multi-seg bench reused a buffer (seq.CopyTo into a once-allocated array). That made the bindings look ~2x faster on multi-seg when the difference was allocation strategy, not parse speed. Now the managed multi-seg also linearizes into the reused buffer (seq.CopyTo + ROM parse), so every multi-seg path = reused-buffer linearize + parse. Result: multi-seg = contiguous + a memcpy for all, and the native-vs-managed gap matches contiguous. Managed multi-seg 32KB drops 9262 -> 5606. Co-Authored-By: Claude Opus 4.8 (1M context) --- bench/README.md | 26 ++++++------ bindings/dotnet/Glyph11.Bench/Program.cs | 10 +++-- site/data.json | 50 ++++++++++++------------ site/index.html | 5 ++- 4 files changed, 48 insertions(+), 43 deletions(-) diff --git a/bench/README.md b/bench/README.md index 4de69c7..dd5ebe2 100644 --- a/bench/README.md +++ b/bench/README.md @@ -33,19 +33,21 @@ benchmarks page. | Payload | C# Ultra | Pure C | C# (FFI) | Kotlin (FFI) | |---------|---------:|--------:|---------:|-------------:| -| ~95 B | 118 ns | 98 ns | 97 ns | 102 ns | -| 4 KB | 730 ns | 512 ns | 556 ns | 574 ns | -| 32 KB | 5028 ns | 3784 ns | 4254 ns | 4167 ns | +| ~95 B | 114 ns | 95 ns | 95 ns | 100 ns | +| 4 KB | 710 ns | 517 ns | 548 ns | 556 ns | +| 32 KB | 5180 ns | 3767 ns | 4120 ns | 4134 ns | -**Multi-segment** (3 segments): +**Multi-segment** (3 segments — each linearized into a *reused* buffer, then parsed): | Payload | C# Ultra | Pure C | C# (FFI) | Kotlin (FFI) | |---------|---------:|--------:|---------:|-------------:| -| ~95 B | 257 ns | 101 ns | 106 ns | 111 ns | -| 4 KB | 1363 ns | 545 ns | 587 ns | 603 ns | -| 32 KB | 9262 ns | 4256 ns | 4624 ns | 4658 ns | - -The FFI bindings track the pure-C floor (`[SuppressGCTransition]` for .NET, -reused off-heap buffers for Kotlin). Native multi-segment = contiguous + a -`memcpy`, so it stays close to contiguous and ~2× faster than the managed -multi-segment path (which allocates per call). Numbers vary run-to-run. +| ~95 B | 125 ns | 99 ns | 106 ns | 110 ns | +| 4 KB | 751 ns | 546 ns | 601 ns | 585 ns | +| 32 KB | 5606 ns | 4222 ns | 4521 ns | 4617 ns | + +Every parser linearizes the segments into a **reused buffer**, so multi-segment = +contiguous + a `memcpy` for all of them and the native-vs-managed gap stays the +same as contiguous — it's the parse engine, not the allocation. (The managed +`TryExtractFullHeaderValidated` *convenience* API would `input.ToArray()` instead, +a per-call allocation that makes it ~1.6× slower at 32 KB; the bench linearizes +manually so the comparison is apples-to-apples.) Numbers vary run-to-run. diff --git a/bindings/dotnet/Glyph11.Bench/Program.cs b/bindings/dotnet/Glyph11.Bench/Program.cs index 994edd7..27f96cb 100644 --- a/bindings/dotnet/Glyph11.Bench/Program.cs +++ b/bindings/dotnet/Glyph11.Bench/Program.cs @@ -117,21 +117,23 @@ public static void Run(string dir) var data = File.ReadAllBytes(Path.Combine(dir, file)); var rom = (ReadOnlyMemory)data; var seq = ThreeSegments(data); + var lin = new byte[data.Length]; // reused linearization buffer (no per-call allocation) // managed — ROM (single contiguous buffer) double mRom = Best(iters, () => { req.Clear(); var r = rom; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); }); Console.WriteLine($"dotnet-managed-rom,{name},{mRom:F1}"); - // managed — multi-segment (3 segments, linearized internally) - double mSeg = Best(iters, () => { req.Clear(); var s = seq; UltraHardenedParser.TryExtractFullHeaderValidated(ref s, req, in ManagedLimits, out _); }); + // managed — multi-segment: linearize into the reused buffer, then ROM-parse, so it is + // apples-to-apples with the native multi-seg paths. (The convenience API + // TryExtractFullHeaderValidated would input.ToArray() here — a per-call allocation.) + double mSeg = Best(iters, () => { req.Clear(); seq.CopyTo(lin); ReadOnlyMemory r = lin; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); }); Console.WriteLine($"dotnet-managed-multiseg,{name},{mSeg:F1}"); // native binding (FFI) — contiguous double ffi = Best(iters, () => Glyph11Parser.Parse(data, h, q, NativeLimits, out _)); Console.WriteLine($"dotnet-ffi,{name},{ffi:F1}"); - // native binding (FFI) — multi-segment: linearize into a reused buffer, then parse - var lin = new byte[data.Length]; + // native binding (FFI) — multi-segment: same reused-buffer linearization, then parse double ffiSeg = Best(iters, () => { seq.CopyTo(lin); Glyph11Parser.Parse(lin, h, q, NativeLimits, out _); }); Console.WriteLine($"dotnet-ffi-multiseg,{name},{ffiSeg:F1}"); } diff --git a/site/data.json b/site/data.json index 9625c84..b6d0a6c 100644 --- a/site/data.json +++ b/site/data.json @@ -1,6 +1,6 @@ { "unit": "ns/op", - "generated": "2026-06-04 19:51 UTC", + "generated": "2026-06-04 20:22 UTC", "langs": [ { "key": "dotnet-managed-rom", @@ -39,38 +39,38 @@ { "payload": "small", "label": "~95 B", - "dotnet-managed-rom": 118.0, - "dotnet-managed-multiseg": 256.6, - "pure-c": 97.8, - "pure-c-multiseg": 101.4, - "dotnet-ffi": 97.2, - "dotnet-ffi-multiseg": 106.3, - "kotlin-ffi": 102.2, - "kotlin-ffi-multiseg": 110.8 + "dotnet-managed-rom": 113.9, + "dotnet-managed-multiseg": 125.3, + "pure-c": 94.8, + "pure-c-multiseg": 99.0, + "dotnet-ffi": 95.3, + "dotnet-ffi-multiseg": 106.4, + "kotlin-ffi": 100.1, + "kotlin-ffi-multiseg": 109.5 }, { "payload": "4k", "label": "4 KB", - "dotnet-managed-rom": 730.2, - "dotnet-managed-multiseg": 1362.7, - "pure-c": 512.4, - "pure-c-multiseg": 545.4, - "dotnet-ffi": 555.5, - "dotnet-ffi-multiseg": 586.7, - "kotlin-ffi": 574.0, - "kotlin-ffi-multiseg": 602.7 + "dotnet-managed-rom": 710.2, + "dotnet-managed-multiseg": 750.8, + "pure-c": 516.8, + "pure-c-multiseg": 545.8, + "dotnet-ffi": 548.5, + "dotnet-ffi-multiseg": 601.3, + "kotlin-ffi": 555.6, + "kotlin-ffi-multiseg": 585.0 }, { "payload": "32k", "label": "32 KB", - "dotnet-managed-rom": 5028.1, - "dotnet-managed-multiseg": 9261.7, - "pure-c": 3784.2, - "pure-c-multiseg": 4256.1, - "dotnet-ffi": 4254.5, - "dotnet-ffi-multiseg": 4624.2, - "kotlin-ffi": 4166.6, - "kotlin-ffi-multiseg": 4657.8 + "dotnet-managed-rom": 5180.3, + "dotnet-managed-multiseg": 5605.8, + "pure-c": 3767.3, + "pure-c-multiseg": 4222.5, + "dotnet-ffi": 4120.5, + "dotnet-ffi-multiseg": 4520.9, + "kotlin-ffi": 4134.3, + "kotlin-ffi-multiseg": 4617.3 } ] } \ No newline at end of file diff --git a/site/index.html b/site/index.html index 049afb4..c1a6271 100644 --- a/site/index.html +++ b/site/index.html @@ -50,8 +50,9 @@

Benchmarks

payloads, in contiguous and multi-segment modes. C# Ultra is the standalone managed parser; Pure C is the native core; the bindings reach that core from .NET and the JVM. The bindings call the same C, so the native columns are within measurement noise of each - other — the meaningful gap is native vs. managed. Native multi-segment is contiguous + a memcpy - (no per-call allocation), unlike the managed multi-segment path. + other — the meaningful gap is native vs. managed. Multi-segment linearizes the 3 segments into a + reused buffer (a memcpy, no per-call allocation) for every implementation, so that gap stays the + parse engine, not the allocation.
github.com/dotnet-web-stack/Glyph11 From 69bebddec303cb797ef9dc6f5ba135f96dad2009 Mon Sep 17 00:00:00 2001 From: Diogo Martins Date: Thu, 4 Jun 2026 21:51:22 +0100 Subject: [PATCH 2/4] Multi-seg: count the managed library's real linearization (ToArray) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts the earlier 'fair' swap. Since the C core is single-slab, the binding must linearize, so that copy is the binding's real cost (counted: reused-buffer CopyTo). By the same logic the managed column must show ITS real linearization — TryExtractFullHeaderValidated, which input.ToArray()s every request — not a hand-rolled reused buffer. So multi-seg now reflects what each actually does: managed allocates the linearization buffer per request (~9200 ns @ 32KB), the bindings reuse one (~4500 ns). The copy is in both; the ~2x gap is the per-request allocation the single-slab binding avoids (a managed caller can match it by hand-rolling CopyTo+ROM). Co-Authored-By: Claude Opus 4.8 (1M context) --- bench/README.md | 30 +++++++------- bindings/dotnet/Glyph11.Bench/Program.cs | 7 ++-- site/data.json | 50 ++++++++++++------------ site/index.html | 9 +++-- 4 files changed, 50 insertions(+), 46 deletions(-) diff --git a/bench/README.md b/bench/README.md index dd5ebe2..7cb741f 100644 --- a/bench/README.md +++ b/bench/README.md @@ -33,21 +33,23 @@ benchmarks page. | Payload | C# Ultra | Pure C | C# (FFI) | Kotlin (FFI) | |---------|---------:|--------:|---------:|-------------:| -| ~95 B | 114 ns | 95 ns | 95 ns | 100 ns | -| 4 KB | 710 ns | 517 ns | 548 ns | 556 ns | -| 32 KB | 5180 ns | 3767 ns | 4120 ns | 4134 ns | +| ~95 B | 116 ns | 95 ns | 96 ns | 101 ns | +| 4 KB | 728 ns | 529 ns | 554 ns | 585 ns | +| 32 KB | 5067 ns | 3852 ns | 4203 ns | 4226 ns | -**Multi-segment** (3 segments — each linearized into a *reused* buffer, then parsed): +**Multi-segment** (3 segments — linearization always counted): | Payload | C# Ultra | Pure C | C# (FFI) | Kotlin (FFI) | |---------|---------:|--------:|---------:|-------------:| -| ~95 B | 125 ns | 99 ns | 106 ns | 110 ns | -| 4 KB | 751 ns | 546 ns | 601 ns | 585 ns | -| 32 KB | 5606 ns | 4222 ns | 4521 ns | 4617 ns | - -Every parser linearizes the segments into a **reused buffer**, so multi-segment = -contiguous + a `memcpy` for all of them and the native-vs-managed gap stays the -same as contiguous — it's the parse engine, not the allocation. (The managed -`TryExtractFullHeaderValidated` *convenience* API would `input.ToArray()` instead, -a per-call allocation that makes it ~1.6× slower at 32 KB; the bench linearizes -manually so the comparison is apples-to-apples.) Numbers vary run-to-run. +| ~95 B | 252 ns | 100 ns | 107 ns | 112 ns | +| 4 KB | 1346 ns | 560 ns | 600 ns | 602 ns | +| 32 KB | 9202 ns | 4444 ns | 4634 ns | 4773 ns | + +Multi-segment input **must** be linearized into a contiguous buffer first — that +copy is in every number above. The managed column is the library's real path, +`TryExtractFullHeaderValidated`, which linearizes via `input.ToArray()` (a fresh +allocation every request). The single-slab native core lets the bindings linearize +into a **reused** scratch buffer, avoiding that per-request allocation — ~2× faster +at 32 KB. That gap is a usage advantage, not a parser difference: a managed caller +can match it by hand-rolling `CopyTo` + ROM (≈ contiguous + a memcpy). Numbers vary +run-to-run. diff --git a/bindings/dotnet/Glyph11.Bench/Program.cs b/bindings/dotnet/Glyph11.Bench/Program.cs index 27f96cb..1d60c1e 100644 --- a/bindings/dotnet/Glyph11.Bench/Program.cs +++ b/bindings/dotnet/Glyph11.Bench/Program.cs @@ -123,10 +123,9 @@ public static void Run(string dir) double mRom = Best(iters, () => { req.Clear(); var r = rom; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); }); Console.WriteLine($"dotnet-managed-rom,{name},{mRom:F1}"); - // managed — multi-segment: linearize into the reused buffer, then ROM-parse, so it is - // apples-to-apples with the native multi-seg paths. (The convenience API - // TryExtractFullHeaderValidated would input.ToArray() here — a per-call allocation.) - double mSeg = Best(iters, () => { req.Clear(); seq.CopyTo(lin); ReadOnlyMemory r = lin; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); }); + // managed — multi-segment: the library's real path. TryExtractFullHeaderValidated + // linearizes via input.ToArray() (a fresh allocation every request), then ROM-parses. + double mSeg = Best(iters, () => { req.Clear(); var s = seq; UltraHardenedParser.TryExtractFullHeaderValidated(ref s, req, in ManagedLimits, out _); }); Console.WriteLine($"dotnet-managed-multiseg,{name},{mSeg:F1}"); // native binding (FFI) — contiguous diff --git a/site/data.json b/site/data.json index b6d0a6c..a1e9aec 100644 --- a/site/data.json +++ b/site/data.json @@ -1,6 +1,6 @@ { "unit": "ns/op", - "generated": "2026-06-04 20:22 UTC", + "generated": "2026-06-04 20:50 UTC", "langs": [ { "key": "dotnet-managed-rom", @@ -39,38 +39,38 @@ { "payload": "small", "label": "~95 B", - "dotnet-managed-rom": 113.9, - "dotnet-managed-multiseg": 125.3, - "pure-c": 94.8, - "pure-c-multiseg": 99.0, - "dotnet-ffi": 95.3, - "dotnet-ffi-multiseg": 106.4, - "kotlin-ffi": 100.1, - "kotlin-ffi-multiseg": 109.5 + "dotnet-managed-rom": 116.3, + "dotnet-managed-multiseg": 251.8, + "pure-c": 95.2, + "pure-c-multiseg": 99.8, + "dotnet-ffi": 96.2, + "dotnet-ffi-multiseg": 107.2, + "kotlin-ffi": 100.8, + "kotlin-ffi-multiseg": 111.7 }, { "payload": "4k", "label": "4 KB", - "dotnet-managed-rom": 710.2, - "dotnet-managed-multiseg": 750.8, - "pure-c": 516.8, - "pure-c-multiseg": 545.8, - "dotnet-ffi": 548.5, - "dotnet-ffi-multiseg": 601.3, - "kotlin-ffi": 555.6, - "kotlin-ffi-multiseg": 585.0 + "dotnet-managed-rom": 728.1, + "dotnet-managed-multiseg": 1346.5, + "pure-c": 529.3, + "pure-c-multiseg": 560.3, + "dotnet-ffi": 553.8, + "dotnet-ffi-multiseg": 600.4, + "kotlin-ffi": 585.2, + "kotlin-ffi-multiseg": 602.0 }, { "payload": "32k", "label": "32 KB", - "dotnet-managed-rom": 5180.3, - "dotnet-managed-multiseg": 5605.8, - "pure-c": 3767.3, - "pure-c-multiseg": 4222.5, - "dotnet-ffi": 4120.5, - "dotnet-ffi-multiseg": 4520.9, - "kotlin-ffi": 4134.3, - "kotlin-ffi-multiseg": 4617.3 + "dotnet-managed-rom": 5067.2, + "dotnet-managed-multiseg": 9202.0, + "pure-c": 3851.5, + "pure-c-multiseg": 4443.6, + "dotnet-ffi": 4203.1, + "dotnet-ffi-multiseg": 4634.1, + "kotlin-ffi": 4226.2, + "kotlin-ffi-multiseg": 4772.9 } ] } \ No newline at end of file diff --git a/site/index.html b/site/index.html index c1a6271..9d358da 100644 --- a/site/index.html +++ b/site/index.html @@ -50,9 +50,12 @@

Benchmarks

payloads, in contiguous and multi-segment modes. C# Ultra is the standalone managed parser; Pure C is the native core; the bindings reach that core from .NET and the JVM. The bindings call the same C, so the native columns are within measurement noise of each - other — the meaningful gap is native vs. managed. Multi-segment linearizes the 3 segments into a - reused buffer (a memcpy, no per-call allocation) for every implementation, so that gap stays the - parse engine, not the allocation. + other — the meaningful gap is native vs. managed. Multi-segment must be linearized into a + contiguous buffer first, and that copy is counted in every multi-seg number: the managed parser + does it via TryExtractFullHeaderValidated (a per-request ToArray + allocation), while the single-slab native core lets the bindings linearize into a reused scratch + buffer — so the multi-seg gap reflects that allocation, which a managed caller can also avoid by + hand-rolling CopyTo + ROM.
github.com/dotnet-web-stack/Glyph11 From e51f5ddd0062d27904602ad83237d46137eb98d6 Mon Sep 17 00:00:00 2001 From: Diogo Martins Date: Thu, 4 Jun 2026 22:01:00 +0100 Subject: [PATCH 3/4] Multi-seg: linearize identically for all parsers (compare parser, not buffer strategy) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The C# FFI does linearize on multi-segment — both paths do. The bug was that the managed column used the one-shot API (ToArray) while the FFI used a reused buffer, smuggling the linearization strategy into a parser comparison. Now every multi-seg path does CopyTo/memcpy into a reused buffer + parse, so the copy is counted identically and the column reflects the parser: multi-seg = contiguous + a memcpy for all, native ~1.2x ahead in both modes. The TryExtractFullHeaderValidated ToArray-per-request cost (~9.2us vs ~5.4us @ 32KB) is now a footnote, not a confound. Co-Authored-By: Claude Opus 4.8 (1M context) --- bench/README.md | 37 ++++++++++-------- bindings/dotnet/Glyph11.Bench/Program.cs | 8 ++-- site/data.json | 50 ++++++++++++------------ site/index.html | 10 ++--- 4 files changed, 56 insertions(+), 49 deletions(-) diff --git a/bench/README.md b/bench/README.md index 7cb741f..3a2b84d 100644 --- a/bench/README.md +++ b/bench/README.md @@ -33,23 +33,28 @@ benchmarks page. | Payload | C# Ultra | Pure C | C# (FFI) | Kotlin (FFI) | |---------|---------:|--------:|---------:|-------------:| -| ~95 B | 116 ns | 95 ns | 96 ns | 101 ns | -| 4 KB | 728 ns | 529 ns | 554 ns | 585 ns | -| 32 KB | 5067 ns | 3852 ns | 4203 ns | 4226 ns | +| ~95 B | 118 ns | 97 ns | 98 ns | 100 ns | +| 4 KB | 727 ns | 522 ns | 562 ns | 562 ns | +| 32 KB | 5039 ns | 3906 ns | 4122 ns | 4182 ns | -**Multi-segment** (3 segments — linearization always counted): +**Multi-segment** (3 segments — every parser linearizes into a reused buffer, copy counted): | Payload | C# Ultra | Pure C | C# (FFI) | Kotlin (FFI) | |---------|---------:|--------:|---------:|-------------:| -| ~95 B | 252 ns | 100 ns | 107 ns | 112 ns | -| 4 KB | 1346 ns | 560 ns | 600 ns | 602 ns | -| 32 KB | 9202 ns | 4444 ns | 4634 ns | 4773 ns | - -Multi-segment input **must** be linearized into a contiguous buffer first — that -copy is in every number above. The managed column is the library's real path, -`TryExtractFullHeaderValidated`, which linearizes via `input.ToArray()` (a fresh -allocation every request). The single-slab native core lets the bindings linearize -into a **reused** scratch buffer, avoiding that per-request allocation — ~2× faster -at 32 KB. That gap is a usage advantage, not a parser difference: a managed caller -can match it by hand-rolling `CopyTo` + ROM (≈ contiguous + a memcpy). Numbers vary -run-to-run. +| ~95 B | 130 ns | 102 ns | 110 ns | 120 ns | +| 4 KB | 753 ns | 553 ns | 612 ns | 592 ns | +| 32 KB | 5406 ns | 4324 ns | 4567 ns | 4795 ns | + +Multi-segment input must be linearized into a contiguous buffer first — that +per-request copy is counted in every number above. To compare the **parsers** (not +buffer strategy), every path linearizes the same way — `CopyTo`/`memcpy` into a +**reused** scratch buffer, then parse — so multi-segment = contiguous + a `memcpy` +for all of them, and native stays ~1.2× ahead in both modes (the parse engine). + +> The managed one-shot API `TryExtractFullHeaderValidated` instead allocates that +> buffer via `input.ToArray()` **every request** — ~9.2 µs vs ~5.4 µs at 32 KB. For a +> multi-segment hot path, hand-roll `CopyTo` + `TryExtractFullHeaderROM` (or, for the +> binding, linearize into a reused buffer before the native call). It's an API cost, +> not a parser difference — hence a note, not the comparison. + +Numbers vary run-to-run. diff --git a/bindings/dotnet/Glyph11.Bench/Program.cs b/bindings/dotnet/Glyph11.Bench/Program.cs index 1d60c1e..367359b 100644 --- a/bindings/dotnet/Glyph11.Bench/Program.cs +++ b/bindings/dotnet/Glyph11.Bench/Program.cs @@ -123,9 +123,11 @@ public static void Run(string dir) double mRom = Best(iters, () => { req.Clear(); var r = rom; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); }); Console.WriteLine($"dotnet-managed-rom,{name},{mRom:F1}"); - // managed — multi-segment: the library's real path. TryExtractFullHeaderValidated - // linearizes via input.ToArray() (a fresh allocation every request), then ROM-parses. - double mSeg = Best(iters, () => { req.Clear(); var s = seq; UltraHardenedParser.TryExtractFullHeaderValidated(ref s, req, in ManagedLimits, out _); }); + // managed — multi-segment: linearize into the SAME reused buffer as the native paths, + // then ROM-parse, so the column compares the parser, not the linearization strategy. + // (The one-shot API TryExtractFullHeaderValidated would input.ToArray() instead — an + // allocation per request; that's an API cost, noted on the page/README, not here.) + double mSeg = Best(iters, () => { req.Clear(); seq.CopyTo(lin); ReadOnlyMemory r = lin; UltraHardenedParser.TryExtractFullHeaderROM(ref r, req, in ManagedLimits, out _); }); Console.WriteLine($"dotnet-managed-multiseg,{name},{mSeg:F1}"); // native binding (FFI) — contiguous diff --git a/site/data.json b/site/data.json index a1e9aec..be91209 100644 --- a/site/data.json +++ b/site/data.json @@ -1,6 +1,6 @@ { "unit": "ns/op", - "generated": "2026-06-04 20:50 UTC", + "generated": "2026-06-04 21:00 UTC", "langs": [ { "key": "dotnet-managed-rom", @@ -39,38 +39,38 @@ { "payload": "small", "label": "~95 B", - "dotnet-managed-rom": 116.3, - "dotnet-managed-multiseg": 251.8, - "pure-c": 95.2, - "pure-c-multiseg": 99.8, - "dotnet-ffi": 96.2, - "dotnet-ffi-multiseg": 107.2, - "kotlin-ffi": 100.8, - "kotlin-ffi-multiseg": 111.7 + "dotnet-managed-rom": 118.2, + "dotnet-managed-multiseg": 130.2, + "pure-c": 97.1, + "pure-c-multiseg": 102.3, + "dotnet-ffi": 97.8, + "dotnet-ffi-multiseg": 109.6, + "kotlin-ffi": 100.2, + "kotlin-ffi-multiseg": 120.5 }, { "payload": "4k", "label": "4 KB", - "dotnet-managed-rom": 728.1, - "dotnet-managed-multiseg": 1346.5, - "pure-c": 529.3, - "pure-c-multiseg": 560.3, - "dotnet-ffi": 553.8, - "dotnet-ffi-multiseg": 600.4, - "kotlin-ffi": 585.2, - "kotlin-ffi-multiseg": 602.0 + "dotnet-managed-rom": 727.2, + "dotnet-managed-multiseg": 753.2, + "pure-c": 521.5, + "pure-c-multiseg": 553.3, + "dotnet-ffi": 562.0, + "dotnet-ffi-multiseg": 612.4, + "kotlin-ffi": 561.5, + "kotlin-ffi-multiseg": 591.5 }, { "payload": "32k", "label": "32 KB", - "dotnet-managed-rom": 5067.2, - "dotnet-managed-multiseg": 9202.0, - "pure-c": 3851.5, - "pure-c-multiseg": 4443.6, - "dotnet-ffi": 4203.1, - "dotnet-ffi-multiseg": 4634.1, - "kotlin-ffi": 4226.2, - "kotlin-ffi-multiseg": 4772.9 + "dotnet-managed-rom": 5039.4, + "dotnet-managed-multiseg": 5406.0, + "pure-c": 3906.2, + "pure-c-multiseg": 4324.2, + "dotnet-ffi": 4121.6, + "dotnet-ffi-multiseg": 4567.0, + "kotlin-ffi": 4182.1, + "kotlin-ffi-multiseg": 4795.3 } ] } \ No newline at end of file diff --git a/site/index.html b/site/index.html index 9d358da..89904f2 100644 --- a/site/index.html +++ b/site/index.html @@ -51,11 +51,11 @@

Benchmarks

managed parser; Pure C is the native core; the bindings reach that core from .NET and the JVM. The bindings call the same C, so the native columns are within measurement noise of each other — the meaningful gap is native vs. managed. Multi-segment must be linearized into a - contiguous buffer first, and that copy is counted in every multi-seg number: the managed parser - does it via TryExtractFullHeaderValidated (a per-request ToArray - allocation), while the single-slab native core lets the bindings linearize into a reused scratch - buffer — so the multi-seg gap reflects that allocation, which a managed caller can also avoid by - hand-rolling CopyTo + ROM. + contiguous buffer first; that per-request copy is counted in every multi-seg number, using a + reused scratch buffer for every parser so the column compares the parser, not the copy. + (The managed one-shot API TryExtractFullHeaderValidated instead allocates that + buffer via ToArray each request — an API cost you avoid by hand-rolling + CopyTo + ROM.)
github.com/dotnet-web-stack/Glyph11 From 62d6f9f1115cfb036f4856bf6f98f148c7478c43 Mon Sep 17 00:00:00 2001 From: Diogo Martins Date: Thu, 4 Jun 2026 22:37:54 +0100 Subject: [PATCH 4/4] README: rewrite around usage examples for the lib + both bindings Replaces the prose README with verified, copy-pasteable request-header-parsing examples for the C# library (UltraHardenedParser), the .NET binding (Glyph11Parser, zero-alloc caller storage), and the Kotlin binding (Glyph11.parse). To make the Kotlin example real, the binding now surfaces parsed headers/query as List (name/value spans) instead of only a count. All three examples were compiled/run against the real libraries; Kotlin smoke now also asserts a header name/value. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 149 ++++++++++-------- .../src/main/kotlin/io/glyph11/Glyph11.kt | 21 ++- .../kotlin/src/main/kotlin/io/glyph11/Main.kt | 1 + 3 files changed, 103 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index b601c99..689422f 100644 --- a/README.md +++ b/README.md @@ -1,100 +1,121 @@ # Glyph11 -Glyph11 is a dependency free, low allocation HTTP/1.1 parser for C#. It does not rely on any specific network technology but can be used with any (such as `Socket`, `NetworkStream`, `PipeReader` or anything else). +A zero-allocation, hardened HTTP/1.1 request parser — a pure-C# library and a C core +(`libglyph11`) reachable from .NET and the JVM. RFC 9110/9112 validation, configurable +resource limits, and request-smuggling / semantic checks fused into a single zero-copy pass. -![.NET](https://img.shields.io/badge/.NET-8.0%20%7C%209.0%20%7C%2010.0-512bd4) [![NuGet](https://img.shields.io/nuget/v/Glyph11.svg)](https://www.nuget.org/packages/Glyph11/) +![.NET](https://img.shields.io/badge/.NET-8.0%20%7C%209.0%20%7C%2010.0-512bd4) [![Benchmarks](https://img.shields.io/badge/benchmarks-live-blue)](https://dotnet-web-stack.github.io/Glyph11/) -[![Coverage](https://img.shields.io/sonar/coverage/MDA2AV_Glyph11?server=https%3A%2F%2Fsonarcloud.io)](https://sonarcloud.io/summary/new_code?id=MDA2AV_Glyph11) -[![Quality Gate](https://sonarcloud.io/api/project_badges/measure?project=MDA2AV_Glyph11&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=MDA2AV_Glyph11) -## Usage +Three ways to use the same hardened parser: + +| | What | Header storage | +|---|---|---| +| **C# library** | pure managed `UltraHardenedParser` | pooled, internal | +| **.NET binding** | the C core via P/Invoke | caller-provided (zero-alloc) | +| **Kotlin binding** | the C core via Panama FFM | per-call, returned as a list | -Glyph11 works with any source that produces a `ReadOnlySequence` or `ReadOnlyMemory` — `PipeReader`, `Socket`, `NetworkStream`, or raw byte arrays. +## C# library (managed) ```csharp using System.Buffers; +using System.Text; using Glyph11.Protocol; using Glyph11.Parser; using Glyph11.Parser.UltraHardened; var request = new BinaryRequest(); -var limits = ParserLimits.Default; +var limits = ParserLimits.Default; -ReadOnlySequence buffer = ...; // from any network source +// From any source that yields a ReadOnlySequence (PipeReader, Socket, NetworkStream, …) +ReadOnlySequence buffer = new(Encoding.ASCII.GetBytes( + "GET /api/users?page=1 HTTP/1.1\r\nHost: example.com\r\nAccept: */*\r\n\r\n")); -// UltraHardenedParser fuses structural parsing, resource limits, and every -// semantic check (smuggling, traversal, Host rules, ...) into one pass. -// It throws HttpParseException on any protocol or semantic violation. if (UltraHardenedParser.TryExtractFullHeaderValidated(ref buffer, request, in limits, out int bytesRead)) { - // All parsed fields are zero-copy slices into the original buffer: - // request.Method.Span → e.g. "GET" - // request.Path.Span → e.g. "/api/users" - // request.Version.Span → e.g. "HTTP/1.1" - // request.Headers → KeyValueList of name/value pairs - // request.QueryParameters → KeyValueList of query params - - // The request is fully validated — safe to process. - // Then advance your reader by bytesRead. - - // Reuse between requests — clear instead of reallocating: - request.Headers.Clear(); - request.QueryParameters.Clear(); -} -``` - -Glyph11 plugs into a `PipeReader` loop: read a buffer, call `TryExtractFullHeaderValidated`, advance the reader by `bytesRead`, and repeat. - -## Parsers - -Glyph11 ships two parsers: - -- **`UltraHardenedParser`** — RFC 9110/9112 compliant with full validation, configurable resource limits, and every smuggling/semantic check fused into the parse pass. Recommended for internet-facing applications. -- **`FlexibleParser`** — Minimal validation for maximum throughput. Suitable for trusted environments where input is pre-validated. - -## Performance + Console.WriteLine(Encoding.ASCII.GetString(request.Method.Span)); // GET + Console.WriteLine(Encoding.ASCII.GetString(request.Path.Span)); // /api/users -- **ROM path is zero-allocation** — no GC pressure regardless of request size -- **SIMD-accelerated validation** keeps the `UltraHardenedParser` within a small constant factor of the unvalidated `FlexibleParser` -- **Multi-segment linearization** provides ROM-speed parsing with a single upfront allocation + for (int i = 0; i < request.Headers.Count; i++) + { + var (name, value) = request.Headers[i]; // zero-copy slices + Console.WriteLine($"{Encoding.ASCII.GetString(name.Span)}: {Encoding.ASCII.GetString(value.Span)}"); + } -See the [live benchmarks](https://dotnet-web-stack.github.io/Glyph11/) — the managed parser vs. the C core and its .NET (P/Invoke) and JVM (Panama FFM) bindings, contiguous and multi-segment. + // advance your reader by bytesRead; reuse `request` across calls (request.Clear()). +} +// throws HttpParseException on a protocol/semantic violation; returns false if incomplete. +``` -## CI Workflows +`TryExtractFullHeaderROM(ref ReadOnlyMemory, …)` is the single-buffer (contiguous) fast path. +`FlexibleParser` is a minimal-validation variant for trusted, pre-validated input. -### Benchmarks +## .NET binding (C core via P/Invoke) -The **Benchmark** workflow (`.github/workflows/benchmark.yml`) measures parser throughput and allocation using BenchmarkDotNet. +Calls `libglyph11` directly — same validation, native speed, **zero allocation** (you provide the +header/query storage). -| Trigger | Job | What it does | -|---------|-----|--------------| -| `pull_request` | **Parser Benchmarks** | Runs `FlexibleParserBenchmark` and `UltraHardenedParserBenchmark`, compares against the baseline on `gh-pages`, and posts a comment on the PR. Fails if any metric regresses by more than 15%. | -| `workflow_dispatch` | **Full Benchmarks** | Runs all benchmarks (parsers + `AllSemanticChecksBenchmark`) and updates the baseline on `gh-pages`. | +```csharp +using System.Text; +using Glyph11.Native; -**Data flow:** benchmark results are stored as `benchmarks/data.js` on the `gh-pages` branch. +byte[] request = Encoding.ASCII.GetBytes( + "GET /api/users?page=1 HTTP/1.1\r\nHost: example.com\r\nAccept: */*\r\n\r\n"); -> The cross-language comparison on the [live site](https://dotnet-web-stack.github.io/Glyph11/) is produced separately by the **Cross-Language Benchmark** workflow (`.github/workflows/cross-bench.yml`), which benchmarks the C core, both bindings, and the managed parser, then publishes `benchmarks/cross-lang.json` to `gh-pages`. +Span headers = stackalloc Glyph11Field[64]; +Span query = stackalloc Glyph11Field[32]; -To publish updated benchmark data: +int status = Glyph11Parser.Parse(request, headers, query, Glyph11Limits.Default, out var r); +if (status == Glyph11Parser.Ok) +{ + string Slice(Glyph11Span s) => Encoding.ASCII.GetString(request, (int)s.Offset, (int)s.Length); -1. Merge your changes to `main`. -2. Go to **Actions > Benchmark > Run workflow** on `main`. + Console.WriteLine(Slice(r.Method)); // GET + Console.WriteLine(Slice(r.Path)); // /api/users -### Compliance Probe + for (int i = 0; i < r.HeaderCount; i++) + Console.WriteLine($"{Slice(headers[i].Name)}: {Slice(headers[i].Value)}"); +} +// status: 0 = OK, 1 = incomplete (read more), otherwise a protocol/limit error (→ HTTP 400 / 431). +``` -The **Probe** workflow (`.github/workflows/probe.yml`) tests HTTP/1.1 compliance across multiple server frameworks using [Glyph11.Probe](src/Glyph11.Probe), a tool that sends malformed and ambiguous HTTP requests and checks the server's response against strict RFC 9110/9112 expectations. +Resolve the native library with the `GLYPH11_NATIVE_PATH` environment variable, or put +`libglyph11.{so,dll,dylib}` on the OS load path. + +## Kotlin / JVM binding (C core via Panama FFM) + +```kotlin +import io.glyph11.Glyph11 +import io.glyph11.Glyph11Span + +val request = "GET /api/users?page=1 HTTP/1.1\r\nHost: example.com\r\nAccept: */*\r\n\r\n" + .toByteArray(Charsets.ISO_8859_1) + +val r = Glyph11.parse(request) +when { + r.isOk -> { + fun slice(s: Glyph11Span) = String(request, s.offset, s.length, Charsets.ISO_8859_1) + println(slice(r.method)) // GET + println(slice(r.path)) // /api/users + for (h in r.headers) + println("${slice(h.name)}: ${slice(h.value)}") + } + r.isIncomplete -> { /* read more bytes */ } + else -> println("rejected → HTTP ${Glyph11.httpCode(r.status)}") // 400 / 431 +} +``` -Servers tested: **Glyph11** (raw TCP + UltraHardenedParser), **Kestrel** (ASP.NET Core), **Flask** (Python), **Express** (Node.js), **Spring Boot** (Java), **Quarkus** (Java), **Nancy** (.NET), **Jetty** (Java), **Nginx** (native), **Apache** (native), **Caddy** (native), **Pingora** (Rust). +Requires JDK 21+ (FFM). Point at the library with `-Dglyph11.lib=/path/to/libglyph11.so`. -| Trigger | What it does | -|---------|--------------| -| `pull_request` | Starts all three servers, probes each one, evaluates results with strict status-code matching (e.g. a parser error must return `400`, not `404`), and posts a comparison table as a PR comment. Never fails the build — this is informational. | -| `workflow_dispatch` | Same as above, plus pushes `probe/data.js` to `gh-pages`. | +## Benchmarks -**Data flow:** probe results are stored as `probe/data.js` on the `gh-pages` branch. +Live cross-language numbers — managed vs. the C core and its .NET / JVM bindings, contiguous and +multi-segment: **** -To publish updated probe data: +## Build the native core (for the bindings) -1. Merge your changes to `main`. -2. Go to **Actions > Probe > Run workflow** on `main`. +```sh +cmake -S core -B core/build-rel -DGLYPH11_BUILD_TESTS=OFF +cmake --build core/build-rel # → core/build-rel/libglyph11.{so,dll,dylib} +``` diff --git a/bindings/kotlin/src/main/kotlin/io/glyph11/Glyph11.kt b/bindings/kotlin/src/main/kotlin/io/glyph11/Glyph11.kt index 7247088..c2df262 100644 --- a/bindings/kotlin/src/main/kotlin/io/glyph11/Glyph11.kt +++ b/bindings/kotlin/src/main/kotlin/io/glyph11/Glyph11.kt @@ -11,6 +11,9 @@ import java.lang.invoke.MethodHandle /** A byte range (offset + length) into the parsed input buffer (zero-copy). */ data class Glyph11Span(val offset: Int, val length: Int) +/** A parsed name/value pair (header or query parameter); spans index into the input. */ +data class Glyph11Field(val name: Glyph11Span, val value: Glyph11Span) + /** Parsed request fields. Spans index into the input passed to [Glyph11.parse]. */ data class Glyph11Result( val status: Int, @@ -18,12 +21,14 @@ data class Glyph11Result( val target: Glyph11Span, val path: Glyph11Span, val version: Glyph11Span, - val headerCount: Int, - val queryCount: Int, + val headers: List, + val query: List, val consumed: Long, ) { val isOk: Boolean get() = status == 0 val isIncomplete: Boolean get() = status == 1 + val headerCount: Int get() = headers.size + val queryCount: Int get() = query.size } /** @@ -105,6 +110,14 @@ object Glyph11 { fun span(off: Long) = Glyph11Span(req.get(ValueLayout.JAVA_INT, off), req.get(ValueLayout.JAVA_INT, off + 4)) + fun fields(seg: MemorySegment, count: Int): List = + (0 until count).map { i -> + val b = i.toLong() * SIZEOF_FIELD + Glyph11Field( + Glyph11Span(seg.get(ValueLayout.JAVA_INT, b), seg.get(ValueLayout.JAVA_INT, b + 4)), + Glyph11Span(seg.get(ValueLayout.JAVA_INT, b + 8), seg.get(ValueLayout.JAVA_INT, b + 12)), + ) + } return Glyph11Result( status = status, @@ -112,8 +125,8 @@ object Glyph11 { target = span(8L), path = span(16L), version = span(24L), - headerCount = req.get(ValueLayout.JAVA_INT, OFF_HEADER_COUNT), - queryCount = req.get(ValueLayout.JAVA_INT, OFF_QUERY_COUNT), + headers = if (status == 0) fields(headers, req.get(ValueLayout.JAVA_INT, OFF_HEADER_COUNT)) else emptyList(), + query = if (status == 0) fields(query, req.get(ValueLayout.JAVA_INT, OFF_QUERY_COUNT)) else emptyList(), consumed = if (status == 0) consumed.get(ValueLayout.JAVA_LONG, 0L) else 0L, ) } diff --git a/bindings/kotlin/src/main/kotlin/io/glyph11/Main.kt b/bindings/kotlin/src/main/kotlin/io/glyph11/Main.kt index becdab9..a7719c5 100644 --- a/bindings/kotlin/src/main/kotlin/io/glyph11/Main.kt +++ b/bindings/kotlin/src/main/kotlin/io/glyph11/Main.kt @@ -43,6 +43,7 @@ private fun smoke() { check("path", slice(valid, r.path) == "/api/users") check("version", slice(valid, r.version) == "HTTP/1.1") check("headerCount", r.headerCount == 2) + check("header name/value", r.headers[0].let { slice(valid, it.name) == "Host" && slice(valid, it.value) == "example.com" }) check("queryCount", r.queryCount == 2) check("consumed", r.consumed.toInt() == valid.size)