From 917f3ce440b30b7a0d51f911f5e4b7fc0381ecb9 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 19:02:44 +0100
Subject: [PATCH 01/36] chore(benchmarks): remove three redundant benchmark
 mains

FullCvBenchmark duplicated the JMH TemplateCvJmhBenchmark (CV through
ModernProfessional) with a hand-rolled, JIT-noisier loop and no report.
GraphComposeBenchmark was an early-engine relic measuring the same
title+body+divider doc as CurrentSpeedBenchmark's engine-simple scenario.
ScalabilityBenchmark's thread-scaling sweep is folded into
CurrentSpeedBenchmark's full-profile throughput run (thread counts now
1,2,4,8,16).

Drop the matching run-benchmarks.ps1 steps and the benchmarks.md /
benchmarks/README.md entries. ComparativeBenchmark, the JMH benches, the
deterministic probes, and the soak/stress runners stay. Benchmark module
compiles; its 28 tests pass.
---
 CHANGELOG.md                                  |  7 ++
 benchmarks/README.md                          |  6 +-
 .../demcha/compose/CurrentSpeedBenchmark.java |  4 +-
 .../com/demcha/compose/FullCvBenchmark.java   | 84 ------------------
 .../demcha/compose/GraphComposeBenchmark.java | 79 -----------------
 .../demcha/compose/ScalabilityBenchmark.java  | 88 -------------------
 docs/operations/benchmarks.md                 |  9 +-
 scripts/run-benchmarks.ps1                    |  7 +-
 8 files changed, 15 insertions(+), 269 deletions(-)
 delete mode 100644 benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java
 delete mode 100644 benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java
 delete mode 100644 benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19c44ff5f..e9f7124c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -337,6 +337,13 @@ Entries land here as they merge.
 
 ### Internal
 
+- **Benchmark suite cleanup (not shipped).** Removed three redundant
+  benchmark mains: `FullCvBenchmark` (superseded by the JMH
+  `TemplateCvJmhBenchmark`), `GraphComposeBenchmark` (early-engine relic
+  duplicating `CurrentSpeedBenchmark`'s `engine-simple` scenario), and
+  `ScalabilityBenchmark` (its thread-scaling sweep folded into
+  `CurrentSpeedBenchmark`'s full-profile throughput run, now `1,2,4,8,16`).
+  Dropped the matching `run-benchmarks.ps1` steps and doc entries.
 - **Removed the `java.awt.*` / `java.util.*` co-wildcard in four files.**
   `InvoiceTemplateComposer`, `ProposalTemplateComposer`,
   `WeeklyScheduleTemplateComposer`, and the engine `PdfRenderingSystemECS`
diff --git a/benchmarks/README.md b/benchmarks/README.md
index f6041365c..e232c6e21 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -62,15 +62,11 @@
 | File | Role |
 |---|---|
 | `CurrentSpeedBenchmark` | Default scenario runner — what CI's `perf-smoke` job exercises. Takes a `-Dgraphcompose.benchmark.profile=smoke\|full\|stress` switch. |
-| `ComparativeBenchmark` | Renders the same fixtures through GraphCompose, iText, openHTMLToPDF, JasperReports. **Rough local comparison only** — see "When not to use" above. |
-| `FullCvBenchmark`, `ScalabilityBenchmark` | Fixture-specific runners for CV and table-heavy scenarios. |
-| `CanonicalBenchmarkSupport`, `BenchmarkSupport` | Shared fixture builders + measurement helpers. |
+| `ComparativeBenchmark` | Renders the same fixtures through GraphCompose, iText, openHTMLToPDF, JasperReports. **Rough local comparison only** — see "When not to use" above. || `CanonicalBenchmarkSupport`, `BenchmarkSupport` | Shared fixture builders + measurement helpers. |
 | `BenchmarkReportWriter` | Writes JSON / CSV / text reports under `benchmarks/target/benchmarks/`. |
 | `BenchmarkDiffTool` | Compares two JSON reports and prints a delta table. Useful for pre/post comparisons. |
 | `BenchmarkMedianTool` | Median + dispersion across N runs of the same scenario. |
 | `GraphComposeStressTest`, `EnduranceTest` | Long-running stress / endurance harnesses. |
-| `GraphComposeBenchmark` | Legacy entry point preserved for one downstream caller. New work should target `CurrentSpeedBenchmark`. |
-
 ## Running
 
 From the repo root:
diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index 2858d64a6..bbda30b8f 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -55,7 +55,9 @@ public final class CurrentSpeedBenchmark {
     private static final int DEFAULT_FULL_WARMUP_ITERATIONS = 12;
     private static final int DEFAULT_FULL_MEASUREMENT_ITERATIONS = 40;
     private static final int DEFAULT_FULL_DOCS_PER_THREAD = 12;
-    private static final String DEFAULT_FULL_THREAD_COUNTS = "1,2,4,8";
+    // The 16-thread tier is absorbed from the removed ScalabilityBenchmark so the
+    // full profile keeps a thread-scaling data point (smoke runs no throughput).
+    private static final String DEFAULT_FULL_THREAD_COUNTS = "1,2,4,8,16";
     // Bumped from 2/5 to 30/100 so smoke runs reach a steady JIT state and the
     // p95 calculation actually has enough samples to interpolate rather than
     // collapsing to the maximum observed time. The smoke profile remains the
diff --git a/benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java
deleted file mode 100644
index c035f96e3..000000000
--- a/benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java
+++ /dev/null
@@ -1,84 +0,0 @@
-package com.demcha.compose;
-
-import com.demcha.compose.document.api.DocumentSession;
-import com.demcha.compose.document.templates.api.DocumentTemplate;
-import com.demcha.compose.document.templates.cv.presets.ModernProfessional;
-import com.demcha.compose.document.templates.cv.spec.CvSpec;
-import com.demcha.compose.document.theme.BusinessTheme;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-
-import java.util.Arrays;
-
-public class FullCvBenchmark {
-
-    private static final int WARMUP_ITERATIONS = Integer.getInteger("graphcompose.benchmark.fullCv.warmup", 100);
-    private static final int MEASUREMENT_ITERATIONS = Integer.getInteger("graphcompose.benchmark.fullCv.iterations", 500);
-
-    public static void main(String[] args) {
-        BenchmarkSupport.configureQuietLogging();
-        System.out.println("Starting FullCvBenchmark...");
-
-        CvSpec cv = CanonicalBenchmarkSupport.canonicalCv();
-        DocumentTemplate<CvSpec> template = ModernProfessional.create(BusinessTheme.modern());
-
-        System.out.println("Warming up JVM (JIT compilation, font cache warmup)...");
-        for (int i = 0; i < WARMUP_ITERATIONS; i++) {
-            generateCvInMemory(template, cv);
-        }
-
-        System.out.println("Measuring performance (" + MEASUREMENT_ITERATIONS + " iterations)...");
-        long[] durationsNs = new long[MEASUREMENT_ITERATIONS];
-
-        for (int i = 0; i < MEASUREMENT_ITERATIONS; i++) {
-            long start = System.nanoTime();
-            generateCvInMemory(template, cv);
-            long end = System.nanoTime();
-            durationsNs[i] = end - start;
-        }
-
-        printStatistics(durationsNs);
-    }
-
-    private static void generateCvInMemory(DocumentTemplate<CvSpec> template, CvSpec cv) {
-        try (DocumentSession document = GraphCompose.document()
-                .pageSize(com.demcha.compose.document.api.DocumentPageSize.A4)
-                .margin(15, 10, 15, 15)
-                .create()) {
-            template.compose(document, cv);
-            document.toPdfBytes();
-        } catch (Exception e) {
-            throw new RuntimeException("Failed to generate PDF", e);
-        }
-    }
-
-    private static void printStatistics(long[] durationsNs) {
-        Arrays.sort(durationsNs);
-
-        double[] durationsMs = Arrays.stream(durationsNs).mapToDouble(ns -> ns / 1_000_000.0).toArray();
-
-        double min = durationsMs[0];
-        double max = durationsMs[durationsMs.length - 1];
-        double avg = Arrays.stream(durationsMs).average().orElse(0.0);
-        double median = durationsMs[(int) (durationsMs.length * 0.5)];
-        double p95 = durationsMs[(int) (durationsMs.length * 0.95)];
-        double p99 = durationsMs[(int) (durationsMs.length * 0.99)];
-
-        System.out.println("\nBenchmark results (milliseconds):");
-        System.out.println("------------------------------------------------");
-        System.out.printf("Min time:           %.2f ms%n", min);
-        System.out.printf("Average time:       %.2f ms%n", avg);
-        System.out.printf("Median (50%%):       %.2f ms (typical response time)%n", median);
-        System.out.printf("95th percentile:    %.2f ms (95%% of runs finish within this)%n", p95);
-        System.out.printf("99th percentile:    %.2f ms (rare spikes or GC pressure)%n", p99);
-        System.out.printf("Max time:           %.2f ms%n", max);
-        System.out.println("------------------------------------------------");
-
-        if (median < 200) {
-            System.out.println("Verdict: Excellent. The engine is very fast for this scenario.");
-        } else if (median < 1000) {
-            System.out.println("Verdict: Good. This is a healthy speed for complex generation.");
-        } else {
-            System.out.println("Verdict: Slow enough to investigate with a profiler.");
-        }
-    }
-}
diff --git a/benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java
deleted file mode 100644
index f4717e66c..000000000
--- a/benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java
+++ /dev/null
@@ -1,79 +0,0 @@
-package com.demcha.compose;
-
-import com.demcha.compose.engine.components.style.Margin;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-
-import java.util.Arrays;
-
-public class GraphComposeBenchmark {
-
-    private static final int WARMUP_ITERATIONS = Integer.getInteger("graphcompose.benchmark.coreEngine.warmup", 100);
-    private static final int MEASUREMENT_ITERATIONS = Integer.getInteger("graphcompose.benchmark.coreEngine.iterations", 500);
-
-    public static void main(String[] args) {
-        BenchmarkSupport.configureQuietLogging();
-        System.out.println("Starting GraphComposeBenchmark...");
-
-        System.out.println("Warming up JVM (JIT compilation, font cache warmup)...");
-        for (int i = 0; i < WARMUP_ITERATIONS; i++) {
-            generateCvInMemory();
-        }
-
-        System.out.println("Measuring performance (" + MEASUREMENT_ITERATIONS + " iterations)...");
-        long[] durationsNs = new long[MEASUREMENT_ITERATIONS];
-
-        for (int i = 0; i < MEASUREMENT_ITERATIONS; i++) {
-            long start = System.nanoTime();
-            generateCvInMemory();
-            long end = System.nanoTime();
-            durationsNs[i] = end - start;
-        }
-
-        printStatistics(durationsNs);
-    }
-
-    private static void generateCvInMemory() {
-        try {
-            CanonicalBenchmarkSupport.renderSimpleBenchmarkDocument(
-                    PDRectangle.A4,
-                    Margin.of(24),
-                    "CoreEngineRoot",
-                    "GraphCompose Core Benchmark",
-                    "Analytical engineer focused on reliable platform design. "
-                            + "Testing paragraph breaking and layout calculation engine.");
-        } catch (Exception e) {
-            throw new RuntimeException("Failed to generate PDF", e);
-        }
-    }
-
-    private static void printStatistics(long[] durationsNs) {
-        Arrays.sort(durationsNs);
-
-        double[] durationsMs = Arrays.stream(durationsNs).mapToDouble(ns -> ns / 1_000_000.0).toArray();
-
-        double min = durationsMs[0];
-        double max = durationsMs[durationsMs.length - 1];
-        double avg = Arrays.stream(durationsMs).average().orElse(0.0);
-        double median = durationsMs[(int) (durationsMs.length * 0.5)];
-        double p95 = durationsMs[(int) (durationsMs.length * 0.95)];
-        double p99 = durationsMs[(int) (durationsMs.length * 0.99)];
-
-        System.out.println("\nBenchmark results (milliseconds):");
-        System.out.println("------------------------------------------------");
-        System.out.printf("Min time:           %.2f ms%n", min);
-        System.out.printf("Average time:       %.2f ms%n", avg);
-        System.out.printf("Median (50%%):       %.2f ms (typical response time)%n", median);
-        System.out.printf("95th percentile:    %.2f ms (95%% of runs finish within this)%n", p95);
-        System.out.printf("99th percentile:    %.2f ms (rare spikes or GC pressure)%n", p99);
-        System.out.printf("Max time:           %.2f ms%n", max);
-        System.out.println("------------------------------------------------");
-
-        if (median < 100) {
-            System.out.println("Verdict: Excellent. The engine is very fast for this scenario.");
-        } else if (median < 500) {
-            System.out.println("Verdict: Good. This is a healthy speed for a synchronous REST API.");
-        } else {
-            System.out.println("Verdict: Slow enough to investigate with a profiler.");
-        }
-    }
-}
diff --git a/benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java
deleted file mode 100644
index b8e945ef6..000000000
--- a/benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java
+++ /dev/null
@@ -1,88 +0,0 @@
-package com.demcha.compose;
-
-import com.demcha.compose.engine.components.style.Margin;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.concurrent.*;
-
-/**
- * Linear Scalability Test
- * Measures throughput (documents per second) as thread count increases.
- */
-public class ScalabilityBenchmark {
-
-    private static final int DOCUMENTS_PER_THREAD = Integer.getInteger("graphcompose.scalability.documentsPerThread", 100);
-    private static final int WARMUP_DOCS = Integer.getInteger("graphcompose.scalability.warmupDocs", 100);
-    private static final String THREAD_COUNTS = System.getProperty("graphcompose.scalability.threads", "1,2,4,8,16");
-
-    public static void main(String[] args) throws Exception {
-        BenchmarkSupport.configureQuietLogging();
-        System.out.println("Starting Scalability Benchmark: Linear Scalability");
-        System.out.println("------------------------------------------------------------");
-
-        // Warmup
-        for (int i = 0; i < WARMUP_DOCS; i++) {
-            generateOne();
-        }
-
-        int[] threadCounts = parseThreadCounts(THREAD_COUNTS);
-        System.out.println(String.format("%-10s | %-15s | %-12s", "Threads", "Total Docs", "Throughput (docs/sec)"));
-        System.out.println("------------------------------------------------------------");
-
-        for (int threads : threadCounts) {
-            runScalabilityTest(threads);
-        }
-    }
-
-    private static void runScalabilityTest(int threads) throws Exception {
-        int totalDocs = threads * DOCUMENTS_PER_THREAD;
-        ExecutorService executor = Executors.newFixedThreadPool(threads);
-        
-        long startTime = System.nanoTime();
-        
-        List<Future<?>> futures = new ArrayList<>();
-        for (int i = 0; i < totalDocs; i++) {
-            futures.add(executor.submit(() -> {
-                try {
-                    generateOne();
-                } catch (Exception e) {
-                    e.printStackTrace();
-                }
-            }));
-        }
-
-        for (Future<?> future : futures) {
-            future.get();
-        }
-
-        long endTime = System.nanoTime();
-        executor.shutdown();
-        executor.awaitTermination(1, TimeUnit.MINUTES);
-
-        double durationSec = (endTime - startTime) / 1_000_000_000.0;
-        double throughput = totalDocs / durationSec;
-
-        System.out.println(String.format("%-10d | %-15d | %12.2f", threads, totalDocs, throughput));
-    }
-
-    private static void generateOne() throws Exception {
-        CanonicalBenchmarkSupport.renderSimpleBenchmarkDocument(
-                PDRectangle.A4,
-                Margin.of(24),
-                "ScalabilityRoot",
-                "Scalability",
-                "Scalability test message.");
-    }
-
-    private static int[] parseThreadCounts(String raw) {
-        return Arrays.stream(raw.split(","))
-                .map(String::trim)
-                .filter(value -> !value.isEmpty())
-                .mapToInt(Integer::parseInt)
-                .filter(value -> value > 0)
-                .toArray();
-    }
-}
diff --git a/docs/operations/benchmarks.md b/docs/operations/benchmarks.md
index 315f4d523..775483384 100644
--- a/docs/operations/benchmarks.md
+++ b/docs/operations/benchmarks.md
@@ -36,15 +36,10 @@ The script prints numbered sections so you can map console output to the pipelin
 1. `01-build-classpath`
    Builds the test classpath once and writes `target/benchmark.classpath`.
 2. `02-current-speed`
-   Runs `CurrentSpeedBenchmark` in the selected profile.
+   Runs `CurrentSpeedBenchmark` in the selected profile. The full profile also
+   runs the thread-scaling throughput sweep (1 → 16 threads).
 3. `03-comparative`
    Runs the GraphCompose canonical vs iText 5 vs JasperReports comparison.
-4. `04-core-engine`
-   Runs `GraphComposeBenchmark`.
-5. `05-full-cv`
-   Runs `FullCvBenchmark`.
-6. `06-scalability`
-   Runs the thread-scaling throughput benchmark.
 7. `07-stress`
    Runs the concurrent stability stress test.
 8. `08-endurance`
diff --git a/scripts/run-benchmarks.ps1 b/scripts/run-benchmarks.ps1
index dbe162c08..e3d3947b6 100644
--- a/scripts/run-benchmarks.ps1
+++ b/scripts/run-benchmarks.ps1
@@ -5,8 +5,8 @@ Runs the local GraphCompose benchmark pipeline and stores timestamped logs and r
 
 .DESCRIPTION
 The wrapper performs a staged local run:
-01 build classpath, 02 current-speed, 03 comparative, 04 core engine, 05 full CV, 06 scalability,
-07 stress, optional 08 endurance, then 09/10 diff steps.
+01 build classpath, 02 current-speed, 03 comparative, 07 stress,
+optional 08 endurance, then 09/10 diff steps.
 
 Current-speed diffs are profile-aware. The wrapper only compares reports
 from the same current-speed profile (`smoke` or `full`) and skips the
@@ -368,9 +368,6 @@ try {
                 -InputPaths $comparativeRuns | Out-Null
         }
 
-        Invoke-JavaMain -Name "04-core-engine" -Classpath $javaClasspath -MainClass "com.demcha.compose.GraphComposeBenchmark"
-        Invoke-JavaMain -Name "05-full-cv" -Classpath $javaClasspath -MainClass "com.demcha.compose.FullCvBenchmark"
-        Invoke-JavaMain -Name "06-scalability" -Classpath $javaClasspath -MainClass "com.demcha.compose.ScalabilityBenchmark"
         Invoke-JavaMain -Name "07-stress" -Classpath $javaClasspath -MainClass "com.demcha.compose.GraphComposeStressTest"
 
         if ($IncludeEndurance) {

From 019f64b32cd23aa44a0694cd43604e11d2c88818 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 19:26:31 +0100
Subject: [PATCH 02/36] perf(benchmarks): persist compose/layout/render stages
 + a run summary.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The stage breakdown (per-template compose / layout / render medians) was
printed to the console and discarded. Promote it into the report:
runStageBreakdown returns a StageRow, CurrentSpeedReport carries a stages[]
array, and a stages CSV is written — so a diff can attribute a regression to
an engine stage, not just the blended total. Also write a per-run summary.md
(latency + stages + throughput tables) so a reviewer reads one file instead
of the JSON plus several CSVs.

Additive output only: diff/verdict/median read the report by field and ignore
the new array. Benchmark module compiles; 28 tests pass; verified on a smoke
run (stages[] present, summary.md readable, perf gate passes).
---
 .../demcha/compose/BenchmarkReportWriter.java |   8 +
 .../demcha/compose/CurrentSpeedBenchmark.java | 144 +++++++++++++++---
 2 files changed, 131 insertions(+), 21 deletions(-)

diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java
index 73e061d3d..51d2b2e42 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java
@@ -60,6 +60,14 @@ Path writeCsv(String tableName, List<String> headers, List<List<String>> rows) t
             return archived;
         }
 
+        Path writeMarkdown(String name, String content) throws IOException {
+            Path latest = directory.resolve("latest-" + name + ".md");
+            Path archived = directory.resolve(name + "-" + timestamp + ".md");
+            Files.writeString(latest, content, StandardCharsets.UTF_8);
+            Files.writeString(archived, content, StandardCharsets.UTF_8);
+            return archived;
+        }
+
         Path directory() {
             return directory;
         }
diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index bbda30b8f..e3d877943 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -143,20 +143,21 @@ private void run() throws Exception {
 
         // Stage breakdown: for each template scenario we time compose / layout
         // / render separately so consumers can attribute regressions to the
-        // engine vs. PDFBox. Engine-simple and feature-rich scenarios also
-        // use the canonical pipeline and benefit from the same probe.
+        // engine vs. PDFBox. Only the template scenarios are probed here; the
+        // latency table above still covers every scenario.
+        List<StageRow> stageRows = new ArrayList<>();
         if (profile != BenchmarkProfile.SMOKE || config.measurementIterations() >= 20) {
             System.out.println();
             System.out.println("Stage breakdown (median ms per stage)");
             System.out.printf("%-18s | %12s | %12s | %12s | %12s%n",
                     "Scenario", "Compose", "Layout", "Render", "Total");
             System.out.println("-".repeat(78));
-            runStageBreakdown("invoice-template", () -> openInvoiceSession(),
-                    s -> invoiceTemplate.compose(s, invoice), config.measurementIterations());
-            runStageBreakdown("cv-template", () -> openCvSession(),
-                    s -> cvTemplate.compose(s, cv), config.measurementIterations());
-            runStageBreakdown("proposal-template", () -> openProposalSession(),
-                    s -> proposalTemplate.compose(s, proposal), config.measurementIterations());
+            stageRows.add(runStageBreakdown("invoice-template", () -> openInvoiceSession(),
+                    s -> invoiceTemplate.compose(s, invoice), config.measurementIterations()));
+            stageRows.add(runStageBreakdown("cv-template", () -> openCvSession(),
+                    s -> cvTemplate.compose(s, cv), config.measurementIterations()));
+            stageRows.add(runStageBreakdown("proposal-template", () -> openProposalSession(),
+                    s -> proposalTemplate.compose(s, proposal), config.measurementIterations()));
         }
 
         List<ThroughputRow> throughputRows = new ArrayList<>();
@@ -201,10 +202,13 @@ private void run() throws Exception {
                 config.docsPerThread(),
                 config.threadCounts(),
                 latencyRows,
+                stageRows,
                 throughputRows,
                 totalBenchmarkBytes);
         System.out.println("Saved JSON benchmark report to " + summary.jsonPath());
-        System.out.println("Saved CSV benchmark reports to " + summary.latencyCsvPath() + " and " + summary.throughputCsvPath());
+        System.out.println("Saved CSV benchmark reports to " + summary.latencyCsvPath() + ", "
+                + summary.stagesCsvPath() + ", and " + summary.throughputCsvPath());
+        System.out.println("Saved markdown summary to " + summary.summaryMarkdownPath());
 
         if (enforceGate) {
             PerformanceGateResult gateResult = evaluatePerformanceGate(profile, latencyRows);
@@ -363,10 +367,10 @@ private interface SessionComposer {
      * median-ms-per-stage row so callers can attribute regressions to
      * compose / layout / render independently.
      */
-    private void runStageBreakdown(String scenario,
-                                   SessionFactory factory,
-                                   SessionComposer composer,
-                                   int iterations) throws Exception {
+    private StageRow runStageBreakdown(String scenario,
+                                       SessionFactory factory,
+                                       SessionComposer composer,
+                                       int iterations) throws Exception {
         int warmup = Math.max(2, Math.min(20, iterations / 5));
         for (int i = 0; i < warmup; i++) {
             try (DocumentSession session = factory.open()) {
@@ -398,12 +402,13 @@ private void runStageBreakdown(String scenario,
                 throw new AssertionError();
             }
         }
+        double composeMs = medianMs(composeNs);
+        double layoutMs = medianMs(layoutNs);
+        double renderMs = medianMs(renderNs);
+        double totalMs = medianMs(totalNs);
         System.out.printf("%-18s | %12.3f | %12.3f | %12.3f | %12.3f%n",
-                scenario,
-                medianMs(composeNs),
-                medianMs(layoutNs),
-                medianMs(renderNs),
-                medianMs(totalNs));
+                scenario, composeMs, layoutMs, renderMs, totalMs);
+        return new StageRow(scenario, round(composeMs), round(layoutMs), round(renderMs), round(totalMs));
     }
 
     private static double medianMs(long[] arr) {
@@ -677,16 +682,19 @@ private PathSummary writeReports(BenchmarkReportWriter.BenchmarkArtifacts artifa
                                      int docsPerThread,
                                      int[] threadCounts,
                                      List<LatencyRow> latencyRows,
+                                     List<StageRow> stageRows,
                                      List<ThroughputRow> throughputRows,
                                      long totalBenchmarkBytes) throws Exception {
+        String timestamp = LocalDateTime.now().format(TIMESTAMP_FORMAT);
         CurrentSpeedReport report = new CurrentSpeedReport(
-                LocalDateTime.now().format(TIMESTAMP_FORMAT),
+                timestamp,
                 profileId,
                 warmupIterations,
                 measurementIterations,
                 docsPerThread,
                 Arrays.stream(threadCounts).boxed().toList(),
                 latencyRows,
+                stageRows,
                 throughputRows,
                 totalBenchmarkBytes);
 
@@ -717,8 +725,88 @@ private PathSummary writeReports(BenchmarkReportWriter.BenchmarkArtifacts artifa
                                 format(row.docsPerSecond()),
                                 format(row.avgMillisPerDoc())))
                         .toList());
+        var stagesCsvPath = artifacts.writeCsv(
+                "stages",
+                List.of("scenario", "compose_ms", "layout_ms", "render_ms", "total_ms"),
+                stageRows.stream()
+                        .map(row -> List.of(
+                                row.scenario(),
+                                format(row.composeMillis()),
+                                format(row.layoutMillis()),
+                                format(row.renderMillis()),
+                                format(row.totalMillis())))
+                        .toList());
+        var summaryMarkdownPath = artifacts.writeMarkdown(
+                "summary",
+                buildSummaryMarkdown(timestamp, profileId, latencyRows, stageRows,
+                        throughputRows, totalBenchmarkBytes));
+
+        return new PathSummary(jsonPath.toString(), latencyCsvPath.toString(),
+                stagesCsvPath.toString(), throughputCsvPath.toString(),
+                summaryMarkdownPath.toString());
+    }
+
+    /**
+     * Renders a single human-readable summary of the run — the latency table,
+     * the per-stage compose/layout/render split (the only place the suite
+     * attributes time to engine stages vs. PDFBox), and the throughput table
+     * when present — so a reviewer reads one file instead of stitching the JSON
+     * and several CSVs together.
+     */
+    private static String buildSummaryMarkdown(String timestamp,
+                                               String profileId,
+                                               List<LatencyRow> latencyRows,
+                                               List<StageRow> stageRows,
+                                               List<ThroughputRow> throughputRows,
+                                               long totalBenchmarkBytes) {
+        StringBuilder md = new StringBuilder();
+        md.append("# Current-speed benchmark — ").append(profileId).append(" profile\n\n");
+        md.append('`').append(timestamp).append("`\n\n");
+
+        md.append("## Latency (ms)\n\n");
+        md.append("| Scenario | Avg | p50 | p95 | Max | Docs/s | Avg KB | Peak MB |\n");
+        md.append("|---|---:|---:|---:|---:|---:|---:|---:|\n");
+        for (LatencyRow row : latencyRows) {
+            md.append("| ").append(row.scenario())
+                    .append(" | ").append(format(row.avgMillis()))
+                    .append(" | ").append(format(row.p50Millis()))
+                    .append(" | ").append(format(row.p95Millis()))
+                    .append(" | ").append(format(row.maxMillis()))
+                    .append(" | ").append(format(row.docsPerSecond()))
+                    .append(" | ").append(format(row.avgKilobytes()))
+                    .append(" | ").append(format(row.peakHeapMb()))
+                    .append(" |\n");
+        }
 
-        return new PathSummary(jsonPath.toString(), latencyCsvPath.toString(), throughputCsvPath.toString());
+        if (!stageRows.isEmpty()) {
+            md.append("\n## Stages — template scenarios (median ms — compose / layout / render)\n\n");
+            md.append("| Scenario | Compose | Layout | Render | Total |\n");
+            md.append("|---|---:|---:|---:|---:|\n");
+            for (StageRow row : stageRows) {
+                md.append("| ").append(row.scenario())
+                        .append(" | ").append(format(row.composeMillis()))
+                        .append(" | ").append(format(row.layoutMillis()))
+                        .append(" | ").append(format(row.renderMillis()))
+                        .append(" | ").append(format(row.totalMillis()))
+                        .append(" |\n");
+            }
+        }
+
+        if (!throughputRows.isEmpty()) {
+            md.append("\n## Throughput\n\n");
+            md.append("| Threads | Total docs | Docs/s | Avg doc ms |\n");
+            md.append("|---:|---:|---:|---:|\n");
+            for (ThroughputRow row : throughputRows) {
+                md.append("| ").append(row.threads())
+                        .append(" | ").append(row.totalDocs())
+                        .append(" | ").append(format(row.docsPerSecond()))
+                        .append(" | ").append(format(row.avgMillisPerDoc()))
+                        .append(" |\n");
+            }
+        }
+
+        md.append("\nByte guard: ").append(totalBenchmarkBytes).append('\n');
+        return md.toString();
     }
 
     private static double round(double value) {
@@ -772,6 +860,18 @@ private record ThroughputRow(String scenario,
                                  double avgMillisPerDoc) {
     }
 
+    /**
+     * Per-scenario compose / layout / render split (median ms). Persisted so a
+     * diff can attribute a regression to an engine stage rather than only the
+     * blended total — previously this was printed to the console and discarded.
+     */
+    private record StageRow(String scenario,
+                            double composeMillis,
+                            double layoutMillis,
+                            double renderMillis,
+                            double totalMillis) {
+    }
+
     private record CurrentSpeedReport(String timestamp,
                                       String profile,
                                       int warmupIterations,
@@ -779,11 +879,13 @@ private record CurrentSpeedReport(String timestamp,
                                       int docsPerThread,
                                       List<Integer> threadCounts,
                                       List<LatencyRow> latency,
+                                      List<StageRow> stages,
                                       List<ThroughputRow> throughput,
                                       long totalBytes) {
     }
 
-    private record PathSummary(String jsonPath, String latencyCsvPath, String throughputCsvPath) {
+    private record PathSummary(String jsonPath, String latencyCsvPath, String stagesCsvPath,
+                               String throughputCsvPath, String summaryMarkdownPath) {
     }
 
     private record BenchmarkConfig(int warmupIterations,

From 2d2785208a73d5fd4a3337cf63d72b4a869be487 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 19:37:13 +0100
Subject: [PATCH 03/36] perf(benchmarks): diff consumes stages[] and reports
 added/removed scenarios
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BenchmarkDiffTool now (1) surfaces scenario set changes — addedScenarios /
removedScenarios — instead of silently intersecting, so a newly-added (or
dropped) scenario can no longer vanish from a diff unnoticed; and (2) diffs
the stages[] array, emitting per-scenario compose/layout/render/total percent
deltas (console block + stages-diff CSV) so a regression can be attributed to
an engine stage.

Backward-compatible: a report without stages[] yields an empty stage diff
(MissingNode iterates empty); latency/throughput delta rows stay
intersection-only; the diff report is terminal (median/verdict read producer
reports, not diffs). Adds a DiffToolTest case; 29 bench tests pass.
---
 .../com/demcha/compose/BenchmarkDiffTool.java | 100 +++++++++++++++++-
 .../demcha/compose/BenchmarkDiffToolTest.java |  61 +++++++++++
 2 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
index 9b99d272f..0fb058bf8 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
@@ -93,6 +93,31 @@ private void diffCurrentSpeed(DiffInput input,
                     signedPercent(row.peakHeapMbDeltaPct()));
         }
 
+        if (!report.addedScenarios().isEmpty() || !report.removedScenarios().isEmpty()) {
+            System.out.println();
+            System.out.println("Scenario set changes");
+            System.out.println("  Added in candidate:    "
+                    + (report.addedScenarios().isEmpty() ? "(none)" : String.join(", ", report.addedScenarios())));
+            System.out.println("  Removed from baseline: "
+                    + (report.removedScenarios().isEmpty() ? "(none)" : String.join(", ", report.removedScenarios())));
+        }
+
+        if (!report.stages().isEmpty()) {
+            System.out.println();
+            System.out.println("Stage diff (pct delta per stage)");
+            System.out.printf("%-18s | %12s | %12s | %12s | %12s%n",
+                    "Scenario", "Compose pct", "Layout pct", "Render pct", "Total pct");
+            System.out.println("-".repeat(78));
+            for (StageDiff row : report.stages()) {
+                System.out.printf("%-18s | %12s | %12s | %12s | %12s%n",
+                        row.scenario(),
+                        signedPercent(row.composeDeltaPct()),
+                        signedPercent(row.layoutDeltaPct()),
+                        signedPercent(row.renderDeltaPct()),
+                        signedPercent(row.totalDeltaPct()));
+            }
+        }
+
         System.out.println();
         System.out.println("Throughput diff");
         System.out.printf("%-18s | %8s | %12s | %14s%n",
@@ -143,10 +168,29 @@ private void diffCurrentSpeed(DiffInput input,
                                 format(row.candidateAvgMillisPerDoc()),
                                 format(row.avgMillisPerDocDeltaPct())))
                         .toList());
+        Path stagesCsv = artifacts.writeCsv(
+                "stages-diff",
+                List.of("scenario", "baseline_compose_ms", "candidate_compose_ms", "compose_delta_pct", "baseline_layout_ms", "candidate_layout_ms", "layout_delta_pct", "baseline_render_ms", "candidate_render_ms", "render_delta_pct", "baseline_total_ms", "candidate_total_ms", "total_delta_pct"),
+                report.stages().stream()
+                        .map(row -> List.of(
+                                row.scenario(),
+                                format(row.baselineComposeMillis()),
+                                format(row.candidateComposeMillis()),
+                                format(row.composeDeltaPct()),
+                                format(row.baselineLayoutMillis()),
+                                format(row.candidateLayoutMillis()),
+                                format(row.layoutDeltaPct()),
+                                format(row.baselineRenderMillis()),
+                                format(row.candidateRenderMillis()),
+                                format(row.renderDeltaPct()),
+                                format(row.baselineTotalMillis()),
+                                format(row.candidateTotalMillis()),
+                                format(row.totalDeltaPct())))
+                        .toList());
 
         System.out.println();
         System.out.println("Saved JSON diff report to " + jsonPath);
-        System.out.println("Saved CSV diff reports to " + latencyCsv + " and " + throughputCsv);
+        System.out.println("Saved CSV diff reports to " + latencyCsv + ", " + throughputCsv + ", and " + stagesCsv);
     }
 
     private void diffComparative(DiffInput input,
@@ -214,6 +258,29 @@ private CurrentSpeedDiffReport buildCurrentSpeedDiff(DiffInput input, JsonNode b
                 })
                 .toList();
 
+        Map<String, JsonNode> baselineStages = indexBy(baseline.path("stages"), "scenario");
+        Map<String, JsonNode> candidateStages = indexBy(candidate.path("stages"), "scenario");
+        List<StageDiff> stageDiffs = intersectKeys(baselineStages, candidateStages).stream()
+                .map(key -> {
+                    JsonNode before = baselineStages.get(key);
+                    JsonNode after = candidateStages.get(key);
+                    return new StageDiff(
+                            key,
+                            before.path("composeMillis").asDouble(),
+                            after.path("composeMillis").asDouble(),
+                            percentDelta(before.path("composeMillis").asDouble(), after.path("composeMillis").asDouble()),
+                            before.path("layoutMillis").asDouble(),
+                            after.path("layoutMillis").asDouble(),
+                            percentDelta(before.path("layoutMillis").asDouble(), after.path("layoutMillis").asDouble()),
+                            before.path("renderMillis").asDouble(),
+                            after.path("renderMillis").asDouble(),
+                            percentDelta(before.path("renderMillis").asDouble(), after.path("renderMillis").asDouble()),
+                            before.path("totalMillis").asDouble(),
+                            after.path("totalMillis").asDouble(),
+                            percentDelta(before.path("totalMillis").asDouble(), after.path("totalMillis").asDouble()));
+                })
+                .toList();
+
         Map<String, JsonNode> baselineThroughput = indexThroughput(baseline.path("throughput"));
         Map<String, JsonNode> candidateThroughput = indexThroughput(candidate.path("throughput"));
         List<CurrentSpeedThroughputDiff> throughputDiffs = intersectKeys(baselineThroughput, candidateThroughput).stream()
@@ -237,7 +304,10 @@ private CurrentSpeedDiffReport buildCurrentSpeedDiff(DiffInput input, JsonNode b
                 input.candidatePath().toString(),
                 baseline.path("timestamp").asText(),
                 candidate.path("timestamp").asText(),
+                addedKeys(baselineLatency, candidateLatency),
+                removedKeys(baselineLatency, candidateLatency),
                 latencyDiffs,
+                stageDiffs,
                 throughputDiffs
         );
     }
@@ -294,6 +364,16 @@ private static List<String> intersectKeys(Map<String, JsonNode> left, Map<String
                 .toList();
     }
 
+    /** Keys present in {@code candidate} but not {@code baseline} (new scenarios). */
+    private static List<String> addedKeys(Map<String, JsonNode> baseline, Map<String, JsonNode> candidate) {
+        return candidate.keySet().stream().filter(key -> !baseline.containsKey(key)).sorted().toList();
+    }
+
+    /** Keys present in {@code baseline} but not {@code candidate} (dropped scenarios). */
+    private static List<String> removedKeys(Map<String, JsonNode> baseline, Map<String, JsonNode> candidate) {
+        return baseline.keySet().stream().filter(key -> !candidate.containsKey(key)).sorted().toList();
+    }
+
     private static Iterable<JsonNode> iterable(JsonNode array) {
         return () -> new Iterator<>() {
             private final Iterator<JsonNode> delegate = array.iterator();
@@ -477,11 +557,29 @@ private record CurrentSpeedThroughputDiff(String scenario,
                                               double avgMillisPerDocDeltaPct) {
     }
 
+    private record StageDiff(String scenario,
+                             double baselineComposeMillis,
+                             double candidateComposeMillis,
+                             double composeDeltaPct,
+                             double baselineLayoutMillis,
+                             double candidateLayoutMillis,
+                             double layoutDeltaPct,
+                             double baselineRenderMillis,
+                             double candidateRenderMillis,
+                             double renderDeltaPct,
+                             double baselineTotalMillis,
+                             double candidateTotalMillis,
+                             double totalDeltaPct) {
+    }
+
     private record CurrentSpeedDiffReport(String baselinePath,
                                           String candidatePath,
                                           String baselineTimestamp,
                                           String candidateTimestamp,
+                                          List<String> addedScenarios,
+                                          List<String> removedScenarios,
                                           List<CurrentSpeedLatencyDiff> latency,
+                                          List<StageDiff> stages,
                                           List<CurrentSpeedThroughputDiff> throughput) {
     }
 
diff --git a/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java b/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java
index 783ad2479..d3319131c 100644
--- a/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java
+++ b/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java
@@ -93,6 +93,35 @@ void currentSpeedDiffKeepsOnlyScenariosPresentInBothRuns() throws Exception {
         assertThat(diff.path("throughput").get(0).path("scenario").asText()).isEqualTo("shared");
     }
 
+    @Test
+    void currentSpeedDiffSurfacesAddedRemovedScenariosAndStageDeltas() throws Exception {
+        System.setProperty("graphcompose.benchmark.root", tempDir.toString());
+        Path baseline = write("baseline.json", currentSpeedWithStages("full",
+                latency("shared", 10.0, 10.0, 100.0, 1.0, 100.0) + ","
+                        + latency("only-in-baseline", 10.0, 10.0, 100.0, 1.0, 100.0),
+                stage("shared", 1.0, 2.0, 4.0, 7.0),
+                throughput("shared", 1, 50.0, 20.0)));
+        Path candidate = write("candidate.json", currentSpeedWithStages("full",
+                latency("shared", 10.0, 10.0, 100.0, 1.0, 100.0) + ","
+                        + latency("only-in-candidate", 5.0, 5.0, 200.0, 0.5, 90.0),
+                stage("shared", 1.0, 2.0, 8.0, 11.0),
+                throughput("shared", 1, 50.0, 20.0)));
+
+        BenchmarkDiffTool.main(new String[]{baseline.toString(), candidate.toString()});
+
+        JsonNode diff = readDiff("current-speed");
+        // Loud set-changes: one-sided scenarios are surfaced, not silently dropped.
+        assertThat(toStrings(diff.path("addedScenarios"))).containsExactly("only-in-candidate");
+        assertThat(toStrings(diff.path("removedScenarios"))).containsExactly("only-in-baseline");
+        // The shared scenario is still the only intersected latency delta row.
+        assertThat(diff.path("latency").size()).isEqualTo(1);
+        // Stage diff: render 4 -> 8 = +100%, compose unchanged.
+        JsonNode stageDiff = diff.path("stages").get(0);
+        assertThat(stageDiff.path("scenario").asText()).isEqualTo("shared");
+        assertThat(stageDiff.path("renderDeltaPct").asDouble()).isCloseTo(100.0, within(EPS));
+        assertThat(stageDiff.path("composeDeltaPct").asDouble()).isCloseTo(0.0, within(EPS));
+    }
+
     @Test
     void currentSpeedDiffTreatsZeroBaselineAsHundredPercentAndZeroToZeroAsZero() throws Exception {
         System.setProperty("graphcompose.benchmark.root", tempDir.toString());
@@ -228,6 +257,38 @@ private static String latency(String scenario,
                 """.formatted(scenario, scenario, avgMillis, p95Millis, docsPerSecond, avgKilobytes, peakHeapMb);
     }
 
+    private static String currentSpeedWithStages(String profile, String latencyItems,
+                                                 String stageItems, String throughputItems) {
+        return """
+                {
+                  "timestamp": "2026-04-14 21:00:00",
+                  "profile": "%s",
+                  "latency": [%s],
+                  "stages": [%s],
+                  "throughput": [%s]
+                }
+                """.formatted(profile, latencyItems, stageItems, throughputItems);
+    }
+
+    private static String stage(String scenario, double composeMs, double layoutMs,
+                                double renderMs, double totalMs) {
+        return """
+                {
+                  "scenario": "%s",
+                  "composeMillis": %s,
+                  "layoutMillis": %s,
+                  "renderMillis": %s,
+                  "totalMillis": %s
+                }
+                """.formatted(scenario, composeMs, layoutMs, renderMs, totalMs);
+    }
+
+    private static java.util.List<String> toStrings(JsonNode array) {
+        java.util.List<String> values = new java.util.ArrayList<>();
+        array.forEach(node -> values.add(node.asText()));
+        return values;
+    }
+
     private static String throughput(String scenario, int threads, double docsPerSecond, double avgMillisPerDoc) {
         return """
                 {

From faec9e3f23c02eb54e2fa5fa5d6ab9fc94d1ae9c Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 19:55:24 +0100
Subject: [PATCH 04/36] perf(benchmarks): add SVG-import feature benches (parse
 / read / node)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First feature-object benchmarks for the v1.8 vector surface (the rest of the
suite is text/table only):
- SvgJmhBenchmark (forked JMH): SvgPath.parse of a real Material heart d,
  SvgIcon.parse of a multi-layer icon, SvgIcon.node on a pre-parsed icon.
- SvgParseAllocProbe (deterministic ThreadMXBean alloc, median of 11): KB/op
  for the same three operations.
- SvgBenchmarkFixtures: the heart d (vendored — the benchmark module can't
  reach the test/example copies) and a synthetic multi-layer icon (gradient
  bg + transformed groups + stroked curves) within the reader's supported
  subset, so it always parses.

Run on demand, not per-PR: java -jar benchmarks/target/benchmarks.jar Svg.
Verified: compiles; both benches run — path parse ~3.6 us/op, icon read
~308 us/op (DOM-parse dominated, 114 KB/op), node build ~0.4 us/op / 2 KB/op.
---
 .../demcha/compose/SvgBenchmarkFixtures.java  | 55 +++++++++++
 .../demcha/compose/SvgParseAllocProbe.java    | 93 ++++++++++++++++++
 .../demcha/compose/jmh/SvgJmhBenchmark.java   | 97 +++++++++++++++++++
 3 files changed, 245 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java
new file mode 100644
index 000000000..120741433
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java
@@ -0,0 +1,55 @@
+package com.demcha.compose;
+
+/**
+ * Shared SVG fixtures for the v1.8 vector-import benchmarks (path parse, whole
+ * icon read, icon → node build).
+ *
+ * <p>Self-contained on purpose: the benchmarks module cannot reach the
+ * main-module test constants or the examples module, so the heart path is
+ * vendored here (it also lives in {@code SvgPathTest} / {@code VectorPathExample}
+ * in their own modules). The icon is a synthetic but realistic multi-layer
+ * document — a gradient-filled background, a {@code translate}+{@code scale}
+ * group of filled paths and a stroked circle, and a {@code rotate} group with a
+ * polygon and a quadratic-curve stroke — so it exercises XML parse, {@code <g>}
+ * transform accumulation, gradient resolution and per-layer path lowering the
+ * way a real exporter file would, while staying entirely within the reader's
+ * supported subset (so it never throws).</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class SvgBenchmarkFixtures {
+
+    /** Material "favorite" heart — the same {@code d} used in the SVG tests/examples. */
+    public static final String MATERIAL_HEART_D =
+            "M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 2 5.42 4.42 3 7.5 3"
+            + "c1.74 0 3.41.81 4.5 2.09C13.09 3.81 14.76 3 16.5 3 19.58 3 22 5.42 22 8.5"
+            + "c0 3.78-3.4 6.86-8.55 11.54L12 21.35z";
+
+    /** Heart viewBox edge (square 24×24), passed to {@code SvgPath.parse}. */
+    public static final double HEART_VIEWBOX = 24.0;
+
+    /** A realistic multi-layer icon: gradient bg + transformed groups + stroked curves. */
+    public static final String MULTI_LAYER_ICON_SVG = """
+            <svg viewBox="0 0 48 48" xmlns="http://www.w3.org/2000/svg">
+              <defs>
+                <linearGradient id="sky" x1="0" y1="0" x2="0" y2="48" gradientUnits="userSpaceOnUse">
+                  <stop offset="0" stop-color="#3b82f6"/>
+                  <stop offset="1" stop-color="#1e3a8a"/>
+                </linearGradient>
+              </defs>
+              <rect x="0" y="0" width="48" height="48" rx="6" fill="url(#sky)"/>
+              <g transform="translate(6 6) scale(1.1)">
+                <path d="M0 24 L12 4 L24 24 Z" fill="#fbbf24"/>
+                <path d="M6 24 L16 10 L26 24 Z" fill="#f59e0b"/>
+                <circle cx="20" cy="8" r="4" fill="#fde68a" stroke="#92400e" stroke-width="1.5"/>
+              </g>
+              <g transform="rotate(8 24 40)">
+                <polygon points="4,40 44,40 40,46 8,46" fill="#10b981"/>
+                <path d="M10 42 Q24 38 38 42" fill="none" stroke="#065f46" stroke-width="2"/>
+              </g>
+            </svg>
+            """;
+
+    private SvgBenchmarkFixtures() {
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java b/benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java
new file mode 100644
index 000000000..b8df62a2b
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java
@@ -0,0 +1,93 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.svg.SvgIcon;
+import com.demcha.compose.document.svg.SvgPath;
+
+import java.lang.management.ManagementFactory;
+import java.util.Arrays;
+import java.util.function.Supplier;
+
+/**
+ * Deterministic allocation probe for the v1.8 SVG-import path: warm
+ * (JIT-steady) bytes allocated per {@link SvgPath#parse}, per
+ * {@link SvgIcon#parse}, and per {@link SvgIcon#node} — the three operations
+ * with no analogue in the rest of the suite (which is text / table only).
+ *
+ * <p>Allocation counts are noise-free (unlike wall-clock or {@code peakHeapMb}),
+ * so this is the signal the "optimize the engine, not benchmarks" rule wants:
+ * a develop-vs-branch A/B shows a parse/read/node allocation change directly.
+ * No {@code src/main} changes.</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class SvgParseAllocProbe {
+
+    private static final com.sun.management.ThreadMXBean THREAD_MX =
+            (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
+
+    private static final int WARMUP = 60;
+    private static final int MEASURE = 11;
+
+    /** Escape sink so the JIT cannot elide the measured allocations. */
+    private static long sink;
+
+    public static void main(String[] args) {
+        BenchmarkSupport.configureQuietLogging();
+        enableAllocationMeasurement();
+
+        SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+
+        double parseKb = measureAllocKb(() -> SvgPath.parse(
+                SvgBenchmarkFixtures.MATERIAL_HEART_D,
+                0, 0, SvgBenchmarkFixtures.HEART_VIEWBOX, SvgBenchmarkFixtures.HEART_VIEWBOX));
+        double readKb = measureAllocKb(() -> SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG));
+        double nodeKb = measureAllocKb(() -> icon.node(48.0));
+
+        System.out.println("GraphCompose SVG-import allocation probe (median of " + MEASURE + ")");
+        System.out.printf("  SvgPath.parse (heart d)     : %s%n", kb(parseKb));
+        System.out.printf("  SvgIcon.parse (multi-layer) : %s%n", kb(readKb));
+        System.out.printf("  SvgIcon.node(48)            : %s%n", kb(nodeKb));
+        System.out.println("alloc sink: " + sink);
+    }
+
+    private static double measureAllocKb(Supplier<Object> op) {
+        for (int i = 0; i < WARMUP; i++) {
+            sink += System.identityHashCode(op.get());
+        }
+        long[] alloc = new long[MEASURE];
+        for (int m = 0; m < MEASURE; m++) {
+            long before = currentThreadAllocatedBytes();
+            Object result = op.get();
+            long after = currentThreadAllocatedBytes();
+            sink += System.identityHashCode(result);
+            alloc[m] = before < 0 ? -1 : after - before;
+        }
+        Arrays.sort(alloc);
+        return alloc[MEASURE / 2] / 1024.0;
+    }
+
+    private static String kb(double value) {
+        return value < 0 ? "n/a (allocation measurement unsupported)" : "%.1f KB/op".formatted(value);
+    }
+
+    private static void enableAllocationMeasurement() {
+        try {
+            if (THREAD_MX.isThreadAllocatedMemorySupported() && !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                THREAD_MX.setThreadAllocatedMemoryEnabled(true);
+            }
+        } catch (UnsupportedOperationException ignored) {
+            // Allocation measurement unsupported on this JVM; the probe reports n/a.
+        }
+    }
+
+    private static long currentThreadAllocatedBytes() {
+        try {
+            if (!THREAD_MX.isThreadAllocatedMemorySupported() || !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                return -1;
+            }
+        } catch (UnsupportedOperationException ex) {
+            return -1;
+        }
+        return THREAD_MX.getCurrentThreadAllocatedBytes();
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
new file mode 100644
index 000000000..f7a63b30c
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
@@ -0,0 +1,97 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.SvgBenchmarkFixtures;
+import com.demcha.compose.document.svg.SvgIcon;
+import com.demcha.compose.document.svg.SvgPath;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark for the v1.8 SVG-import surface — the first
+ * feature-object benchmark (the rest of the suite renders text / tables only).
+ *
+ * <p>Three measured operations, all pure CPU + allocation (no
+ * {@code DocumentSession}, no PDF render):</p>
+ * <ul>
+ *   <li>{@code parseSvgPath} — {@link SvgPath#parse} of a real Material icon
+ *       {@code d} string (arc→cubic conversion, normalization).</li>
+ *   <li>{@code readSvgIcon} — {@link SvgIcon#parse} of a multi-layer icon (XML
+ *       parse, {@code <g>} transform accumulation, gradient resolution, one
+ *       {@link SvgPath} per layer).</li>
+ *   <li>{@code svgIconToNode} — {@link SvgIcon#node} on a pre-parsed icon (the
+ *       {@code PathNode} / layer-stack allocation done once per placement).</li>
+ * </ul>
+ *
+ * <p>Microsecond-scale work, so it needs the forked, JIT-stable JMH harness
+ * (an {@code exec:java} run cannot fork). Build the runner jar and run:</p>
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar Svg
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class SvgJmhBenchmark {
+
+    /** Parsed once so {@code svgIconToNode} measures only the node-build cost. */
+    private final SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+
+    /**
+     * Parses a real icon path-data string into normalized segments.
+     *
+     * @param blackhole JMH sink
+     */
+    @Benchmark
+    public void parseSvgPath(Blackhole blackhole) {
+        blackhole.consume(SvgPath.parse(
+                SvgBenchmarkFixtures.MATERIAL_HEART_D,
+                0, 0, SvgBenchmarkFixtures.HEART_VIEWBOX, SvgBenchmarkFixtures.HEART_VIEWBOX));
+    }
+
+    /**
+     * Reads a whole multi-layer SVG icon (XML parse → layers).
+     *
+     * @param blackhole JMH sink
+     */
+    @Benchmark
+    public void readSvgIcon(Blackhole blackhole) {
+        blackhole.consume(SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG));
+    }
+
+    /**
+     * Builds a placeable node (path nodes + layer stack) from a parsed icon.
+     *
+     * @param blackhole JMH sink
+     */
+    @Benchmark
+    public void svgIconToNode(Blackhole blackhole) {
+        blackhole.consume(icon.node(48.0));
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}

From ae025075d8e660b04e65e9438237873fd9aee026 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 20:00:55 +0100
Subject: [PATCH 05/36] perf(benchmarks): add chart feature benches (render +
 compile alloc)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

S4 of the modernization — the first chart benchmarks (the suite otherwise
renders text/tables only):
- ChartJmhBenchmark (forked JMH): end-to-end render of a chart-heavy doc —
  grouped bar + multi-series line (12 categories x 3 series) + 6-slice pie.
- ChartAllocProbe (deterministic ThreadMXBean, median of 11): warm
  layout-compile allocation, isolating chart-resolve + geometry emission.
- ChartBenchmarkFixtures: the shared bar/line/pie specs + data.

Run on demand, not per-PR: java -jar benchmarks/target/benchmarks.jar Chart.
Verified: compiles; render ~2.8 ms/op; compile alloc 446.8 KB (deterministic,
min=max=median, 1 page).
---
 .../com/demcha/compose/ChartAllocProbe.java   | 114 ++++++++++++++++++
 .../compose/ChartBenchmarkFixtures.java       |  91 ++++++++++++++
 .../demcha/compose/jmh/ChartJmhBenchmark.java |  79 ++++++++++++
 3 files changed, 284 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/ChartAllocProbe.java
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/ChartJmhBenchmark.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/ChartAllocProbe.java b/benchmarks/src/main/java/com/demcha/compose/ChartAllocProbe.java
new file mode 100644
index 000000000..2921bde80
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ChartAllocProbe.java
@@ -0,0 +1,114 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.backend.fixed.pdf.PdfMeasurementResources;
+import com.demcha.compose.document.layout.DocumentGraph;
+import com.demcha.compose.document.layout.DocumentLayoutPassContext;
+import com.demcha.compose.document.layout.LayoutCanvas;
+import com.demcha.compose.document.layout.LayoutCompiler;
+import com.demcha.compose.document.layout.LayoutGraph;
+import com.demcha.compose.document.layout.NodeRegistry;
+import com.demcha.compose.document.node.DocumentNode;
+
+import java.lang.management.ManagementFactory;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Deterministic allocation probe for the v1.8 chart subsystem: warm
+ * (JIT-steady) bytes allocated by the layout-compile pass of a chart-heavy
+ * document (a grouped bar, a multi-series line, and a pie). Charts are resolved
+ * into engine primitives during compile, so this isolates the chart-resolve +
+ * geometry-emission allocation — the noise-free signal a develop-vs-branch A/B
+ * needs. No {@code src/main} changes.
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ChartAllocProbe {
+
+    private static final com.sun.management.ThreadMXBean THREAD_MX =
+            (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
+
+    private static final int WARMUP = 60;
+    private static final int MEASURE = 11;
+
+    public static void main(String[] args) throws Exception {
+        BenchmarkSupport.configureQuietLogging();
+        enableAllocationMeasurement();
+
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(24, 24, 24, 24)
+                .create()) {
+            session.pageFlow(flow -> flow
+                    .chart(ChartBenchmarkFixtures.barSpec(), ChartBenchmarkFixtures.barStyle())
+                    .chart(ChartBenchmarkFixtures.lineSpec(), ChartBenchmarkFixtures.lineStyle())
+                    .chart(ChartBenchmarkFixtures.pieSpec()));
+
+            List<DocumentNode> roots = session.roots();
+            LayoutCanvas canvas = session.canvas();
+            NodeRegistry registry = session.registry();
+
+            try (PdfMeasurementResources resources = PdfMeasurementResources.open(List.of())) {
+                LayoutCompiler compiler = new LayoutCompiler(registry);
+                DocumentGraph graph = new DocumentGraph(roots);
+
+                int pages = 0;
+                // Warm up so the measured allocation is JIT steady state, not
+                // class-load / first-call cold start.
+                for (int i = 0; i < WARMUP; i++) {
+                    pages = compile(compiler, graph, registry, canvas, resources).totalPages();
+                }
+
+                long[] alloc = new long[MEASURE];
+                for (int m = 0; m < MEASURE; m++) {
+                    long before = currentThreadAllocatedBytes();
+                    LayoutGraph layout = compile(compiler, graph, registry, canvas, resources);
+                    alloc[m] = before < 0 ? -1 : currentThreadAllocatedBytes() - before;
+                    pages = layout.totalPages();
+                }
+                Arrays.sort(alloc);
+
+                System.out.println("GraphCompose chart layout-compile allocation probe");
+                System.out.printf("document: grouped bar + line (12 cats x 3 series) + 6-slice pie, pages: %d%n", pages);
+                System.out.printf("warm compile allocation (median of %d): %s%n",
+                        MEASURE, kb(alloc[MEASURE / 2]));
+                System.out.printf("  min %s / max %s%n", kb(alloc[0]), kb(alloc[MEASURE - 1]));
+            }
+        }
+    }
+
+    private static LayoutGraph compile(LayoutCompiler compiler, DocumentGraph graph,
+                                       NodeRegistry registry, LayoutCanvas canvas,
+                                       PdfMeasurementResources resources) {
+        DocumentLayoutPassContext context = new DocumentLayoutPassContext(
+                registry, canvas, resources.fontLibrary(), resources.textMeasurementSystem(), false);
+        return compiler.compile(graph, context, context);
+    }
+
+    private static String kb(long bytes) {
+        return bytes < 0 ? "n/a (allocation measurement unsupported)" : "%.1f KB".formatted(bytes / 1024.0);
+    }
+
+    private static void enableAllocationMeasurement() {
+        try {
+            if (THREAD_MX.isThreadAllocatedMemorySupported() && !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                THREAD_MX.setThreadAllocatedMemoryEnabled(true);
+            }
+        } catch (UnsupportedOperationException ignored) {
+            // Allocation measurement unsupported on this JVM; the probe reports n/a.
+        }
+    }
+
+    private static long currentThreadAllocatedBytes() {
+        try {
+            if (!THREAD_MX.isThreadAllocatedMemorySupported() || !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                return -1;
+            }
+        } catch (UnsupportedOperationException ex) {
+            return -1;
+        }
+        return THREAD_MX.getCurrentThreadAllocatedBytes();
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java
new file mode 100644
index 000000000..59aa1578b
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java
@@ -0,0 +1,91 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.chart.AxisSpec;
+import com.demcha.compose.document.chart.ChartData;
+import com.demcha.compose.document.chart.ChartSize;
+import com.demcha.compose.document.chart.ChartSpec;
+import com.demcha.compose.document.chart.ChartStyle;
+import com.demcha.compose.document.chart.LegendPosition;
+import com.demcha.compose.document.chart.PointMarker;
+import com.demcha.compose.document.chart.SliceLabelMode;
+import com.demcha.compose.document.chart.ValueLabelMode;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentPaint;
+import com.demcha.compose.document.style.DocumentStroke;
+
+/**
+ * Shared fixtures for the v1.8 chart benchmarks: a non-trivial grouped bar and
+ * multi-series line (12 categories × 3 series) plus a 6-slice pie. Charts
+ * compile at layout time into ordinary shapes / lines / polygons / labels, so
+ * these stress {@code ChartLayoutResolver} + per-primitive geometry + label
+ * text-metrics — the cost no text/table bench exercises.
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ChartBenchmarkFixtures {
+
+    private ChartBenchmarkFixtures() {
+    }
+
+    /** 12 categories × 3 series — a representative grouped-bar / line workload. */
+    public static ChartData monthlySeries() {
+        return ChartData.builder()
+                .categories("Jan", "Feb", "Mar", "Apr", "May", "Jun",
+                        "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
+                .series("2023", 12.4, 15.1, 9.8, 14.2, 16.0, 13.3, 17.1, 18.4, 15.9, 14.0, 19.2, 21.1)
+                .series("2024", 14.0, 18.2, 11.3, 16.9, 17.5, 15.0, 19.0, 20.2, 17.1, 16.4, 21.0, 23.5)
+                .series("2025", 15.5, 19.0, 12.0, 18.0, 19.1, 16.2, 20.5, 22.0, 18.9, 17.7, 22.8, 25.0)
+                .build();
+    }
+
+    /** 6-slice single-series data for the pie. */
+    public static ChartData regionShare() {
+        return ChartData.builder()
+                .categories("EMEA", "Americas", "APAC", "LATAM", "MEA", "Other")
+                .series("Share", 31.0, 27.0, 19.0, 10.0, 8.0, 5.0)
+                .build();
+    }
+
+    public static ChartSpec barSpec() {
+        return ChartSpec.bar()
+                .data(monthlySeries())
+                .valueAxis(AxisSpec.builder().baselineAtZero(true).build())
+                .legend(LegendPosition.BOTTOM)
+                .valueLabels(ValueLabelMode.OUTSIDE)
+                .size(ChartSize.aspectRatio(16, 7))
+                .build();
+    }
+
+    public static ChartStyle barStyle() {
+        return ChartStyle.builder()
+                .seriesPaint(0, DocumentPaint.solid(DocumentColor.rgb(20, 80, 95)))
+                .seriesPaint(1, DocumentPaint.solid(DocumentColor.rgb(196, 153, 76)))
+                .seriesPaint(2, DocumentPaint.solid(DocumentColor.rgb(120, 60, 140)))
+                .build();
+    }
+
+    public static ChartSpec lineSpec() {
+        return ChartSpec.line()
+                .data(monthlySeries())
+                .valueAxis(AxisSpec.builder().baselineAtZero(true).build())
+                .legend(LegendPosition.BOTTOM)
+                .size(ChartSize.aspectRatio(16, 7))
+                .build();
+    }
+
+    public static ChartStyle lineStyle() {
+        return ChartStyle.builder()
+                .lineWidth(1.8)
+                .pointMarker(PointMarker.circle(5.0)
+                        .withStroke(DocumentStroke.of(DocumentColor.WHITE, 1.0)))
+                .build();
+    }
+
+    public static ChartSpec pieSpec() {
+        return ChartSpec.pie()
+                .data(regionShare())
+                .sliceLabels(SliceLabelMode.CATEGORY_PERCENT)
+                .size(ChartSize.fixedHeight(190))
+                .build();
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ChartJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ChartJmhBenchmark.java
new file mode 100644
index 000000000..760592853
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ChartJmhBenchmark.java
@@ -0,0 +1,79 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.ChartBenchmarkFixtures;
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: end-to-end render of a chart-heavy document — a
+ * grouped bar, a multi-series line (both 12 categories × 3 series) and a 6-slice
+ * pie — to PDF bytes. Charts compile into engine primitives at layout time, so
+ * this exercises {@code ChartLayoutResolver} + per-primitive geometry + label
+ * text-metrics on top of the normal compose / layout / render pipeline.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar Chart
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class ChartJmhBenchmark {
+
+    /**
+     * Builds the three-chart document and renders it to PDF bytes.
+     *
+     * @param blackhole JMH sink that consumes the rendered bytes
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderChartDocument(Blackhole blackhole) throws Exception {
+        blackhole.consume(renderDocument());
+    }
+
+    private static byte[] renderDocument() throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            document.pageFlow()
+                    .name("ChartBenchmark")
+                    .spacing(12)
+                    .chart(ChartBenchmarkFixtures.barSpec(), ChartBenchmarkFixtures.barStyle())
+                    .chart(ChartBenchmarkFixtures.lineSpec(), ChartBenchmarkFixtures.lineStyle())
+                    .chart(ChartBenchmarkFixtures.pieSpec())
+                    .build();
+            return document.toPdfBytes();
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}

From 1747446cce56d31a307de251ba5e55719342a4fd Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 20:16:51 +0100
Subject: [PATCH 06/36] perf(benchmarks): add vector-paint render-operator
 probe (S5/S6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VectorRenderOperatorProbe renders the same 40 curved blob paths three ways —
flat solid fill, linear gradient, and translucent (alpha) — and counts the PDF
content-stream operators, so the deltas isolate what each paint mode costs at
render time. Flat takes the fast fill path (sh=0, gs=0, W=0); a gradient fill
adds one shading + one clip per shape (sh, W); a translucent fill adds one
ExtGState (gs). Byte-deterministic, no A/B build needed; catches a regression
where a flat path wrongly takes the gradient branch (sh would jump from 0).

Verified: flat 0/0/0, gradient sh=40/W=40, alpha gs=40 over 40 paths.
---
 .../compose/VectorRenderOperatorProbe.java    | 102 ++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
new file mode 100644
index 000000000..8ea5652c2
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
@@ -0,0 +1,102 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentPaint;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.pdfparser.PDFStreamParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Deterministic content-stream operator probe for the v1.8 vector-paint render
+ * paths (S5/S6): the same {@code N} curved blob paths rendered three ways —
+ * flat solid fill, linear gradient, and translucent (alpha) fill — so the
+ * operator deltas isolate exactly what each paint mode costs at the PDF level.
+ *
+ * <p>A flat path takes the fast {@code fillAndStrokePath} route (just curve +
+ * fill operators). A gradient fill clips to the path and paints a shading
+ * ({@code q} / {@code W n} clip / {@code sh} / {@code Q} per shape); a
+ * translucent fill sets an ExtGState alpha ({@code gs}). Counting {@code sh} /
+ * {@code gs} / {@code W} against the flat baseline proves the per-shape cost
+ * structure and catches a regression where a flat path accidentally takes the
+ * heavier gradient branch. Byte-deterministic — no A/B build needed.</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class VectorRenderOperatorProbe {
+
+    private static final int PATHS = 40;
+
+    private enum PaintMode { FLAT, GRADIENT, ALPHA }
+
+    public static void main(String[] args) throws Exception {
+        BenchmarkSupport.configureQuietLogging();
+
+        System.out.println("GraphCompose vector-paint render-operator probe (" + PATHS + " blob paths each)");
+        System.out.printf("%-10s | %6s | %6s | %6s | %6s%n", "Mode", "c", "sh", "gs", "W");
+        System.out.println("-".repeat(46));
+        for (PaintMode mode : PaintMode.values()) {
+            report(mode);
+        }
+        System.out.println();
+        System.out.println("c=cubic curve, sh=shading fill, gs=ExtGState (alpha), W=clip. "
+                + "Flat takes the fast path (no sh/gs/W); gradient adds sh+W per shape; alpha adds gs.");
+    }
+
+    private static void report(PaintMode mode) throws Exception {
+        byte[] pdf;
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4).margin(28, 28, 28, 28).create()) {
+            session.pageFlow(flow -> authorBlobs(flow, mode));
+            pdf = session.toPdfBytes();
+        }
+        try (PDDocument document = Loader.loadPDF(pdf)) {
+            System.out.printf("%-10s | %6d | %6d | %6d | %6d%n",
+                    mode.name().toLowerCase(),
+                    count(document, "c"),
+                    count(document, "sh"),
+                    count(document, "gs"),
+                    count(document, "W"));
+        }
+    }
+
+    private static void authorBlobs(PageFlowBuilder flow, PaintMode mode) {
+        DocumentPaint gradient = DocumentPaint.linear(
+                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
+        DocumentColor flat = DocumentColor.rgb(40, 90, 160);
+        DocumentColor translucent = DocumentColor.rgb(40, 90, 160).withOpacity(0.5);
+        for (int i = 0; i < PATHS; i++) {
+            flow.addPath(p -> {
+                p.size(60, 36)
+                        .moveTo(0.0, 0.5)
+                        .curveTo(0.25, 1.0, 0.75, 1.0, 1.0, 0.5)
+                        .curveTo(0.75, 0.0, 0.25, 0.0, 0.0, 0.5)
+                        .closePath();
+                switch (mode) {
+                    case FLAT -> p.fillColor(flat);
+                    case GRADIENT -> p.fill(gradient);
+                    case ALPHA -> p.fillColor(translucent);
+                }
+            });
+        }
+    }
+
+    private static int count(PDDocument document, String op) throws IOException {
+        int n = 0;
+        for (var page : document.getPages()) {
+            List<Object> tokens = new PDFStreamParser(page).parse();
+            for (Object token : tokens) {
+                if (token instanceof Operator operator && op.equals(operator.getName())) {
+                    n++;
+                }
+            }
+        }
+        return n;
+    }
+}

From c249e537f7bf676366c4efd11dfe5057198cfdd5 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 20:23:31 +0100
Subject: [PATCH 07/36] bench(jmh): add icon-ramp and mixed v1.8 showcase
 render benches

IconRampJmhBenchmark places N copies of a multi-layer SVG icon
(@Param 8/32/128) and renders to PDF, so the per-icon node-build +
layout + render scaling is visible; the icon is parsed once in setup
so the ramp measures placement, not re-parsing.

MixedShowcaseJmhBenchmark renders one realistic document mixing every
v1.8 vector feature -- prose with two inline sparklines, a grouped bar
chart and a pie chart, a row of SVG icons, and a gradient accent path
-- as a single integration canary for "did a v1.8 feature regress a
realistic doc?".

Both reuse the existing SvgBenchmarkFixtures / ChartBenchmarkFixtures;
no src/main change.
---
 .../compose/jmh/IconRampJmhBenchmark.java     | 82 ++++++++++++++++
 .../jmh/MixedShowcaseJmhBenchmark.java        | 95 +++++++++++++++++++
 2 files changed, 177 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/IconRampJmhBenchmark.java
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/MixedShowcaseJmhBenchmark.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/IconRampJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/IconRampJmhBenchmark.java
new file mode 100644
index 000000000..ec655616d
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/IconRampJmhBenchmark.java
@@ -0,0 +1,82 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.SvgBenchmarkFixtures;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentInsets;
+import com.demcha.compose.document.svg.SvgIcon;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: an "icon ramp" — place {@code N} copies of a
+ * multi-layer SVG icon (the realistic icon-grid / skills-ribbon workload) and
+ * render to PDF. Parameterized over N so the trend (node-build + layout +
+ * render per icon) is visible; the icon is parsed once in setup so the ramp
+ * measures placement scaling, not re-parsing.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar IconRamp
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class IconRampJmhBenchmark {
+
+    @Param({"8", "32", "128"})
+    public int iconCount;
+
+    /** Parsed once: the ramp measures node-build + layout + render scaling, not re-parsing. */
+    private final SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+
+    /**
+     * Places {@code iconCount} icons in a flow and renders the document.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderIconRamp(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(24))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("IconRamp").spacing(4);
+            for (int i = 0; i < iconCount; i++) {
+                flow.addSvgIcon(icon, 32);
+            }
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/MixedShowcaseJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/MixedShowcaseJmhBenchmark.java
new file mode 100644
index 000000000..ae139a705
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/MixedShowcaseJmhBenchmark.java
@@ -0,0 +1,95 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.ChartBenchmarkFixtures;
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.SvgBenchmarkFixtures;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentInsets;
+import com.demcha.compose.document.style.DocumentPaint;
+import com.demcha.compose.document.svg.SvgIcon;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: a representative "v1.8 showcase" document that
+ * mixes every new vector feature in one render — running prose with two inline
+ * sparklines, a grouped bar chart and a pie chart, a row of SVG icons, and
+ * gradient accent paths. This is the integration canary: it answers "did adding
+ * any v1.8 feature blow up a realistic document?" in one number.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar MixedShowcase
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class MixedShowcaseJmhBenchmark {
+
+    private static final int ICONS = 8;
+
+    /** Parsed once; the bench measures the mixed render, not icon parsing. */
+    private final SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+
+    /**
+     * Renders the mixed v1.8 showcase document to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderMixedShowcase(Blackhole blackhole) throws Exception {
+        DocumentPaint accent = DocumentPaint.linear(
+                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(32))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("MixedShowcase").spacing(12);
+            flow.addParagraph("v1.8 feature showcase");
+            flow.addRich(r -> r
+                    .plain("Revenue ")
+                    .sparkline(42, 9, DocumentColor.rgb(20, 80, 95), 65.2, 69.8, 74.1, 81.3, 88.2)
+                    .plain("   profit ")
+                    .sparklineLine(42, 9, 1.6, DocumentColor.rgb(196, 153, 76), 28.1, 30.7, 32.9, 36.4, 39.5));
+            flow.chart(ChartBenchmarkFixtures.barSpec(), ChartBenchmarkFixtures.barStyle());
+            flow.chart(ChartBenchmarkFixtures.pieSpec());
+            for (int i = 0; i < ICONS; i++) {
+                flow.addSvgIcon(icon, 32);
+            }
+            flow.addPath(p -> p.size(220, 28)
+                    .moveTo(0.0, 0.5).curveTo(0.25, 1.0, 0.75, 0.0, 1.0, 0.5).fill(accent));
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}

From 2bdb59b36decd2eefec3e9f1b96810e107ffe701 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 23:02:12 +0100
Subject: [PATCH 08/36] bench(gate): gate the long-token scenario and guard
 scenario/threshold coverage

The smoke perf gate ignores any scenario without a configured threshold,
so long-token (the 6th latency scenario) was silently ungated -- a real
regression there would never fail the gate. Add its SMOKE threshold
(10.0 ms / 256.0 MB, ~3x the observed ~3.2 ms / ~94 MB, matching the
existing per-scenario calibration headroom).

Hoist the scenario list to a static SCENARIO_DEFS so the names are
readable without re-measuring, and add CurrentSpeedScenarioGateTest,
which fails the build if any scenario lacks a SMOKE threshold. No
behaviour change to the run itself -- same six scenarios, same order.
---
 .../demcha/compose/CurrentSpeedBenchmark.java | 57 +++++++++++++++----
 .../compose/CurrentSpeedScenarioGateTest.java | 35 ++++++++++++
 2 files changed, 82 insertions(+), 10 deletions(-)
 create mode 100644 benchmarks/src/test/java/com/demcha/compose/CurrentSpeedScenarioGateTest.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index e3d877943..64e113d20 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -32,6 +32,7 @@
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
+import java.util.function.Function;
 
 /**
  * Focused local benchmark harness for current GraphCompose performance.
@@ -87,6 +88,36 @@ public final class CurrentSpeedBenchmark {
     private final ProposalDocumentSpec proposal = CanonicalBenchmarkSupport.canonicalProposal();
     private final CvSpec cv = CanonicalBenchmarkSupport.canonicalCv();
 
+    // Canonical scenario list, in table order. Declared statically (the
+    // renderer is bound to an instance at run time) so the gate-coverage guard
+    // test can read the scenario names without re-measuring: a scenario added
+    // here without a matching SMOKE threshold below would silently escape the
+    // perf gate, and CurrentSpeedScenarioGateTest fails loudly if that happens.
+    private static final List<ScenarioDef> SCENARIO_DEFS = List.of(
+            new ScenarioDef("engine-simple", "One-page engine composition",
+                    b -> b::renderEngineSimpleDocument),
+            new ScenarioDef("invoice-template", "Compose-first invoice template",
+                    b -> b::renderInvoiceTemplateDocument),
+            new ScenarioDef("cv-template", "Compose-first CV template",
+                    b -> b::renderCvTemplateDocument),
+            new ScenarioDef("proposal-template", "Long multi-page proposal template",
+                    b -> b::renderProposalTemplateDocument),
+            new ScenarioDef("feature-rich", "QR, barcode, watermark, header/footer, page break",
+                    b -> b::renderFeatureRichDocument),
+            new ScenarioDef("long-token", "Long unbreakable tokens (URLs/IDs) forcing character-level wrap",
+                    b -> b::renderLongTokenDocument)
+    );
+
+    /**
+     * Ordered scenario names. Read by {@code CurrentSpeedScenarioGateTest} to
+     * assert every scenario is covered by a SMOKE gate threshold.
+     *
+     * @return the canonical scenario names in table order
+     */
+    static List<String> scenarioNames() {
+        return SCENARIO_DEFS.stream().map(ScenarioDef::name).toList();
+    }
+
     public static void main(String[] args) throws Exception {
         BenchmarkSupport.configureQuietLogging();
         new CurrentSpeedBenchmark().run();
@@ -109,14 +140,9 @@ private void run() throws Exception {
         System.out.println("Perf gate: " + (enforceGate ? "enabled" : "disabled"));
         System.out.println();
 
-        List<Scenario> scenarios = List.of(
-                new Scenario("engine-simple", "One-page engine composition", this::renderEngineSimpleDocument),
-                new Scenario("invoice-template", "Compose-first invoice template", this::renderInvoiceTemplateDocument),
-                new Scenario("cv-template", "Compose-first CV template", this::renderCvTemplateDocument),
-                new Scenario("proposal-template", "Long multi-page proposal template", this::renderProposalTemplateDocument),
-                new Scenario("feature-rich", "QR, barcode, watermark, header/footer, page break", this::renderFeatureRichDocument),
-                new Scenario("long-token", "Long unbreakable tokens (URLs/IDs) forcing character-level wrap", this::renderLongTokenDocument)
-        );
+        List<Scenario> scenarios = SCENARIO_DEFS.stream()
+                .map(def -> new Scenario(def.name(), def.description(), def.renderer().apply(this)))
+                .toList();
 
         System.out.println("Latency benchmark");
         System.out.printf("%-18s | %10s | %10s | %10s | %10s | %11s | %10s | %10s%n",
@@ -820,6 +846,13 @@ private static String format(double value) {
     private record Scenario(String name, String description, Renderer renderer) {
     }
 
+    // Static scenario template: name + description + a factory that binds the
+    // renderer to a benchmark instance. Keeps the scenario list declarable as a
+    // static constant (so the gate-coverage test can read it) while the actual
+    // render still runs against per-run instance state.
+    private record ScenarioDef(String name, String description, Function<CurrentSpeedBenchmark, Renderer> renderer) {
+    }
+
     @FunctionalInterface
     private interface Renderer {
         byte[] render() throws Exception;
@@ -909,12 +942,16 @@ enum BenchmarkProfile {
                 // (typically 1.5-2x slower) does not produce false positives
                 // while real regressions of 50% or more still trigger. The
                 // previous values (800-2600 ms) were 50-100x looser and would
-                // not have flagged even a 10x slowdown.
+                // not have flagged even a 10x slowdown. long-token (observed
+                // ~3.2 ms / ~94 MB) is gated too so every scenario in the
+                // latency table is covered — CurrentSpeedScenarioGateTest pins
+                // that invariant.
                 "engine-simple", new SmokeThreshold(8.0, 96.0),
                 "invoice-template", new SmokeThreshold(35.0, 384.0),
                 "cv-template", new SmokeThreshold(25.0, 192.0),
                 "proposal-template", new SmokeThreshold(45.0, 384.0),
-                "feature-rich", new SmokeThreshold(100.0, 256.0)
+                "feature-rich", new SmokeThreshold(100.0, 256.0),
+                "long-token", new SmokeThreshold(10.0, 256.0)
         ));
 
         private final String id;
diff --git a/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedScenarioGateTest.java b/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedScenarioGateTest.java
new file mode 100644
index 000000000..da7296d45
--- /dev/null
+++ b/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedScenarioGateTest.java
@@ -0,0 +1,35 @@
+package com.demcha.compose;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Guards that every CurrentSpeed latency scenario is covered by a SMOKE gate
+ * threshold.
+ *
+ * <p>The smoke perf gate silently ignores a scenario that has no configured
+ * threshold (by design — see
+ * {@link CurrentSpeedBenchmarkPerfGateTest#ignoresScenariosWithoutAConfiguredThreshold()}).
+ * That defensive behaviour means a newly added scenario would escape the gate
+ * unnoticed. This test makes the omission fail loudly instead: adding a scenario
+ * to {@code SCENARIO_DEFS} without a matching {@code SMOKE} threshold breaks the
+ * build.</p>
+ */
+class CurrentSpeedScenarioGateTest {
+
+    @Test
+    void everyScenarioHasASmokeGateThreshold() {
+        var gated = CurrentSpeedBenchmark.BenchmarkProfile.SMOKE.smokeThresholds().keySet();
+
+        List<String> ungated = CurrentSpeedBenchmark.scenarioNames().stream()
+                .filter(name -> !gated.contains(name))
+                .toList();
+
+        assertThat(ungated)
+                .as("CurrentSpeed scenarios missing a SMOKE gate threshold")
+                .isEmpty();
+    }
+}

From c2317f5cc14e316e5fa8f06bdab2db4f88b69b8e Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 23:15:13 +0100
Subject: [PATCH 09/36] docs(benchmarks): finish the removed-bench cleanup and
 fix two stale Javadocs

Sweep the references the three removed benchmark mains (FullCvBenchmark,
GraphComposeBenchmark, ScalabilityBenchmark) left behind, and correct two
docs that overstated what the code does:

- ab-bench.ps1 no longer parses the retired 04/05/06 logs (they are no
  longer produced); it reads the surviving stress log, and the
  thread-scaling series still comes from the current-speed JSON report.
- benchmarks/README.md "Files in this module": split a row that had been
  merged onto one line and restore the blank line before "## Running".
- docs/operations/performance.md: mark it a frozen v1.4 snapshot and note
  the retired suites/mains so it no longer contradicts benchmarks.md.
- docs/operations/benchmarks.md and the run-benchmarks.ps1 synopsis: note
  that steps 04-06 were retired, so the 03 -> 07 numbering gap is intentional.
- SvgJmhBenchmark Javadoc: describe the heart-path parse accurately
  (tokenize / cubic-line lowering / viewBox normalization); the fixture
  has no arc command, so the old "arc->cubic" wording was wrong.
- BenchmarkMedianTool Javadoc: note that stages[] is not carried into the
  median aggregate, so a median-vs-median diff shows no stage deltas.
---
 benchmarks/README.md                          |  4 +++-
 .../demcha/compose/BenchmarkMedianTool.java   |  5 +++++
 .../demcha/compose/jmh/SvgJmhBenchmark.java   |  3 ++-
 docs/operations/benchmarks.md                 |  4 ++++
 docs/operations/performance.md                | 14 ++++++++++++--
 scripts/ab-bench.ps1                          | 19 ++++---------------
 scripts/run-benchmarks.ps1                    |  4 +++-
 7 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index e232c6e21..48c953b20 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -62,11 +62,13 @@
 | File | Role |
 |---|---|
 | `CurrentSpeedBenchmark` | Default scenario runner — what CI's `perf-smoke` job exercises. Takes a `-Dgraphcompose.benchmark.profile=smoke\|full\|stress` switch. |
-| `ComparativeBenchmark` | Renders the same fixtures through GraphCompose, iText, openHTMLToPDF, JasperReports. **Rough local comparison only** — see "When not to use" above. || `CanonicalBenchmarkSupport`, `BenchmarkSupport` | Shared fixture builders + measurement helpers. |
+| `ComparativeBenchmark` | Renders the same fixtures through GraphCompose, iText, openHTMLToPDF, JasperReports. **Rough local comparison only** — see "When not to use" above. |
+| `CanonicalBenchmarkSupport`, `BenchmarkSupport` | Shared fixture builders + measurement helpers. |
 | `BenchmarkReportWriter` | Writes JSON / CSV / text reports under `benchmarks/target/benchmarks/`. |
 | `BenchmarkDiffTool` | Compares two JSON reports and prints a delta table. Useful for pre/post comparisons. |
 | `BenchmarkMedianTool` | Median + dispersion across N runs of the same scenario. |
 | `GraphComposeStressTest`, `EnduranceTest` | Long-running stress / endurance harnesses. |
+
 ## Running
 
 From the repo root:
diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
index 5eb786649..f82d0b6f8 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
@@ -24,6 +24,11 @@
  * possible, so it can be diffed by {@link BenchmarkDiffTool}. The tool is meant
  * for local benchmark sessions where a few repeated runs are needed to reduce
  * machine noise before comparing results.</p>
+ *
+ * <p>The current-speed per-stage breakdown ({@code stages[]}) is <em>not</em>
+ * carried into the median aggregate — only latency and throughput are medianed.
+ * A median-vs-median diff therefore shows no compose/layout/render stage deltas;
+ * diff a single-run pair when you need stage attribution.</p>
  */
 public final class BenchmarkMedianTool {
 
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
index f7a63b30c..58ed3f99f 100644
--- a/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
@@ -24,7 +24,8 @@
  * {@code DocumentSession}, no PDF render):</p>
  * <ul>
  *   <li>{@code parseSvgPath} — {@link SvgPath#parse} of a real Material icon
- *       {@code d} string (arc→cubic conversion, normalization).</li>
+ *       {@code d} string (tokenize, relative/absolute resolution, cubic/line
+ *       lowering, viewBox normalization).</li>
  *   <li>{@code readSvgIcon} — {@link SvgIcon#parse} of a multi-layer icon (XML
  *       parse, {@code <g>} transform accumulation, gradient resolution, one
  *       {@link SvgPath} per layer).</li>
diff --git a/docs/operations/benchmarks.md b/docs/operations/benchmarks.md
index 775483384..3611d877e 100644
--- a/docs/operations/benchmarks.md
+++ b/docs/operations/benchmarks.md
@@ -40,6 +40,10 @@ The script prints numbered sections so you can map console output to the pipelin
    runs the thread-scaling throughput sweep (1 → 16 threads).
 3. `03-comparative`
    Runs the GraphCompose canonical vs iText 5 vs JasperReports comparison.
+
+   _Steps 04–06 (`core-engine`, `full-cv`, `scalability`) were retired. The
+   surviving steps keep their original `NN-` console prefixes, so the labels
+   jump from `03-` to `07-`._
 7. `07-stress`
    Runs the concurrent stability stress test.
 8. `08-endurance`
diff --git a/docs/operations/performance.md b/docs/operations/performance.md
index ecf02c5b7..7fc02d480 100644
--- a/docs/operations/performance.md
+++ b/docs/operations/performance.md
@@ -1,7 +1,13 @@
 # Performance — v1.4 numbers
 
-All numbers below come from `scripts/run-benchmarks.ps1` — the full local
-benchmark workflow that builds the test classpath once and runs
+> **Historical snapshot (v1.4).** The numbers and suite list below are frozen
+> as captured for v1.4 and are kept for reference. The pipeline has since
+> changed: the `core-engine`, `full-cv`, and `scalability` suites were retired,
+> and current numbers come from the `current-speed` / `comparative` / `stress`
+> pipeline plus the JMH suite. See [docs/operations/benchmarks.md](./benchmarks.md).
+
+All numbers below were captured from `scripts/run-benchmarks.ps1` — the full
+local benchmark workflow that built the test classpath once and ran
 `current-speed`, `comparative`, `core-engine`, `full-cv`, `scalability`,
 and `stress` suites in sequence. They were captured on a developer
 laptop; CI machines are typically 1.5–2× slower. The benchmark
@@ -93,5 +99,9 @@ snapshots.
 
 ## Engine-only timings
 
+_The `GraphComposeBenchmark` and `FullCvBenchmark` mains below were retired
+after v1.4. Equivalent timings now come from the `CurrentSpeedBenchmark`
+`engine-simple` scenario and the JMH `TemplateCvJmhBenchmark`._
+
 - `GraphComposeBenchmark` (engine-only, no PDF render): avg **1.04 ms**, p50 **0.97 ms**, p95 **1.64 ms**.
 - `FullCvBenchmark` (full CV template, including render): avg **4.14 ms**, p50 **3.80 ms**, p95 **6.37 ms**.
diff --git a/scripts/ab-bench.ps1 b/scripts/ab-bench.ps1
index 5a3e4eb42..a237ec203 100644
--- a/scripts/ab-bench.ps1
+++ b/scripts/ab-bench.ps1
@@ -110,21 +110,10 @@ function Parse-Comparative($jsonPath) {
 }
 function Parse-Logs($logsDir) {
     $o = @{}
-    $scal = Join-Path $logsDir "06-scalability.log"
-    if (Test-Path $scal) {
-        foreach ($line in (Get-Content $scal)) {
-            if ($line -match '^\s*(\d+)\s*\|\s*\d+\s*\|\s*([\d.]+)\s*$') {
-                $o["scalability | $($matches[1])t | docs/s"] = [double]$matches[2]
-            }
-        }
-    }
-    foreach ($pair in @(@("04-core-engine.log", "core-engine"), @("05-full-cv.log", "full-cv"))) {
-        $p = Join-Path $logsDir $pair[0]
-        if (Test-Path $p) {
-            $txt = Get-Content $p -Raw
-            if ($txt -match 'Median[^\r\n]*?:\s*([\d.]+)\s*ms') { $o["$($pair[1]) | median ms"] = [double]$matches[1] }
-        }
-    }
+    # Steps 04-06 (core-engine, full-cv, scalability) were retired, so their logs
+    # are no longer produced. Current-speed throughput — including the
+    # thread-scaling series — is read from the JSON report by Parse-CurrentSpeed;
+    # only the surviving stress log is parsed here.
     $stress = Join-Path $logsDir "07-stress.log"
     if (Test-Path $stress) {
         $txt = Get-Content $stress -Raw
diff --git a/scripts/run-benchmarks.ps1 b/scripts/run-benchmarks.ps1
index e3d3947b6..a0dd2c777 100644
--- a/scripts/run-benchmarks.ps1
+++ b/scripts/run-benchmarks.ps1
@@ -6,7 +6,9 @@ Runs the local GraphCompose benchmark pipeline and stores timestamped logs and r
 .DESCRIPTION
 The wrapper performs a staged local run:
 01 build classpath, 02 current-speed, 03 comparative, 07 stress,
-optional 08 endurance, then 09/10 diff steps.
+optional 08 endurance, then 09/10 diff and 11 verdict steps. Steps 04-06
+(core-engine, full-cv, scalability) were retired; the surviving steps keep
+their original numeric prefixes, so the numbering jumps from 03 to 07.
 
 Current-speed diffs are profile-aware. The wrapper only compares reports
 from the same current-speed profile (`smoke` or `full`) and skips the

From b93c44ec62ce1a386889302cec4383f3b3f31405 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 23:15:51 +0100
Subject: [PATCH 10/36] docs(changelog): note the v1.8 feature-object benches,
 stage output, and gate coverage

---
 CHANGELOG.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e9f7124c2..6cb0e7074 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -344,6 +344,28 @@ Entries land here as they merge.
   `ScalabilityBenchmark` (its thread-scaling sweep folded into
   `CurrentSpeedBenchmark`'s full-profile throughput run, now `1,2,4,8,16`).
   Dropped the matching `run-benchmarks.ps1` steps and doc entries.
+- **Feature-object benchmarks for the v1.8 vector surface (not shipped).**
+  The suite previously exercised only text/table primitives. Added JMH render
+  benches and deterministic probes over the new vector features:
+  `SvgJmhBenchmark` (path parse / whole-file icon read / icon→node) plus a
+  `SvgParseAllocProbe`; `ChartJmhBenchmark` (bar + line + pie render) plus a
+  `ChartAllocProbe` (layout-compile allocation); `VectorRenderOperatorProbe`
+  (the same paths drawn flat vs. gradient vs. translucent, counted as PDF
+  content-stream operators); `IconRampJmhBenchmark` (icon-placement scaling,
+  `@Param` 8/32/128); and `MixedShowcaseJmhBenchmark` (one document combining
+  prose, inline sparklines, bar + pie charts, SVG icons and a gradient path).
+  Shared `SvgBenchmarkFixtures` / `ChartBenchmarkFixtures` hold the inputs so
+  each bench and its probe measure identical data.
+- **Current-speed report carries a stage breakdown and a run summary (not
+  shipped).** `CurrentSpeedBenchmark` persists a per-scenario compose / layout /
+  render split (`stages[]`, median ms) to the JSON and a `stages` CSV, and
+  writes a readable `summary.md`. `BenchmarkDiffTool` consumes `stages[]`,
+  prints a per-stage delta table, and reports the scenarios added/removed
+  between two runs.
+- **Every current-speed scenario is now covered by the smoke perf gate (not
+  shipped).** The `long-token` scenario previously had no SMOKE threshold and
+  silently escaped the gate; it now has one, and `CurrentSpeedScenarioGateTest`
+  fails the build if any scenario lacks a threshold.
 - **Removed the `java.awt.*` / `java.util.*` co-wildcard in four files.**
   `InvoiceTemplateComposer`, `ProposalTemplateComposer`,
   `WeeklyScheduleTemplateComposer`, and the engine `PdfRenderingSystemECS`

From 7e74b555ff0015ac9a5fe750efc1d020c9ba7ac2 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Sun, 14 Jun 2026 23:35:01 +0100
Subject: [PATCH 11/36] fix(benchmarks): widen comparative-diff Library column
 so GraphCompose Canonical fits

The comparative-diff table printed the Library column as %-20s, but "GraphCompose Canonical" is 22 chars, so it overflowed the field and pushed the | separator right, misaligning that row. Widen to %-24s (matching the comparative run table in ComparativeBenchmark) and extend the rule to 56 so the column fits the longest library label.
---
 .../src/main/java/com/demcha/compose/BenchmarkDiffTool.java | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
index 0fb058bf8..ce99ce16e 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
@@ -200,11 +200,11 @@ private void diffComparative(DiffInput input,
         ComparativeDiffReport report = buildComparativeDiff(input, baseline, candidate);
 
         System.out.println("Comparative diff");
-        System.out.printf("%-20s | %12s | %12s%n",
+        System.out.printf("%-24s | %12s | %12s%n",
                 "Library", "Time pct", "Heap pct");
-        System.out.println("-".repeat(52));
+        System.out.println("-".repeat(56));
         for (ComparativeLibraryDiff row : report.libraries()) {
-            System.out.printf("%-20s | %12s | %12s%n",
+            System.out.printf("%-24s | %12s | %12s%n",
                     row.library(),
                     signedPercent(row.avgTimeDeltaPct()),
                     signedPercent(row.avgHeapDeltaPct()));

From 87ebe8400c2eada70f21b2223b5761f55089de7d Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 09:57:47 +0100
Subject: [PATCH 12/36] perf(benchmarks): add image embed/scale coverage + a
 PdfImageCache reuse gate

The suite had no image coverage at all: no bench or probe placed a raster image, so the embed/scale hot path and PdfImageCache dedup could regress unmeasured.

ImageBenchmarkFixtures builds deterministic in-code synthetic PNGs (a shared demoImage plus distinctImage(i)), so no binary asset is committed. ImageCacheOperatorProbe places one image N times vs N distinct images and counts embedded image XObjects + Do draws (same image x30 -> 1 embed/30 draws; 30 distinct -> 30/30). ImageCacheGateTest turns that reuse invariant into a build-failing assertion (1 embed for the same image regardless of placements; N for N distinct), so a dedup regression cannot pass silently. ImageJmhBenchmark renders a 12-image thumbnail document, driving the ImageIO decode + bicubic rescale + embed path that nothing else exercised.
---
 .../compose/ImageBenchmarkFixtures.java       |  90 +++++++++++++
 .../compose/ImageCacheOperatorProbe.java      | 119 ++++++++++++++++++
 .../demcha/compose/jmh/ImageJmhBenchmark.java |  92 ++++++++++++++
 .../demcha/compose/ImageCacheGateTest.java    |  49 ++++++++
 4 files changed, 350 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java
 create mode 100644 benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
new file mode 100644
index 000000000..c9f95b739
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
@@ -0,0 +1,90 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.image.DocumentImageData;
+
+import javax.imageio.ImageIO;
+import java.awt.BasicStroke;
+import java.awt.Color;
+import java.awt.GradientPaint;
+import java.awt.Graphics2D;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+
+/**
+ * Deterministic synthetic raster fixtures for the image embed/scale benches and
+ * the {@code PdfImageCache} reuse gate.
+ *
+ * <p>The images are generated in code (a fixed gradient placeholder, a few KB
+ * each) so the suite needs no committed binary asset and the bytes — hence the
+ * cache fingerprint — are stable. {@link #demoImage()} returns the same logical
+ * image every call; {@link #distinctImage(int)} returns visually distinct images
+ * with distinct fingerprints, to exercise the distinct-embed path.</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ImageBenchmarkFixtures {
+
+    /** Native pixel size of every generated fixture. */
+    public static final int NATIVE_WIDTH_PX = 360;
+    /** Native pixel height of every generated fixture. */
+    public static final int NATIVE_HEIGHT_PX = 200;
+
+    /**
+     * Draw size (points) that keeps the original-embed path: at 144 DPI this is
+     * a {@code 360x200 px} target, i.e. &gt; 50% of native, so {@code PdfImageCache}
+     * does not build a downscaled variant and the embed count stays at one.
+     */
+    public static final double DRAW_WIDTH_PT = 180.0;
+    /** Companion draw height (points) for {@link #DRAW_WIDTH_PT}. */
+    public static final double DRAW_HEIGHT_PT = 100.0;
+
+    private ImageBenchmarkFixtures() {
+    }
+
+    /**
+     * One fixed gradient placeholder. Returns equal bytes every call, so all
+     * placements share a fingerprint and the cache treats them as one image.
+     *
+     * @return the shared demo image descriptor
+     */
+    public static DocumentImageData demoImage() {
+        return DocumentImageData.fromBytes(pngBytes(0));
+    }
+
+    /**
+     * The {@code index}-th of a family of visually distinct images, each with a
+     * distinct fingerprint so the cache embeds each one separately.
+     *
+     * @param index variant index (any non-negative int)
+     * @return a distinct image descriptor
+     */
+    public static DocumentImageData distinctImage(int index) {
+        return DocumentImageData.fromBytes(pngBytes(index + 1));
+    }
+
+    private static byte[] pngBytes(int seed) {
+        BufferedImage image = new BufferedImage(NATIVE_WIDTH_PX, NATIVE_HEIGHT_PX, BufferedImage.TYPE_INT_RGB);
+        Graphics2D g = image.createGraphics();
+        try {
+            int r = 20 + (seed * 23) % 200;
+            int b = 95 + (seed * 17) % 150;
+            g.setPaint(new GradientPaint(0, 0, new Color(r, 45, 80),
+                    NATIVE_WIDTH_PX, NATIVE_HEIGHT_PX, new Color(20, 80, b)));
+            g.fillRect(0, 0, NATIVE_WIDTH_PX, NATIVE_HEIGHT_PX);
+            g.setPaint(new Color(196, 153, 76));
+            g.setStroke(new BasicStroke(6f));
+            g.drawLine(0, 170, NATIVE_WIDTH_PX, 110 - (seed % 40));
+        } finally {
+            g.dispose();
+        }
+        ByteArrayOutputStream png = new ByteArrayOutputStream();
+        try {
+            ImageIO.write(image, "png", png);
+        } catch (IOException e) {
+            throw new UncheckedIOException("Failed to encode synthetic benchmark image", e);
+        }
+        return png.toByteArray();
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java
new file mode 100644
index 000000000..6e8d84847
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java
@@ -0,0 +1,119 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.image.DocumentImageData;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.pdfparser.PDFStreamParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.IdentityHashMap;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Consumer;
+
+/**
+ * Deterministic content-stream probe for the {@code PdfImageCache} dedup path:
+ * the same raster image is placed {@code N} times and counted against {@code N}
+ * distinct images, so the embed structure isolates exactly what the cache saves.
+ *
+ * <p>Placing one logical image {@code N} times must embed a single image XObject
+ * (referenced by {@code N} {@code Do} draws), while {@code N} distinct images must
+ * embed {@code N} XObjects. Counting the distinct image XObjects in the output PDF
+ * proves the cache reuses by fingerprint and catches a regression where embeds
+ * scale with placements (PDF bloat). Byte-deterministic — no A/B build needed.
+ * The image render/scale hot path is also entirely uncovered without this and the
+ * companion {@code ImageJmhBenchmark}.</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ImageCacheOperatorProbe {
+
+    private static final int PLACEMENTS = 30;
+
+    /** Distinct image XObjects embedded in a PDF, and the number of {@code Do} draws. */
+    record EmbedCounts(int embeds, int draws) {
+    }
+
+    public static void main(String[] args) throws Exception {
+        BenchmarkSupport.configureQuietLogging();
+
+        System.out.println("GraphCompose image-cache embed probe (" + PLACEMENTS + " placements each)");
+        System.out.printf("%-22s | %8s | %8s%n", "Mode", "Embeds", "Draws");
+        System.out.println("-".repeat(44));
+        report("same image x N", countPdf(renderSameImage(PLACEMENTS)));
+        report("N distinct images", countPdf(renderDistinctImages(PLACEMENTS)));
+        System.out.println();
+        System.out.println("Embeds = distinct image XObjects in the PDF, Draws = Do operators. "
+                + "PdfImageCache must hold embeds at 1 for the same image regardless of placements; "
+                + "distinct images embed once each.");
+    }
+
+    private static void report(String mode, EmbedCounts counts) {
+        System.out.printf("%-22s | %8d | %8d%n", mode, counts.embeds(), counts.draws());
+    }
+
+    /** Renders {@code count} placements of one shared image (cache should embed it once). */
+    static byte[] renderSameImage(int count) throws Exception {
+        DocumentImageData image = ImageBenchmarkFixtures.demoImage();
+        return render(flow -> {
+            for (int i = 0; i < count; i++) {
+                flow.addImage(spec -> spec.source(image)
+                        .size(ImageBenchmarkFixtures.DRAW_WIDTH_PT, ImageBenchmarkFixtures.DRAW_HEIGHT_PT));
+            }
+        });
+    }
+
+    /** Renders {@code count} distinct images (cache embeds each once). */
+    static byte[] renderDistinctImages(int count) throws Exception {
+        return render(flow -> {
+            for (int i = 0; i < count; i++) {
+                DocumentImageData image = ImageBenchmarkFixtures.distinctImage(i);
+                flow.addImage(spec -> spec.source(image)
+                        .size(ImageBenchmarkFixtures.DRAW_WIDTH_PT, ImageBenchmarkFixtures.DRAW_HEIGHT_PT));
+            }
+        });
+    }
+
+    private static byte[] render(Consumer<PageFlowBuilder> author) throws Exception {
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4).margin(28, 28, 28, 28).create()) {
+            session.pageFlow(flow -> {
+                flow.name("ImageCacheProbe").spacing(8);
+                author.accept(flow);
+            });
+            return session.toPdfBytes();
+        }
+    }
+
+    /** Counts distinct embedded image XObjects (by COS identity) and {@code Do} draws. */
+    static EmbedCounts countPdf(byte[] pdf) throws IOException {
+        try (PDDocument document = Loader.loadPDF(pdf)) {
+            Set<COSBase> embeds = Collections.newSetFromMap(new IdentityHashMap<>());
+            int draws = 0;
+            for (PDPage page : document.getPages()) {
+                for (var name : page.getResources().getXObjectNames()) {
+                    PDXObject xobject = page.getResources().getXObject(name);
+                    if (xobject instanceof PDImageXObject image) {
+                        embeds.add(image.getCOSObject());
+                    }
+                }
+                List<Object> tokens = new PDFStreamParser(page).parse();
+                for (Object token : tokens) {
+                    if (token instanceof Operator operator && "Do".equals(operator.getName())) {
+                        draws++;
+                    }
+                }
+            }
+            return new EmbedCounts(embeds.size(), draws);
+        }
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java
new file mode 100644
index 000000000..2b05b1d09
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java
@@ -0,0 +1,92 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.ImageBenchmarkFixtures;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.image.DocumentImageData;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.IntStream;
+
+/**
+ * Strict JMH micro-benchmark: end-to-end render of an image-heavy document — a
+ * dozen distinct raster images placed at thumbnail size — to PDF bytes. Drawing
+ * below 50% of native resolution drives {@code PdfImageCache}'s downscale path
+ * ({@code ImageIO} decode + bicubic rescale + re-encode + embed), so this covers
+ * the raster embed/scale hot path that no other bench touches.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar Image
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class ImageJmhBenchmark {
+
+    private static final int IMAGES = 12;
+
+    /** Distinct images built once in setup; the bench measures render, not image synthesis. */
+    private List<DocumentImageData> images;
+
+    @Setup
+    public void setUp() {
+        images = IntStream.range(0, IMAGES)
+                .mapToObj(ImageBenchmarkFixtures::distinctImage)
+                .toList();
+    }
+
+    /**
+     * Renders the image-heavy document to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderImageDocument(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(28))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("ImageBenchmark").spacing(8);
+            for (DocumentImageData image : images) {
+                // 60x33 pt -> ~120x66 px target at 144 DPI, i.e. <50% of the
+                // 360x200 native, so the cache builds a downscaled variant.
+                flow.addImage(spec -> spec.source(image).size(60, 33));
+            }
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java b/benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java
new file mode 100644
index 000000000..e28a2d9c9
--- /dev/null
+++ b/benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java
@@ -0,0 +1,49 @@
+package com.demcha.compose;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Deterministic regression gate for {@code PdfImageCache} dedup, driving
+ * {@link ImageCacheOperatorProbe}'s render + count helpers.
+ *
+ * <p>The cache keys embedded image XObjects by content fingerprint, so the same
+ * image placed many times must embed once (referenced by many draws) while
+ * distinct images embed once each. Counting the embedded XObjects in the output
+ * PDF makes that structural invariant a build-failing assertion — a regression
+ * that re-embeds the same image per placement (PDF bloat) breaks this test
+ * rather than silently passing CI.</p>
+ */
+class ImageCacheGateTest {
+
+    @Test
+    void sameImageEmbedsOnceRegardlessOfPlacements() throws Exception {
+        int placements = 30;
+
+        ImageCacheOperatorProbe.EmbedCounts counts =
+                ImageCacheOperatorProbe.countPdf(ImageCacheOperatorProbe.renderSameImage(placements));
+
+        assertThat(counts.embeds())
+                .as("the same image placed %d times must embed exactly one XObject", placements)
+                .isEqualTo(1);
+        assertThat(counts.draws())
+                .as("each placement must still draw the cached image")
+                .isGreaterThanOrEqualTo(placements);
+    }
+
+    @Test
+    void distinctImagesEachEmbedOnce() throws Exception {
+        int distinct = 8;
+
+        ImageCacheOperatorProbe.EmbedCounts counts =
+                ImageCacheOperatorProbe.countPdf(ImageCacheOperatorProbe.renderDistinctImages(distinct));
+
+        assertThat(counts.embeds())
+                .as("%d distinct images must embed %d XObjects (no over-dedup)", distinct, distinct)
+                .isEqualTo(distinct);
+        assertThat(counts.draws())
+                .as("each distinct image must be drawn")
+                .isGreaterThanOrEqualTo(distinct);
+    }
+}

From 14390d5372eaeafbbc74df578ac5699882e19844 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 10:01:54 +0100
Subject: [PATCH 13/36] perf(benchmarks): run the deterministic benchmark gates
 in CI + add a render-operator gate

The deterministic probes produce machine-independent counts, but nothing asserted on them and the benchmarks module's tests never ran in CI (perf-smoke used -DskipTests; the root verify skips the standalone module), so an operator-count or cache regression passed CI silently.

Add a 'Run deterministic benchmark gates' step to the PR-triggered perf-smoke job (./mvnw -f benchmarks/pom.xml test) so the image-cache reuse gate, the scenario/threshold coverage gate, and the diff-tooling tests now fail the build on a structural regression. Refactor RenderOperatorProbe to expose countOperators(...) and add RenderOperatorGateTest, which pins the F5 coalescing invariant: a long single-style paragraph keeps Tf/colour ops below the per-line text-draw count, so a regression back to per-span font ops breaks the test. Probe console output is unchanged.
---
 .github/workflows/ci.yml                      | 14 +++++++
 .../demcha/compose/RenderOperatorProbe.java   | 28 +++++++++++--
 .../compose/RenderOperatorGateTest.java       | 41 +++++++++++++++++++
 3 files changed, 79 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/src/test/java/com/demcha/compose/RenderOperatorGateTest.java

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 31ce987b2..c2cf8a7d2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -208,6 +208,12 @@ jobs:
       - name: Compile benchmarks module
         run: ./mvnw -B -ntp -f benchmarks/pom.xml clean compile
 
+      - name: Run deterministic benchmark gates
+        # Fast, machine-independent unit/gate tests (image-cache reuse,
+        # render-operator coalescing, scenario/threshold coverage, diff tooling).
+        # Catches structural regressions the timing smoke run cannot.
+        run: ./mvnw -B -ntp -f benchmarks/pom.xml test
+
       - name: Run coarse performance smoke benchmark
         run: |
           ./mvnw -B -ntp -f benchmarks/pom.xml -DskipTests \
@@ -223,6 +229,14 @@ jobs:
           path: benchmarks/target/benchmarks/current-speed/**
           if-no-files-found: ignore
 
+      - name: Upload benchmark gate reports
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: benchmark-gate-reports-${{ github.run_id }}
+          path: benchmarks/target/surefire-reports/**
+          if-no-files-found: ignore
+
   benchmark-diff:
     name: Weekly Benchmark Diff
     if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
diff --git a/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java
index 94cafb25e..016f4ea9e 100644
--- a/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java
+++ b/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java
@@ -70,6 +70,29 @@ public static void main(String[] args) throws Exception {
     }
 
     private static void report(String scenario, Consumer<com.demcha.compose.document.dsl.PageFlowBuilder> author) throws Exception {
+        OpCounts counts = countOperators(author);
+        int saved = Math.max(0, counts.draws() - counts.tf()) + Math.max(0, counts.draws() - counts.rg());
+        double reduction = counts.draws() == 0 ? 0
+                : 100.0 * (2.0 * counts.draws() - counts.tf() - counts.rg()) / (2.0 * counts.draws());
+        System.out.printf("%-22s | %8d | %8d | %8d | %12d | %8.1f%%%n",
+                scenario, counts.draws(), counts.tf(), counts.rg(), saved, reduction);
+    }
+
+    /** Text-show ({@code Tj}/{@code TJ}), {@code setFont} ({@code Tf}) and non-stroking-colour op counts. */
+    record OpCounts(int draws, int tf, int rg) {
+    }
+
+    /**
+     * Renders {@code author} and counts the text-show, font and colour operators.
+     * Exposed (package-visible) so {@code RenderOperatorGateTest} can pin the F5
+     * coalescing invariant: post-F5 the font/colour ops no longer scale 1:1 with
+     * text draws, so {@code tf} and {@code rg} stay below {@code draws}.
+     *
+     * @param author flow author
+     * @return the operator counts of the rendered document
+     * @throws Exception if rendering fails
+     */
+    static OpCounts countOperators(Consumer<com.demcha.compose.document.dsl.PageFlowBuilder> author) throws Exception {
         byte[] pdf;
         try (DocumentSession session = GraphCompose.document()
                 .pageSize(DocumentPageSize.A4).margin(28, 28, 28, 28).create()) {
@@ -80,10 +103,7 @@ private static void report(String scenario, Consumer<com.demcha.compose.document
             int draws = count(document, "Tj") + count(document, "TJ");
             int tf = count(document, "Tf");
             int rg = count(document, "rg") + count(document, "g") + count(document, "sc") + count(document, "scn");
-            int saved = Math.max(0, draws - tf) + Math.max(0, draws - rg);
-            double reduction = draws == 0 ? 0 : 100.0 * (2.0 * draws - tf - rg) / (2.0 * draws);
-            System.out.printf("%-22s | %8d | %8d | %8d | %12d | %8.1f%%%n",
-                    scenario, draws, tf, rg, saved, reduction);
+            return new OpCounts(draws, tf, rg);
         }
     }
 
diff --git a/benchmarks/src/test/java/com/demcha/compose/RenderOperatorGateTest.java b/benchmarks/src/test/java/com/demcha/compose/RenderOperatorGateTest.java
new file mode 100644
index 000000000..01807d4bf
--- /dev/null
+++ b/benchmarks/src/test/java/com/demcha/compose/RenderOperatorGateTest.java
@@ -0,0 +1,41 @@
+package com.demcha.compose;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Deterministic regression gate for the F5 render-operator coalescing, driving
+ * {@link RenderOperatorProbe#countOperators}.
+ *
+ * <p>Before F5 the paragraph handler emitted one {@code setFont} (Tf) and one
+ * non-stroking-colour op per text-show, so font/colour ops scaled 1:1 with the
+ * per-line {@code Tj}/{@code TJ} draws. After F5 they are coalesced, so a single
+ * styled paragraph that wraps to many lines emits far fewer Tf/colour ops than
+ * draws. Asserting {@code tf < draws} and {@code rg < draws} pins that
+ * structural win as a build-failing check — a regression back to per-span font
+ * ops (bloated content streams) breaks this test instead of passing CI. The
+ * assertion is content-independent: it does not hardcode brittle exact counts.</p>
+ */
+class RenderOperatorGateTest {
+
+    private static final String LONG_PARAGRAPH =
+            ("GraphCompose lays out structured business documents across many pages "
+                    + "while keeping header and footer placement stable. ").repeat(30);
+
+    @Test
+    void fontAndColourOpsStayCoalescedBelowTextDraws() throws Exception {
+        RenderOperatorProbe.OpCounts counts =
+                RenderOperatorProbe.countOperators(flow -> flow.addParagraph(LONG_PARAGRAPH));
+
+        assertThat(counts.draws())
+                .as("a long paragraph must wrap to many text-show ops")
+                .isGreaterThanOrEqualTo(10);
+        assertThat(counts.tf())
+                .as("setFont ops must be coalesced below the per-line draw count (F5), not 1:1")
+                .isLessThan(counts.draws());
+        assertThat(counts.rg())
+                .as("non-stroking colour ops must be coalesced below the per-line draw count (F5)")
+                .isLessThan(counts.draws());
+    }
+}

From 8f8b47702005b04e5d157b505d72a34b78eee7f2 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 10:21:06 +0100
Subject: [PATCH 14/36] perf(benchmarks): add a single-shot cold-start render
 bench

Every JMH bench reported steady-state (warm) timings, which is what a long-lived server pays; nothing measured the JIT-cold first render a short-lived CLI invocation or a serverless cold-start actually pays.

ColdStartJmhBenchmark uses Mode.SingleShotTime with @Warmup(0)/@Measurement(1)/@Fork(10) to sample the cold first render across ten fresh JVMs, over the same workloads as the warm benches (an inline engine doc, InvoiceTemplateV1, the ModernProfessional CV preset). Specs and templates are built in @Setup so the measured shot is the cold render path, not fixture assembly. Observed cold first render ~370-510 ms/op locally, vs the warm ms-scale numbers -- the headline metric for CLI/Lambda consumers.
---
 .../compose/jmh/ColdStartJmhBenchmark.java    | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java
new file mode 100644
index 000000000..a21e3ddbc
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java
@@ -0,0 +1,141 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.CanonicalBenchmarkSupport;
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.style.DocumentInsets;
+import com.demcha.compose.document.templates.builtins.InvoiceTemplateV1;
+import com.demcha.compose.document.templates.cv.presets.ModernProfessional;
+import com.demcha.compose.document.templates.cv.spec.CvSpec;
+import com.demcha.compose.document.templates.data.invoice.InvoiceDocumentSpec;
+import com.demcha.compose.document.theme.BusinessTheme;
+import com.demcha.compose.document.templates.api.DocumentTemplate;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH <em>single-shot</em> benchmark: the JIT-cold cost of the first PDF
+ * render in a fresh JVM. Every other JMH bench in this module reports
+ * steady-state ({@code AverageTime} after warmup), which is what a long-lived
+ * server pays — but a short-lived CLI invocation or a serverless (Lambda)
+ * cold-start pays the <em>first</em> render, with the layout and PDFBox classes
+ * unloaded and uncompiled. This bench measures exactly that.
+ *
+ * <p>{@code Mode.SingleShotTime} with {@code @Warmup(0)} and {@code @Measurement(1)}
+ * times a single invocation; {@code @Fork(10)} repeats it in ten fresh JVMs so the
+ * reported number is a distribution of cold first-renders, not one lucky start.
+ * The spec/template objects are built in {@link #setUp()} so the measured shot is
+ * the cold render path, not fixture assembly. Same workloads as the warm benches
+ * ({@code engine-simple} inline, {@code InvoiceTemplateV1}, {@code ModernProfessional})
+ * so cold and warm numbers are directly comparable.</p>
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar ColdStart
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode(Mode.SingleShotTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 0)
+@Measurement(iterations = 1)
+@Fork(10)
+public class ColdStartJmhBenchmark {
+
+    private InvoiceDocumentSpec invoice;
+    private InvoiceTemplateV1 invoiceTemplate;
+    private CvSpec cv;
+    private DocumentTemplate<CvSpec> cvTemplate;
+
+    /** Builds the specs and templates once per fork, outside the measured cold shot. */
+    @Setup
+    public void setUp() {
+        invoice = CanonicalBenchmarkSupport.canonicalInvoice();
+        invoiceTemplate = new InvoiceTemplateV1();
+        cv = CanonicalBenchmarkSupport.canonicalCv();
+        cvTemplate = ModernProfessional.create(BusinessTheme.modern());
+    }
+
+    /**
+     * Cold first render of a small inline engine document.
+     *
+     * @return the rendered PDF bytes (consumed by JMH)
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public byte[] coldEngineSimple() throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            document.pageFlow()
+                    .name("ColdEngineSimple")
+                    .spacing(10)
+                    .addParagraph("GraphCompose cold-start check")
+                    .addSection("Summary", section -> section
+                            .addParagraph("First render in a fresh JVM, layout and PDFBox classes cold."))
+                    .addSection("Body", section -> section
+                            .addParagraph("Structured business document composition.")
+                            .addParagraph("Semantic layout, pagination, deterministic output."))
+                    .build();
+            return document.toPdfBytes();
+        }
+    }
+
+    /**
+     * Cold first render of the canonical invoice through {@code InvoiceTemplateV1}.
+     *
+     * @return the rendered PDF bytes (consumed by JMH)
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public byte[] coldInvoiceTemplate() throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            invoiceTemplate.compose(document, invoice);
+            return document.toPdfBytes();
+        }
+    }
+
+    /**
+     * Cold first render of the canonical CV through the {@code ModernProfessional} preset.
+     *
+     * @return the rendered PDF bytes (consumed by JMH)
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public byte[] coldCvTemplate() throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            cvTemplate.compose(document, cv);
+            return document.toPdfBytes();
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}

From ebb31900fbdb2a047e53315f9e1dd02aebdf6b39 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 10:43:29 +0100
Subject: [PATCH 15/36] perf(benchmarks): add a multi-page report tier to the
 comparative benchmark

The comparative benchmark only rendered a trivial 3-line invoice -- too small to show GraphCompose's standing on real multi-page work (all three libraries finished in fixed overhead).

Add a 'business report' tier (title + 40-row line-item table + prose) rendered with equivalent content across all three: GraphCompose via the public pageFlow DSL with a repeating table header; iText via PdfPTable with setHeaderRows(1); JasperReports via a datasource-driven detail band with a repeating column header, the prose bound to a parameter and rendered through a stretch-height text field both before and after the table so all three lay out the same text. The small invoice stays as the fixed-overhead baseline; output prints two labelled scenario tables and the report carries a row per (library, scenario). README notes the Jasper fill-vs-build measurement boundary.

Local report numbers: GraphCompose 5.1ms/0.87MB, iText 2.8ms/4.97MB, JasperReports 9.3ms/2.51MB -- GraphCompose is mid on time but allocates ~5.7x less than iText.
---
 benchmarks/README.md                          |  15 +-
 .../demcha/compose/ComparativeBenchmark.java  | 220 +++++++++++++++++-
 2 files changed, 220 insertions(+), 15 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index f6041365c..8747f0d92 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -51,11 +51,16 @@
   layout-pass count) and reason about it; the harness is a sanity
   check after you've already chosen, not a decision tool before.
 - For **comparing GraphCompose to another PDF library** —
-  `ComparativeBenchmark` does render the same fixture through iText /
-  openHTMLToPDF / JasperReports for rough sizing, but the comparison
-  is a manual smoke test: each library has different defaults
-  (compression, font embedding, image resampling) and reading too much
-  into a single number is the wrong call.
+  `ComparativeBenchmark` does render equivalent content through iText /
+  JasperReports for rough sizing (two tiers: a tiny single-page invoice
+  for fixed overhead, and a multi-page report — title + 40-row table +
+  prose — for realistic work), but the comparison is a manual smoke test:
+  each library has different defaults (compression, font embedding, image
+  resampling) and reading too much into a single number is the wrong call.
+  Note one boundary asymmetry: the JasperReports figure measures fill +
+  PDF export with the design compiled once outside the loop, while the
+  GraphCompose and iText figures include per-iteration document
+  construction — so the Jasper number excludes work the other two pay.
 
 ## Files in this module
 
diff --git a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
index 76cd87c70..771505521 100644
--- a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
@@ -1,5 +1,6 @@
 package com.demcha.compose;
 
+import com.demcha.compose.document.api.DocumentPageSize;
 import com.demcha.compose.document.api.DocumentSession;
 import com.demcha.compose.document.node.ContainerNode;
 import com.demcha.compose.document.node.ParagraphNode;
@@ -11,15 +12,19 @@
 import com.itextpdf.text.pdf.PdfPTable;
 import com.itextpdf.text.pdf.PdfWriter;
 import net.sf.jasperreports.engine.*;
+import net.sf.jasperreports.engine.data.JRMapCollectionDataSource;
 import net.sf.jasperreports.engine.design.*;
+import net.sf.jasperreports.engine.type.TextAdjustEnum;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 
 import java.io.ByteArrayOutputStream;
 import java.time.LocalDateTime;
 import java.time.format.DateTimeFormatter;
 import java.lang.management.ManagementFactory;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 /**
  * Fair Comparative Benchmark (CPU & RAM)
@@ -32,8 +37,17 @@ public class ComparativeBenchmark {
     private static final int WARMUP_ITERATIONS = 50;
     private static final int MEASUREMENT_ITERATIONS = 100;
 
+    // Multi-page "report" scenario: a title, an N-row line-item table, and prose.
+    // Rendered with equivalent content across all three libraries so the numbers
+    // reflect real multi-page document work, not just per-render fixed overhead.
+    private static final int REPORT_ROWS = 40;
+    private static final String REPORT_PROSE =
+            ("GraphCompose lays out structured business documents across many pages "
+                    + "while keeping header and footer placement stable. ").repeat(6);
+
     // Предкомпилированный отчет для честного теста Jasper
     private static JasperReport compiledJasperReport;
+    private static JasperReport compiledJasperReportHeavy;
 
     public static void main(String[] args) throws Exception {
         BenchmarkSupport.configureQuietLogging();
@@ -41,28 +55,38 @@ public static void main(String[] args) throws Exception {
         System.out.println("Timestamp: " + LocalDateTime.now().format(TIMESTAMP_FORMAT));
         System.out.println("------------------------------------------------------------");
 
-        // Подготавливаем Jasper 1 раз (как в Production)
+        // Подготавливаем оба отчета Jasper 1 раз (как в Production)
         setupJasper();
+        setupJasperReport();
 
-        // Прогрев JVM (JIT компилятор)
+        // Прогрев JVM (JIT компилятор) — оба сценария
         System.out.println("Warming up JVM...");
         for (int i = 0; i < WARMUP_ITERATIONS; i++) {
             benchmarkGraphComposeCanonical();
             benchmarkIText();
             benchmarkJasper();
+            benchmarkGraphComposeReport();
+            benchmarkITextReport();
+            benchmarkJasperReport();
         }
 
-        // Замер
+        // Замер — два сценария: дешёвый (фиксированные накладные) и многостраничный
         System.out.println("Measuring performance (" + MEASUREMENT_ITERATIONS + " iterations)...");
+        List<ComparativeRow> rows = new ArrayList<>();
+
         System.out.println();
-        System.out.printf("%-24s | %14s | %14s%n", "Library", "Avg Time (ms)", "Avg Heap (MB)");
-        System.out.println("-".repeat(60));
+        System.out.println("Scenario: small invoice (single page, ~3 lines)");
+        printTableHeader();
+        rows.add(runBenchmark("GraphCompose Canonical", ComparativeBenchmark::benchmarkGraphComposeCanonical));
+        rows.add(runBenchmark("iText 5 (Old)", ComparativeBenchmark::benchmarkIText));
+        rows.add(runBenchmark("JasperReports", ComparativeBenchmark::benchmarkJasper));
 
-        List<ComparativeRow> rows = List.of(
-                runBenchmark("GraphCompose Canonical", ComparativeBenchmark::benchmarkGraphComposeCanonical),
-                runBenchmark("iText 5 (Old)", ComparativeBenchmark::benchmarkIText),
-                runBenchmark("JasperReports", ComparativeBenchmark::benchmarkJasper)
-        );
+        System.out.println();
+        System.out.println("Scenario: business report (multi-page: title + " + REPORT_ROWS + "-row table + prose)");
+        printTableHeader();
+        rows.add(runBenchmark("GraphCompose (report)", ComparativeBenchmark::benchmarkGraphComposeReport));
+        rows.add(runBenchmark("iText 5 (report)", ComparativeBenchmark::benchmarkITextReport));
+        rows.add(runBenchmark("JasperReports (report)", ComparativeBenchmark::benchmarkJasperReport));
 
         BenchmarkReportWriter.BenchmarkArtifacts artifacts = BenchmarkReportWriter.prepare("comparative");
         ComparativeReport report = new ComparativeReport(
@@ -85,6 +109,11 @@ public static void main(String[] args) throws Exception {
         System.out.println("Saved CSV benchmark report to " + csvPath);
     }
 
+    private static void printTableHeader() {
+        System.out.printf("%-24s | %14s | %14s%n", "Library", "Avg Time (ms)", "Avg Heap (MB)");
+        System.out.println("-".repeat(60));
+    }
+
     private static ComparativeRow runBenchmark(String name, BenchmarkTask task) throws Exception {
         long totalTimeNs = 0;
         long totalAllocatedBytes = 0;
@@ -145,6 +174,29 @@ private static byte[] benchmarkGraphComposeCanonical() throws Exception {
         }
     }
 
+    /**
+     * GraphCompose canonical, multi-page report: title + N-row table + prose,
+     * authored through the public page-flow DSL (the realistic consumer path).
+     */
+    private static byte[] benchmarkGraphComposeReport() throws Exception {
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4).margin(DocumentInsets.of(32)).create()) {
+            session.pageFlow(flow -> {
+                flow.name("Report").spacing(8);
+                flow.addParagraph("Quarterly Business Report");
+                flow.addParagraph(REPORT_PROSE);
+                flow.addTable(t -> {
+                    t.autoColumns(4).header("Item", "Qty", "Unit", "Total").repeatHeader();
+                    for (int r = 1; r <= REPORT_ROWS; r++) {
+                        t.row("Line item " + r, "3", "ea", "38.75");
+                    }
+                });
+                flow.addParagraph(REPORT_PROSE);
+            });
+            return session.toPdfBytes();
+        }
+    }
+
     /**
      * iText: Тестируем с таблицей, чтобы заставить его рассчитывать геометрию
      */
@@ -166,6 +218,36 @@ private static byte[] benchmarkIText() throws Exception {
         return baos.toByteArray();
     }
 
+    /**
+     * iText, multi-page report: same title + N-row table + prose. iText paginates
+     * the {@code PdfPTable} natively, so this exercises real multi-page layout.
+     */
+    private static byte[] benchmarkITextReport() throws Exception {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        Document document = new Document();
+        PdfWriter.getInstance(document, baos);
+        document.open();
+        document.add(new Paragraph("Quarterly Business Report"));
+        document.add(new Paragraph(REPORT_PROSE));
+
+        PdfPTable table = new PdfPTable(4);
+        table.setWidthPercentage(100);
+        table.setHeaderRows(1);
+        for (String header : new String[]{"Item", "Qty", "Unit", "Total"}) {
+            table.addCell(new Paragraph(header));
+        }
+        for (int r = 1; r <= REPORT_ROWS; r++) {
+            table.addCell(new Paragraph("Line item " + r));
+            table.addCell(new Paragraph("3"));
+            table.addCell(new Paragraph("ea"));
+            table.addCell(new Paragraph("38.75"));
+        }
+        document.add(table);
+        document.add(new Paragraph(REPORT_PROSE));
+        document.close();
+        return baos.toByteArray();
+    }
+
     /**
      * JasperReports: Тестируем ТОЛЬКО заполнение и экспорт (компиляция уже сделана)
      */
@@ -199,6 +281,124 @@ private static void setupJasper() throws Exception {
         compiledJasperReport = JasperCompileManager.compileReport(jd);
     }
 
+    /**
+     * JasperReports, multi-page report: a 4-field detail band filled from an
+     * {@code REPORT_ROWS}-row data source, with a title (+ prose) and column
+     * header. Compiled once here; the benchmark measures fill + PDF export.
+     */
+    private static byte[] benchmarkJasperReport() throws Exception {
+        List<Map<String, ?>> data = new ArrayList<>(REPORT_ROWS);
+        for (int r = 1; r <= REPORT_ROWS; r++) {
+            Map<String, Object> row = new HashMap<>();
+            row.put("item", "Line item " + r);
+            row.put("qty", "3");
+            row.put("unit", "ea");
+            row.put("total", "38.75");
+            data.add(row);
+        }
+        Map<String, Object> parameters = new HashMap<>();
+        parameters.put("prose", REPORT_PROSE);
+        JasperPrint jp = JasperFillManager.fillReport(
+                compiledJasperReportHeavy, parameters, new JRMapCollectionDataSource(data));
+        return JasperExportManager.exportReportToPdf(jp);
+    }
+
+    /** A full-width prose text field that wraps and grows, so all of {@code REPORT_PROSE} renders. */
+    private static JRDesignTextField proseField(int y) {
+        JRDesignTextField field = new JRDesignTextField();
+        field.setX(0);
+        field.setY(y);
+        field.setWidth(555);
+        field.setHeight(14);
+        field.setTextAdjust(TextAdjustEnum.STRETCH_HEIGHT);
+        JRDesignExpression expression = new JRDesignExpression();
+        expression.setText("$P{prose}");
+        field.setExpression(expression);
+        return field;
+    }
+
+    /** Compiles the multi-row Jasper report design once, before measurement. */
+    private static void setupJasperReport() throws Exception {
+        JasperDesign jd = new JasperDesign();
+        jd.setName("Report");
+        jd.setPageWidth(595);
+        jd.setPageHeight(842);
+        jd.setLeftMargin(20);
+        jd.setRightMargin(20);
+        jd.setTopMargin(20);
+        jd.setBottomMargin(20);
+        jd.setColumnWidth(555);
+
+        String[] fields = {"item", "qty", "unit", "total"};
+        for (String name : fields) {
+            JRDesignField field = new JRDesignField();
+            field.setName(name);
+            field.setValueClass(String.class);
+            jd.addField(field);
+        }
+
+        // Prose is a parameter rendered through a stretching text field, so all of
+        // REPORT_PROSE wraps and renders (a fixed static-text box would clip it),
+        // matching the full prose GraphCompose and iText lay out.
+        JRDesignParameter proseParameter = new JRDesignParameter();
+        proseParameter.setName("prose");
+        proseParameter.setValueClass(String.class);
+        jd.addParameter(proseParameter);
+
+        // Title band: heading + the first full prose block.
+        JRDesignBand title = new JRDesignBand();
+        title.setHeight(40);
+        JRDesignStaticText heading = new JRDesignStaticText();
+        heading.setX(0);
+        heading.setY(0);
+        heading.setWidth(555);
+        heading.setHeight(20);
+        heading.setText("Quarterly Business Report");
+        title.addElement(heading);
+        title.addElement(proseField(22));
+        jd.setTitle(title);
+
+        // Summary band: the second full prose block (the other two libs render
+        // prose both before and after the table).
+        JRDesignBand summary = new JRDesignBand();
+        summary.setHeight(16);
+        summary.addElement(proseField(0));
+        jd.setSummary(summary);
+
+        // Column header band.
+        String[] headers = {"Item", "Qty", "Unit", "Total"};
+        JRDesignBand columnHeader = new JRDesignBand();
+        columnHeader.setHeight(20);
+        for (int i = 0; i < headers.length; i++) {
+            JRDesignStaticText cell = new JRDesignStaticText();
+            cell.setX(i * 138);
+            cell.setY(0);
+            cell.setWidth(138);
+            cell.setHeight(18);
+            cell.setText(headers[i]);
+            columnHeader.addElement(cell);
+        }
+        jd.setColumnHeader(columnHeader);
+
+        // Detail band: one row per data-source record.
+        JRDesignBand detail = new JRDesignBand();
+        detail.setHeight(18);
+        for (int i = 0; i < fields.length; i++) {
+            JRDesignTextField cell = new JRDesignTextField();
+            cell.setX(i * 138);
+            cell.setY(0);
+            cell.setWidth(138);
+            cell.setHeight(16);
+            JRDesignExpression expression = new JRDesignExpression();
+            expression.setText("$F{" + fields[i] + "}");
+            cell.setExpression(expression);
+            detail.addElement(cell);
+        }
+        ((JRDesignSection) jd.getDetailSection()).addBand(detail);
+
+        compiledJasperReportHeavy = JasperCompileManager.compileReport(jd);
+    }
+
     @FunctionalInterface
     public interface BenchmarkTask {
         byte[] runAndGetBytes() throws Exception;

From a5c5cc213829d5d2e86ce762ea9d37df91328814 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 10:47:47 +0100
Subject: [PATCH 16/36] perf(benchmarks): add a production-scale large-table
 render bench

The suite rendered only small documents; nothing measured end-to-end render of a genuinely large multi-page table (TablePaginationAllocProbe covers layout-compile allocation only, not render). LargeTableJmhBenchmark renders a priced 5-column table parameterized over 100/500/1000 rows, with the header repeating on every page, so the large-table pagination + render scaling trend is visible. Observed ~9 / ~32 / ~77 ms/op locally. JMH full/on-demand, no CI gate.
---
 .../compose/jmh/LargeTableJmhBenchmark.java   | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java
new file mode 100644
index 000000000..1df72c467
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java
@@ -0,0 +1,86 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: end-to-end render of a production-scale priced
+ * table that paginates across many pages, parameterized over row count so the
+ * scaling trend of large-table pagination + render is visible.
+ *
+ * <p>The rest of the suite renders small documents; nothing measured how the
+ * engine handles a genuinely large multi-page table (the existing
+ * {@code TablePaginationAllocProbe} measures layout-compile allocation only, not
+ * end-to-end render). The header repeats on every page (the realistic report
+ * layout), so this exercises per-page header re-emission as well as row layout
+ * and slicing at scale.</p>
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar LargeTable
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class LargeTableJmhBenchmark {
+
+    @Param({"100", "500", "1000"})
+    public int rows;
+
+    /**
+     * Renders the priced table document to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderLargeTable(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(28))
+                .create()) {
+            document.pageFlow(flow -> {
+                flow.name("LargeTable").spacing(8);
+                flow.addParagraph("Priced line items");
+                flow.addTable(t -> {
+                    t.autoColumns(5).header("#", "Item", "Qty", "Unit", "Total").repeatHeader();
+                    for (int r = 1; r <= rows; r++) {
+                        t.row(String.valueOf(r), "Line item " + r, "3", "12.50", "37.50");
+                    }
+                });
+            });
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}

From 20bdf2f04f1047225063e1d138d650898106c39a Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 10:54:25 +0100
Subject: [PATCH 17/36] perf(benchmarks): add an allocation-rate / GC-pressure
 probe

The endurance and stress harnesses only check that sustained rendering stays stable and under a heap ceiling; nothing reported how much garbage a single render churns -- the driver of GC pressure for a high-throughput server.

AllocationRateProbe renders many warm documents of two realistic templates (invoice, proposal) and reports warm per-document allocation (ThreadMXBean current-thread bytes/doc, a deterministic A/B signal) plus the JVM garbage collections those renders triggered (count + time via GarbageCollectorMXBean, advisory). Observed ~3.9 MB/doc (invoice) and ~3.8 MB/doc (proposal), ~1 GC per ~18 renders. No src/main changes.
---
 .../demcha/compose/AllocationRateProbe.java   | 161 ++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/AllocationRateProbe.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/AllocationRateProbe.java b/benchmarks/src/main/java/com/demcha/compose/AllocationRateProbe.java
new file mode 100644
index 000000000..e81c2af92
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/AllocationRateProbe.java
@@ -0,0 +1,161 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.templates.builtins.InvoiceTemplateV1;
+import com.demcha.compose.document.templates.builtins.ProposalTemplateV1;
+import com.demcha.compose.document.templates.data.invoice.InvoiceDocumentSpec;
+import com.demcha.compose.document.templates.data.proposal.ProposalDocumentSpec;
+
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+
+/**
+ * Allocation-rate and GC-pressure probe over realistic templates. The endurance
+ * and stress harnesses only check that sustained rendering stays stable / under a
+ * heap ceiling; nothing reports how much garbage a single render churns, which is
+ * what drives GC pressure for a high-throughput server.
+ *
+ * <p>For each template it renders many warm documents and reports two things: the
+ * warm per-document allocation (ThreadMXBean current-thread bytes / doc — a
+ * deterministic figure ideal for an A/B), and the JVM garbage collections those
+ * renders triggered (count + time via {@code GarbageCollectorMXBean} — JVM-wide
+ * and GC-timing sensitive, so advisory). No {@code src/main} changes.</p>
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml exec:java -Dexec.mainClass=com.demcha.compose.AllocationRateProbe
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class AllocationRateProbe {
+
+    private static final com.sun.management.ThreadMXBean THREAD_MX =
+            (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
+
+    private static final int WARMUP = 60;
+    private static final int MEASURE = 300;
+
+    @FunctionalInterface
+    private interface Render {
+        byte[] run() throws Exception;
+    }
+
+    public static void main(String[] args) throws Exception {
+        BenchmarkSupport.configureQuietLogging();
+        enableAllocationMeasurement();
+
+        InvoiceDocumentSpec invoice = CanonicalBenchmarkSupport.canonicalInvoice();
+        InvoiceTemplateV1 invoiceTemplate = new InvoiceTemplateV1();
+        ProposalDocumentSpec proposal = CanonicalBenchmarkSupport.canonicalProposal();
+        ProposalTemplateV1 proposalTemplate = new ProposalTemplateV1();
+
+        System.out.println("GraphCompose allocation-rate / GC-pressure probe (" + MEASURE + " warm renders each)");
+        System.out.printf("%-12s | %14s | %10s | %12s | %12s%n",
+                "Template", "Alloc / doc", "GC count", "GC time ms", "Total alloc");
+        System.out.println("-".repeat(70));
+        report("invoice", () -> renderTemplate(s -> invoiceTemplate.compose(s, invoice)));
+        report("proposal", () -> renderTemplate(s -> proposalTemplate.compose(s, proposal)));
+        System.out.println();
+        System.out.println("Alloc/doc = warm ThreadMXBean bytes per render (deterministic A/B signal). "
+                + "GC count/time = JVM collections those renders triggered (advisory, GC-timing sensitive).");
+    }
+
+    private interface Compose {
+        void into(DocumentSession session);
+    }
+
+    private static byte[] renderTemplate(Compose compose) throws Exception {
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(22, 22, 22, 22)
+                .create()) {
+            compose.into(session);
+            return session.toPdfBytes();
+        }
+    }
+
+    private static void report(String name, Render render) throws Exception {
+        long dummy = 0;
+        for (int i = 0; i < WARMUP; i++) {
+            dummy += render.run().length;
+        }
+
+        System.gc();
+        Thread.sleep(50);
+
+        long gcCountStart = totalGcCount();
+        long gcTimeStart = totalGcTime();
+        long allocStart = currentThreadAllocatedBytes();
+
+        for (int i = 0; i < MEASURE; i++) {
+            dummy += render.run().length;
+        }
+
+        long alloc = allocStart < 0 ? -1 : currentThreadAllocatedBytes() - allocStart;
+        long gcCount = totalGcCount() - gcCountStart;
+        long gcTime = totalGcTime() - gcTimeStart;
+
+        System.out.printf("%-12s | %14s | %10d | %12d | %12s%n",
+                name,
+                alloc < 0 ? "n/a" : kb(alloc / (double) MEASURE),
+                gcCount,
+                gcTime,
+                alloc < 0 ? "n/a" : mb(alloc));
+
+        if (dummy == 0) {
+            System.out.println("Error: no bytes generated");
+        }
+    }
+
+    private static long totalGcCount() {
+        long total = 0;
+        for (GarbageCollectorMXBean bean : ManagementFactory.getGarbageCollectorMXBeans()) {
+            long count = bean.getCollectionCount();
+            if (count > 0) {
+                total += count;
+            }
+        }
+        return total;
+    }
+
+    private static long totalGcTime() {
+        long total = 0;
+        for (GarbageCollectorMXBean bean : ManagementFactory.getGarbageCollectorMXBeans()) {
+            long time = bean.getCollectionTime();
+            if (time > 0) {
+                total += time;
+            }
+        }
+        return total;
+    }
+
+    private static String kb(double bytes) {
+        return "%.1f KB".formatted(bytes / 1024.0);
+    }
+
+    private static String mb(long bytes) {
+        return "%.1f MB".formatted(bytes / (1024.0 * 1024.0));
+    }
+
+    private static void enableAllocationMeasurement() {
+        try {
+            if (THREAD_MX.isThreadAllocatedMemorySupported() && !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                THREAD_MX.setThreadAllocatedMemoryEnabled(true);
+            }
+        } catch (UnsupportedOperationException ignored) {
+            // Allocation measurement unsupported on this JVM; the probe reports n/a.
+        }
+    }
+
+    private static long currentThreadAllocatedBytes() {
+        try {
+            if (!THREAD_MX.isThreadAllocatedMemorySupported() || !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                return -1;
+            }
+        } catch (UnsupportedOperationException ex) {
+            return -1;
+        }
+        return THREAD_MX.getCurrentThreadAllocatedBytes();
+    }
+}

From 3f79c273aaf1fe776ee7b07cf5dcf35189f1ae11 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 10:56:36 +0100
Subject: [PATCH 18/36] perf(benchmarks): dump a sample PDF per
 library/scenario after the comparative run

After all measurement, ComparativeBenchmark writes one rendered PDF per library and scenario (graphcompose/itext/jasper x small/report) under target/benchmarks/comparative/samples/, so the exact documents the benchmark measured can be opened and inspected visually -- you can see what each library actually rendered.

The dump runs outside the measured region (after the report is written), so it cannot affect the timing or allocation numbers.
---
 .../demcha/compose/ComparativeBenchmark.java  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
index 771505521..dd71714a5 100644
--- a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
@@ -18,6 +18,8 @@
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 
 import java.io.ByteArrayOutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.time.LocalDateTime;
 import java.time.format.DateTimeFormatter;
 import java.lang.management.ManagementFactory;
@@ -107,6 +109,27 @@ public static void main(String[] args) throws Exception {
         System.out.println("-".repeat(60));
         System.out.println("Saved JSON benchmark report to " + jsonPath);
         System.out.println("Saved CSV benchmark report to " + csvPath);
+
+        // After all measurement, dump one rendered PDF per library/scenario so the
+        // exact documents that were benchmarked can be inspected visually. This runs
+        // outside the measured region, so it cannot affect the timing/allocation numbers.
+        Path samples = writeSampleRenders(artifacts.directory().resolve("samples"));
+        System.out.println("Saved sample renders (one PDF per library/scenario) to " + samples);
+    }
+
+    /**
+     * Renders each library/scenario once more and writes the bytes to PDF files,
+     * so a reader can open the actual documents the benchmark measured.
+     */
+    private static Path writeSampleRenders(Path directory) throws Exception {
+        Files.createDirectories(directory);
+        Files.write(directory.resolve("graphcompose-small.pdf"), benchmarkGraphComposeCanonical());
+        Files.write(directory.resolve("itext-small.pdf"), benchmarkIText());
+        Files.write(directory.resolve("jasper-small.pdf"), benchmarkJasper());
+        Files.write(directory.resolve("graphcompose-report.pdf"), benchmarkGraphComposeReport());
+        Files.write(directory.resolve("itext-report.pdf"), benchmarkITextReport());
+        Files.write(directory.resolve("jasper-report.pdf"), benchmarkJasperReport());
+        return directory;
     }
 
     private static void printTableHeader() {

From 5b394a6110d8d2297aae96d66ebf28de4e5cae1d Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 11:04:15 +0100
Subject: [PATCH 19/36] fix(benchmarks): render benchmark tables full-width to
 match real reports and rival libraries

Both the comparative report table and the large-table bench used autoColumns (content-width), so the GraphCompose table hugged its text while iText (setWidthPercentage 100) and JasperReports (full-column-width cells) filled the page. The comparative documents were therefore not layout-equivalent, and a content-width production-scale table is unrealistic.

Use equal fixed columns summing to the usable page width (page width minus the L/R margins), matching the rival libraries and real report layout. Comparative timing is unchanged (~4.3 ms / 0.87 MB for GraphCompose); the sample-dump PDFs now show equivalent full-width tables across all three libraries.
---
 .../com/demcha/compose/ComparativeBenchmark.java     | 12 +++++++++++-
 .../demcha/compose/jmh/LargeTableJmhBenchmark.java   | 12 +++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
index dd71714a5..d204da846 100644
--- a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
@@ -7,6 +7,7 @@
 import com.demcha.compose.document.node.TextAlign;
 import com.demcha.compose.document.style.DocumentInsets;
 import com.demcha.compose.document.style.DocumentTextStyle;
+import com.demcha.compose.document.table.DocumentTableColumn;
 import com.itextpdf.text.Document;
 import com.itextpdf.text.Paragraph;
 import com.itextpdf.text.pdf.PdfPTable;
@@ -202,6 +203,10 @@ private static byte[] benchmarkGraphComposeCanonical() throws Exception {
      * authored through the public page-flow DSL (the realistic consumer path).
      */
     private static byte[] benchmarkGraphComposeReport() throws Exception {
+        // Equal full-width columns (page width minus the 32pt L/R margins, split
+        // four ways), so the table fills the page like iText (setWidthPercentage
+        // 100) and Jasper (full-column-width cells) rather than hugging its text.
+        final double columnWidth = (DocumentPageSize.A4.width() - 2 * 32) / 4.0;
         try (DocumentSession session = GraphCompose.document()
                 .pageSize(DocumentPageSize.A4).margin(DocumentInsets.of(32)).create()) {
             session.pageFlow(flow -> {
@@ -209,7 +214,12 @@ private static byte[] benchmarkGraphComposeReport() throws Exception {
                 flow.addParagraph("Quarterly Business Report");
                 flow.addParagraph(REPORT_PROSE);
                 flow.addTable(t -> {
-                    t.autoColumns(4).header("Item", "Qty", "Unit", "Total").repeatHeader();
+                    t.columns(
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth))
+                            .header("Item", "Qty", "Unit", "Total").repeatHeader();
                     for (int r = 1; r <= REPORT_ROWS; r++) {
                         t.row("Line item " + r, "3", "ea", "38.75");
                     }
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java
index 1df72c467..82f45a20a 100644
--- a/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java
@@ -4,6 +4,7 @@
 import com.demcha.compose.document.api.DocumentPageSize;
 import com.demcha.compose.document.api.DocumentSession;
 import com.demcha.compose.document.style.DocumentInsets;
+import com.demcha.compose.document.table.DocumentTableColumn;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -56,6 +57,9 @@ public class LargeTableJmhBenchmark {
      */
     @Benchmark
     public void renderLargeTable(Blackhole blackhole) throws Exception {
+        // Equal full-width columns (page width minus the 28pt L/R margins, split
+        // five ways) so the table fills the page like a real report, not its text.
+        final double columnWidth = (DocumentPageSize.A4.width() - 2 * 28) / 5.0;
         try (DocumentSession document = GraphCompose.document()
                 .pageSize(DocumentPageSize.A4)
                 .margin(DocumentInsets.of(28))
@@ -64,7 +68,13 @@ public void renderLargeTable(Blackhole blackhole) throws Exception {
                 flow.name("LargeTable").spacing(8);
                 flow.addParagraph("Priced line items");
                 flow.addTable(t -> {
-                    t.autoColumns(5).header("#", "Item", "Qty", "Unit", "Total").repeatHeader();
+                    t.columns(
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth))
+                            .header("#", "Item", "Qty", "Unit", "Total").repeatHeader();
                     for (int r = 1; r <= rows; r++) {
                         t.row(String.valueOf(r), "Line item " + r, "3", "12.50", "37.50");
                     }

From 9491994098415fbce9e71d5a559eb63f4a37e94f Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 11:37:34 +0100
Subject: [PATCH 20/36] fix(benchmarks): fill the Jasper comparative columns,
 ignore benchmarks/logs, fix a stale README row

The Jasper report cells spanned 552 of the 555pt column; the last cell now absorbs the remainder so the table fills the full column width like GraphCompose and iText.

Add benchmarks/logs/ to .gitignore: the benchmarks logback config writes a relative logs/ directory (now produced by the new benchmark-gate test step), and the root-anchored /logs/ rule did not cover it. README 'Files in this module': ComparativeBenchmark no longer renders through openHTMLToPDF -- describe the two tiers plus the sample dump.
---
 .gitignore                                                    | 1 +
 benchmarks/README.md                                          | 2 +-
 .../main/java/com/demcha/compose/ComparativeBenchmark.java    | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9951201a3..c9f7fc1ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,7 @@ build/
 ### Mac OS ###
 .DS_Store
 /logs/
+benchmarks/logs/
 /CV_Generated.pdf
 *.pdf
 # Allow PDF previews that are committed README assets.
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 8747f0d92..0c01c095c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -67,7 +67,7 @@
 | File | Role |
 |---|---|
 | `CurrentSpeedBenchmark` | Default scenario runner — what CI's `perf-smoke` job exercises. Takes a `-Dgraphcompose.benchmark.profile=smoke\|full\|stress` switch. |
-| `ComparativeBenchmark` | Renders the same fixtures through GraphCompose, iText, openHTMLToPDF, JasperReports. **Rough local comparison only** — see "When not to use" above. |
+| `ComparativeBenchmark` | Renders equivalent content through GraphCompose, iText, JasperReports — two tiers (small invoice + multi-page report), and dumps a sample PDF per library/scenario. **Rough local comparison only** — see "When not to use" above. |
 | `FullCvBenchmark`, `ScalabilityBenchmark` | Fixture-specific runners for CV and table-heavy scenarios. |
 | `CanonicalBenchmarkSupport`, `BenchmarkSupport` | Shared fixture builders + measurement helpers. |
 | `BenchmarkReportWriter` | Writes JSON / CSV / text reports under `benchmarks/target/benchmarks/`. |
diff --git a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
index d204da846..9ea39235b 100644
--- a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
@@ -406,7 +406,7 @@ private static void setupJasperReport() throws Exception {
             JRDesignStaticText cell = new JRDesignStaticText();
             cell.setX(i * 138);
             cell.setY(0);
-            cell.setWidth(138);
+            cell.setWidth(i == headers.length - 1 ? 555 - i * 138 : 138);
             cell.setHeight(18);
             cell.setText(headers[i]);
             columnHeader.addElement(cell);
@@ -420,7 +420,7 @@ private static void setupJasperReport() throws Exception {
             JRDesignTextField cell = new JRDesignTextField();
             cell.setX(i * 138);
             cell.setY(0);
-            cell.setWidth(138);
+            cell.setWidth(i == fields.length - 1 ? 555 - i * 138 : 138);
             cell.setHeight(16);
             JRDesignExpression expression = new JRDesignExpression();
             expression.setText("$F{" + fields[i] + "}");

From 8e200de80033e057b3c7746eb1063a2e48063f42 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 11:44:23 +0100
Subject: [PATCH 21/36] docs(benchmarks): note why openHTMLtoPDF is excluded
 from the comparative

openHTMLtoPDF 1.0.10 (the declared version) targets PDFBox 2.x and fails at runtime against the PDFBox 3.x GraphCompose uses (PDType1Font.COURIER_BOLD_OBLIQUE and the other Standard-14 static fields were removed in PDFBox 3.x), so it cannot share GraphCompose's classpath and no PDFBox-3-compatible openhtmltopdf release exists yet. Document this in the comparative note so the exclusion is a known, reasoned decision rather than an oversight.
---
 benchmarks/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 0c01c095c..908cc61aa 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -61,6 +61,10 @@
   PDF export with the design compiled once outside the loop, while the
   GraphCompose and iText figures include per-iteration document
   construction — so the Jasper number excludes work the other two pay.
+  `openHTMLtoPDF` is intentionally absent: its current release (1.0.10)
+  targets PDFBox 2.x and fails at runtime against the PDFBox 3.x this
+  project uses (no PDFBox-3-compatible openhtmltopdf release exists yet),
+  so it cannot share GraphCompose's classpath.
 
 ## Files in this module
 

From a0c3e34d3109e0d19f8763f9846ed062f3209db1 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 11:53:12 +0100
Subject: [PATCH 22/36] perf(benchmarks): add an accented-Latin measurement
 scenario
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The measurement-count probe's text fixtures were all ASCII-Latin, so the distinct-width-request / repeat-rate counters never reflected a high-glyph-diversity non-ASCII workload.

Add an accented-Latin (Latin-1) scenario -- varied diacritic words (cafe/Genève/Größe/coração/fjörð...) covered by Standard-14 Helvetica -- alongside long-text/long-token/large-table. Observed 37 distinct width requests vs 32 for the repeated-ASCII long-text, and a lower repeat-rate. True CJK/Cyrillic would need an embedded font (noted in the fixture comment).
---
 .../demcha/compose/MeasurementCountBenchmark.java  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/benchmarks/src/main/java/com/demcha/compose/MeasurementCountBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/MeasurementCountBenchmark.java
index b4b585d53..926921ad3 100644
--- a/benchmarks/src/main/java/com/demcha/compose/MeasurementCountBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/MeasurementCountBenchmark.java
@@ -70,6 +70,16 @@ public final class MeasurementCountBenchmark {
             "Prefix text before an unbreakable token " + "x".repeat(600)
                     + " and several trailing words that must still wrap onto the following lines here.";
 
+    // High-glyph-diversity accented-Latin (Latin-1) passage: many distinct
+    // diacritic glyphs and varied words, unlike the single repeated ASCII
+    // sentence above, so distinctWidthRequests / repeat-rate reflect a non-ASCII,
+    // high-diversity workload. Standard-14 Helvetica covers Latin-1; true
+    // CJK / Cyrillic would need an embedded font and is out of scope here.
+    private static final String ACCENTED_LATIN_PARAGRAPH =
+            ("Le café à Genève - résumé naïve, façon piñata. Über die Größe schön: "
+                    + "coração São, mañana señor. Déjà brûlée crème, fjörð Århus Tromsø "
+                    + "Köln Zürich Besançon, garçon élève hôtel. ").repeat(40);
+
     public static void main(String[] args) throws Exception {
         BenchmarkSupport.configureQuietLogging();
         new MeasurementCountBenchmark().run();
@@ -87,6 +97,8 @@ private void run() throws Exception {
                 flow.addParagraph(p -> p.text(LONG_PARAGRAPH).textStyle(BODY_STYLE));
         Consumer<PageFlowBuilder> longToken = flow ->
                 flow.addParagraph(p -> p.text(LONG_TOKEN_PARAGRAPH).textStyle(BODY_STYLE));
+        Consumer<PageFlowBuilder> accentedText = flow ->
+                flow.addParagraph(p -> p.text(ACCENTED_LATIN_PARAGRAPH).textStyle(BODY_STYLE));
         Consumer<PageFlowBuilder> largeTable = MeasurementCountBenchmark::authorLargeTable;
 
         // Warm up the JVM (class loading + JIT) BEFORE the allocation window so the
@@ -98,12 +110,14 @@ private void run() throws Exception {
         for (int warmup = 0; warmup < 5; warmup++) {
             measureScenario("warmup", longText);
             measureScenario("warmup", longToken);
+            measureScenario("warmup", accentedText);
             measureScenario("warmup", largeTable);
         }
 
         List<Result> results = new ArrayList<>();
         results.add(measureScenario("long-text", longText));
         results.add(measureScenario("long-token", longToken));
+        results.add(measureScenario("accented-latin", accentedText));
         results.add(measureScenario("large-table", largeTable));
 
         System.out.printf("%-14s | %11s | %9s | %9s | %11s | %8s | %11s | %10s | %6s%n",

From 22008b88207df8b8f9bbe6440d27d5bce1864892 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 12:00:15 +0100
Subject: [PATCH 23/36] docs(benchmarks): document the per-PR gate scope vs
 on-demand benches

Record what gates every PR (the perf-smoke smoke run with absolute thresholds + the deterministic benchmark gate tests) and what is intentionally on-demand/local only: the JMH benches (a per-PR forked run of the whole suite is too slow for the signal) and the relative BenchmarkVerdictTool gate (no static smoke baseline is committed because absolute timings are machine-specific and would false-positive across machines; use a local same-machine A/B median instead). Makes the gate scope a stated design decision.
---
 benchmarks/README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 908cc61aa..7c39df47a 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -66,6 +66,27 @@
   project uses (no PDFBox-3-compatible openhtmltopdf release exists yet),
   so it cannot share GraphCompose's classpath.
 
+## What runs on a PR — and what is on-demand (by design)
+
+The per-PR CI gate is deliberately light and deterministic:
+
+- **`perf-smoke` job** — `CurrentSpeedBenchmark` in the `smoke` profile with
+  absolute latency / heap thresholds (a gross-regression tripwire), plus the
+  module's deterministic gate tests (`mvnw -f benchmarks/pom.xml test`:
+  image-cache reuse, render-operator coalescing, scenario/threshold coverage).
+
+These are intentionally **not** on the per-PR path:
+
+- **The JMH benches** (`*JmhBenchmark`) are full / on-demand only. A forked,
+  warmed JMH run of the whole suite takes minutes; running it per PR is too
+  expensive for the signal. Run them by hand (or on a schedule) before a release
+  and quote those numbers for rigorous claims.
+- **The relative `BenchmarkVerdictTool` gate** (±% vs a committed baseline) runs
+  locally only, and no static `smoke` baseline is committed: absolute timings are
+  machine-specific, so a baseline captured on one machine would false-positive on
+  another. Use a local same-machine A/B (a `-Repeat` median before/after) for
+  relative comparison; the absolute smoke thresholds are the CI safety net.
+
 ## Files in this module
 
 | File | Role |

From 1bf018f4a4fb105693608f7de2a53bbd10e3a9e8 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 12:47:18 +0100
Subject: [PATCH 24/36] perf(benchmarks): gate the v1.8 vector-paint operator
 structure

The deterministic v1.8 vector-paint probe (VectorRenderOperatorProbe) printed its operator counts but nothing asserted on them, so a regression in the gradient/alpha render branches would pass CI silently.

Refactor it to expose countOperators(PaintMode) and add VectorRenderOperatorGateTest pinning the per-mode cost structure: a flat fill emits no shading/alpha/clip (the fast path), a linear gradient emits one shading + one clip per shape, and a translucent fill sets one ExtGState per shape. The perf-smoke CI gate step (mvnw -f benchmarks/pom.xml test) now picks it up, extending the deterministic gate to the v1.8 vector render path. Probe console output is unchanged.
---
 .../compose/VectorRenderOperatorProbe.java    | 33 +++++++++--
 .../compose/VectorRenderOperatorGateTest.java | 56 +++++++++++++++++++
 2 files changed, 85 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
index 8ea5652c2..43988a666 100644
--- a/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
+++ b/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
@@ -31,9 +31,13 @@
  */
 public final class VectorRenderOperatorProbe {
 
-    private static final int PATHS = 40;
+    static final int PATHS = 40;
 
-    private enum PaintMode { FLAT, GRADIENT, ALPHA }
+    enum PaintMode { FLAT, GRADIENT, ALPHA }
+
+    /** PDF operator counts for one paint mode: cubic curves, shadings, ExtGState sets, clips. */
+    record OperatorCounts(int curves, int shadings, int extGStates, int clips) {
+    }
 
     public static void main(String[] args) throws Exception {
         BenchmarkSupport.configureQuietLogging();
@@ -50,6 +54,28 @@ public static void main(String[] args) throws Exception {
     }
 
     private static void report(PaintMode mode) throws Exception {
+        OperatorCounts counts = countOperators(mode);
+        System.out.printf("%-10s | %6d | %6d | %6d | %6d%n",
+                mode.name().toLowerCase(),
+                counts.curves(),
+                counts.shadings(),
+                counts.extGStates(),
+                counts.clips());
+    }
+
+    /**
+     * Renders {@link #PATHS} blob paths in the given paint mode and counts the PDF
+     * operators. Exposed (package-visible) so {@code VectorRenderOperatorGateTest}
+     * can pin the per-mode cost structure: flat takes the fast fill path (no
+     * shading / alpha / clip), gradient adds a shading + clip per shape, alpha
+     * adds an ExtGState per shape — and a flat path must never take the heavier
+     * gradient branch.
+     *
+     * @param mode the paint mode to exercise
+     * @return the operator counts of the rendered document
+     * @throws Exception if rendering fails
+     */
+    static OperatorCounts countOperators(PaintMode mode) throws Exception {
         byte[] pdf;
         try (DocumentSession session = GraphCompose.document()
                 .pageSize(DocumentPageSize.A4).margin(28, 28, 28, 28).create()) {
@@ -57,8 +83,7 @@ private static void report(PaintMode mode) throws Exception {
             pdf = session.toPdfBytes();
         }
         try (PDDocument document = Loader.loadPDF(pdf)) {
-            System.out.printf("%-10s | %6d | %6d | %6d | %6d%n",
-                    mode.name().toLowerCase(),
+            return new OperatorCounts(
                     count(document, "c"),
                     count(document, "sh"),
                     count(document, "gs"),
diff --git a/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java b/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java
new file mode 100644
index 000000000..040ce51e5
--- /dev/null
+++ b/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java
@@ -0,0 +1,56 @@
+package com.demcha.compose;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Deterministic regression gate for the v1.8 vector-paint render branches,
+ * driving {@link VectorRenderOperatorProbe#countOperators}.
+ *
+ * <p>A flat fill takes the fast path (no shading / alpha / clip); a linear
+ * gradient clips to the shape and paints a shading (one {@code W} clip + one
+ * {@code sh} per shape); a translucent fill sets an ExtGState alpha (one
+ * {@code gs} per shape). Pinning these operator counts makes a regression — a
+ * flat path accidentally taking the heavier gradient branch, or the gradient
+ * clip/shading being dropped — a build failure rather than a silent CI pass.</p>
+ */
+class VectorRenderOperatorGateTest {
+
+    @Test
+    void flatFillTakesTheFastPathWithNoShadingAlphaOrClip() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts flat =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.FLAT);
+
+        assertThat(flat.curves()).as("flat paths still emit curve operators").isGreaterThan(0);
+        assertThat(flat.shadings()).as("flat fill must not paint a shading").isZero();
+        assertThat(flat.extGStates()).as("flat fill must not set an ExtGState alpha").isZero();
+        assertThat(flat.clips()).as("flat fill must not clip").isZero();
+    }
+
+    @Test
+    void gradientFillClipsAndShadesOncePerShape() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts gradient =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.GRADIENT);
+
+        assertThat(gradient.shadings())
+                .as("a linear gradient paints one shading per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(gradient.clips())
+                .as("a gradient clips to each shape before shading")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+    }
+
+    @Test
+    void translucentFillSetsOneExtGStatePerShape() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts alpha =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.ALPHA);
+
+        assertThat(alpha.extGStates())
+                .as("a translucent fill sets one ExtGState alpha per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(alpha.shadings())
+                .as("a translucent solid fill must not paint a shading")
+                .isZero();
+    }
+}

From cc36009be6dbad55cf05f65527a340984942a5f7 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 12:53:01 +0100
Subject: [PATCH 25/36] perf(benchmarks): gate a v1.8 vector-rich scenario in
 the current-speed smoke harness

All six current-speed scenarios were text/table, so no v1.8 vector feature was under the per-PR perf gate; a regression in the chart/SVG-icon/gradient render path would not trip it.

Add a 'vector-rich' scenario (bar + pie charts, 8 SVG icons, a gradient accent path, reusing ChartBenchmarkFixtures / SvgBenchmarkFixtures) with a SMOKE threshold (20.0 ms / 256 MB, ~3.5x the observed ~5.7 ms / ~86 MB). CurrentSpeedScenarioGateTest enforces the threshold exists, and the perf-smoke gate now catches a regression in the vector render path the way it already gates text and tables.
---
 .../demcha/compose/CurrentSpeedBenchmark.java | 31 +++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index 64e113d20..3f23c6fe2 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -8,8 +8,10 @@
 import com.demcha.compose.document.backend.fixed.pdf.options.PdfWatermarkOptions;
 import com.demcha.compose.document.backend.fixed.pdf.options.PdfWatermarkPosition;
 import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentPaint;
 import com.demcha.compose.document.style.DocumentTextDecoration;
 import com.demcha.compose.document.style.DocumentTextStyle;
+import com.demcha.compose.document.svg.SvgIcon;
 import com.demcha.compose.document.templates.api.DocumentTemplate;
 import com.demcha.compose.document.templates.builtins.InvoiceTemplateV1;
 import com.demcha.compose.document.templates.builtins.ProposalTemplateV1;
@@ -105,7 +107,9 @@ public final class CurrentSpeedBenchmark {
             new ScenarioDef("feature-rich", "QR, barcode, watermark, header/footer, page break",
                     b -> b::renderFeatureRichDocument),
             new ScenarioDef("long-token", "Long unbreakable tokens (URLs/IDs) forcing character-level wrap",
-                    b -> b::renderLongTokenDocument)
+                    b -> b::renderLongTokenDocument),
+            new ScenarioDef("vector-rich", "v1.8 vector surface: bar + pie charts, SVG icons, gradient path",
+                    b -> b::renderVectorRichDocument)
     );
 
     /**
@@ -555,6 +559,28 @@ private byte[] renderEngineSimpleDocument() throws Exception {
                         + "a root flow container, heading text, paragraph layout, and final PDF serialization.");
     }
 
+    private byte[] renderVectorRichDocument() throws Exception {
+        DocumentPaint accent = DocumentPaint.linear(
+                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
+        SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(com.demcha.compose.document.api.DocumentPageSize.A4)
+                .margin(28, 28, 28, 28)
+                .create()) {
+            var flow = document.pageFlow().name("BenchmarkVectorRich").spacing(12);
+            flow.addParagraph("v1.8 vector-rich benchmark");
+            flow.chart(ChartBenchmarkFixtures.barSpec(), ChartBenchmarkFixtures.barStyle());
+            flow.chart(ChartBenchmarkFixtures.pieSpec());
+            for (int i = 0; i < 8; i++) {
+                flow.addSvgIcon(icon, 32);
+            }
+            flow.addPath(p -> p.size(220, 28)
+                    .moveTo(0.0, 0.5).curveTo(0.25, 1.0, 0.75, 0.0, 1.0, 0.5).fill(accent));
+            flow.build();
+            return document.toPdfBytes();
+        }
+    }
+
     private byte[] renderInvoiceTemplateDocument() throws Exception {
         try (DocumentSession document = GraphCompose.document()
                 .pageSize(com.demcha.compose.document.api.DocumentPageSize.A4)
@@ -951,7 +977,8 @@ enum BenchmarkProfile {
                 "cv-template", new SmokeThreshold(25.0, 192.0),
                 "proposal-template", new SmokeThreshold(45.0, 384.0),
                 "feature-rich", new SmokeThreshold(100.0, 256.0),
-                "long-token", new SmokeThreshold(10.0, 256.0)
+                "long-token", new SmokeThreshold(10.0, 256.0),
+                "vector-rich", new SmokeThreshold(20.0, 256.0)
         ));
 
         private final String id;

From 394afad6b02015c5171c89c45a2fcc0532902b57 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 14:05:26 +0100
Subject: [PATCH 26/36] perf(benchmarks): carry stages[] through the median
 aggregate

BenchmarkMedianTool medianed only latency and throughput, so a median-vs-median BenchmarkDiffTool run lost the compose/layout/render stage attribution -- the deterministic signal the stage breakdown adds, dropped on exactly the noise-reduced path used for real decisions.

Add aggregateCurrentSpeedStages (medians composeMillis/layoutMillis/renderMillis/totalMillis per scenario, paralleling the latency aggregation), carry stages[] in CurrentSpeedMedianReport, and emit a stages CSV when present. Lenient: stages[] is optional (CurrentSpeedBenchmark emits it only for runs with enough iterations), so it aggregates only when every source run carries a matching stages[] and is omitted otherwise -- no throw on the optional field. New BenchmarkMedianToolTest case asserts the medianed stages; the existing no-stages cases still pass.
---
 .../demcha/compose/BenchmarkMedianTool.java   | 71 +++++++++++++++++--
 .../compose/BenchmarkMedianToolTest.java      | 48 +++++++++++++
 2 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
index f82d0b6f8..3cadbfa43 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
@@ -25,10 +25,11 @@
  * for local benchmark sessions where a few repeated runs are needed to reduce
  * machine noise before comparing results.</p>
  *
- * <p>The current-speed per-stage breakdown ({@code stages[]}) is <em>not</em>
- * carried into the median aggregate — only latency and throughput are medianed.
- * A median-vs-median diff therefore shows no compose/layout/render stage deltas;
- * diff a single-run pair when you need stage attribution.</p>
+ * <p>The current-speed per-stage breakdown ({@code stages[]}) is medianed and
+ * carried into the aggregate when every source run has it (it is present only for
+ * runs with enough measurement iterations), so a median-vs-median
+ * {@link BenchmarkDiffTool} run still attributes a regression to
+ * compose / layout / render.</p>
  */
 public final class BenchmarkMedianTool {
 
@@ -80,6 +81,7 @@ private void aggregateCurrentSpeed(List<ReportFile> reportFiles) throws Exceptio
         List<Integer> threadCounts = requireIntegerArrayConsistency(reportFiles, "threadCounts");
 
         List<CurrentSpeedLatencyMedianRow> latencyRows = aggregateCurrentSpeedLatency(reportFiles);
+        List<CurrentSpeedStageMedianRow> stageRows = aggregateCurrentSpeedStages(reportFiles);
         List<CurrentSpeedThroughputMedianRow> throughputRows = aggregateCurrentSpeedThroughput(reportFiles);
 
         long totalBytesMedian = Math.round(median(
@@ -95,6 +97,7 @@ private void aggregateCurrentSpeed(List<ReportFile> reportFiles) throws Exceptio
                 docsPerThread,
                 threadCounts,
                 latencyRows,
+                stageRows,
                 throughputRows,
                 totalBytesMedian,
                 "median",
@@ -131,12 +134,28 @@ private void aggregateCurrentSpeed(List<ReportFile> reportFiles) throws Exceptio
                                 format(row.avgMillisPerDoc())))
                         .toList());
 
+        Path stagesCsv = null;
+        if (!stageRows.isEmpty()) {
+            stagesCsv = artifacts.writeCsv(
+                    "stages",
+                    List.of("scenario", "compose_ms", "layout_ms", "render_ms", "total_ms"),
+                    stageRows.stream()
+                            .map(row -> List.of(
+                                    row.scenario(),
+                                    format(row.composeMillis()),
+                                    format(row.layoutMillis()),
+                                    format(row.renderMillis()),
+                                    format(row.totalMillis())))
+                            .toList());
+        }
+
         System.out.println("Median benchmark report");
         System.out.println("Suite: current-speed");
         System.out.println("Profile: " + profile);
         System.out.println("Source runs: " + reportFiles.size());
         System.out.println("Saved JSON median report to " + jsonPath);
-        System.out.println("Saved CSV median reports to " + latencyCsv + " and " + throughputCsv);
+        System.out.println("Saved CSV median reports to " + latencyCsv
+                + (stagesCsv != null ? ", " + stagesCsv : "") + " and " + throughputCsv);
     }
 
     private List<CurrentSpeedLatencyMedianRow> aggregateCurrentSpeedLatency(List<ReportFile> reportFiles) {
@@ -170,6 +189,40 @@ private List<CurrentSpeedLatencyMedianRow> aggregateCurrentSpeedLatency(List<Rep
                 .toList();
     }
 
+    private List<CurrentSpeedStageMedianRow> aggregateCurrentSpeedStages(List<ReportFile> reportFiles) {
+        // stages[] is optional: CurrentSpeedBenchmark only emits it when the run
+        // has enough measurement iterations (smoke < 20 emits none). Aggregate only
+        // when EVERY source report carries a non-empty stages[] with the same
+        // scenario set; otherwise return empty so the median report simply carries
+        // no stages — mirroring the benchmark's own conditional emission rather
+        // than throwing on an absent/partial optional field.
+        List<JsonNode> firstRows = iterable(reportFiles.get(0).report().path("stages"));
+        if (firstRows.isEmpty()) {
+            return List.of();
+        }
+        Map<String, JsonNode> firstByScenario = indexBy(firstRows, "scenario");
+        for (ReportFile reportFile : reportFiles) {
+            Map<String, JsonNode> currentByScenario = indexBy(iterable(reportFile.report().path("stages")), "scenario");
+            if (!firstByScenario.keySet().equals(currentByScenario.keySet())) {
+                return List.of();
+            }
+        }
+
+        return firstByScenario.keySet().stream()
+                .map(scenario -> {
+                    List<JsonNode> rows = reportFiles.stream()
+                            .map(reportFile -> indexBy(iterable(reportFile.report().path("stages")), "scenario").get(scenario))
+                            .toList();
+                    return new CurrentSpeedStageMedianRow(
+                            scenario,
+                            median(rows, "composeMillis"),
+                            median(rows, "layoutMillis"),
+                            median(rows, "renderMillis"),
+                            median(rows, "totalMillis"));
+                })
+                .toList();
+    }
+
     private List<CurrentSpeedThroughputMedianRow> aggregateCurrentSpeedThroughput(List<ReportFile> reportFiles) {
         List<JsonNode> firstRows = iterable(reportFiles.get(0).report().path("throughput"));
         Map<String, JsonNode> firstByScenario = indexThroughput(firstRows);
@@ -398,6 +451,13 @@ private record CurrentSpeedThroughputMedianRow(String scenario,
                                                    double avgMillisPerDoc) {
     }
 
+    private record CurrentSpeedStageMedianRow(String scenario,
+                                              double composeMillis,
+                                              double layoutMillis,
+                                              double renderMillis,
+                                              double totalMillis) {
+    }
+
     private record CurrentSpeedMedianReport(String timestamp,
                                             String profile,
                                             int warmupIterations,
@@ -405,6 +465,7 @@ private record CurrentSpeedMedianReport(String timestamp,
                                             int docsPerThread,
                                             List<Integer> threadCounts,
                                             List<CurrentSpeedLatencyMedianRow> latency,
+                                            List<CurrentSpeedStageMedianRow> stages,
                                             List<CurrentSpeedThroughputMedianRow> throughput,
                                             long totalBytes,
                                             String aggregation,
diff --git a/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java b/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java
index ee449b4b7..f110c9c7e 100644
--- a/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java
+++ b/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java
@@ -209,4 +209,52 @@ void shouldWriteMedianComparativeAggregateForRepeatedRuns() throws Exception {
         assertThat(aggregate.path("libraries").get(1).path("avgHeapMb").asDouble()).isEqualTo(0.25);
     }
 
+    @Test
+    void shouldMedianStagesWhenSourceRunsCarryThem() throws Exception {
+        System.setProperty("graphcompose.benchmark.root", tempDir.toString());
+
+        Path suiteDir = Files.createDirectories(tempDir.resolve("current-speed"));
+        // Three runs whose render stage is 10 / 20 / 30 (median 20) and total
+        // 13 / 23 / 33 (median 23); compose/layout are constant (median 1 / 2).
+        double[] renders = {10.0, 20.0, 30.0};
+        String[] paths = new String[renders.length];
+        for (int i = 0; i < renders.length; i++) {
+            double render = renders[i];
+            double total = render + 3.0;
+            Path run = suiteDir.resolve("run-20260415-2200" + i + "0.json");
+            Files.writeString(run, """
+                    {
+                      "profile": "full",
+                      "warmupIterations": 12,
+                      "measurementIterations": 40,
+                      "docsPerThread": 12,
+                      "threadCounts": [1],
+                      "latency": [
+                        {"scenario": "invoice-template", "description": "Invoice", "avgMillis": %1$s,
+                         "p50Millis": 0.0, "p95Millis": 0.0, "maxMillis": 0.0, "docsPerSecond": 0.0,
+                         "avgKilobytes": 0.0, "peakHeapMb": 0.0}
+                      ],
+                      "stages": [
+                        {"scenario": "invoice-template", "composeMillis": 1.0, "layoutMillis": 2.0,
+                         "renderMillis": %1$s, "totalMillis": %2$s}
+                      ],
+                      "throughput": [],
+                      "totalBytes": 1000
+                    }
+                    """.formatted(render, total));
+            paths[i] = run.toString();
+        }
+
+        BenchmarkMedianTool.main(new String[]{"current-speed", paths[0], paths[1], paths[2]});
+
+        JsonNode aggregate = JSON.readTree(
+                Files.readAllBytes(tempDir.resolve("aggregates/current-speed/full/latest.json")));
+
+        JsonNode stage = aggregate.path("stages").get(0);
+        assertThat(stage.path("scenario").asText()).isEqualTo("invoice-template");
+        assertThat(stage.path("composeMillis").asDouble()).isEqualTo(1.0);
+        assertThat(stage.path("renderMillis").asDouble()).isEqualTo(20.0);
+        assertThat(stage.path("totalMillis").asDouble()).isEqualTo(23.0);
+    }
+
 }

From dfe74151b49a9e5f79911546706bc6d4b9a0b8de Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 14:10:31 +0100
Subject: [PATCH 27/36] docs(benchmarks): correct report field names and
 clarify docs_per_sec / peak heap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "How to read a report" section documented field names (avgMs / p50Ms / peakMB) that the emitted JSON never uses — the real keys are avgMillis / p50Millis / p95Millis / maxMillis / docsPerSecond / avgKilobytes / peakHeapMb. Correct them and fix two misleading descriptions: docsPerSecond is a derived 1000/avgMillis reciprocal of latency (real throughput is the separate throughput[] section), not a measured rate; peakHeapMb is a GC-noisy post-warmup heap delta (advisory), not an absolute MemoryMXBean reading. Also document the stages[] array. Doc-only; the JSON schema is unchanged.
---
 benchmarks/README.md | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 65b9e1550..bbf5e0c72 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -125,18 +125,27 @@ without reproducing locally.
 ## How to read a report
 
 The JSON shape is intentionally simple — a top-level run record with
-per-scenario sub-records. Each sub-record carries:
-
-- `avgMs`, `p50Ms`, `p95Ms`, `maxMs` — latency distribution across
-  iterations within the run.
-- `docsPerSec` — rough throughput; **not statistically rigorous**,
-  intended only as a relative number against a sibling scenario or a
-  previous run on the same machine.
-- `avgKB` — average output byte size. Stable across runs on the same
-  fixture; useful for catching content corruption (size shifts by
-  > a few hundred bytes are usually a bug, not a benchmark fluctuation).
-- `peakMB` — peak heap as observed by `MemoryMXBean`; coarse, do not
-  use for memory-budget enforcement.
+per-scenario sub-records. The latency rows carry these fields (the JSON
+keys are camelCase; the CSV columns are the snake_case equivalents):
+
+- `avgMillis`, `p50Millis`, `p95Millis`, `maxMillis` — latency distribution
+  across iterations within the run.
+- `docsPerSecond` — a **derived** figure, `1000 / avgMillis`: the reciprocal of
+  average latency, **not** a measured throughput rate. Real parallel throughput
+  lives in the separate `throughput[]` section (full profile only). Treat it as
+  a relative number against a sibling scenario or a previous run on the same
+  machine, not a publishable rate.
+- `avgKilobytes` — average output byte size. Stable across runs on the same
+  fixture; useful for catching content corruption (size shifts by more than a
+  few hundred bytes are usually a bug, not a benchmark fluctuation).
+- `peakHeapMb` — used-heap **delta** over the post-warmup baseline (closer to
+  per-iteration allocation pressure than to absolute live heap). GC-timing
+  noisy, so **advisory only** — for a deterministic memory signal use the
+  allocation bytes from `MeasurementCountBenchmark` or the alloc probes.
+
+A `stages[]` array carries the per-template-scenario compose / layout / render
+median split (`composeMillis` / `layoutMillis` / `renderMillis` / `totalMillis`),
+present when the run has enough measurement iterations.
 
 ## Strict JMH layer
 

From 0f56aedc23dc15d65a3ed0a932e2d3d3547dd760 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 14:25:30 +0100
Subject: [PATCH 28/36] perf(benchmarks): bench every chart-layout variant
 (horizontal/stacked/donut/axis-min)

Chart coverage was only vertical grouped-bar + line + full pie; the horizontal-transpose, stacked, donut, and non-zero value-axis-min resolver branches had no number, so a regression in any of them would go unmeasured.

Add horizontalBarSpec / stackedBarSpec / axisMinBarSpec / donutSpec to ChartBenchmarkFixtures and a ChartVariantJmhBenchmark that renders each one (@Param over the seven variants) so every ChartLayoutResolver branch has its own render-time row instead of being blended into the three-chart total.
---
 .../compose/ChartBenchmarkFixtures.java       | 43 +++++++++
 .../compose/jmh/ChartVariantJmhBenchmark.java | 94 +++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/ChartVariantJmhBenchmark.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java
index 59aa1578b..1993acb36 100644
--- a/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java
+++ b/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java
@@ -1,6 +1,7 @@
 package com.demcha.compose;
 
 import com.demcha.compose.document.chart.AxisSpec;
+import com.demcha.compose.document.chart.BarGrouping;
 import com.demcha.compose.document.chart.ChartData;
 import com.demcha.compose.document.chart.ChartSize;
 import com.demcha.compose.document.chart.ChartSpec;
@@ -88,4 +89,46 @@ public static ChartSpec pieSpec() {
                 .size(ChartSize.fixedHeight(190))
                 .build();
     }
+
+    /** Horizontal grouped bar — exercises the transposed (category-on-Y) layout branch. */
+    public static ChartSpec horizontalBarSpec() {
+        return ChartSpec.bar()
+                .data(monthlySeries())
+                .horizontal(true)
+                .valueAxis(AxisSpec.builder().baselineAtZero(true).build())
+                .legend(LegendPosition.BOTTOM)
+                .size(ChartSize.aspectRatio(16, 9))
+                .build();
+    }
+
+    /** Stacked bar — exercises the cumulative-stacking layout branch. */
+    public static ChartSpec stackedBarSpec() {
+        return ChartSpec.bar()
+                .data(monthlySeries())
+                .grouping(BarGrouping.STACKED)
+                .valueAxis(AxisSpec.builder().baselineAtZero(true).build())
+                .legend(LegendPosition.BOTTOM)
+                .size(ChartSize.aspectRatio(16, 7))
+                .build();
+    }
+
+    /** Bar with a non-zero value-axis minimum — exercises the lifted-baseline branch. */
+    public static ChartSpec axisMinBarSpec() {
+        return ChartSpec.bar()
+                .data(monthlySeries())
+                .valueAxis(AxisSpec.builder().min(8.0).build())
+                .legend(LegendPosition.BOTTOM)
+                .size(ChartSize.aspectRatio(16, 7))
+                .build();
+    }
+
+    /** Donut — exercises the pie's donut-ratio (inner-radius) branch. */
+    public static ChartSpec donutSpec() {
+        return ChartSpec.pie()
+                .data(regionShare())
+                .donutRatio(0.55)
+                .sliceLabels(SliceLabelMode.CATEGORY_PERCENT)
+                .size(ChartSize.fixedHeight(190))
+                .build();
+    }
 }
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ChartVariantJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ChartVariantJmhBenchmark.java
new file mode 100644
index 000000000..efdc1ff67
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ChartVariantJmhBenchmark.java
@@ -0,0 +1,94 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.ChartBenchmarkFixtures;
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.chart.ChartSpec;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: end-to-end render of a single chart, parameterized
+ * over the chart-layout branches the resolver takes — grouped bar, horizontal
+ * bar, stacked bar, a non-zero value-axis minimum (lifted baseline), line, pie,
+ * and donut. {@code ChartJmhBenchmark} renders one grouped-bar + line + pie
+ * document; this isolates each distinct {@code ChartLayoutResolver} branch so a
+ * regression in, say, the stacking or horizontal-transpose geometry shows up on
+ * its own row rather than blended into a three-chart total.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar ChartVariant
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class ChartVariantJmhBenchmark {
+
+    @Param({"grouped-bar", "horizontal-bar", "stacked-bar", "axis-min-bar", "line", "pie", "donut"})
+    public String variant;
+
+    /** Resolved once per trial so the bench measures the render, not spec assembly. */
+    private ChartSpec spec;
+
+    @Setup
+    public void setUp() {
+        spec = switch (variant) {
+            case "grouped-bar" -> ChartBenchmarkFixtures.barSpec();
+            case "horizontal-bar" -> ChartBenchmarkFixtures.horizontalBarSpec();
+            case "stacked-bar" -> ChartBenchmarkFixtures.stackedBarSpec();
+            case "axis-min-bar" -> ChartBenchmarkFixtures.axisMinBarSpec();
+            case "line" -> ChartBenchmarkFixtures.lineSpec();
+            case "pie" -> ChartBenchmarkFixtures.pieSpec();
+            case "donut" -> ChartBenchmarkFixtures.donutSpec();
+            default -> throw new IllegalArgumentException("Unknown chart variant: " + variant);
+        };
+    }
+
+    /**
+     * Renders a one-chart document of the parameterized variant to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderChartVariant(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            document.pageFlow().name("ChartVariant").spacing(12).chart(spec).build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}

From ebf4294a2e54857a086ec06433b40ccd77a005c5 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 15:46:33 +0100
Subject: [PATCH 29/36] perf(benchmarks): cover stroked + dashed vector paths
 in the operator probe + gate

VectorRenderOperatorProbe covered only the three fill modes (flat/gradient/alpha); the stroke and dash render branches had no operator coverage, so a regression there would pass silently.

Add STROKED and DASHED paint modes (counting S/s stroke and d dash-array operators) and pin them in VectorRenderOperatorGateTest: a stroked path strokes once per shape and sets no dash, a dashed stroke sets a dash array once per shape and still strokes, and a flat fill strokes/dashes never. Observed flat S=0/d=0, stroked S=40/d=0, dashed S=40/d=40.
---
 .../compose/VectorRenderOperatorProbe.java    | 33 +++++++++++++------
 .../compose/VectorRenderOperatorGateTest.java | 29 ++++++++++++++++
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
index 43988a666..cc3c79dcc 100644
--- a/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
+++ b/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
@@ -5,6 +5,7 @@
 import com.demcha.compose.document.dsl.PageFlowBuilder;
 import com.demcha.compose.document.style.DocumentColor;
 import com.demcha.compose.document.style.DocumentPaint;
+import com.demcha.compose.document.style.DocumentStroke;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.contentstream.operator.Operator;
 import org.apache.pdfbox.pdfparser.PDFStreamParser;
@@ -33,34 +34,41 @@ public final class VectorRenderOperatorProbe {
 
     static final int PATHS = 40;
 
-    enum PaintMode { FLAT, GRADIENT, ALPHA }
+    enum PaintMode { FLAT, GRADIENT, ALPHA, STROKED, DASHED }
 
-    /** PDF operator counts for one paint mode: cubic curves, shadings, ExtGState sets, clips. */
-    record OperatorCounts(int curves, int shadings, int extGStates, int clips) {
+    /**
+     * PDF operator counts for one paint mode: cubic curves ({@code c}), shadings
+     * ({@code sh}), ExtGState sets ({@code gs}), clips ({@code W}), strokes
+     * ({@code S}/{@code s}) and dash-array sets ({@code d}).
+     */
+    record OperatorCounts(int curves, int shadings, int extGStates, int clips, int strokes, int dashes) {
     }
 
     public static void main(String[] args) throws Exception {
         BenchmarkSupport.configureQuietLogging();
 
         System.out.println("GraphCompose vector-paint render-operator probe (" + PATHS + " blob paths each)");
-        System.out.printf("%-10s | %6s | %6s | %6s | %6s%n", "Mode", "c", "sh", "gs", "W");
-        System.out.println("-".repeat(46));
+        System.out.printf("%-10s | %6s | %6s | %6s | %6s | %6s | %6s%n", "Mode", "c", "sh", "gs", "W", "S", "d");
+        System.out.println("-".repeat(64));
         for (PaintMode mode : PaintMode.values()) {
             report(mode);
         }
         System.out.println();
-        System.out.println("c=cubic curve, sh=shading fill, gs=ExtGState (alpha), W=clip. "
-                + "Flat takes the fast path (no sh/gs/W); gradient adds sh+W per shape; alpha adds gs.");
+        System.out.println("c=cubic curve, sh=shading fill, gs=ExtGState (alpha), W=clip, S=stroke, d=dash set. "
+                + "Flat takes the fast fill path (no sh/gs/W/S/d); gradient adds sh+W per shape; alpha adds gs; "
+                + "stroked adds S per shape; dashed adds d+S per shape.");
     }
 
     private static void report(PaintMode mode) throws Exception {
         OperatorCounts counts = countOperators(mode);
-        System.out.printf("%-10s | %6d | %6d | %6d | %6d%n",
+        System.out.printf("%-10s | %6d | %6d | %6d | %6d | %6d | %6d%n",
                 mode.name().toLowerCase(),
                 counts.curves(),
                 counts.shadings(),
                 counts.extGStates(),
-                counts.clips());
+                counts.clips(),
+                counts.strokes(),
+                counts.dashes());
     }
 
     /**
@@ -87,7 +95,9 @@ static OperatorCounts countOperators(PaintMode mode) throws Exception {
                     count(document, "c"),
                     count(document, "sh"),
                     count(document, "gs"),
-                    count(document, "W"));
+                    count(document, "W"),
+                    count(document, "S") + count(document, "s"),
+                    count(document, "d"));
         }
     }
 
@@ -96,6 +106,7 @@ private static void authorBlobs(PageFlowBuilder flow, PaintMode mode) {
                 DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
         DocumentColor flat = DocumentColor.rgb(40, 90, 160);
         DocumentColor translucent = DocumentColor.rgb(40, 90, 160).withOpacity(0.5);
+        DocumentStroke stroke = DocumentStroke.of(DocumentColor.rgb(40, 90, 160), 2.0);
         for (int i = 0; i < PATHS; i++) {
             flow.addPath(p -> {
                 p.size(60, 36)
@@ -107,6 +118,8 @@ private static void authorBlobs(PageFlowBuilder flow, PaintMode mode) {
                     case FLAT -> p.fillColor(flat);
                     case GRADIENT -> p.fill(gradient);
                     case ALPHA -> p.fillColor(translucent);
+                    case STROKED -> p.stroke(stroke);
+                    case DASHED -> p.stroke(stroke).dashed(4.0, 2.0);
                 }
             });
         }
diff --git a/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java b/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java
index 040ce51e5..5aa3d91ef 100644
--- a/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java
+++ b/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java
@@ -26,6 +26,35 @@ void flatFillTakesTheFastPathWithNoShadingAlphaOrClip() throws Exception {
         assertThat(flat.shadings()).as("flat fill must not paint a shading").isZero();
         assertThat(flat.extGStates()).as("flat fill must not set an ExtGState alpha").isZero();
         assertThat(flat.clips()).as("flat fill must not clip").isZero();
+        assertThat(flat.strokes()).as("a flat fill must not stroke").isZero();
+        assertThat(flat.dashes()).as("a flat fill must not set a dash array").isZero();
+    }
+
+    @Test
+    void strokedPathStrokesOncePerShapeWithoutFillPaint() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts stroked =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.STROKED);
+
+        assertThat(stroked.strokes())
+                .as("a stroked path strokes once per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(stroked.dashes()).as("a solid stroke sets no dash array").isZero();
+        assertThat(stroked.shadings()).as("a stroke must not paint a shading").isZero();
+        assertThat(stroked.extGStates()).as("a stroke must not set an ExtGState alpha").isZero();
+    }
+
+    @Test
+    void dashedStrokeSetsADashArrayPerShape() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts dashed =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.DASHED);
+
+        assertThat(dashed.dashes())
+                .as("a dashed stroke sets a dash array once per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(dashed.strokes())
+                .as("a dashed path still strokes once per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(dashed.shadings()).as("a dashed stroke must not paint a shading").isZero();
     }
 
     @Test

From fe4053a5c5d87b67829129facac9928043f60063 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 16:01:15 +0100
Subject: [PATCH 30/36] perf(benchmarks): make the smoke-gate peak-heap check
 advisory, not a hard fail

The smoke perf gate hard-failed on peakHeapMb -- a GC-timing-noisy used-heap delta -- so a GC blip could redden a PR on a non-regression. BenchmarkVerdictTool already treats heap as advisory; align them. evaluatePerformanceGate now fails only on avgMillis and reports any peak-heap breach as an advisory note (passed stays true). The deterministic memory signal remains the allocation-bytes probes. The perf-gate test is updated accordingly (treatsPeakHeapAsAdvisoryNotAGateFailure).
---
 .../com/demcha/compose/CurrentSpeedBenchmark.java   | 13 ++++++++++---
 .../compose/CurrentSpeedBenchmarkPerfGateTest.java  | 10 ++++++----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index 3f23c6fe2..aca749da8 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -510,6 +510,7 @@ static PerformanceGateResult evaluatePerformanceGate(BenchmarkProfile profile, L
         }
 
         List<String> failures = new ArrayList<>();
+        List<String> advisories = new ArrayList<>();
         for (LatencyRow row : latencyRows) {
             SmokeThreshold threshold = profile.smokeThresholds().get(row.scenario());
             if (threshold == null) {
@@ -527,17 +528,23 @@ static PerformanceGateResult evaluatePerformanceGate(BenchmarkProfile profile, L
                 failures.add(row.scenario() + " avg " + format(row.avgMillis()) + " ms > " + format(maxAvgMillis) + " ms");
             }
             if (row.peakHeapMb() > maxPeakHeapMb) {
-                failures.add(row.scenario() + " peak heap " + format(row.peakHeapMb()) + " MB > " + format(maxPeakHeapMb) + " MB");
+                // peakHeapMb is a GC-timing-noisy used-heap delta, so a breach is
+                // reported as advisory rather than failing the gate — matching
+                // BenchmarkVerdictTool and avoiding flaky CI from a GC blip. The
+                // deterministic memory signal is the allocation-bytes probes.
+                advisories.add(row.scenario() + " peak heap " + format(row.peakHeapMb()) + " MB > " + format(maxPeakHeapMb) + " MB");
             }
         }
 
+        String advisoryNote = advisories.isEmpty() ? "" : " (advisory: " + String.join("; ", advisories) + ")";
+
         if (failures.isEmpty()) {
-            return new PerformanceGateResult(true, "Performance gate passed for profile " + profile.id());
+            return new PerformanceGateResult(true, "Performance gate passed for profile " + profile.id() + advisoryNote);
         }
 
         return new PerformanceGateResult(
                 false,
-                "Performance gate failed for profile " + profile.id() + ": " + String.join("; ", failures));
+                "Performance gate failed for profile " + profile.id() + ": " + String.join("; ", failures) + advisoryNote);
     }
 
     private long usedHeapBytes() {
diff --git a/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedBenchmarkPerfGateTest.java b/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedBenchmarkPerfGateTest.java
index cae8d91f0..6a1efc07a 100644
--- a/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedBenchmarkPerfGateTest.java
+++ b/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedBenchmarkPerfGateTest.java
@@ -53,14 +53,16 @@ void failsWhenAverageLatencyExceedsThreshold() {
     }
 
     @Test
-    void failsWhenPeakHeapExceedsThreshold() {
+    void treatsPeakHeapAsAdvisoryNotAGateFailure() {
         CurrentSpeedBenchmark.PerformanceGateResult result =
                 CurrentSpeedBenchmark.evaluatePerformanceGate(
                         CurrentSpeedBenchmark.BenchmarkProfile.SMOKE,
-                        List.of(latency(ENGINE_SIMPLE, 1.0, 999.0))); // 999 > 96
+                        List.of(latency(ENGINE_SIMPLE, 1.0, 999.0))); // heap 999 > 96, avg 1.0 ok
 
-        assertThat(result.passed()).isFalse();
-        assertThat(result.message()).contains("peak heap");
+        assertThat(result.passed())
+                .as("peak heap is GC-noisy and advisory — a heap-only breach must not fail the gate")
+                .isTrue();
+        assertThat(result.message()).contains("peak heap").contains("advisory");
     }
 
     @Test

From a013c755d43c2417d2b5f67c73752a067bb2e8b3 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 16:04:19 +0100
Subject: [PATCH 31/36] perf(benchmarks): add sparkline-ramp and per-paint-mode
 vector render benches

Sparklines were measured only inside MixedShowcaseJmhBenchmark, and the vector paint modes only at the operator-count level (VectorRenderOperatorProbe), never as render time.

SparklineRampJmhBenchmark renders a rich paragraph of N inline sparklines (@Param 8/32/128) so the per-sparkline inline-fragment cost scales visibly. VectorPaintJmhBenchmark renders 40 blob paths flat/gradient/alpha (@Param) for the render-time complement to the operator probe. Observed sparkline ramp ~2.6/5.4/17.9 ms; flat ~1.9 / gradient ~3.5 / alpha ~1.7 ms.
---
 .../jmh/SparklineRampJmhBenchmark.java        | 83 ++++++++++++++++
 .../compose/jmh/VectorPaintJmhBenchmark.java  | 99 +++++++++++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/SparklineRampJmhBenchmark.java
 create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java

diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/SparklineRampJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/SparklineRampJmhBenchmark.java
new file mode 100644
index 000000000..492aba9f1
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/SparklineRampJmhBenchmark.java
@@ -0,0 +1,83 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: a "sparkline ramp" — a rich paragraph carrying
+ * {@code N} inline sparklines — rendered to PDF, parameterized over N so the
+ * per-sparkline inline-fragment cost (build + layout + vector draw) is visible.
+ * Sparklines were otherwise only exercised once inside
+ * {@code MixedShowcaseJmhBenchmark}, where a regression would dilute into the
+ * surrounding charts and icons; this isolates and scales them.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar SparklineRamp
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class SparklineRampJmhBenchmark {
+
+    private static final DocumentColor ACCENT = DocumentColor.rgb(20, 80, 95);
+
+    @Param({"8", "32", "128"})
+    public int sparklineCount;
+
+    /**
+     * Renders a paragraph of {@code sparklineCount} inline sparklines to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderSparklineRamp(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(28))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("SparklineRamp").spacing(4);
+            flow.addRich(r -> {
+                for (int i = 0; i < sparklineCount; i++) {
+                    r.plain("m ").sparkline(42, 9, ACCENT, 65.2, 69.8, 74.1, 81.3, 88.2);
+                }
+            });
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java
new file mode 100644
index 000000000..4f9f5061c
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java
@@ -0,0 +1,99 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentPaint;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: render {@code N} identical curved blob paths in one
+ * paint mode — flat solid fill, linear gradient, or translucent (alpha) fill —
+ * parameterized over the mode, so the render-<em>time</em> cost of each vector
+ * paint branch is isolated. This is the timing complement to
+ * {@code VectorRenderOperatorProbe} (which counts the PDF operators each mode
+ * emits): gradient shading and alpha ExtGState are heavier than the flat fast
+ * fill path, and this puts a millisecond number on that.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar VectorPaint
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class VectorPaintJmhBenchmark {
+
+    private static final int PATHS = 40;
+
+    @Param({"flat", "gradient", "alpha"})
+    public String paint;
+
+    /**
+     * Renders {@code PATHS} blob paths in the parameterized paint mode to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderVectorPaint(Blackhole blackhole) throws Exception {
+        DocumentPaint gradient = DocumentPaint.linear(
+                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
+        DocumentColor flat = DocumentColor.rgb(40, 90, 160);
+        DocumentColor translucent = DocumentColor.rgb(40, 90, 160).withOpacity(0.5);
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(28))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("VectorPaint").spacing(4);
+            for (int i = 0; i < PATHS; i++) {
+                flow.addPath(p -> {
+                    p.size(60, 36)
+                            .moveTo(0.0, 0.5)
+                            .curveTo(0.25, 1.0, 0.75, 1.0, 1.0, 0.5)
+                            .curveTo(0.75, 0.0, 0.25, 0.0, 0.0, 0.5)
+                            .closePath();
+                    switch (paint) {
+                        case "flat" -> p.fillColor(flat);
+                        case "gradient" -> p.fill(gradient);
+                        case "alpha" -> p.fillColor(translucent);
+                        default -> throw new IllegalArgumentException("Unknown paint mode: " + paint);
+                    }
+                });
+            }
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}

From 2e0f9f1ad2da79f79d2fd002d71839a86707f1b8 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 16:05:39 +0100
Subject: [PATCH 32/36] docs(benchmarks): document the JMH @Fork(1) default and
 refresh the bench list

The "Strict JMH layer" section listed only 3 of the now-12 JMH benches and never stated the fork choice. Refresh the list (steady-state render / parameterised scaling ramps / SVG micro-benches / single-shot cold-start) and document that @Fork(1) is the deliberately fast on-demand default -- pass -f N for a cross-fork error estimate when quoting a number.
---
 benchmarks/README.md | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index bbf5e0c72..228d7a2ee 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -152,10 +152,16 @@ present when the run has enough measurement iterations.
 The Track C JMH layer (forked JVM, warmup + measurement, JIT-stable numbers)
 lives alongside this manual harness. JMH benchmarks are annotated classes under
 `com.demcha.compose.jmh`; the shade plugin builds a self-contained runner jar so
-forked benchmark JVMs inherit the full classpath. Present benchmarks:
-`CanonicalRender` (bare-DSL multi-section render), `TemplateCv` (the
-`ModernProfessional` layered template), and `PaginatedDocument` (a multi-page
-document parameterised by section count).
+forked benchmark JVMs inherit the full classpath. The suite spans steady-state
+render benches (`CanonicalRender`, `TemplateCv`, `Chart`, `ChartVariant`, `Image`,
+`MixedShowcase`), parameterised scaling ramps (`IconRamp`, `LargeTable`,
+`SparklineRamp`, `PaginatedDocument`, `VectorPaint`), the SVG-import micro-benches
+(`Svg`), and a single-shot cold-start bench (`ColdStart`).
+
+Every JMH bench uses `@Fork(1)` with a 3×2s warmup / 5×2s measurement window — a
+deliberately fast default for on-demand local iteration (a single fork, so the
+reported `Error` column is blank). For a number you intend to quote, pass more
+forks on the CLI (e.g. `-f 5`) to get a cross-fork error estimate.
 
 The measured region differs per benchmark: `TemplateCv` hoists fixture
 construction into `@Setup` and times the render only, while `CanonicalRender` and

From 2eab25282db255029255b02b6e28c00566b6bd81 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 16:24:51 +0100
Subject: [PATCH 33/36] docs(benchmarks): sync docs with the full suite + hoist
 vector-paint fixtures to @Setup

README: scope the @Fork(1) note to steady-state benches (ColdStart is single-shot @Fork(10)) and correct the smoke scenario count (5 -> 7). CurrentSpeedBenchmark class Javadoc now lists all seven scenarios (adds long-token and vector-rich). CHANGELOG Internal gains notes for the render-hot-path coverage (image/cold-start/comparative tier + sample dump/large-table/GC-churn/accented-Latin) and the CI-run deterministic gates (+ vector-rich scenario, median stages[], advisory peakHeapMb). VectorPaintJmhBenchmark builds its paint objects in @Setup like the sibling benches instead of inside the measured method. BenchmarkMedianToolTest asserts the no-stages lenient path omits stages without throwing.
---
 CHANGELOG.md                                   | 18 ++++++++++++++++++
 benchmarks/README.md                           | 13 ++++++++-----
 .../demcha/compose/CurrentSpeedBenchmark.java  |  2 ++
 .../compose/jmh/VectorPaintJmhBenchmark.java   | 18 ++++++++++++++----
 .../compose/BenchmarkMedianToolTest.java       |  5 +++++
 5 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6cb0e7074..fc78a4e2c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -366,6 +366,24 @@ Entries land here as they merge.
   shipped).** The `long-token` scenario previously had no SMOKE threshold and
   silently escaped the gate; it now has one, and `CurrentSpeedScenarioGateTest`
   fails the build if any scenario lacks a threshold.
+- **Benchmark coverage for the render hot paths (not shipped).** Added an image
+  embed/scale gate (`ImageCacheOperatorProbe` + `ImageBenchmarkFixtures` +
+  `ImageJmhBenchmark`, with `ImageCacheGateTest` pinning `PdfImageCache` reuse), a
+  single-shot cold-start render bench (`ColdStartJmhBenchmark`), a multi-page
+  "report" tier in `ComparativeBenchmark` (equivalent content across GraphCompose /
+  iText / JasperReports, plus a post-run sample-PDF dump per library/scenario), a
+  production-scale `LargeTableJmhBenchmark`, an allocation-rate / GC-pressure probe
+  (`AllocationRateProbe`), and an accented-Latin measurement scenario.
+- **Deterministic benchmark gates run on every PR (not shipped).** The benchmarks
+  module's tests never ran in CI; the `perf-smoke` job now runs them, so the
+  image-cache, render-operator (F5 coalescing), vector-paint (flat / gradient /
+  alpha / stroked / dashed operator structure), and scenario-coverage gates fail a
+  PR on a structural regression. A `vector-rich` scenario (charts + SVG icons +
+  gradient) joins the gated current-speed harness; `BenchmarkMedianTool` carries the
+  stage breakdown into its aggregate; and the smoke gate's GC-noisy `peakHeapMb`
+  check is now advisory (fails only on average latency). Chart-layout variants
+  (horizontal / stacked / donut / value-axis-min), a sparkline ramp, and a
+  per-paint-mode vector render bench round out the JMH suite.
 - **Removed the `java.awt.*` / `java.util.*` co-wildcard in four files.**
   `InvoiceTemplateComposer`, `ProposalTemplateComposer`,
   `WeeklyScheduleTemplateComposer`, and the engine `PdfRenderingSystemECS`
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 228d7a2ee..f7f70d056 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -23,7 +23,7 @@
 ## When to use the harness
 
 - **Smoke check before a release** — `CurrentSpeedBenchmark -Dgraphcompose.benchmark.profile=smoke`
-  takes ~15 s, exercises the canonical render path through 5 fixture
+  takes ~15 s, exercises the canonical render path through 7 fixture
   scenarios, and prints a single-page latency / throughput table.
   CI runs this on every PR (the `perf-smoke` job); the goal is "did
   this PR make a representative render visibly slower?" — *not* "is
@@ -158,10 +158,13 @@ render benches (`CanonicalRender`, `TemplateCv`, `Chart`, `ChartVariant`, `Image
 `SparklineRamp`, `PaginatedDocument`, `VectorPaint`), the SVG-import micro-benches
 (`Svg`), and a single-shot cold-start bench (`ColdStart`).
 
-Every JMH bench uses `@Fork(1)` with a 3×2s warmup / 5×2s measurement window — a
-deliberately fast default for on-demand local iteration (a single fork, so the
-reported `Error` column is blank). For a number you intend to quote, pass more
-forks on the CLI (e.g. `-f 5`) to get a cross-fork error estimate.
+Every steady-state JMH bench uses `@Fork(1)` with a 3×2s warmup / 5×2s measurement
+window — a deliberately fast default for on-demand local iteration (a single fork,
+so the reported `Error` column is blank). For a number you intend to quote, pass
+more forks on the CLI (e.g. `-f 5`) for a cross-fork error estimate. The exception
+is `ColdStart`, which is single-shot (`Mode.SingleShotTime`, `@Warmup(0)`,
+`@Fork(10)`) — it deliberately measures the JIT-cold first render across ten fresh
+JVMs.
 
 The measured region differs per benchmark: `TemplateCv` hoists fixture
 construction into `@Setup` and times the render only, while `CanonicalRender` and
diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index aca749da8..5fdef47e2 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -50,6 +50,8 @@
  *     <li>the built-in CV template</li>
  *     <li>a longer multi-page proposal template</li>
  *     <li>a feature-rich document with QR/barcode, watermark, page break, and footer</li>
+ *     <li>long unbreakable tokens forcing character-level wrap</li>
+ *     <li>a v1.8 vector-rich document (bar/pie charts, SVG icons, gradient path)</li>
  * </ul>
  */
 public final class CurrentSpeedBenchmark {
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java
index 4f9f5061c..382ad4d57 100644
--- a/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java
@@ -15,6 +15,7 @@
 import org.openjdk.jmh.annotations.OutputTimeUnit;
 import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
@@ -50,6 +51,19 @@ public class VectorPaintJmhBenchmark {
     @Param({"flat", "gradient", "alpha"})
     public String paint;
 
+    private DocumentPaint gradient;
+    private DocumentColor flat;
+    private DocumentColor translucent;
+
+    /** Paint objects built once per trial, outside the measured render. */
+    @Setup
+    public void setUp() {
+        gradient = DocumentPaint.linear(
+                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
+        flat = DocumentColor.rgb(40, 90, 160);
+        translucent = DocumentColor.rgb(40, 90, 160).withOpacity(0.5);
+    }
+
     /**
      * Renders {@code PATHS} blob paths in the parameterized paint mode to PDF bytes.
      *
@@ -58,10 +72,6 @@ public class VectorPaintJmhBenchmark {
      */
     @Benchmark
     public void renderVectorPaint(Blackhole blackhole) throws Exception {
-        DocumentPaint gradient = DocumentPaint.linear(
-                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
-        DocumentColor flat = DocumentColor.rgb(40, 90, 160);
-        DocumentColor translucent = DocumentColor.rgb(40, 90, 160).withOpacity(0.5);
         try (DocumentSession document = GraphCompose.document()
                 .pageSize(DocumentPageSize.A4)
                 .margin(DocumentInsets.of(28))
diff --git a/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java b/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java
index f110c9c7e..c1bc150b8 100644
--- a/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java
+++ b/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java
@@ -148,6 +148,11 @@ void shouldWriteMedianCurrentSpeedAggregateForRepeatedRuns() throws Exception {
         assertThat(aggregate.path("latency").get(0).path("peakHeapMb").asDouble()).isEqualTo(120.0);
         assertThat(aggregate.path("throughput").get(0).path("docsPerSecond").asDouble()).isEqualTo(40.0);
         assertThat(aggregate.path("totalBytes").asLong()).isEqualTo(2000L);
+        // None of these runs carried a stages[] (smoke < 20 iters emits none), so the
+        // lenient aggregation must omit stages without throwing.
+        assertThat(aggregate.path("stages").isEmpty())
+                .as("median omits stages when no source run carries them")
+                .isTrue();
     }
 
     @Test

From 44a965d5d879bef53d74530b5d286648cbccad0c Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 17:19:48 +0100
Subject: [PATCH 34/36] fix(benchmarks): parse vector-rich icon once, harden
 distinct images, log dropped median stages

Review follow-ups on the suite:

- The vector-rich current-speed scenario parsed its SVG icon and built its gradient inside the per-iteration render method, so it measured a re-parse the other (pre-built-fixture) scenarios don't; hoist both to instance fields. Widen its SMOKE threshold 20 -> 25 ms (charts + SVG icons vary more than the text scenarios) and document the observed ~5-6 ms basis.

- ImageBenchmarkFixtures.distinctImage relied on modular gradient/line colours that can repeat at large indices, risking duplicate fingerprints; add a seed-positioned 1px marker so each index < native width yields byte-distinct content, keeping the distinct-embed gate robust.

- BenchmarkMedianTool now logs a note when it omits stages[] because the source runs' stage scenario sets differ, instead of dropping them silently.
---
 .../com/demcha/compose/BenchmarkMedianTool.java  |  2 ++
 .../demcha/compose/CurrentSpeedBenchmark.java    | 16 ++++++++++------
 .../demcha/compose/ImageBenchmarkFixtures.java   |  8 ++++++++
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
index 3cadbfa43..6a3abb58f 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
@@ -204,6 +204,8 @@ private List<CurrentSpeedStageMedianRow> aggregateCurrentSpeedStages(List<Report
         for (ReportFile reportFile : reportFiles) {
             Map<String, JsonNode> currentByScenario = indexBy(iterable(reportFile.report().path("stages")), "scenario");
             if (!firstByScenario.keySet().equals(currentByScenario.keySet())) {
+                System.out.println("Note: stages omitted from the median aggregate — "
+                        + "the stage-breakdown scenario set differs across the source runs.");
                 return List.of();
             }
         }
diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index 5fdef47e2..46706038a 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -91,6 +91,11 @@ public final class CurrentSpeedBenchmark {
     private final InvoiceDocumentSpec invoice = CanonicalBenchmarkSupport.canonicalInvoice();
     private final ProposalDocumentSpec proposal = CanonicalBenchmarkSupport.canonicalProposal();
     private final CvSpec cv = CanonicalBenchmarkSupport.canonicalCv();
+    // Parsed/built once (like the template fixtures above) so the vector-rich
+    // scenario measures the render, not a per-iteration SVG re-parse.
+    private final SvgIcon vectorRichIcon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+    private final DocumentPaint vectorRichAccent = DocumentPaint.linear(
+            DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
 
     // Canonical scenario list, in table order. Declared statically (the
     // renderer is bound to an instance at run time) so the gate-coverage guard
@@ -569,9 +574,6 @@ private byte[] renderEngineSimpleDocument() throws Exception {
     }
 
     private byte[] renderVectorRichDocument() throws Exception {
-        DocumentPaint accent = DocumentPaint.linear(
-                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
-        SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
         try (DocumentSession document = GraphCompose.document()
                 .pageSize(com.demcha.compose.document.api.DocumentPageSize.A4)
                 .margin(28, 28, 28, 28)
@@ -581,10 +583,10 @@ private byte[] renderVectorRichDocument() throws Exception {
             flow.chart(ChartBenchmarkFixtures.barSpec(), ChartBenchmarkFixtures.barStyle());
             flow.chart(ChartBenchmarkFixtures.pieSpec());
             for (int i = 0; i < 8; i++) {
-                flow.addSvgIcon(icon, 32);
+                flow.addSvgIcon(vectorRichIcon, 32);
             }
             flow.addPath(p -> p.size(220, 28)
-                    .moveTo(0.0, 0.5).curveTo(0.25, 1.0, 0.75, 0.0, 1.0, 0.5).fill(accent));
+                    .moveTo(0.0, 0.5).curveTo(0.25, 1.0, 0.75, 0.0, 1.0, 0.5).fill(vectorRichAccent));
             flow.build();
             return document.toPdfBytes();
         }
@@ -987,7 +989,9 @@ enum BenchmarkProfile {
                 "proposal-template", new SmokeThreshold(45.0, 384.0),
                 "feature-rich", new SmokeThreshold(100.0, 256.0),
                 "long-token", new SmokeThreshold(10.0, 256.0),
-                "vector-rich", new SmokeThreshold(20.0, 256.0)
+                // vector-rich observed ~5-6 ms smoke avg; charts + SVG icons vary
+                // more than the text scenarios, so a wider ~4.5x band absorbs that.
+                "vector-rich", new SmokeThreshold(25.0, 256.0)
         ));
 
         private final String id;
diff --git a/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
index c9f95b739..dcf1ec97e 100644
--- a/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
+++ b/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
@@ -76,6 +76,14 @@ private static byte[] pngBytes(int seed) {
             g.setPaint(new Color(196, 153, 76));
             g.setStroke(new BasicStroke(6f));
             g.drawLine(0, 170, NATIVE_WIDTH_PX, 110 - (seed % 40));
+            // A seed-positioned 1px marker guarantees byte-distinct content per
+            // seed — the modular gradient/line colours above can repeat at large
+            // seeds, but a unique x keeps distinctImage(i) fingerprints distinct
+            // for i in [0, NATIVE_WIDTH_PX - 1].
+            if (seed < NATIVE_WIDTH_PX) {
+                g.setPaint(new Color(0, 0, 0));
+                g.fillRect(seed, 0, 1, 6);
+            }
         } finally {
             g.dispose();
         }

From c81a6c51dacb0e6df83d2b019be7e88c26ac5bc1 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 18:12:44 +0100
Subject: [PATCH 35/36] perf(benchmarks): compare against current iText 9
 instead of EOL iText 5

The comparative pinned iText 5.5.13.3 (EOL ~2021, the monolithic com.itextpdf.text API). Upgrade to iText Core 9.6.0 (current) and rewrite benchmarkIText / benchmarkITextReport to the kernel + layout API (PdfDocument + layout Document + Table/Cell with useAllAvailableWidth and repeating header cells); relabel the rows "iText 9".

Against the current iText engine the picture changes: on the multi-page report GraphCompose now leads on both time (~5.0 vs ~12.5 ms) and allocation (~0.88 vs ~2.95 MB) -- the old iText-5 time advantage was against a 2020 engine. No PDFBox conflict (iText is its own classpath island and is still excluded from the shade jar). Sample dump confirms a valid 2-page iText report.
---
 CHANGELOG.md                                  |  3 +-
 benchmarks/pom.xml                            |  7 +-
 .../demcha/compose/ComparativeBenchmark.java  | 73 +++++++++----------
 docs/operations/benchmarks.md                 |  2 +-
 4 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc78a4e2c..43ea26931 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -371,7 +371,8 @@ Entries land here as they merge.
   `ImageJmhBenchmark`, with `ImageCacheGateTest` pinning `PdfImageCache` reuse), a
   single-shot cold-start render bench (`ColdStartJmhBenchmark`), a multi-page
   "report" tier in `ComparativeBenchmark` (equivalent content across GraphCompose /
-  iText / JasperReports, plus a post-run sample-PDF dump per library/scenario), a
+  iText 9 / JasperReports — iText upgraded from the EOL 5.5.x to current 9.x — plus
+  a post-run sample-PDF dump per library/scenario), a
   production-scale `LargeTableJmhBenchmark`, an allocation-rate / GC-pressure probe
   (`AllocationRateProbe`), and an accented-Latin measurement scenario.
 - **Deterministic benchmark gates run on every PR (not shipped).** The benchmarks
diff --git a/benchmarks/pom.xml b/benchmarks/pom.xml
index 25ac8f25d..b48aff3a8 100644
--- a/benchmarks/pom.xml
+++ b/benchmarks/pom.xml
@@ -30,7 +30,7 @@
         <logback.version>1.5.34</logback.version>
 
         <openhtmltopdf.version>1.0.10</openhtmltopdf.version>
-        <itextpdf.version>5.5.13.3</itextpdf.version>
+        <itext.version>9.6.0</itext.version>
         <jasperreports.version>7.0.7</jasperreports.version>
     </properties>
 
@@ -100,8 +100,9 @@
         </dependency>
         <dependency>
             <groupId>com.itextpdf</groupId>
-            <artifactId>itextpdf</artifactId>
-            <version>${itextpdf.version}</version>
+            <artifactId>itext-core</artifactId>
+            <version>${itext.version}</version>
+            <type>pom</type>
         </dependency>
         <dependency>
             <groupId>net.sf.jasperreports</groupId>
diff --git a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
index 9ea39235b..1d677f942 100644
--- a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
@@ -8,10 +8,13 @@
 import com.demcha.compose.document.style.DocumentInsets;
 import com.demcha.compose.document.style.DocumentTextStyle;
 import com.demcha.compose.document.table.DocumentTableColumn;
-import com.itextpdf.text.Document;
-import com.itextpdf.text.Paragraph;
-import com.itextpdf.text.pdf.PdfPTable;
-import com.itextpdf.text.pdf.PdfWriter;
+import com.itextpdf.kernel.pdf.PdfDocument;
+import com.itextpdf.kernel.pdf.PdfWriter;
+import com.itextpdf.layout.Document;
+import com.itextpdf.layout.element.Cell;
+import com.itextpdf.layout.element.Paragraph;
+import com.itextpdf.layout.element.Table;
+import com.itextpdf.layout.properties.UnitValue;
 import net.sf.jasperreports.engine.*;
 import net.sf.jasperreports.engine.data.JRMapCollectionDataSource;
 import net.sf.jasperreports.engine.design.*;
@@ -81,14 +84,14 @@ public static void main(String[] args) throws Exception {
         System.out.println("Scenario: small invoice (single page, ~3 lines)");
         printTableHeader();
         rows.add(runBenchmark("GraphCompose Canonical", ComparativeBenchmark::benchmarkGraphComposeCanonical));
-        rows.add(runBenchmark("iText 5 (Old)", ComparativeBenchmark::benchmarkIText));
+        rows.add(runBenchmark("iText 9", ComparativeBenchmark::benchmarkIText));
         rows.add(runBenchmark("JasperReports", ComparativeBenchmark::benchmarkJasper));
 
         System.out.println();
         System.out.println("Scenario: business report (multi-page: title + " + REPORT_ROWS + "-row table + prose)");
         printTableHeader();
         rows.add(runBenchmark("GraphCompose (report)", ComparativeBenchmark::benchmarkGraphComposeReport));
-        rows.add(runBenchmark("iText 5 (report)", ComparativeBenchmark::benchmarkITextReport));
+        rows.add(runBenchmark("iText 9 (report)", ComparativeBenchmark::benchmarkITextReport));
         rows.add(runBenchmark("JasperReports (report)", ComparativeBenchmark::benchmarkJasperReport));
 
         BenchmarkReportWriter.BenchmarkArtifacts artifacts = BenchmarkReportWriter.prepare("comparative");
@@ -235,19 +238,15 @@ private static byte[] benchmarkGraphComposeReport() throws Exception {
      */
     private static byte[] benchmarkIText() throws Exception {
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        Document document = new Document();
-        PdfWriter.getInstance(document, baos);
-        document.open();
-
-        // Используем таблицу, чтобы iText делал расчет ширины (как GraphCompose)
-        PdfPTable table = new PdfPTable(1);
-        table.setWidthPercentage(100);
-        table.addCell(new Paragraph("INVOICE #12345"));
-        table.addCell(new Paragraph("Customer: John Doe"));
-        table.addCell(new Paragraph("Amount: $1,000.00"));
-
-        document.add(table);
-        document.close();
+        // iText 9 (kernel + layout). A full-width 1-column table makes iText do
+        // the same width calculation GraphCompose does.
+        try (Document document = new Document(new PdfDocument(new PdfWriter(baos)))) {
+            Table table = new Table(UnitValue.createPercentArray(new float[]{1})).useAllAvailableWidth();
+            table.addCell(new Cell().add(new Paragraph("INVOICE #12345")));
+            table.addCell(new Cell().add(new Paragraph("Customer: John Doe")));
+            table.addCell(new Cell().add(new Paragraph("Amount: $1,000.00")));
+            document.add(table);
+        }
         return baos.toByteArray();
     }
 
@@ -257,27 +256,23 @@ private static byte[] benchmarkIText() throws Exception {
      */
     private static byte[] benchmarkITextReport() throws Exception {
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        Document document = new Document();
-        PdfWriter.getInstance(document, baos);
-        document.open();
-        document.add(new Paragraph("Quarterly Business Report"));
-        document.add(new Paragraph(REPORT_PROSE));
-
-        PdfPTable table = new PdfPTable(4);
-        table.setWidthPercentage(100);
-        table.setHeaderRows(1);
-        for (String header : new String[]{"Item", "Qty", "Unit", "Total"}) {
-            table.addCell(new Paragraph(header));
-        }
-        for (int r = 1; r <= REPORT_ROWS; r++) {
-            table.addCell(new Paragraph("Line item " + r));
-            table.addCell(new Paragraph("3"));
-            table.addCell(new Paragraph("ea"));
-            table.addCell(new Paragraph("38.75"));
+        try (Document document = new Document(new PdfDocument(new PdfWriter(baos)))) {
+            document.add(new Paragraph("Quarterly Business Report"));
+            document.add(new Paragraph(REPORT_PROSE));
+
+            Table table = new Table(UnitValue.createPercentArray(new float[]{1, 1, 1, 1})).useAllAvailableWidth();
+            for (String header : new String[]{"Item", "Qty", "Unit", "Total"}) {
+                table.addHeaderCell(new Cell().add(new Paragraph(header)));
+            }
+            for (int r = 1; r <= REPORT_ROWS; r++) {
+                table.addCell(new Cell().add(new Paragraph("Line item " + r)));
+                table.addCell(new Cell().add(new Paragraph("3")));
+                table.addCell(new Cell().add(new Paragraph("ea")));
+                table.addCell(new Cell().add(new Paragraph("38.75")));
+            }
+            document.add(table);
+            document.add(new Paragraph(REPORT_PROSE));
         }
-        document.add(table);
-        document.add(new Paragraph(REPORT_PROSE));
-        document.close();
         return baos.toByteArray();
     }
 
diff --git a/docs/operations/benchmarks.md b/docs/operations/benchmarks.md
index 3611d877e..e7da4c1eb 100644
--- a/docs/operations/benchmarks.md
+++ b/docs/operations/benchmarks.md
@@ -39,7 +39,7 @@ The script prints numbered sections so you can map console output to the pipelin
    Runs `CurrentSpeedBenchmark` in the selected profile. The full profile also
    runs the thread-scaling throughput sweep (1 → 16 threads).
 3. `03-comparative`
-   Runs the GraphCompose canonical vs iText 5 vs JasperReports comparison.
+   Runs the GraphCompose canonical vs iText 9 vs JasperReports comparison.
 
    _Steps 04–06 (`core-engine`, `full-cv`, `scalability`) were retired. The
    surviving steps keep their original `NN-` console prefixes, so the labels

From 947085ef15a26bb2d299c021c66bb0a46eab7f02 Mon Sep 17 00:00:00 2001
From: DemchaAV <demchaav@gmail.com>
Date: Mon, 15 Jun 2026 18:44:59 +0100
Subject: [PATCH 36/36] perf(benchmarks): sweep the comparative report across
 40/200/1000 rows

Render the same title + prose + N-row table through GraphCompose, iText 9, and JasperReports at N = 40 / 200 / 1000 instead of a single 40-row size, and print a per-size GraphCompose-advantage ratio (time and heap vs each library) so the scaling trend is measured rather than assumed. Dump the smallest and largest report per library as sample PDFs.

The heap column and its ratios now enable per-thread allocation tracking explicitly (failing loudly if unsupported), and the advantage ratios are computed from full-precision averages rather than the rounded report rows.
---
 CHANGELOG.md                                  |   9 +-
 benchmarks/README.md                          |   9 +-
 .../demcha/compose/ComparativeBenchmark.java  | 165 +++++++++++++-----
 docs/operations/benchmarks.md                 |   5 +-
 4 files changed, 135 insertions(+), 53 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 43ea26931..8323af3e9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -369,10 +369,11 @@ Entries land here as they merge.
 - **Benchmark coverage for the render hot paths (not shipped).** Added an image
   embed/scale gate (`ImageCacheOperatorProbe` + `ImageBenchmarkFixtures` +
   `ImageJmhBenchmark`, with `ImageCacheGateTest` pinning `PdfImageCache` reuse), a
-  single-shot cold-start render bench (`ColdStartJmhBenchmark`), a multi-page
-  "report" tier in `ComparativeBenchmark` (equivalent content across GraphCompose /
-  iText 9 / JasperReports — iText upgraded from the EOL 5.5.x to current 9.x — plus
-  a post-run sample-PDF dump per library/scenario), a
+  single-shot cold-start render bench (`ColdStartJmhBenchmark`), a report-scaling
+  sweep in `ComparativeBenchmark` (equivalent content across GraphCompose /
+  iText 9 / JasperReports at 40 / 200 / 1000 table rows — iText upgraded from the
+  EOL 5.5.x to current 9.x — printing a per-size GraphCompose-advantage ratio plus
+  a post-run sample-PDF dump per library/size), a
   production-scale `LargeTableJmhBenchmark`, an allocation-rate / GC-pressure probe
   (`AllocationRateProbe`), and an accented-Latin measurement scenario.
 - **Deterministic benchmark gates run on every PR (not shipped).** The benchmarks
diff --git a/benchmarks/README.md b/benchmarks/README.md
index f7f70d056..9322e1018 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -52,9 +52,10 @@
   check after you've already chosen, not a decision tool before.
 - For **comparing GraphCompose to another PDF library** —
   `ComparativeBenchmark` does render equivalent content through iText /
-  JasperReports for rough sizing (two tiers: a tiny single-page invoice
-  for fixed overhead, and a multi-page report — title + 40-row table +
-  prose — for realistic work), but the comparison is a manual smoke test:
+  JasperReports for rough sizing (a tiny single-page invoice for fixed
+  overhead, plus a report-scaling sweep — title + prose + an N-row table
+  at N = 40 / 200 / 1000 — that shows how each engine scales and prints a
+  GraphCompose-advantage ratio per size), but the comparison is a manual smoke test:
   each library has different defaults (compression, font embedding, image
   resampling) and reading too much into a single number is the wrong call.
   Note one boundary asymmetry: the JasperReports figure measures fill +
@@ -92,7 +93,7 @@ These are intentionally **not** on the per-PR path:
 | File | Role |
 |---|---|
 | `CurrentSpeedBenchmark` | Default scenario runner — what CI's `perf-smoke` job exercises. Takes a `-Dgraphcompose.benchmark.profile=smoke\|full\|stress` switch. |
-| `ComparativeBenchmark` | Renders equivalent content through GraphCompose, iText, JasperReports — two tiers (small invoice + multi-page report), and dumps a sample PDF per library/scenario. **Rough local comparison only** — see "When not to use" above. |
+| `ComparativeBenchmark` | Renders equivalent content through GraphCompose, iText, JasperReports — a small-invoice tier plus a report-scaling sweep (40 / 200 / 1000 rows) with a per-size advantage ratio, and dumps a sample PDF per library/size. **Rough local comparison only** — see "When not to use" above. |
 | `CanonicalBenchmarkSupport`, `BenchmarkSupport` | Shared fixture builders + measurement helpers. |
 | `BenchmarkReportWriter` | Writes JSON / CSV / text reports under `benchmarks/target/benchmarks/`. |
 | `BenchmarkDiffTool` | Compares two JSON reports and prints a delta table. Useful for pre/post comparisons. |
diff --git a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
index 1d677f942..b37215fcc 100644
--- a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
@@ -43,10 +43,16 @@ public class ComparativeBenchmark {
     private static final int WARMUP_ITERATIONS = 50;
     private static final int MEASUREMENT_ITERATIONS = 100;
 
-    // Multi-page "report" scenario: a title, an N-row line-item table, and prose.
-    // Rendered with equivalent content across all three libraries so the numbers
-    // reflect real multi-page document work, not just per-render fixed overhead.
-    private static final int REPORT_ROWS = 40;
+    // Report-scaling sweep: the same title + prose + N-row table rendered through
+    // every library at growing row counts, so the numbers show how each engine
+    // SCALES (and whether GraphCompose's lead widens with document size) instead
+    // of at a single fixed size. The heavy sizes use fewer iterations to keep the
+    // on-demand run reasonable; this is a directional comparative, not a strict
+    // JMH measurement (see benchmarks/README.md).
+    private static final int[] SWEEP_SIZES = {40, 200, 1000};
+    private static final int SWEEP_WARMUP_ITERATIONS = 20;
+    private static final int SWEEP_MEASUREMENT_ITERATIONS = 30;
+
     private static final String REPORT_PROSE =
             ("GraphCompose lays out structured business documents across many pages "
                     + "while keeping header and footer placement stable. ").repeat(6);
@@ -61,6 +67,17 @@ public static void main(String[] args) throws Exception {
         System.out.println("Timestamp: " + LocalDateTime.now().format(TIMESTAMP_FORMAT));
         System.out.println("------------------------------------------------------------");
 
+        // Per-thread allocation accounting backs the "Avg Heap (MB)" column and the
+        // heap-advantage ratios. Enable it explicitly (and bail loudly if the JVM
+        // does not support it) instead of trusting the platform default, matching
+        // the guard the other allocation probes in this module use.
+        com.sun.management.ThreadMXBean allocBean =
+                (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
+        if (!allocBean.isThreadAllocatedMemorySupported()) {
+            throw new IllegalStateException("Thread allocated-memory measurement is not supported on this JVM");
+        }
+        allocBean.setThreadAllocatedMemoryEnabled(true);
+
         // Подготавливаем оба отчета Jasper 1 раз (как в Production)
         setupJasper();
         setupJasperReport();
@@ -71,28 +88,48 @@ public static void main(String[] args) throws Exception {
             benchmarkGraphComposeCanonical();
             benchmarkIText();
             benchmarkJasper();
-            benchmarkGraphComposeReport();
-            benchmarkITextReport();
-            benchmarkJasperReport();
+        }
+        for (int i = 0; i < SWEEP_WARMUP_ITERATIONS; i++) {
+            for (int size : SWEEP_SIZES) {
+                benchmarkGraphComposeReport(size);
+                benchmarkITextReport(size);
+                benchmarkJasperReport(size);
+            }
         }
 
-        // Замер — два сценария: дешёвый (фиксированные накладные) и многостраничный
-        System.out.println("Measuring performance (" + MEASUREMENT_ITERATIONS + " iterations)...");
+        // Замер — два сценария: дешёвый (фиксированные накладные) и масштабирование отчёта
+        System.out.println("Measuring performance...");
         List<ComparativeRow> rows = new ArrayList<>();
 
         System.out.println();
-        System.out.println("Scenario: small invoice (single page, ~3 lines)");
+        System.out.println("Scenario: small invoice (single page, ~3 lines), " + MEASUREMENT_ITERATIONS + " iterations");
         printTableHeader();
-        rows.add(runBenchmark("GraphCompose Canonical", ComparativeBenchmark::benchmarkGraphComposeCanonical));
-        rows.add(runBenchmark("iText 9", ComparativeBenchmark::benchmarkIText));
-        rows.add(runBenchmark("JasperReports", ComparativeBenchmark::benchmarkJasper));
+        rows.add(runBenchmark("GraphCompose Canonical", MEASUREMENT_ITERATIONS, ComparativeBenchmark::benchmarkGraphComposeCanonical).toRow());
+        rows.add(runBenchmark("iText 9", MEASUREMENT_ITERATIONS, ComparativeBenchmark::benchmarkIText).toRow());
+        rows.add(runBenchmark("JasperReports", MEASUREMENT_ITERATIONS, ComparativeBenchmark::benchmarkJasper).toRow());
 
         System.out.println();
-        System.out.println("Scenario: business report (multi-page: title + " + REPORT_ROWS + "-row table + prose)");
-        printTableHeader();
-        rows.add(runBenchmark("GraphCompose (report)", ComparativeBenchmark::benchmarkGraphComposeReport));
-        rows.add(runBenchmark("iText 9 (report)", ComparativeBenchmark::benchmarkITextReport));
-        rows.add(runBenchmark("JasperReports (report)", ComparativeBenchmark::benchmarkJasperReport));
+        System.out.println("Scenario: report scaling sweep (title + prose + N-row table), "
+                + SWEEP_MEASUREMENT_ITERATIONS + " iterations per size");
+        List<ScalingPoint> scaling = new ArrayList<>();
+        for (int size : SWEEP_SIZES) {
+            System.out.println();
+            System.out.println("  N = " + size + " rows");
+            printTableHeader();
+            Measured gc = runBenchmark("GraphCompose (" + size + " rows)", SWEEP_MEASUREMENT_ITERATIONS,
+                    () -> benchmarkGraphComposeReport(size));
+            Measured it = runBenchmark("iText 9 (" + size + " rows)", SWEEP_MEASUREMENT_ITERATIONS,
+                    () -> benchmarkITextReport(size));
+            Measured js = runBenchmark("JasperReports (" + size + " rows)", SWEEP_MEASUREMENT_ITERATIONS,
+                    () -> benchmarkJasperReport(size));
+            rows.add(gc.toRow());
+            rows.add(it.toRow());
+            rows.add(js.toRow());
+            // Ratios are computed from the full-precision averages, not the rounded
+            // report rows, so the advantage figures don't compound rounding error.
+            scaling.add(new ScalingPoint(size, gc, it, js));
+        }
+        printScalingSummary(scaling);
 
         BenchmarkReportWriter.BenchmarkArtifacts artifacts = BenchmarkReportWriter.prepare("comparative");
         ComparativeReport report = new ComparativeReport(
@@ -130,9 +167,13 @@ private static Path writeSampleRenders(Path directory) throws Exception {
         Files.write(directory.resolve("graphcompose-small.pdf"), benchmarkGraphComposeCanonical());
         Files.write(directory.resolve("itext-small.pdf"), benchmarkIText());
         Files.write(directory.resolve("jasper-small.pdf"), benchmarkJasper());
-        Files.write(directory.resolve("graphcompose-report.pdf"), benchmarkGraphComposeReport());
-        Files.write(directory.resolve("itext-report.pdf"), benchmarkITextReport());
-        Files.write(directory.resolve("jasper-report.pdf"), benchmarkJasperReport());
+        // The smallest and largest sweep sizes, so the reader can see both a short
+        // report and the multi-page document that drives the scaling numbers.
+        for (int size : new int[]{SWEEP_SIZES[0], SWEEP_SIZES[SWEEP_SIZES.length - 1]}) {
+            Files.write(directory.resolve("graphcompose-report-" + size + ".pdf"), benchmarkGraphComposeReport(size));
+            Files.write(directory.resolve("itext-report-" + size + ".pdf"), benchmarkITextReport(size));
+            Files.write(directory.resolve("jasper-report-" + size + ".pdf"), benchmarkJasperReport(size));
+        }
         return directory;
     }
 
@@ -141,14 +182,14 @@ private static void printTableHeader() {
         System.out.println("-".repeat(60));
     }
 
-    private static ComparativeRow runBenchmark(String name, BenchmarkTask task) throws Exception {
+    private static Measured runBenchmark(String name, int iterations, BenchmarkTask task) throws Exception {
         long totalTimeNs = 0;
         long totalAllocatedBytes = 0;
         long dummyAccumulator = 0; // Защита от Dead Code Elimination
 
         com.sun.management.ThreadMXBean bean = (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
 
-        for (int i = 0; i < MEASUREMENT_ITERATIONS; i++) {
+        for (int i = 0; i < iterations; i++) {
             System.gc(); // Форсируем сборку мусора перед каждым замером для чистоты аллокации
 
             long startBytes = bean.getThreadAllocatedBytes(Thread.currentThread().getId());
@@ -165,19 +206,15 @@ private static ComparativeRow runBenchmark(String name, BenchmarkTask task) thro
             dummyAccumulator += pdfBytes.length;
         }
 
-        double avgTimeMs = (totalTimeNs / (double) MEASUREMENT_ITERATIONS) / 1_000_000.0;
-        double avgMemMb = (totalAllocatedBytes / (double) MEASUREMENT_ITERATIONS) / (1024.0 * 1024.0);
+        double avgTimeMs = (totalTimeNs / (double) iterations) / 1_000_000.0;
+        double avgMemMb = (totalAllocatedBytes / (double) iterations) / (1024.0 * 1024.0);
 
         System.out.printf("%-24s | %14.2f | %14.2f%n", name, avgTimeMs, avgMemMb);
 
         // Печатаем dummy-переменную, чтобы JIT не вырезал код генерации
         if (dummyAccumulator == 0) System.out.println("Error: No bytes generated");
 
-        return new ComparativeRow(
-                name,
-                round(avgTimeMs),
-                round(avgMemMb)
-        );
+        return new Measured(name, avgTimeMs, avgMemMb);
     }
 
     /**
@@ -202,10 +239,10 @@ private static byte[] benchmarkGraphComposeCanonical() throws Exception {
     }
 
     /**
-     * GraphCompose canonical, multi-page report: title + N-row table + prose,
-     * authored through the public page-flow DSL (the realistic consumer path).
+     * GraphCompose canonical, multi-page report: title + {@code rows}-row table +
+     * prose, authored through the public page-flow DSL (the realistic consumer path).
      */
-    private static byte[] benchmarkGraphComposeReport() throws Exception {
+    private static byte[] benchmarkGraphComposeReport(int rows) throws Exception {
         // Equal full-width columns (page width minus the 32pt L/R margins, split
         // four ways), so the table fills the page like iText (setWidthPercentage
         // 100) and Jasper (full-column-width cells) rather than hugging its text.
@@ -223,7 +260,7 @@ private static byte[] benchmarkGraphComposeReport() throws Exception {
                             DocumentTableColumn.fixed(columnWidth),
                             DocumentTableColumn.fixed(columnWidth))
                             .header("Item", "Qty", "Unit", "Total").repeatHeader();
-                    for (int r = 1; r <= REPORT_ROWS; r++) {
+                    for (int r = 1; r <= rows; r++) {
                         t.row("Line item " + r, "3", "ea", "38.75");
                     }
                 });
@@ -251,10 +288,10 @@ private static byte[] benchmarkIText() throws Exception {
     }
 
     /**
-     * iText, multi-page report: same title + N-row table + prose. iText paginates
-     * the {@code PdfPTable} natively, so this exercises real multi-page layout.
+     * iText, multi-page report: same title + {@code rows}-row table + prose. iText
+     * paginates the table natively, so this exercises real multi-page layout.
      */
-    private static byte[] benchmarkITextReport() throws Exception {
+    private static byte[] benchmarkITextReport(int rows) throws Exception {
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         try (Document document = new Document(new PdfDocument(new PdfWriter(baos)))) {
             document.add(new Paragraph("Quarterly Business Report"));
@@ -264,7 +301,7 @@ private static byte[] benchmarkITextReport() throws Exception {
             for (String header : new String[]{"Item", "Qty", "Unit", "Total"}) {
                 table.addHeaderCell(new Cell().add(new Paragraph(header)));
             }
-            for (int r = 1; r <= REPORT_ROWS; r++) {
+            for (int r = 1; r <= rows; r++) {
                 table.addCell(new Cell().add(new Paragraph("Line item " + r)));
                 table.addCell(new Cell().add(new Paragraph("3")));
                 table.addCell(new Cell().add(new Paragraph("ea")));
@@ -310,13 +347,13 @@ private static void setupJasper() throws Exception {
     }
 
     /**
-     * JasperReports, multi-page report: a 4-field detail band filled from an
-     * {@code REPORT_ROWS}-row data source, with a title (+ prose) and column
-     * header. Compiled once here; the benchmark measures fill + PDF export.
+     * JasperReports, multi-page report: a 4-field detail band filled from a
+     * {@code rows}-row data source, with a title (+ prose) and column header.
+     * Compiled once here; the benchmark measures fill + PDF export.
      */
-    private static byte[] benchmarkJasperReport() throws Exception {
-        List<Map<String, ?>> data = new ArrayList<>(REPORT_ROWS);
-        for (int r = 1; r <= REPORT_ROWS; r++) {
+    private static byte[] benchmarkJasperReport(int rows) throws Exception {
+        List<Map<String, ?>> data = new ArrayList<>(rows);
+        for (int r = 1; r <= rows; r++) {
             Map<String, Object> row = new HashMap<>();
             row.put("item", "Line item " + r);
             row.put("qty", "3");
@@ -436,9 +473,49 @@ private static double round(double value) {
         return Math.round(value * 100.0) / 100.0;
     }
 
+    /**
+     * Prints how GraphCompose's time/memory advantage over iText and Jasper changes
+     * as the row count grows, so the "does the lead widen with document size?"
+     * question is answered by the numbers rather than asserted. A ratio above 1.0
+     * means GraphCompose is that many times faster / lighter at that size.
+     */
+    private static void printScalingSummary(List<ScalingPoint> scaling) {
+        System.out.println();
+        System.out.println("Scaling summary (GraphCompose advantage; >1.0 = GraphCompose faster / lighter)");
+        System.out.printf("%-8s | %16s | %16s | %16s | %16s%n",
+                "Rows", "Time vs iText", "Time vs Jasper", "Heap vs iText", "Heap vs Jasper");
+        System.out.println("-".repeat(86));
+        for (ScalingPoint p : scaling) {
+            System.out.printf("%-8d | %16s | %16s | %16s | %16s%n",
+                    p.rows(),
+                    ratio(p.iText().timeMs(), p.graphCompose().timeMs()),
+                    ratio(p.jasper().timeMs(), p.graphCompose().timeMs()),
+                    ratio(p.iText().heapMb(), p.graphCompose().heapMb()),
+                    ratio(p.jasper().heapMb(), p.graphCompose().heapMb()));
+        }
+    }
+
+    /** {@code other / graphCompose} as an "Nx" string; guards against divide-by-zero. */
+    private static String ratio(double other, double graphCompose) {
+        if (graphCompose <= 0.0) {
+            return "n/a";
+        }
+        return "%.2fx".formatted(other / graphCompose);
+    }
+
     private record ComparativeRow(String library, double avgTimeMs, double avgHeapMb) {
     }
 
+    /** Full-precision average for one library/scenario, before report rounding. */
+    private record Measured(String name, double timeMs, double heapMb) {
+        ComparativeRow toRow() {
+            return new ComparativeRow(name, round(timeMs), round(heapMb));
+        }
+    }
+
+    private record ScalingPoint(int rows, Measured graphCompose, Measured iText, Measured jasper) {
+    }
+
     private record ComparativeReport(String timestamp,
                                      int warmupIterations,
                                      int measurementIterations,
diff --git a/docs/operations/benchmarks.md b/docs/operations/benchmarks.md
index e7da4c1eb..1c6bd6d75 100644
--- a/docs/operations/benchmarks.md
+++ b/docs/operations/benchmarks.md
@@ -39,7 +39,10 @@ The script prints numbered sections so you can map console output to the pipelin
    Runs `CurrentSpeedBenchmark` in the selected profile. The full profile also
    runs the thread-scaling throughput sweep (1 → 16 threads).
 3. `03-comparative`
-   Runs the GraphCompose canonical vs iText 9 vs JasperReports comparison.
+   Runs the GraphCompose canonical vs iText 9 vs JasperReports comparison: a
+   small-invoice tier plus a report-scaling sweep (40 / 200 / 1000 rows) that
+   prints a per-size GraphCompose-advantage ratio and dumps a sample PDF per
+   library/size.
 
    _Steps 04–06 (`core-engine`, `full-cv`, `scalability`) were retired. The
    surviving steps keep their original `NN-` console prefixes, so the labels