diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 31ce987b2..c2cf8a7d2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -208,6 +208,12 @@ jobs:
       - name: Compile benchmarks module
         run: ./mvnw -B -ntp -f benchmarks/pom.xml clean compile
 
+      - name: Run deterministic benchmark gates
+        # Fast, machine-independent unit/gate tests (image-cache reuse,
+        # render-operator coalescing, scenario/threshold coverage, diff tooling).
+        # Catches structural regressions the timing smoke run cannot.
+        run: ./mvnw -B -ntp -f benchmarks/pom.xml test
+
       - name: Run coarse performance smoke benchmark
         run: |
           ./mvnw -B -ntp -f benchmarks/pom.xml -DskipTests \
@@ -223,6 +229,14 @@ jobs:
           path: benchmarks/target/benchmarks/current-speed/**
           if-no-files-found: ignore
 
+      - name: Upload benchmark gate reports
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: benchmark-gate-reports-${{ github.run_id }}
+          path: benchmarks/target/surefire-reports/**
+          if-no-files-found: ignore
+
   benchmark-diff:
     name: Weekly Benchmark Diff
     if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
diff --git a/.gitignore b/.gitignore
index 9951201a3..c9f7fc1ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,7 @@ build/
 ### Mac OS ###
 .DS_Store
 /logs/
+benchmarks/logs/
 /CV_Generated.pdf
 *.pdf
 # Allow PDF previews that are committed README assets.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19c44ff5f..8323af3e9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -337,6 +337,55 @@ Entries land here as they merge.
 
 ### Internal
 
+- **Benchmark suite cleanup (not shipped).** Removed three redundant
+  benchmark mains: `FullCvBenchmark` (superseded by the JMH
+  `TemplateCvJmhBenchmark`), `GraphComposeBenchmark` (early-engine relic
+  duplicating `CurrentSpeedBenchmark`'s `engine-simple` scenario), and
+  `ScalabilityBenchmark` (its thread-scaling sweep folded into
+  `CurrentSpeedBenchmark`'s full-profile throughput run, now `1,2,4,8,16`).
+  Dropped the matching `run-benchmarks.ps1` steps and doc entries.
+- **Feature-object benchmarks for the v1.8 vector surface (not shipped).**
+  The suite previously exercised only text/table primitives. Added JMH render
+  benches and deterministic probes over the new vector features:
+  `SvgJmhBenchmark` (path parse / whole-file icon read / icon→node) plus a
+  `SvgParseAllocProbe`; `ChartJmhBenchmark` (bar + line + pie render) plus a
+  `ChartAllocProbe` (layout-compile allocation); `VectorRenderOperatorProbe`
+  (the same paths drawn flat vs. gradient vs. translucent, counted as PDF
+  content-stream operators); `IconRampJmhBenchmark` (icon-placement scaling,
+  `@Param` 8/32/128); and `MixedShowcaseJmhBenchmark` (one document combining
+  prose, inline sparklines, bar + pie charts, SVG icons and a gradient path).
+  Shared `SvgBenchmarkFixtures` / `ChartBenchmarkFixtures` hold the inputs so
+  each bench and its probe measure identical data.
+- **Current-speed report carries a stage breakdown and a run summary (not
+  shipped).** `CurrentSpeedBenchmark` persists a per-scenario compose / layout /
+  render split (`stages[]`, median ms) to the JSON and a `stages` CSV, and
+  writes a readable `summary.md`. `BenchmarkDiffTool` consumes `stages[]`,
+  prints a per-stage delta table, and reports the scenarios added/removed
+  between two runs.
+- **Every current-speed scenario is now covered by the smoke perf gate (not
+  shipped).** The `long-token` scenario previously had no SMOKE threshold and
+  silently escaped the gate; it now has one, and `CurrentSpeedScenarioGateTest`
+  fails the build if any scenario lacks a threshold.
+- **Benchmark coverage for the render hot paths (not shipped).** Added an image
+  embed/scale gate (`ImageCacheOperatorProbe` + `ImageBenchmarkFixtures` +
+  `ImageJmhBenchmark`, with `ImageCacheGateTest` pinning `PdfImageCache` reuse), a
+  single-shot cold-start render bench (`ColdStartJmhBenchmark`), a report-scaling
+  sweep in `ComparativeBenchmark` (equivalent content across GraphCompose /
+  iText 9 / JasperReports at 40 / 200 / 1000 table rows — iText upgraded from the
+  EOL 5.5.x to current 9.x — printing a per-size GraphCompose-advantage ratio plus
+  a post-run sample-PDF dump per library/size), a
+  production-scale `LargeTableJmhBenchmark`, an allocation-rate / GC-pressure probe
+  (`AllocationRateProbe`), and an accented-Latin measurement scenario.
+- **Deterministic benchmark gates run on every PR (not shipped).** The benchmarks
+  module's tests never ran in CI; the `perf-smoke` job now runs them, so the
+  image-cache, render-operator (F5 coalescing), vector-paint (flat / gradient /
+  alpha / stroked / dashed operator structure), and scenario-coverage gates fail a
+  PR on a structural regression. A `vector-rich` scenario (charts + SVG icons +
+  gradient) joins the gated current-speed harness; `BenchmarkMedianTool` carries the
+  stage breakdown into its aggregate; and the smoke gate's GC-noisy `peakHeapMb`
+  check is now advisory (fails only on average latency). Chart-layout variants
+  (horizontal / stacked / donut / value-axis-min), a sparkline ramp, and a
+  per-paint-mode vector render bench round out the JMH suite.
 - **Removed the `java.awt.*` / `java.util.*` co-wildcard in four files.**
   `InvoiceTemplateComposer`, `ProposalTemplateComposer`,
   `WeeklyScheduleTemplateComposer`, and the engine `PdfRenderingSystemECS`
diff --git a/benchmarks/README.md b/benchmarks/README.md
index f6041365c..9322e1018 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -23,7 +23,7 @@
 ## When to use the harness
 
 - **Smoke check before a release** — `CurrentSpeedBenchmark -Dgraphcompose.benchmark.profile=smoke`
-  takes ~15 s, exercises the canonical render path through 5 fixture
+  takes ~15 s, exercises the canonical render path through 7 fixture
   scenarios, and prints a single-page latency / throughput table.
   CI runs this on every PR (the `perf-smoke` job); the goal is "did
   this PR make a representative render visibly slower?" — *not* "is
@@ -51,25 +51,54 @@
   layout-pass count) and reason about it; the harness is a sanity
   check after you've already chosen, not a decision tool before.
 - For **comparing GraphCompose to another PDF library** —
-  `ComparativeBenchmark` does render the same fixture through iText /
-  openHTMLToPDF / JasperReports for rough sizing, but the comparison
-  is a manual smoke test: each library has different defaults
-  (compression, font embedding, image resampling) and reading too much
-  into a single number is the wrong call.
+  `ComparativeBenchmark` does render equivalent content through iText /
+  JasperReports for rough sizing (a tiny single-page invoice for fixed
+  overhead, plus a report-scaling sweep — title + prose + an N-row table
+  at N = 40 / 200 / 1000 — that shows how each engine scales and prints a
+  GraphCompose-advantage ratio per size), but the comparison is a manual smoke test:
+  each library has different defaults (compression, font embedding, image
+  resampling) and reading too much into a single number is the wrong call.
+  Note one boundary asymmetry: the JasperReports figure measures fill +
+  PDF export with the design compiled once outside the loop, while the
+  GraphCompose and iText figures include per-iteration document
+  construction — so the Jasper number excludes work the other two pay.
+  `openHTMLtoPDF` is intentionally absent: its current release (1.0.10)
+  targets PDFBox 2.x and fails at runtime against the PDFBox 3.x this
+  project uses (no PDFBox-3-compatible openhtmltopdf release exists yet),
+  so it cannot share GraphCompose's classpath.
+
+## What runs on a PR — and what is on-demand (by design)
+
+The per-PR CI gate is deliberately light and deterministic:
+
+- **`perf-smoke` job** — `CurrentSpeedBenchmark` in the `smoke` profile with
+  absolute latency / heap thresholds (a gross-regression tripwire), plus the
+  module's deterministic gate tests (`mvnw -f benchmarks/pom.xml test`:
+  image-cache reuse, render-operator coalescing, scenario/threshold coverage).
+
+These are intentionally **not** on the per-PR path:
+
+- **The JMH benches** (`*JmhBenchmark`) are full / on-demand only. A forked,
+  warmed JMH run of the whole suite takes minutes; running it per PR is too
+  expensive for the signal. Run them by hand (or on a schedule) before a release
+  and quote those numbers for rigorous claims.
+- **The relative `BenchmarkVerdictTool` gate** (±% vs a committed baseline) runs
+  locally only, and no static `smoke` baseline is committed: absolute timings are
+  machine-specific, so a baseline captured on one machine would false-positive on
+  another. Use a local same-machine A/B (a `-Repeat` median before/after) for
+  relative comparison; the absolute smoke thresholds are the CI safety net.
 
 ## Files in this module
 
 | File | Role |
 |---|---|
 | `CurrentSpeedBenchmark` | Default scenario runner — what CI's `perf-smoke` job exercises. Takes a `-Dgraphcompose.benchmark.profile=smoke\|full\|stress` switch. |
-| `ComparativeBenchmark` | Renders the same fixtures through GraphCompose, iText, openHTMLToPDF, JasperReports. **Rough local comparison only** — see "When not to use" above. |
-| `FullCvBenchmark`, `ScalabilityBenchmark` | Fixture-specific runners for CV and table-heavy scenarios. |
+| `ComparativeBenchmark` | Renders equivalent content through GraphCompose, iText, JasperReports — a small-invoice tier plus a report-scaling sweep (40 / 200 / 1000 rows) with a per-size advantage ratio, and dumps a sample PDF per library/size. **Rough local comparison only** — see "When not to use" above. |
 | `CanonicalBenchmarkSupport`, `BenchmarkSupport` | Shared fixture builders + measurement helpers. |
 | `BenchmarkReportWriter` | Writes JSON / CSV / text reports under `benchmarks/target/benchmarks/`. |
 | `BenchmarkDiffTool` | Compares two JSON reports and prints a delta table. Useful for pre/post comparisons. |
 | `BenchmarkMedianTool` | Median + dispersion across N runs of the same scenario. |
 | `GraphComposeStressTest`, `EnduranceTest` | Long-running stress / endurance harnesses. |
-| `GraphComposeBenchmark` | Legacy entry point preserved for one downstream caller. New work should target `CurrentSpeedBenchmark`. |
 
 ## Running
 
@@ -97,28 +126,46 @@ without reproducing locally.
 ## How to read a report
 
 The JSON shape is intentionally simple — a top-level run record with
-per-scenario sub-records. Each sub-record carries:
-
-- `avgMs`, `p50Ms`, `p95Ms`, `maxMs` — latency distribution across
-  iterations within the run.
-- `docsPerSec` — rough throughput; **not statistically rigorous**,
-  intended only as a relative number against a sibling scenario or a
-  previous run on the same machine.
-- `avgKB` — average output byte size. Stable across runs on the same
-  fixture; useful for catching content corruption (size shifts by
-  > a few hundred bytes are usually a bug, not a benchmark fluctuation).
-- `peakMB` — peak heap as observed by `MemoryMXBean`; coarse, do not
-  use for memory-budget enforcement.
+per-scenario sub-records. The latency rows carry these fields (the JSON
+keys are camelCase; the CSV columns are the snake_case equivalents):
+
+- `avgMillis`, `p50Millis`, `p95Millis`, `maxMillis` — latency distribution
+  across iterations within the run.
+- `docsPerSecond` — a **derived** figure, `1000 / avgMillis`: the reciprocal of
+  average latency, **not** a measured throughput rate. Real parallel throughput
+  lives in the separate `throughput[]` section (full profile only). Treat it as
+  a relative number against a sibling scenario or a previous run on the same
+  machine, not a publishable rate.
+- `avgKilobytes` — average output byte size. Stable across runs on the same
+  fixture; useful for catching content corruption (size shifts by more than a
+  few hundred bytes are usually a bug, not a benchmark fluctuation).
+- `peakHeapMb` — used-heap **delta** over the post-warmup baseline (closer to
+  per-iteration allocation pressure than to absolute live heap). GC-timing
+  noisy, so **advisory only** — for a deterministic memory signal use the
+  allocation bytes from `MeasurementCountBenchmark` or the alloc probes.
+
+A `stages[]` array carries the per-template-scenario compose / layout / render
+median split (`composeMillis` / `layoutMillis` / `renderMillis` / `totalMillis`),
+present when the run has enough measurement iterations.
 
 ## Strict JMH layer
 
 The Track C JMH layer (forked JVM, warmup + measurement, JIT-stable numbers)
 lives alongside this manual harness. JMH benchmarks are annotated classes under
 `com.demcha.compose.jmh`; the shade plugin builds a self-contained runner jar so
-forked benchmark JVMs inherit the full classpath. Present benchmarks:
-`CanonicalRender` (bare-DSL multi-section render), `TemplateCv` (the
-`ModernProfessional` layered template), and `PaginatedDocument` (a multi-page
-document parameterised by section count).
+forked benchmark JVMs inherit the full classpath. The suite spans steady-state
+render benches (`CanonicalRender`, `TemplateCv`, `Chart`, `ChartVariant`, `Image`,
+`MixedShowcase`), parameterised scaling ramps (`IconRamp`, `LargeTable`,
+`SparklineRamp`, `PaginatedDocument`, `VectorPaint`), the SVG-import micro-benches
+(`Svg`), and a single-shot cold-start bench (`ColdStart`).
+
+Every steady-state JMH bench uses `@Fork(1)` with a 3×2s warmup / 5×2s measurement
+window — a deliberately fast default for on-demand local iteration (a single fork,
+so the reported `Error` column is blank). For a number you intend to quote, pass
+more forks on the CLI (e.g. `-f 5`) for a cross-fork error estimate. The exception
+is `ColdStart`, which is single-shot (`Mode.SingleShotTime`, `@Warmup(0)`,
+`@Fork(10)`) — it deliberately measures the JIT-cold first render across ten fresh
+JVMs.
 
 The measured region differs per benchmark: `TemplateCv` hoists fixture
 construction into `@Setup` and times the render only, while `CanonicalRender` and
diff --git a/benchmarks/pom.xml b/benchmarks/pom.xml
index 25ac8f25d..b48aff3a8 100644
--- a/benchmarks/pom.xml
+++ b/benchmarks/pom.xml
@@ -30,7 +30,7 @@
         <logback.version>1.5.34</logback.version>
 
         <openhtmltopdf.version>1.0.10</openhtmltopdf.version>
-        <itextpdf.version>5.5.13.3</itextpdf.version>
+        <itext.version>9.6.0</itext.version>
         <jasperreports.version>7.0.7</jasperreports.version>
     </properties>
 
@@ -100,8 +100,9 @@
         </dependency>
         <dependency>
             <groupId>com.itextpdf</groupId>
-            <artifactId>itextpdf</artifactId>
-            <version>${itextpdf.version}</version>
+            <artifactId>itext-core</artifactId>
+            <version>${itext.version}</version>
+            <type>pom</type>
         </dependency>
         <dependency>
             <groupId>net.sf.jasperreports</groupId>
diff --git a/benchmarks/src/main/java/com/demcha/compose/AllocationRateProbe.java b/benchmarks/src/main/java/com/demcha/compose/AllocationRateProbe.java
new file mode 100644
index 000000000..e81c2af92
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/AllocationRateProbe.java
@@ -0,0 +1,161 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.templates.builtins.InvoiceTemplateV1;
+import com.demcha.compose.document.templates.builtins.ProposalTemplateV1;
+import com.demcha.compose.document.templates.data.invoice.InvoiceDocumentSpec;
+import com.demcha.compose.document.templates.data.proposal.ProposalDocumentSpec;
+
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+
+/**
+ * Allocation-rate and GC-pressure probe over realistic templates. The endurance
+ * and stress harnesses only check that sustained rendering stays stable / under a
+ * heap ceiling; nothing reports how much garbage a single render churns, which is
+ * what drives GC pressure for a high-throughput server.
+ *
+ * <p>For each template it renders many warm documents and reports two things: the
+ * warm per-document allocation (ThreadMXBean current-thread bytes / doc — a
+ * deterministic figure ideal for an A/B), and the JVM garbage collections those
+ * renders triggered (count + time via {@code GarbageCollectorMXBean} — JVM-wide
+ * and GC-timing sensitive, so advisory). No {@code src/main} changes.</p>
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml exec:java -Dexec.mainClass=com.demcha.compose.AllocationRateProbe
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class AllocationRateProbe {
+
+    private static final com.sun.management.ThreadMXBean THREAD_MX =
+            (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
+
+    private static final int WARMUP = 60;
+    private static final int MEASURE = 300;
+
+    @FunctionalInterface
+    private interface Render {
+        byte[] run() throws Exception;
+    }
+
+    public static void main(String[] args) throws Exception {
+        BenchmarkSupport.configureQuietLogging();
+        enableAllocationMeasurement();
+
+        InvoiceDocumentSpec invoice = CanonicalBenchmarkSupport.canonicalInvoice();
+        InvoiceTemplateV1 invoiceTemplate = new InvoiceTemplateV1();
+        ProposalDocumentSpec proposal = CanonicalBenchmarkSupport.canonicalProposal();
+        ProposalTemplateV1 proposalTemplate = new ProposalTemplateV1();
+
+        System.out.println("GraphCompose allocation-rate / GC-pressure probe (" + MEASURE + " warm renders each)");
+        System.out.printf("%-12s | %14s | %10s | %12s | %12s%n",
+                "Template", "Alloc / doc", "GC count", "GC time ms", "Total alloc");
+        System.out.println("-".repeat(70));
+        report("invoice", () -> renderTemplate(s -> invoiceTemplate.compose(s, invoice)));
+        report("proposal", () -> renderTemplate(s -> proposalTemplate.compose(s, proposal)));
+        System.out.println();
+        System.out.println("Alloc/doc = warm ThreadMXBean bytes per render (deterministic A/B signal). "
+                + "GC count/time = JVM collections those renders triggered (advisory, GC-timing sensitive).");
+    }
+
+    private interface Compose {
+        void into(DocumentSession session);
+    }
+
+    private static byte[] renderTemplate(Compose compose) throws Exception {
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(22, 22, 22, 22)
+                .create()) {
+            compose.into(session);
+            return session.toPdfBytes();
+        }
+    }
+
+    private static void report(String name, Render render) throws Exception {
+        long dummy = 0;
+        for (int i = 0; i < WARMUP; i++) {
+            dummy += render.run().length;
+        }
+
+        System.gc();
+        Thread.sleep(50);
+
+        long gcCountStart = totalGcCount();
+        long gcTimeStart = totalGcTime();
+        long allocStart = currentThreadAllocatedBytes();
+
+        for (int i = 0; i < MEASURE; i++) {
+            dummy += render.run().length;
+        }
+
+        long alloc = allocStart < 0 ? -1 : currentThreadAllocatedBytes() - allocStart;
+        long gcCount = totalGcCount() - gcCountStart;
+        long gcTime = totalGcTime() - gcTimeStart;
+
+        System.out.printf("%-12s | %14s | %10d | %12d | %12s%n",
+                name,
+                alloc < 0 ? "n/a" : kb(alloc / (double) MEASURE),
+                gcCount,
+                gcTime,
+                alloc < 0 ? "n/a" : mb(alloc));
+
+        if (dummy == 0) {
+            System.out.println("Error: no bytes generated");
+        }
+    }
+
+    private static long totalGcCount() {
+        long total = 0;
+        for (GarbageCollectorMXBean bean : ManagementFactory.getGarbageCollectorMXBeans()) {
+            long count = bean.getCollectionCount();
+            if (count > 0) {
+                total += count;
+            }
+        }
+        return total;
+    }
+
+    private static long totalGcTime() {
+        long total = 0;
+        for (GarbageCollectorMXBean bean : ManagementFactory.getGarbageCollectorMXBeans()) {
+            long time = bean.getCollectionTime();
+            if (time > 0) {
+                total += time;
+            }
+        }
+        return total;
+    }
+
+    private static String kb(double bytes) {
+        return "%.1f KB".formatted(bytes / 1024.0);
+    }
+
+    private static String mb(long bytes) {
+        return "%.1f MB".formatted(bytes / (1024.0 * 1024.0));
+    }
+
+    private static void enableAllocationMeasurement() {
+        try {
+            if (THREAD_MX.isThreadAllocatedMemorySupported() && !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                THREAD_MX.setThreadAllocatedMemoryEnabled(true);
+            }
+        } catch (UnsupportedOperationException ignored) {
+            // Allocation measurement unsupported on this JVM; the probe reports n/a.
+        }
+    }
+
+    private static long currentThreadAllocatedBytes() {
+        try {
+            if (!THREAD_MX.isThreadAllocatedMemorySupported() || !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                return -1;
+            }
+        } catch (UnsupportedOperationException ex) {
+            return -1;
+        }
+        return THREAD_MX.getCurrentThreadAllocatedBytes();
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
index 9b99d272f..ce99ce16e 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
@@ -93,6 +93,31 @@ private void diffCurrentSpeed(DiffInput input,
                     signedPercent(row.peakHeapMbDeltaPct()));
         }
 
+        if (!report.addedScenarios().isEmpty() || !report.removedScenarios().isEmpty()) {
+            System.out.println();
+            System.out.println("Scenario set changes");
+            System.out.println("  Added in candidate:    "
+                    + (report.addedScenarios().isEmpty() ? "(none)" : String.join(", ", report.addedScenarios())));
+            System.out.println("  Removed from baseline: "
+                    + (report.removedScenarios().isEmpty() ? "(none)" : String.join(", ", report.removedScenarios())));
+        }
+
+        if (!report.stages().isEmpty()) {
+            System.out.println();
+            System.out.println("Stage diff (pct delta per stage)");
+            System.out.printf("%-18s | %12s | %12s | %12s | %12s%n",
+                    "Scenario", "Compose pct", "Layout pct", "Render pct", "Total pct");
+            System.out.println("-".repeat(78));
+            for (StageDiff row : report.stages()) {
+                System.out.printf("%-18s | %12s | %12s | %12s | %12s%n",
+                        row.scenario(),
+                        signedPercent(row.composeDeltaPct()),
+                        signedPercent(row.layoutDeltaPct()),
+                        signedPercent(row.renderDeltaPct()),
+                        signedPercent(row.totalDeltaPct()));
+            }
+        }
+
         System.out.println();
         System.out.println("Throughput diff");
         System.out.printf("%-18s | %8s | %12s | %14s%n",
@@ -143,10 +168,29 @@ private void diffCurrentSpeed(DiffInput input,
                                 format(row.candidateAvgMillisPerDoc()),
                                 format(row.avgMillisPerDocDeltaPct())))
                         .toList());
+        Path stagesCsv = artifacts.writeCsv(
+                "stages-diff",
+                List.of("scenario", "baseline_compose_ms", "candidate_compose_ms", "compose_delta_pct", "baseline_layout_ms", "candidate_layout_ms", "layout_delta_pct", "baseline_render_ms", "candidate_render_ms", "render_delta_pct", "baseline_total_ms", "candidate_total_ms", "total_delta_pct"),
+                report.stages().stream()
+                        .map(row -> List.of(
+                                row.scenario(),
+                                format(row.baselineComposeMillis()),
+                                format(row.candidateComposeMillis()),
+                                format(row.composeDeltaPct()),
+                                format(row.baselineLayoutMillis()),
+                                format(row.candidateLayoutMillis()),
+                                format(row.layoutDeltaPct()),
+                                format(row.baselineRenderMillis()),
+                                format(row.candidateRenderMillis()),
+                                format(row.renderDeltaPct()),
+                                format(row.baselineTotalMillis()),
+                                format(row.candidateTotalMillis()),
+                                format(row.totalDeltaPct())))
+                        .toList());
 
         System.out.println();
         System.out.println("Saved JSON diff report to " + jsonPath);
-        System.out.println("Saved CSV diff reports to " + latencyCsv + " and " + throughputCsv);
+        System.out.println("Saved CSV diff reports to " + latencyCsv + ", " + throughputCsv + ", and " + stagesCsv);
     }
 
     private void diffComparative(DiffInput input,
@@ -156,11 +200,11 @@ private void diffComparative(DiffInput input,
         ComparativeDiffReport report = buildComparativeDiff(input, baseline, candidate);
 
         System.out.println("Comparative diff");
-        System.out.printf("%-20s | %12s | %12s%n",
+        System.out.printf("%-24s | %12s | %12s%n",
                 "Library", "Time pct", "Heap pct");
-        System.out.println("-".repeat(52));
+        System.out.println("-".repeat(56));
         for (ComparativeLibraryDiff row : report.libraries()) {
-            System.out.printf("%-20s | %12s | %12s%n",
+            System.out.printf("%-24s | %12s | %12s%n",
                     row.library(),
                     signedPercent(row.avgTimeDeltaPct()),
                     signedPercent(row.avgHeapDeltaPct()));
@@ -214,6 +258,29 @@ private CurrentSpeedDiffReport buildCurrentSpeedDiff(DiffInput input, JsonNode b
                 })
                 .toList();
 
+        Map<String, JsonNode> baselineStages = indexBy(baseline.path("stages"), "scenario");
+        Map<String, JsonNode> candidateStages = indexBy(candidate.path("stages"), "scenario");
+        List<StageDiff> stageDiffs = intersectKeys(baselineStages, candidateStages).stream()
+                .map(key -> {
+                    JsonNode before = baselineStages.get(key);
+                    JsonNode after = candidateStages.get(key);
+                    return new StageDiff(
+                            key,
+                            before.path("composeMillis").asDouble(),
+                            after.path("composeMillis").asDouble(),
+                            percentDelta(before.path("composeMillis").asDouble(), after.path("composeMillis").asDouble()),
+                            before.path("layoutMillis").asDouble(),
+                            after.path("layoutMillis").asDouble(),
+                            percentDelta(before.path("layoutMillis").asDouble(), after.path("layoutMillis").asDouble()),
+                            before.path("renderMillis").asDouble(),
+                            after.path("renderMillis").asDouble(),
+                            percentDelta(before.path("renderMillis").asDouble(), after.path("renderMillis").asDouble()),
+                            before.path("totalMillis").asDouble(),
+                            after.path("totalMillis").asDouble(),
+                            percentDelta(before.path("totalMillis").asDouble(), after.path("totalMillis").asDouble()));
+                })
+                .toList();
+
         Map<String, JsonNode> baselineThroughput = indexThroughput(baseline.path("throughput"));
         Map<String, JsonNode> candidateThroughput = indexThroughput(candidate.path("throughput"));
         List<CurrentSpeedThroughputDiff> throughputDiffs = intersectKeys(baselineThroughput, candidateThroughput).stream()
@@ -237,7 +304,10 @@ private CurrentSpeedDiffReport buildCurrentSpeedDiff(DiffInput input, JsonNode b
                 input.candidatePath().toString(),
                 baseline.path("timestamp").asText(),
                 candidate.path("timestamp").asText(),
+                addedKeys(baselineLatency, candidateLatency),
+                removedKeys(baselineLatency, candidateLatency),
                 latencyDiffs,
+                stageDiffs,
                 throughputDiffs
         );
     }
@@ -294,6 +364,16 @@ private static List<String> intersectKeys(Map<String, JsonNode> left, Map<String
                 .toList();
     }
 
+    /** Keys present in {@code candidate} but not {@code baseline} (new scenarios). */
+    private static List<String> addedKeys(Map<String, JsonNode> baseline, Map<String, JsonNode> candidate) {
+        return candidate.keySet().stream().filter(key -> !baseline.containsKey(key)).sorted().toList();
+    }
+
+    /** Keys present in {@code baseline} but not {@code candidate} (dropped scenarios). */
+    private static List<String> removedKeys(Map<String, JsonNode> baseline, Map<String, JsonNode> candidate) {
+        return baseline.keySet().stream().filter(key -> !candidate.containsKey(key)).sorted().toList();
+    }
+
     private static Iterable<JsonNode> iterable(JsonNode array) {
         return () -> new Iterator<>() {
             private final Iterator<JsonNode> delegate = array.iterator();
@@ -477,11 +557,29 @@ private record CurrentSpeedThroughputDiff(String scenario,
                                               double avgMillisPerDocDeltaPct) {
     }
 
+    private record StageDiff(String scenario,
+                             double baselineComposeMillis,
+                             double candidateComposeMillis,
+                             double composeDeltaPct,
+                             double baselineLayoutMillis,
+                             double candidateLayoutMillis,
+                             double layoutDeltaPct,
+                             double baselineRenderMillis,
+                             double candidateRenderMillis,
+                             double renderDeltaPct,
+                             double baselineTotalMillis,
+                             double candidateTotalMillis,
+                             double totalDeltaPct) {
+    }
+
     private record CurrentSpeedDiffReport(String baselinePath,
                                           String candidatePath,
                                           String baselineTimestamp,
                                           String candidateTimestamp,
+                                          List<String> addedScenarios,
+                                          List<String> removedScenarios,
                                           List<CurrentSpeedLatencyDiff> latency,
+                                          List<StageDiff> stages,
                                           List<CurrentSpeedThroughputDiff> throughput) {
     }
 
diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
index 5eb786649..6a3abb58f 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkMedianTool.java
@@ -24,6 +24,12 @@
  * possible, so it can be diffed by {@link BenchmarkDiffTool}. The tool is meant
  * for local benchmark sessions where a few repeated runs are needed to reduce
  * machine noise before comparing results.</p>
+ *
+ * <p>The current-speed per-stage breakdown ({@code stages[]}) is medianed and
+ * carried into the aggregate when every source run has it (it is present only for
+ * runs with enough measurement iterations), so a median-vs-median
+ * {@link BenchmarkDiffTool} run still attributes a regression to
+ * compose / layout / render.</p>
  */
 public final class BenchmarkMedianTool {
 
@@ -75,6 +81,7 @@ private void aggregateCurrentSpeed(List<ReportFile> reportFiles) throws Exceptio
         List<Integer> threadCounts = requireIntegerArrayConsistency(reportFiles, "threadCounts");
 
         List<CurrentSpeedLatencyMedianRow> latencyRows = aggregateCurrentSpeedLatency(reportFiles);
+        List<CurrentSpeedStageMedianRow> stageRows = aggregateCurrentSpeedStages(reportFiles);
         List<CurrentSpeedThroughputMedianRow> throughputRows = aggregateCurrentSpeedThroughput(reportFiles);
 
         long totalBytesMedian = Math.round(median(
@@ -90,6 +97,7 @@ private void aggregateCurrentSpeed(List<ReportFile> reportFiles) throws Exceptio
                 docsPerThread,
                 threadCounts,
                 latencyRows,
+                stageRows,
                 throughputRows,
                 totalBytesMedian,
                 "median",
@@ -126,12 +134,28 @@ private void aggregateCurrentSpeed(List<ReportFile> reportFiles) throws Exceptio
                                 format(row.avgMillisPerDoc())))
                         .toList());
 
+        Path stagesCsv = null;
+        if (!stageRows.isEmpty()) {
+            stagesCsv = artifacts.writeCsv(
+                    "stages",
+                    List.of("scenario", "compose_ms", "layout_ms", "render_ms", "total_ms"),
+                    stageRows.stream()
+                            .map(row -> List.of(
+                                    row.scenario(),
+                                    format(row.composeMillis()),
+                                    format(row.layoutMillis()),
+                                    format(row.renderMillis()),
+                                    format(row.totalMillis())))
+                            .toList());
+        }
+
         System.out.println("Median benchmark report");
         System.out.println("Suite: current-speed");
         System.out.println("Profile: " + profile);
         System.out.println("Source runs: " + reportFiles.size());
         System.out.println("Saved JSON median report to " + jsonPath);
-        System.out.println("Saved CSV median reports to " + latencyCsv + " and " + throughputCsv);
+        System.out.println("Saved CSV median reports to " + latencyCsv
+                + (stagesCsv != null ? ", " + stagesCsv : "") + " and " + throughputCsv);
     }
 
     private List<CurrentSpeedLatencyMedianRow> aggregateCurrentSpeedLatency(List<ReportFile> reportFiles) {
@@ -165,6 +189,42 @@ private List<CurrentSpeedLatencyMedianRow> aggregateCurrentSpeedLatency(List<Rep
                 .toList();
     }
 
+    private List<CurrentSpeedStageMedianRow> aggregateCurrentSpeedStages(List<ReportFile> reportFiles) {
+        // stages[] is optional: CurrentSpeedBenchmark only emits it when the run
+        // has enough measurement iterations (smoke < 20 emits none). Aggregate only
+        // when EVERY source report carries a non-empty stages[] with the same
+        // scenario set; otherwise return empty so the median report simply carries
+        // no stages — mirroring the benchmark's own conditional emission rather
+        // than throwing on an absent/partial optional field.
+        List<JsonNode> firstRows = iterable(reportFiles.get(0).report().path("stages"));
+        if (firstRows.isEmpty()) {
+            return List.of();
+        }
+        Map<String, JsonNode> firstByScenario = indexBy(firstRows, "scenario");
+        for (ReportFile reportFile : reportFiles) {
+            Map<String, JsonNode> currentByScenario = indexBy(iterable(reportFile.report().path("stages")), "scenario");
+            if (!firstByScenario.keySet().equals(currentByScenario.keySet())) {
+                System.out.println("Note: stages omitted from the median aggregate — "
+                        + "the stage-breakdown scenario set differs across the source runs.");
+                return List.of();
+            }
+        }
+
+        return firstByScenario.keySet().stream()
+                .map(scenario -> {
+                    List<JsonNode> rows = reportFiles.stream()
+                            .map(reportFile -> indexBy(iterable(reportFile.report().path("stages")), "scenario").get(scenario))
+                            .toList();
+                    return new CurrentSpeedStageMedianRow(
+                            scenario,
+                            median(rows, "composeMillis"),
+                            median(rows, "layoutMillis"),
+                            median(rows, "renderMillis"),
+                            median(rows, "totalMillis"));
+                })
+                .toList();
+    }
+
     private List<CurrentSpeedThroughputMedianRow> aggregateCurrentSpeedThroughput(List<ReportFile> reportFiles) {
         List<JsonNode> firstRows = iterable(reportFiles.get(0).report().path("throughput"));
         Map<String, JsonNode> firstByScenario = indexThroughput(firstRows);
@@ -393,6 +453,13 @@ private record CurrentSpeedThroughputMedianRow(String scenario,
                                                    double avgMillisPerDoc) {
     }
 
+    private record CurrentSpeedStageMedianRow(String scenario,
+                                              double composeMillis,
+                                              double layoutMillis,
+                                              double renderMillis,
+                                              double totalMillis) {
+    }
+
     private record CurrentSpeedMedianReport(String timestamp,
                                             String profile,
                                             int warmupIterations,
@@ -400,6 +467,7 @@ private record CurrentSpeedMedianReport(String timestamp,
                                             int docsPerThread,
                                             List<Integer> threadCounts,
                                             List<CurrentSpeedLatencyMedianRow> latency,
+                                            List<CurrentSpeedStageMedianRow> stages,
                                             List<CurrentSpeedThroughputMedianRow> throughput,
                                             long totalBytes,
                                             String aggregation,
diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java
index 73e061d3d..51d2b2e42 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java
@@ -60,6 +60,14 @@ Path writeCsv(String tableName, List<String> headers, List<List<String>> rows) t
             return archived;
         }
 
+        Path writeMarkdown(String name, String content) throws IOException {
+            Path latest = directory.resolve("latest-" + name + ".md");
+            Path archived = directory.resolve(name + "-" + timestamp + ".md");
+            Files.writeString(latest, content, StandardCharsets.UTF_8);
+            Files.writeString(archived, content, StandardCharsets.UTF_8);
+            return archived;
+        }
+
         Path directory() {
             return directory;
         }
diff --git a/benchmarks/src/main/java/com/demcha/compose/ChartAllocProbe.java b/benchmarks/src/main/java/com/demcha/compose/ChartAllocProbe.java
new file mode 100644
index 000000000..2921bde80
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ChartAllocProbe.java
@@ -0,0 +1,114 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.backend.fixed.pdf.PdfMeasurementResources;
+import com.demcha.compose.document.layout.DocumentGraph;
+import com.demcha.compose.document.layout.DocumentLayoutPassContext;
+import com.demcha.compose.document.layout.LayoutCanvas;
+import com.demcha.compose.document.layout.LayoutCompiler;
+import com.demcha.compose.document.layout.LayoutGraph;
+import com.demcha.compose.document.layout.NodeRegistry;
+import com.demcha.compose.document.node.DocumentNode;
+
+import java.lang.management.ManagementFactory;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Deterministic allocation probe for the v1.8 chart subsystem: warm
+ * (JIT-steady) bytes allocated by the layout-compile pass of a chart-heavy
+ * document (a grouped bar, a multi-series line, and a pie). Charts are resolved
+ * into engine primitives during compile, so this isolates the chart-resolve +
+ * geometry-emission allocation — the noise-free signal a develop-vs-branch A/B
+ * needs. No {@code src/main} changes.
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ChartAllocProbe {
+
+    private static final com.sun.management.ThreadMXBean THREAD_MX =
+            (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
+
+    private static final int WARMUP = 60;
+    private static final int MEASURE = 11;
+
+    public static void main(String[] args) throws Exception {
+        BenchmarkSupport.configureQuietLogging();
+        enableAllocationMeasurement();
+
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(24, 24, 24, 24)
+                .create()) {
+            session.pageFlow(flow -> flow
+                    .chart(ChartBenchmarkFixtures.barSpec(), ChartBenchmarkFixtures.barStyle())
+                    .chart(ChartBenchmarkFixtures.lineSpec(), ChartBenchmarkFixtures.lineStyle())
+                    .chart(ChartBenchmarkFixtures.pieSpec()));
+
+            List<DocumentNode> roots = session.roots();
+            LayoutCanvas canvas = session.canvas();
+            NodeRegistry registry = session.registry();
+
+            try (PdfMeasurementResources resources = PdfMeasurementResources.open(List.of())) {
+                LayoutCompiler compiler = new LayoutCompiler(registry);
+                DocumentGraph graph = new DocumentGraph(roots);
+
+                int pages = 0;
+                // Warm up so the measured allocation is JIT steady state, not
+                // class-load / first-call cold start.
+                for (int i = 0; i < WARMUP; i++) {
+                    pages = compile(compiler, graph, registry, canvas, resources).totalPages();
+                }
+
+                long[] alloc = new long[MEASURE];
+                for (int m = 0; m < MEASURE; m++) {
+                    long before = currentThreadAllocatedBytes();
+                    LayoutGraph layout = compile(compiler, graph, registry, canvas, resources);
+                    alloc[m] = before < 0 ? -1 : currentThreadAllocatedBytes() - before;
+                    pages = layout.totalPages();
+                }
+                Arrays.sort(alloc);
+
+                System.out.println("GraphCompose chart layout-compile allocation probe");
+                System.out.printf("document: grouped bar + line (12 cats x 3 series) + 6-slice pie, pages: %d%n", pages);
+                System.out.printf("warm compile allocation (median of %d): %s%n",
+                        MEASURE, kb(alloc[MEASURE / 2]));
+                System.out.printf("  min %s / max %s%n", kb(alloc[0]), kb(alloc[MEASURE - 1]));
+            }
+        }
+    }
+
+    private static LayoutGraph compile(LayoutCompiler compiler, DocumentGraph graph,
+                                       NodeRegistry registry, LayoutCanvas canvas,
+                                       PdfMeasurementResources resources) {
+        DocumentLayoutPassContext context = new DocumentLayoutPassContext(
+                registry, canvas, resources.fontLibrary(), resources.textMeasurementSystem(), false);
+        return compiler.compile(graph, context, context);
+    }
+
+    private static String kb(long bytes) {
+        return bytes < 0 ? "n/a (allocation measurement unsupported)" : "%.1f KB".formatted(bytes / 1024.0);
+    }
+
+    private static void enableAllocationMeasurement() {
+        try {
+            if (THREAD_MX.isThreadAllocatedMemorySupported() && !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                THREAD_MX.setThreadAllocatedMemoryEnabled(true);
+            }
+        } catch (UnsupportedOperationException ignored) {
+            // Allocation measurement unsupported on this JVM; the probe reports n/a.
+        }
+    }
+
+    private static long currentThreadAllocatedBytes() {
+        try {
+            if (!THREAD_MX.isThreadAllocatedMemorySupported() || !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                return -1;
+            }
+        } catch (UnsupportedOperationException ex) {
+            return -1;
+        }
+        return THREAD_MX.getCurrentThreadAllocatedBytes();
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java
new file mode 100644
index 000000000..1993acb36
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ChartBenchmarkFixtures.java
@@ -0,0 +1,134 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.chart.AxisSpec;
+import com.demcha.compose.document.chart.BarGrouping;
+import com.demcha.compose.document.chart.ChartData;
+import com.demcha.compose.document.chart.ChartSize;
+import com.demcha.compose.document.chart.ChartSpec;
+import com.demcha.compose.document.chart.ChartStyle;
+import com.demcha.compose.document.chart.LegendPosition;
+import com.demcha.compose.document.chart.PointMarker;
+import com.demcha.compose.document.chart.SliceLabelMode;
+import com.demcha.compose.document.chart.ValueLabelMode;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentPaint;
+import com.demcha.compose.document.style.DocumentStroke;
+
+/**
+ * Shared fixtures for the v1.8 chart benchmarks: a non-trivial grouped bar and
+ * multi-series line (12 categories × 3 series) plus a 6-slice pie. Charts
+ * compile at layout time into ordinary shapes / lines / polygons / labels, so
+ * these stress {@code ChartLayoutResolver} + per-primitive geometry + label
+ * text-metrics — the cost no text/table bench exercises.
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ChartBenchmarkFixtures {
+
+    private ChartBenchmarkFixtures() {
+    }
+
+    /** 12 categories × 3 series — a representative grouped-bar / line workload. */
+    public static ChartData monthlySeries() {
+        return ChartData.builder()
+                .categories("Jan", "Feb", "Mar", "Apr", "May", "Jun",
+                        "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
+                .series("2023", 12.4, 15.1, 9.8, 14.2, 16.0, 13.3, 17.1, 18.4, 15.9, 14.0, 19.2, 21.1)
+                .series("2024", 14.0, 18.2, 11.3, 16.9, 17.5, 15.0, 19.0, 20.2, 17.1, 16.4, 21.0, 23.5)
+                .series("2025", 15.5, 19.0, 12.0, 18.0, 19.1, 16.2, 20.5, 22.0, 18.9, 17.7, 22.8, 25.0)
+                .build();
+    }
+
+    /** 6-slice single-series data for the pie. */
+    public static ChartData regionShare() {
+        return ChartData.builder()
+                .categories("EMEA", "Americas", "APAC", "LATAM", "MEA", "Other")
+                .series("Share", 31.0, 27.0, 19.0, 10.0, 8.0, 5.0)
+                .build();
+    }
+
+    public static ChartSpec barSpec() {
+        return ChartSpec.bar()
+                .data(monthlySeries())
+                .valueAxis(AxisSpec.builder().baselineAtZero(true).build())
+                .legend(LegendPosition.BOTTOM)
+                .valueLabels(ValueLabelMode.OUTSIDE)
+                .size(ChartSize.aspectRatio(16, 7))
+                .build();
+    }
+
+    public static ChartStyle barStyle() {
+        return ChartStyle.builder()
+                .seriesPaint(0, DocumentPaint.solid(DocumentColor.rgb(20, 80, 95)))
+                .seriesPaint(1, DocumentPaint.solid(DocumentColor.rgb(196, 153, 76)))
+                .seriesPaint(2, DocumentPaint.solid(DocumentColor.rgb(120, 60, 140)))
+                .build();
+    }
+
+    public static ChartSpec lineSpec() {
+        return ChartSpec.line()
+                .data(monthlySeries())
+                .valueAxis(AxisSpec.builder().baselineAtZero(true).build())
+                .legend(LegendPosition.BOTTOM)
+                .size(ChartSize.aspectRatio(16, 7))
+                .build();
+    }
+
+    public static ChartStyle lineStyle() {
+        return ChartStyle.builder()
+                .lineWidth(1.8)
+                .pointMarker(PointMarker.circle(5.0)
+                        .withStroke(DocumentStroke.of(DocumentColor.WHITE, 1.0)))
+                .build();
+    }
+
+    public static ChartSpec pieSpec() {
+        return ChartSpec.pie()
+                .data(regionShare())
+                .sliceLabels(SliceLabelMode.CATEGORY_PERCENT)
+                .size(ChartSize.fixedHeight(190))
+                .build();
+    }
+
+    /** Horizontal grouped bar — exercises the transposed (category-on-Y) layout branch. */
+    public static ChartSpec horizontalBarSpec() {
+        return ChartSpec.bar()
+                .data(monthlySeries())
+                .horizontal(true)
+                .valueAxis(AxisSpec.builder().baselineAtZero(true).build())
+                .legend(LegendPosition.BOTTOM)
+                .size(ChartSize.aspectRatio(16, 9))
+                .build();
+    }
+
+    /** Stacked bar — exercises the cumulative-stacking layout branch. */
+    public static ChartSpec stackedBarSpec() {
+        return ChartSpec.bar()
+                .data(monthlySeries())
+                .grouping(BarGrouping.STACKED)
+                .valueAxis(AxisSpec.builder().baselineAtZero(true).build())
+                .legend(LegendPosition.BOTTOM)
+                .size(ChartSize.aspectRatio(16, 7))
+                .build();
+    }
+
+    /** Bar with a non-zero value-axis minimum — exercises the lifted-baseline branch. */
+    public static ChartSpec axisMinBarSpec() {
+        return ChartSpec.bar()
+                .data(monthlySeries())
+                .valueAxis(AxisSpec.builder().min(8.0).build())
+                .legend(LegendPosition.BOTTOM)
+                .size(ChartSize.aspectRatio(16, 7))
+                .build();
+    }
+
+    /** Donut — exercises the pie's donut-ratio (inner-radius) branch. */
+    public static ChartSpec donutSpec() {
+        return ChartSpec.pie()
+                .data(regionShare())
+                .donutRatio(0.55)
+                .sliceLabels(SliceLabelMode.CATEGORY_PERCENT)
+                .size(ChartSize.fixedHeight(190))
+                .build();
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
index 76cd87c70..b37215fcc 100644
--- a/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/ComparativeBenchmark.java
@@ -1,25 +1,36 @@
 package com.demcha.compose;
 
+import com.demcha.compose.document.api.DocumentPageSize;
 import com.demcha.compose.document.api.DocumentSession;
 import com.demcha.compose.document.node.ContainerNode;
 import com.demcha.compose.document.node.ParagraphNode;
 import com.demcha.compose.document.node.TextAlign;
 import com.demcha.compose.document.style.DocumentInsets;
 import com.demcha.compose.document.style.DocumentTextStyle;
-import com.itextpdf.text.Document;
-import com.itextpdf.text.Paragraph;
-import com.itextpdf.text.pdf.PdfPTable;
-import com.itextpdf.text.pdf.PdfWriter;
+import com.demcha.compose.document.table.DocumentTableColumn;
+import com.itextpdf.kernel.pdf.PdfDocument;
+import com.itextpdf.kernel.pdf.PdfWriter;
+import com.itextpdf.layout.Document;
+import com.itextpdf.layout.element.Cell;
+import com.itextpdf.layout.element.Paragraph;
+import com.itextpdf.layout.element.Table;
+import com.itextpdf.layout.properties.UnitValue;
 import net.sf.jasperreports.engine.*;
+import net.sf.jasperreports.engine.data.JRMapCollectionDataSource;
 import net.sf.jasperreports.engine.design.*;
+import net.sf.jasperreports.engine.type.TextAdjustEnum;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 
 import java.io.ByteArrayOutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.time.LocalDateTime;
 import java.time.format.DateTimeFormatter;
 import java.lang.management.ManagementFactory;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 /**
  * Fair Comparative Benchmark (CPU & RAM)
@@ -32,8 +43,23 @@ public class ComparativeBenchmark {
     private static final int WARMUP_ITERATIONS = 50;
     private static final int MEASUREMENT_ITERATIONS = 100;
 
+    // Report-scaling sweep: the same title + prose + N-row table rendered through
+    // every library at growing row counts, so the numbers show how each engine
+    // SCALES (and whether GraphCompose's lead widens with document size) instead
+    // of at a single fixed size. The heavy sizes use fewer iterations to keep the
+    // on-demand run reasonable; this is a directional comparative, not a strict
+    // JMH measurement (see benchmarks/README.md).
+    private static final int[] SWEEP_SIZES = {40, 200, 1000};
+    private static final int SWEEP_WARMUP_ITERATIONS = 20;
+    private static final int SWEEP_MEASUREMENT_ITERATIONS = 30;
+
+    private static final String REPORT_PROSE =
+            ("GraphCompose lays out structured business documents across many pages "
+                    + "while keeping header and footer placement stable. ").repeat(6);
+
     // Предкомпилированный отчет для честного теста Jasper
     private static JasperReport compiledJasperReport;
+    private static JasperReport compiledJasperReportHeavy;
 
     public static void main(String[] args) throws Exception {
         BenchmarkSupport.configureQuietLogging();
@@ -41,28 +67,69 @@ public static void main(String[] args) throws Exception {
         System.out.println("Timestamp: " + LocalDateTime.now().format(TIMESTAMP_FORMAT));
         System.out.println("------------------------------------------------------------");
 
-        // Подготавливаем Jasper 1 раз (как в Production)
+        // Per-thread allocation accounting backs the "Avg Heap (MB)" column and the
+        // heap-advantage ratios. Enable it explicitly (and bail loudly if the JVM
+        // does not support it) instead of trusting the platform default, matching
+        // the guard the other allocation probes in this module use.
+        com.sun.management.ThreadMXBean allocBean =
+                (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
+        if (!allocBean.isThreadAllocatedMemorySupported()) {
+            throw new IllegalStateException("Thread allocated-memory measurement is not supported on this JVM");
+        }
+        allocBean.setThreadAllocatedMemoryEnabled(true);
+
+        // Подготавливаем оба отчета Jasper 1 раз (как в Production)
         setupJasper();
+        setupJasperReport();
 
-        // Прогрев JVM (JIT компилятор)
+        // Прогрев JVM (JIT компилятор) — оба сценария
         System.out.println("Warming up JVM...");
         for (int i = 0; i < WARMUP_ITERATIONS; i++) {
             benchmarkGraphComposeCanonical();
             benchmarkIText();
             benchmarkJasper();
         }
+        for (int i = 0; i < SWEEP_WARMUP_ITERATIONS; i++) {
+            for (int size : SWEEP_SIZES) {
+                benchmarkGraphComposeReport(size);
+                benchmarkITextReport(size);
+                benchmarkJasperReport(size);
+            }
+        }
+
+        // Замер — два сценария: дешёвый (фиксированные накладные) и масштабирование отчёта
+        System.out.println("Measuring performance...");
+        List<ComparativeRow> rows = new ArrayList<>();
 
-        // Замер
-        System.out.println("Measuring performance (" + MEASUREMENT_ITERATIONS + " iterations)...");
         System.out.println();
-        System.out.printf("%-24s | %14s | %14s%n", "Library", "Avg Time (ms)", "Avg Heap (MB)");
-        System.out.println("-".repeat(60));
+        System.out.println("Scenario: small invoice (single page, ~3 lines), " + MEASUREMENT_ITERATIONS + " iterations");
+        printTableHeader();
+        rows.add(runBenchmark("GraphCompose Canonical", MEASUREMENT_ITERATIONS, ComparativeBenchmark::benchmarkGraphComposeCanonical).toRow());
+        rows.add(runBenchmark("iText 9", MEASUREMENT_ITERATIONS, ComparativeBenchmark::benchmarkIText).toRow());
+        rows.add(runBenchmark("JasperReports", MEASUREMENT_ITERATIONS, ComparativeBenchmark::benchmarkJasper).toRow());
 
-        List<ComparativeRow> rows = List.of(
-                runBenchmark("GraphCompose Canonical", ComparativeBenchmark::benchmarkGraphComposeCanonical),
-                runBenchmark("iText 5 (Old)", ComparativeBenchmark::benchmarkIText),
-                runBenchmark("JasperReports", ComparativeBenchmark::benchmarkJasper)
-        );
+        System.out.println();
+        System.out.println("Scenario: report scaling sweep (title + prose + N-row table), "
+                + SWEEP_MEASUREMENT_ITERATIONS + " iterations per size");
+        List<ScalingPoint> scaling = new ArrayList<>();
+        for (int size : SWEEP_SIZES) {
+            System.out.println();
+            System.out.println("  N = " + size + " rows");
+            printTableHeader();
+            Measured gc = runBenchmark("GraphCompose (" + size + " rows)", SWEEP_MEASUREMENT_ITERATIONS,
+                    () -> benchmarkGraphComposeReport(size));
+            Measured it = runBenchmark("iText 9 (" + size + " rows)", SWEEP_MEASUREMENT_ITERATIONS,
+                    () -> benchmarkITextReport(size));
+            Measured js = runBenchmark("JasperReports (" + size + " rows)", SWEEP_MEASUREMENT_ITERATIONS,
+                    () -> benchmarkJasperReport(size));
+            rows.add(gc.toRow());
+            rows.add(it.toRow());
+            rows.add(js.toRow());
+            // Ratios are computed from the full-precision averages, not the rounded
+            // report rows, so the advantage figures don't compound rounding error.
+            scaling.add(new ScalingPoint(size, gc, it, js));
+        }
+        printScalingSummary(scaling);
 
         BenchmarkReportWriter.BenchmarkArtifacts artifacts = BenchmarkReportWriter.prepare("comparative");
         ComparativeReport report = new ComparativeReport(
@@ -83,16 +150,46 @@ public static void main(String[] args) throws Exception {
         System.out.println("-".repeat(60));
         System.out.println("Saved JSON benchmark report to " + jsonPath);
         System.out.println("Saved CSV benchmark report to " + csvPath);
+
+        // After all measurement, dump one rendered PDF per library/scenario so the
+        // exact documents that were benchmarked can be inspected visually. This runs
+        // outside the measured region, so it cannot affect the timing/allocation numbers.
+        Path samples = writeSampleRenders(artifacts.directory().resolve("samples"));
+        System.out.println("Saved sample renders (one PDF per library/scenario) to " + samples);
+    }
+
+    /**
+     * Renders each library/scenario once more and writes the bytes to PDF files,
+     * so a reader can open the actual documents the benchmark measured.
+     */
+    private static Path writeSampleRenders(Path directory) throws Exception {
+        Files.createDirectories(directory);
+        Files.write(directory.resolve("graphcompose-small.pdf"), benchmarkGraphComposeCanonical());
+        Files.write(directory.resolve("itext-small.pdf"), benchmarkIText());
+        Files.write(directory.resolve("jasper-small.pdf"), benchmarkJasper());
+        // The smallest and largest sweep sizes, so the reader can see both a short
+        // report and the multi-page document that drives the scaling numbers.
+        for (int size : new int[]{SWEEP_SIZES[0], SWEEP_SIZES[SWEEP_SIZES.length - 1]}) {
+            Files.write(directory.resolve("graphcompose-report-" + size + ".pdf"), benchmarkGraphComposeReport(size));
+            Files.write(directory.resolve("itext-report-" + size + ".pdf"), benchmarkITextReport(size));
+            Files.write(directory.resolve("jasper-report-" + size + ".pdf"), benchmarkJasperReport(size));
+        }
+        return directory;
+    }
+
+    private static void printTableHeader() {
+        System.out.printf("%-24s | %14s | %14s%n", "Library", "Avg Time (ms)", "Avg Heap (MB)");
+        System.out.println("-".repeat(60));
     }
 
-    private static ComparativeRow runBenchmark(String name, BenchmarkTask task) throws Exception {
+    private static Measured runBenchmark(String name, int iterations, BenchmarkTask task) throws Exception {
         long totalTimeNs = 0;
         long totalAllocatedBytes = 0;
         long dummyAccumulator = 0; // Защита от Dead Code Elimination
 
         com.sun.management.ThreadMXBean bean = (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
 
-        for (int i = 0; i < MEASUREMENT_ITERATIONS; i++) {
+        for (int i = 0; i < iterations; i++) {
             System.gc(); // Форсируем сборку мусора перед каждым замером для чистоты аллокации
 
             long startBytes = bean.getThreadAllocatedBytes(Thread.currentThread().getId());
@@ -109,19 +206,15 @@ private static ComparativeRow runBenchmark(String name, BenchmarkTask task) thro
             dummyAccumulator += pdfBytes.length;
         }
 
-        double avgTimeMs = (totalTimeNs / (double) MEASUREMENT_ITERATIONS) / 1_000_000.0;
-        double avgMemMb = (totalAllocatedBytes / (double) MEASUREMENT_ITERATIONS) / (1024.0 * 1024.0);
+        double avgTimeMs = (totalTimeNs / (double) iterations) / 1_000_000.0;
+        double avgMemMb = (totalAllocatedBytes / (double) iterations) / (1024.0 * 1024.0);
 
         System.out.printf("%-24s | %14.2f | %14.2f%n", name, avgTimeMs, avgMemMb);
 
         // Печатаем dummy-переменную, чтобы JIT не вырезал код генерации
         if (dummyAccumulator == 0) System.out.println("Error: No bytes generated");
 
-        return new ComparativeRow(
-                name,
-                round(avgTimeMs),
-                round(avgMemMb)
-        );
+        return new Measured(name, avgTimeMs, avgMemMb);
     }
 
     /**
@@ -145,24 +238,78 @@ private static byte[] benchmarkGraphComposeCanonical() throws Exception {
         }
     }
 
+    /**
+     * GraphCompose canonical, multi-page report: title + {@code rows}-row table +
+     * prose, authored through the public page-flow DSL (the realistic consumer path).
+     */
+    private static byte[] benchmarkGraphComposeReport(int rows) throws Exception {
+        // Equal full-width columns (page width minus the 32pt L/R margins, split
+        // four ways), so the table fills the page like iText (setWidthPercentage
+        // 100) and Jasper (full-column-width cells) rather than hugging its text.
+        final double columnWidth = (DocumentPageSize.A4.width() - 2 * 32) / 4.0;
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4).margin(DocumentInsets.of(32)).create()) {
+            session.pageFlow(flow -> {
+                flow.name("Report").spacing(8);
+                flow.addParagraph("Quarterly Business Report");
+                flow.addParagraph(REPORT_PROSE);
+                flow.addTable(t -> {
+                    t.columns(
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth))
+                            .header("Item", "Qty", "Unit", "Total").repeatHeader();
+                    for (int r = 1; r <= rows; r++) {
+                        t.row("Line item " + r, "3", "ea", "38.75");
+                    }
+                });
+                flow.addParagraph(REPORT_PROSE);
+            });
+            return session.toPdfBytes();
+        }
+    }
+
     /**
      * iText: Тестируем с таблицей, чтобы заставить его рассчитывать геометрию
      */
     private static byte[] benchmarkIText() throws Exception {
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        Document document = new Document();
-        PdfWriter.getInstance(document, baos);
-        document.open();
-
-        // Используем таблицу, чтобы iText делал расчет ширины (как GraphCompose)
-        PdfPTable table = new PdfPTable(1);
-        table.setWidthPercentage(100);
-        table.addCell(new Paragraph("INVOICE #12345"));
-        table.addCell(new Paragraph("Customer: John Doe"));
-        table.addCell(new Paragraph("Amount: $1,000.00"));
-
-        document.add(table);
-        document.close();
+        // iText 9 (kernel + layout). A full-width 1-column table makes iText do
+        // the same width calculation GraphCompose does.
+        try (Document document = new Document(new PdfDocument(new PdfWriter(baos)))) {
+            Table table = new Table(UnitValue.createPercentArray(new float[]{1})).useAllAvailableWidth();
+            table.addCell(new Cell().add(new Paragraph("INVOICE #12345")));
+            table.addCell(new Cell().add(new Paragraph("Customer: John Doe")));
+            table.addCell(new Cell().add(new Paragraph("Amount: $1,000.00")));
+            document.add(table);
+        }
+        return baos.toByteArray();
+    }
+
+    /**
+     * iText, multi-page report: same title + {@code rows}-row table + prose. iText
+     * paginates the table natively, so this exercises real multi-page layout.
+     */
+    private static byte[] benchmarkITextReport(int rows) throws Exception {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (Document document = new Document(new PdfDocument(new PdfWriter(baos)))) {
+            document.add(new Paragraph("Quarterly Business Report"));
+            document.add(new Paragraph(REPORT_PROSE));
+
+            Table table = new Table(UnitValue.createPercentArray(new float[]{1, 1, 1, 1})).useAllAvailableWidth();
+            for (String header : new String[]{"Item", "Qty", "Unit", "Total"}) {
+                table.addHeaderCell(new Cell().add(new Paragraph(header)));
+            }
+            for (int r = 1; r <= rows; r++) {
+                table.addCell(new Cell().add(new Paragraph("Line item " + r)));
+                table.addCell(new Cell().add(new Paragraph("3")));
+                table.addCell(new Cell().add(new Paragraph("ea")));
+                table.addCell(new Cell().add(new Paragraph("38.75")));
+            }
+            document.add(table);
+            document.add(new Paragraph(REPORT_PROSE));
+        }
         return baos.toByteArray();
     }
 
@@ -199,6 +346,124 @@ private static void setupJasper() throws Exception {
         compiledJasperReport = JasperCompileManager.compileReport(jd);
     }
 
+    /**
+     * JasperReports, multi-page report: a 4-field detail band filled from a
+     * {@code rows}-row data source, with a title (+ prose) and column header.
+     * Compiled once here; the benchmark measures fill + PDF export.
+     */
+    private static byte[] benchmarkJasperReport(int rows) throws Exception {
+        List<Map<String, ?>> data = new ArrayList<>(rows);
+        for (int r = 1; r <= rows; r++) {
+            Map<String, Object> row = new HashMap<>();
+            row.put("item", "Line item " + r);
+            row.put("qty", "3");
+            row.put("unit", "ea");
+            row.put("total", "38.75");
+            data.add(row);
+        }
+        Map<String, Object> parameters = new HashMap<>();
+        parameters.put("prose", REPORT_PROSE);
+        JasperPrint jp = JasperFillManager.fillReport(
+                compiledJasperReportHeavy, parameters, new JRMapCollectionDataSource(data));
+        return JasperExportManager.exportReportToPdf(jp);
+    }
+
+    /** A full-width prose text field that wraps and grows, so all of {@code REPORT_PROSE} renders. */
+    private static JRDesignTextField proseField(int y) {
+        JRDesignTextField field = new JRDesignTextField();
+        field.setX(0);
+        field.setY(y);
+        field.setWidth(555);
+        field.setHeight(14);
+        field.setTextAdjust(TextAdjustEnum.STRETCH_HEIGHT);
+        JRDesignExpression expression = new JRDesignExpression();
+        expression.setText("$P{prose}");
+        field.setExpression(expression);
+        return field;
+    }
+
+    /** Compiles the multi-row Jasper report design once, before measurement. */
+    private static void setupJasperReport() throws Exception {
+        JasperDesign jd = new JasperDesign();
+        jd.setName("Report");
+        jd.setPageWidth(595);
+        jd.setPageHeight(842);
+        jd.setLeftMargin(20);
+        jd.setRightMargin(20);
+        jd.setTopMargin(20);
+        jd.setBottomMargin(20);
+        jd.setColumnWidth(555);
+
+        String[] fields = {"item", "qty", "unit", "total"};
+        for (String name : fields) {
+            JRDesignField field = new JRDesignField();
+            field.setName(name);
+            field.setValueClass(String.class);
+            jd.addField(field);
+        }
+
+        // Prose is a parameter rendered through a stretching text field, so all of
+        // REPORT_PROSE wraps and renders (a fixed static-text box would clip it),
+        // matching the full prose GraphCompose and iText lay out.
+        JRDesignParameter proseParameter = new JRDesignParameter();
+        proseParameter.setName("prose");
+        proseParameter.setValueClass(String.class);
+        jd.addParameter(proseParameter);
+
+        // Title band: heading + the first full prose block.
+        JRDesignBand title = new JRDesignBand();
+        title.setHeight(40);
+        JRDesignStaticText heading = new JRDesignStaticText();
+        heading.setX(0);
+        heading.setY(0);
+        heading.setWidth(555);
+        heading.setHeight(20);
+        heading.setText("Quarterly Business Report");
+        title.addElement(heading);
+        title.addElement(proseField(22));
+        jd.setTitle(title);
+
+        // Summary band: the second full prose block (the other two libs render
+        // prose both before and after the table).
+        JRDesignBand summary = new JRDesignBand();
+        summary.setHeight(16);
+        summary.addElement(proseField(0));
+        jd.setSummary(summary);
+
+        // Column header band.
+        String[] headers = {"Item", "Qty", "Unit", "Total"};
+        JRDesignBand columnHeader = new JRDesignBand();
+        columnHeader.setHeight(20);
+        for (int i = 0; i < headers.length; i++) {
+            JRDesignStaticText cell = new JRDesignStaticText();
+            cell.setX(i * 138);
+            cell.setY(0);
+            cell.setWidth(i == headers.length - 1 ? 555 - i * 138 : 138);
+            cell.setHeight(18);
+            cell.setText(headers[i]);
+            columnHeader.addElement(cell);
+        }
+        jd.setColumnHeader(columnHeader);
+
+        // Detail band: one row per data-source record.
+        JRDesignBand detail = new JRDesignBand();
+        detail.setHeight(18);
+        for (int i = 0; i < fields.length; i++) {
+            JRDesignTextField cell = new JRDesignTextField();
+            cell.setX(i * 138);
+            cell.setY(0);
+            cell.setWidth(i == fields.length - 1 ? 555 - i * 138 : 138);
+            cell.setHeight(16);
+            JRDesignExpression expression = new JRDesignExpression();
+            expression.setText("$F{" + fields[i] + "}");
+            cell.setExpression(expression);
+            detail.addElement(cell);
+        }
+        ((JRDesignSection) jd.getDetailSection()).addBand(detail);
+
+        compiledJasperReportHeavy = JasperCompileManager.compileReport(jd);
+    }
+
     @FunctionalInterface
     public interface BenchmarkTask {
         byte[] runAndGetBytes() throws Exception;
@@ -208,9 +473,49 @@ private static double round(double value) {
         return Math.round(value * 100.0) / 100.0;
     }
 
+    /**
+     * Prints how GraphCompose's time/memory advantage over iText and Jasper changes
+     * as the row count grows, so the "does the lead widen with document size?"
+     * question is answered by the numbers rather than asserted. A ratio above 1.0
+     * means GraphCompose is that many times faster / lighter at that size.
+     */
+    private static void printScalingSummary(List<ScalingPoint> scaling) {
+        System.out.println();
+        System.out.println("Scaling summary (GraphCompose advantage; >1.0 = GraphCompose faster / lighter)");
+        System.out.printf("%-8s | %16s | %16s | %16s | %16s%n",
+                "Rows", "Time vs iText", "Time vs Jasper", "Heap vs iText", "Heap vs Jasper");
+        System.out.println("-".repeat(86));
+        for (ScalingPoint p : scaling) {
+            System.out.printf("%-8d | %16s | %16s | %16s | %16s%n",
+                    p.rows(),
+                    ratio(p.iText().timeMs(), p.graphCompose().timeMs()),
+                    ratio(p.jasper().timeMs(), p.graphCompose().timeMs()),
+                    ratio(p.iText().heapMb(), p.graphCompose().heapMb()),
+                    ratio(p.jasper().heapMb(), p.graphCompose().heapMb()));
+        }
+    }
+
+    /** {@code other / graphCompose} as an "Nx" string; guards against divide-by-zero. */
+    private static String ratio(double other, double graphCompose) {
+        if (graphCompose <= 0.0) {
+            return "n/a";
+        }
+        return "%.2fx".formatted(other / graphCompose);
+    }
+
     private record ComparativeRow(String library, double avgTimeMs, double avgHeapMb) {
     }
 
+    /** Full-precision average for one library/scenario, before report rounding. */
+    private record Measured(String name, double timeMs, double heapMb) {
+        ComparativeRow toRow() {
+            return new ComparativeRow(name, round(timeMs), round(heapMb));
+        }
+    }
+
+    private record ScalingPoint(int rows, Measured graphCompose, Measured iText, Measured jasper) {
+    }
+
     private record ComparativeReport(String timestamp,
                                      int warmupIterations,
                                      int measurementIterations,
diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index 2858d64a6..46706038a 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -8,8 +8,10 @@
 import com.demcha.compose.document.backend.fixed.pdf.options.PdfWatermarkOptions;
 import com.demcha.compose.document.backend.fixed.pdf.options.PdfWatermarkPosition;
 import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentPaint;
 import com.demcha.compose.document.style.DocumentTextDecoration;
 import com.demcha.compose.document.style.DocumentTextStyle;
+import com.demcha.compose.document.svg.SvgIcon;
 import com.demcha.compose.document.templates.api.DocumentTemplate;
 import com.demcha.compose.document.templates.builtins.InvoiceTemplateV1;
 import com.demcha.compose.document.templates.builtins.ProposalTemplateV1;
@@ -32,6 +34,7 @@
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
+import java.util.function.Function;
 
 /**
  * Focused local benchmark harness for current GraphCompose performance.
@@ -47,6 +50,8 @@
  *     <li>the built-in CV template</li>
  *     <li>a longer multi-page proposal template</li>
  *     <li>a feature-rich document with QR/barcode, watermark, page break, and footer</li>
+ *     <li>long unbreakable tokens forcing character-level wrap</li>
+ *     <li>a v1.8 vector-rich document (bar/pie charts, SVG icons, gradient path)</li>
  * </ul>
  */
 public final class CurrentSpeedBenchmark {
@@ -55,7 +60,9 @@ public final class CurrentSpeedBenchmark {
     private static final int DEFAULT_FULL_WARMUP_ITERATIONS = 12;
     private static final int DEFAULT_FULL_MEASUREMENT_ITERATIONS = 40;
     private static final int DEFAULT_FULL_DOCS_PER_THREAD = 12;
-    private static final String DEFAULT_FULL_THREAD_COUNTS = "1,2,4,8";
+    // The 16-thread tier is absorbed from the removed ScalabilityBenchmark so the
+    // full profile keeps a thread-scaling data point (smoke runs no throughput).
+    private static final String DEFAULT_FULL_THREAD_COUNTS = "1,2,4,8,16";
     // Bumped from 2/5 to 30/100 so smoke runs reach a steady JIT state and the
     // p95 calculation actually has enough samples to interpolate rather than
     // collapsing to the maximum observed time. The smoke profile remains the
@@ -84,6 +91,43 @@ public final class CurrentSpeedBenchmark {
     private final InvoiceDocumentSpec invoice = CanonicalBenchmarkSupport.canonicalInvoice();
     private final ProposalDocumentSpec proposal = CanonicalBenchmarkSupport.canonicalProposal();
     private final CvSpec cv = CanonicalBenchmarkSupport.canonicalCv();
+    // Parsed/built once (like the template fixtures above) so the vector-rich
+    // scenario measures the render, not a per-iteration SVG re-parse.
+    private final SvgIcon vectorRichIcon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+    private final DocumentPaint vectorRichAccent = DocumentPaint.linear(
+            DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
+
+    // Canonical scenario list, in table order. Declared statically (the
+    // renderer is bound to an instance at run time) so the gate-coverage guard
+    // test can read the scenario names without re-measuring: a scenario added
+    // here without a matching SMOKE threshold below would silently escape the
+    // perf gate, and CurrentSpeedScenarioGateTest fails loudly if that happens.
+    private static final List<ScenarioDef> SCENARIO_DEFS = List.of(
+            new ScenarioDef("engine-simple", "One-page engine composition",
+                    b -> b::renderEngineSimpleDocument),
+            new ScenarioDef("invoice-template", "Compose-first invoice template",
+                    b -> b::renderInvoiceTemplateDocument),
+            new ScenarioDef("cv-template", "Compose-first CV template",
+                    b -> b::renderCvTemplateDocument),
+            new ScenarioDef("proposal-template", "Long multi-page proposal template",
+                    b -> b::renderProposalTemplateDocument),
+            new ScenarioDef("feature-rich", "QR, barcode, watermark, header/footer, page break",
+                    b -> b::renderFeatureRichDocument),
+            new ScenarioDef("long-token", "Long unbreakable tokens (URLs/IDs) forcing character-level wrap",
+                    b -> b::renderLongTokenDocument),
+            new ScenarioDef("vector-rich", "v1.8 vector surface: bar + pie charts, SVG icons, gradient path",
+                    b -> b::renderVectorRichDocument)
+    );
+
+    /**
+     * Ordered scenario names. Read by {@code CurrentSpeedScenarioGateTest} to
+     * assert every scenario is covered by a SMOKE gate threshold.
+     *
+     * @return the canonical scenario names in table order
+     */
+    static List<String> scenarioNames() {
+        return SCENARIO_DEFS.stream().map(ScenarioDef::name).toList();
+    }
 
     public static void main(String[] args) throws Exception {
         BenchmarkSupport.configureQuietLogging();
@@ -107,14 +151,9 @@ private void run() throws Exception {
         System.out.println("Perf gate: " + (enforceGate ? "enabled" : "disabled"));
         System.out.println();
 
-        List<Scenario> scenarios = List.of(
-                new Scenario("engine-simple", "One-page engine composition", this::renderEngineSimpleDocument),
-                new Scenario("invoice-template", "Compose-first invoice template", this::renderInvoiceTemplateDocument),
-                new Scenario("cv-template", "Compose-first CV template", this::renderCvTemplateDocument),
-                new Scenario("proposal-template", "Long multi-page proposal template", this::renderProposalTemplateDocument),
-                new Scenario("feature-rich", "QR, barcode, watermark, header/footer, page break", this::renderFeatureRichDocument),
-                new Scenario("long-token", "Long unbreakable tokens (URLs/IDs) forcing character-level wrap", this::renderLongTokenDocument)
-        );
+        List<Scenario> scenarios = SCENARIO_DEFS.stream()
+                .map(def -> new Scenario(def.name(), def.description(), def.renderer().apply(this)))
+                .toList();
 
         System.out.println("Latency benchmark");
         System.out.printf("%-18s | %10s | %10s | %10s | %10s | %11s | %10s | %10s%n",
@@ -141,20 +180,21 @@ private void run() throws Exception {
 
         // Stage breakdown: for each template scenario we time compose / layout
         // / render separately so consumers can attribute regressions to the
-        // engine vs. PDFBox. Engine-simple and feature-rich scenarios also
-        // use the canonical pipeline and benefit from the same probe.
+        // engine vs. PDFBox. Only the template scenarios are probed here; the
+        // latency table above still covers every scenario.
+        List<StageRow> stageRows = new ArrayList<>();
         if (profile != BenchmarkProfile.SMOKE || config.measurementIterations() >= 20) {
             System.out.println();
             System.out.println("Stage breakdown (median ms per stage)");
             System.out.printf("%-18s | %12s | %12s | %12s | %12s%n",
                     "Scenario", "Compose", "Layout", "Render", "Total");
             System.out.println("-".repeat(78));
-            runStageBreakdown("invoice-template", () -> openInvoiceSession(),
-                    s -> invoiceTemplate.compose(s, invoice), config.measurementIterations());
-            runStageBreakdown("cv-template", () -> openCvSession(),
-                    s -> cvTemplate.compose(s, cv), config.measurementIterations());
-            runStageBreakdown("proposal-template", () -> openProposalSession(),
-                    s -> proposalTemplate.compose(s, proposal), config.measurementIterations());
+            stageRows.add(runStageBreakdown("invoice-template", () -> openInvoiceSession(),
+                    s -> invoiceTemplate.compose(s, invoice), config.measurementIterations()));
+            stageRows.add(runStageBreakdown("cv-template", () -> openCvSession(),
+                    s -> cvTemplate.compose(s, cv), config.measurementIterations()));
+            stageRows.add(runStageBreakdown("proposal-template", () -> openProposalSession(),
+                    s -> proposalTemplate.compose(s, proposal), config.measurementIterations()));
         }
 
         List<ThroughputRow> throughputRows = new ArrayList<>();
@@ -199,10 +239,13 @@ private void run() throws Exception {
                 config.docsPerThread(),
                 config.threadCounts(),
                 latencyRows,
+                stageRows,
                 throughputRows,
                 totalBenchmarkBytes);
         System.out.println("Saved JSON benchmark report to " + summary.jsonPath());
-        System.out.println("Saved CSV benchmark reports to " + summary.latencyCsvPath() + " and " + summary.throughputCsvPath());
+        System.out.println("Saved CSV benchmark reports to " + summary.latencyCsvPath() + ", "
+                + summary.stagesCsvPath() + ", and " + summary.throughputCsvPath());
+        System.out.println("Saved markdown summary to " + summary.summaryMarkdownPath());
 
         if (enforceGate) {
             PerformanceGateResult gateResult = evaluatePerformanceGate(profile, latencyRows);
@@ -361,10 +404,10 @@ private interface SessionComposer {
      * median-ms-per-stage row so callers can attribute regressions to
      * compose / layout / render independently.
      */
-    private void runStageBreakdown(String scenario,
-                                   SessionFactory factory,
-                                   SessionComposer composer,
-                                   int iterations) throws Exception {
+    private StageRow runStageBreakdown(String scenario,
+                                       SessionFactory factory,
+                                       SessionComposer composer,
+                                       int iterations) throws Exception {
         int warmup = Math.max(2, Math.min(20, iterations / 5));
         for (int i = 0; i < warmup; i++) {
             try (DocumentSession session = factory.open()) {
@@ -396,12 +439,13 @@ private void runStageBreakdown(String scenario,
                 throw new AssertionError();
             }
         }
+        double composeMs = medianMs(composeNs);
+        double layoutMs = medianMs(layoutNs);
+        double renderMs = medianMs(renderNs);
+        double totalMs = medianMs(totalNs);
         System.out.printf("%-18s | %12.3f | %12.3f | %12.3f | %12.3f%n",
-                scenario,
-                medianMs(composeNs),
-                medianMs(layoutNs),
-                medianMs(renderNs),
-                medianMs(totalNs));
+                scenario, composeMs, layoutMs, renderMs, totalMs);
+        return new StageRow(scenario, round(composeMs), round(layoutMs), round(renderMs), round(totalMs));
     }
 
     private static double medianMs(long[] arr) {
@@ -473,6 +517,7 @@ static PerformanceGateResult evaluatePerformanceGate(BenchmarkProfile profile, L
         }
 
         List<String> failures = new ArrayList<>();
+        List<String> advisories = new ArrayList<>();
         for (LatencyRow row : latencyRows) {
             SmokeThreshold threshold = profile.smokeThresholds().get(row.scenario());
             if (threshold == null) {
@@ -490,17 +535,23 @@ static PerformanceGateResult evaluatePerformanceGate(BenchmarkProfile profile, L
                 failures.add(row.scenario() + " avg " + format(row.avgMillis()) + " ms > " + format(maxAvgMillis) + " ms");
             }
             if (row.peakHeapMb() > maxPeakHeapMb) {
-                failures.add(row.scenario() + " peak heap " + format(row.peakHeapMb()) + " MB > " + format(maxPeakHeapMb) + " MB");
+                // peakHeapMb is a GC-timing-noisy used-heap delta, so a breach is
+                // reported as advisory rather than failing the gate — matching
+                // BenchmarkVerdictTool and avoiding flaky CI from a GC blip. The
+                // deterministic memory signal is the allocation-bytes probes.
+                advisories.add(row.scenario() + " peak heap " + format(row.peakHeapMb()) + " MB > " + format(maxPeakHeapMb) + " MB");
             }
         }
 
+        String advisoryNote = advisories.isEmpty() ? "" : " (advisory: " + String.join("; ", advisories) + ")";
+
         if (failures.isEmpty()) {
-            return new PerformanceGateResult(true, "Performance gate passed for profile " + profile.id());
+            return new PerformanceGateResult(true, "Performance gate passed for profile " + profile.id() + advisoryNote);
         }
 
         return new PerformanceGateResult(
                 false,
-                "Performance gate failed for profile " + profile.id() + ": " + String.join("; ", failures));
+                "Performance gate failed for profile " + profile.id() + ": " + String.join("; ", failures) + advisoryNote);
     }
 
     private long usedHeapBytes() {
@@ -522,6 +573,25 @@ private byte[] renderEngineSimpleDocument() throws Exception {
                         + "a root flow container, heading text, paragraph layout, and final PDF serialization.");
     }
 
+    private byte[] renderVectorRichDocument() throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(com.demcha.compose.document.api.DocumentPageSize.A4)
+                .margin(28, 28, 28, 28)
+                .create()) {
+            var flow = document.pageFlow().name("BenchmarkVectorRich").spacing(12);
+            flow.addParagraph("v1.8 vector-rich benchmark");
+            flow.chart(ChartBenchmarkFixtures.barSpec(), ChartBenchmarkFixtures.barStyle());
+            flow.chart(ChartBenchmarkFixtures.pieSpec());
+            for (int i = 0; i < 8; i++) {
+                flow.addSvgIcon(vectorRichIcon, 32);
+            }
+            flow.addPath(p -> p.size(220, 28)
+                    .moveTo(0.0, 0.5).curveTo(0.25, 1.0, 0.75, 0.0, 1.0, 0.5).fill(vectorRichAccent));
+            flow.build();
+            return document.toPdfBytes();
+        }
+    }
+
     private byte[] renderInvoiceTemplateDocument() throws Exception {
         try (DocumentSession document = GraphCompose.document()
                 .pageSize(com.demcha.compose.document.api.DocumentPageSize.A4)
@@ -675,16 +745,19 @@ private PathSummary writeReports(BenchmarkReportWriter.BenchmarkArtifacts artifa
                                      int docsPerThread,
                                      int[] threadCounts,
                                      List<LatencyRow> latencyRows,
+                                     List<StageRow> stageRows,
                                      List<ThroughputRow> throughputRows,
                                      long totalBenchmarkBytes) throws Exception {
+        String timestamp = LocalDateTime.now().format(TIMESTAMP_FORMAT);
         CurrentSpeedReport report = new CurrentSpeedReport(
-                LocalDateTime.now().format(TIMESTAMP_FORMAT),
+                timestamp,
                 profileId,
                 warmupIterations,
                 measurementIterations,
                 docsPerThread,
                 Arrays.stream(threadCounts).boxed().toList(),
                 latencyRows,
+                stageRows,
                 throughputRows,
                 totalBenchmarkBytes);
 
@@ -715,8 +788,88 @@ private PathSummary writeReports(BenchmarkReportWriter.BenchmarkArtifacts artifa
                                 format(row.docsPerSecond()),
                                 format(row.avgMillisPerDoc())))
                         .toList());
+        var stagesCsvPath = artifacts.writeCsv(
+                "stages",
+                List.of("scenario", "compose_ms", "layout_ms", "render_ms", "total_ms"),
+                stageRows.stream()
+                        .map(row -> List.of(
+                                row.scenario(),
+                                format(row.composeMillis()),
+                                format(row.layoutMillis()),
+                                format(row.renderMillis()),
+                                format(row.totalMillis())))
+                        .toList());
+        var summaryMarkdownPath = artifacts.writeMarkdown(
+                "summary",
+                buildSummaryMarkdown(timestamp, profileId, latencyRows, stageRows,
+                        throughputRows, totalBenchmarkBytes));
+
+        return new PathSummary(jsonPath.toString(), latencyCsvPath.toString(),
+                stagesCsvPath.toString(), throughputCsvPath.toString(),
+                summaryMarkdownPath.toString());
+    }
+
+    /**
+     * Renders a single human-readable summary of the run — the latency table,
+     * the per-stage compose/layout/render split (the only place the suite
+     * attributes time to engine stages vs. PDFBox), and the throughput table
+     * when present — so a reviewer reads one file instead of stitching the JSON
+     * and several CSVs together.
+     */
+    private static String buildSummaryMarkdown(String timestamp,
+                                               String profileId,
+                                               List<LatencyRow> latencyRows,
+                                               List<StageRow> stageRows,
+                                               List<ThroughputRow> throughputRows,
+                                               long totalBenchmarkBytes) {
+        StringBuilder md = new StringBuilder();
+        md.append("# Current-speed benchmark — ").append(profileId).append(" profile\n\n");
+        md.append('`').append(timestamp).append("`\n\n");
+
+        md.append("## Latency (ms)\n\n");
+        md.append("| Scenario | Avg | p50 | p95 | Max | Docs/s | Avg KB | Peak MB |\n");
+        md.append("|---|---:|---:|---:|---:|---:|---:|---:|\n");
+        for (LatencyRow row : latencyRows) {
+            md.append("| ").append(row.scenario())
+                    .append(" | ").append(format(row.avgMillis()))
+                    .append(" | ").append(format(row.p50Millis()))
+                    .append(" | ").append(format(row.p95Millis()))
+                    .append(" | ").append(format(row.maxMillis()))
+                    .append(" | ").append(format(row.docsPerSecond()))
+                    .append(" | ").append(format(row.avgKilobytes()))
+                    .append(" | ").append(format(row.peakHeapMb()))
+                    .append(" |\n");
+        }
+
+        if (!stageRows.isEmpty()) {
+            md.append("\n## Stages — template scenarios (median ms — compose / layout / render)\n\n");
+            md.append("| Scenario | Compose | Layout | Render | Total |\n");
+            md.append("|---|---:|---:|---:|---:|\n");
+            for (StageRow row : stageRows) {
+                md.append("| ").append(row.scenario())
+                        .append(" | ").append(format(row.composeMillis()))
+                        .append(" | ").append(format(row.layoutMillis()))
+                        .append(" | ").append(format(row.renderMillis()))
+                        .append(" | ").append(format(row.totalMillis()))
+                        .append(" |\n");
+            }
+        }
 
-        return new PathSummary(jsonPath.toString(), latencyCsvPath.toString(), throughputCsvPath.toString());
+        if (!throughputRows.isEmpty()) {
+            md.append("\n## Throughput\n\n");
+            md.append("| Threads | Total docs | Docs/s | Avg doc ms |\n");
+            md.append("|---:|---:|---:|---:|\n");
+            for (ThroughputRow row : throughputRows) {
+                md.append("| ").append(row.threads())
+                        .append(" | ").append(row.totalDocs())
+                        .append(" | ").append(format(row.docsPerSecond()))
+                        .append(" | ").append(format(row.avgMillisPerDoc()))
+                        .append(" |\n");
+            }
+        }
+
+        md.append("\nByte guard: ").append(totalBenchmarkBytes).append('\n');
+        return md.toString();
     }
 
     private static double round(double value) {
@@ -730,6 +883,13 @@ private static String format(double value) {
     private record Scenario(String name, String description, Renderer renderer) {
     }
 
+    // Static scenario template: name + description + a factory that binds the
+    // renderer to a benchmark instance. Keeps the scenario list declarable as a
+    // static constant (so the gate-coverage test can read it) while the actual
+    // render still runs against per-run instance state.
+    private record ScenarioDef(String name, String description, Function<CurrentSpeedBenchmark, Renderer> renderer) {
+    }
+
     @FunctionalInterface
     private interface Renderer {
         byte[] render() throws Exception;
@@ -770,6 +930,18 @@ private record ThroughputRow(String scenario,
                                  double avgMillisPerDoc) {
     }
 
+    /**
+     * Per-scenario compose / layout / render split (median ms). Persisted so a
+     * diff can attribute a regression to an engine stage rather than only the
+     * blended total — previously this was printed to the console and discarded.
+     */
+    private record StageRow(String scenario,
+                            double composeMillis,
+                            double layoutMillis,
+                            double renderMillis,
+                            double totalMillis) {
+    }
+
     private record CurrentSpeedReport(String timestamp,
                                       String profile,
                                       int warmupIterations,
@@ -777,11 +949,13 @@ private record CurrentSpeedReport(String timestamp,
                                       int docsPerThread,
                                       List<Integer> threadCounts,
                                       List<LatencyRow> latency,
+                                      List<StageRow> stages,
                                       List<ThroughputRow> throughput,
                                       long totalBytes) {
     }
 
-    private record PathSummary(String jsonPath, String latencyCsvPath, String throughputCsvPath) {
+    private record PathSummary(String jsonPath, String latencyCsvPath, String stagesCsvPath,
+                               String throughputCsvPath, String summaryMarkdownPath) {
     }
 
     private record BenchmarkConfig(int warmupIterations,
@@ -805,12 +979,19 @@ enum BenchmarkProfile {
                 // (typically 1.5-2x slower) does not produce false positives
                 // while real regressions of 50% or more still trigger. The
                 // previous values (800-2600 ms) were 50-100x looser and would
-                // not have flagged even a 10x slowdown.
+                // not have flagged even a 10x slowdown. long-token (observed
+                // ~3.2 ms / ~94 MB) is gated too so every scenario in the
+                // latency table is covered — CurrentSpeedScenarioGateTest pins
+                // that invariant.
                 "engine-simple", new SmokeThreshold(8.0, 96.0),
                 "invoice-template", new SmokeThreshold(35.0, 384.0),
                 "cv-template", new SmokeThreshold(25.0, 192.0),
                 "proposal-template", new SmokeThreshold(45.0, 384.0),
-                "feature-rich", new SmokeThreshold(100.0, 256.0)
+                "feature-rich", new SmokeThreshold(100.0, 256.0),
+                "long-token", new SmokeThreshold(10.0, 256.0),
+                // vector-rich observed ~5-6 ms smoke avg; charts + SVG icons vary
+                // more than the text scenarios, so a wider ~4.5x band absorbs that.
+                "vector-rich", new SmokeThreshold(25.0, 256.0)
         ));
 
         private final String id;
diff --git a/benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java
deleted file mode 100644
index c035f96e3..000000000
--- a/benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java
+++ /dev/null
@@ -1,84 +0,0 @@
-package com.demcha.compose;
-
-import com.demcha.compose.document.api.DocumentSession;
-import com.demcha.compose.document.templates.api.DocumentTemplate;
-import com.demcha.compose.document.templates.cv.presets.ModernProfessional;
-import com.demcha.compose.document.templates.cv.spec.CvSpec;
-import com.demcha.compose.document.theme.BusinessTheme;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-
-import java.util.Arrays;
-
-public class FullCvBenchmark {
-
-    private static final int WARMUP_ITERATIONS = Integer.getInteger("graphcompose.benchmark.fullCv.warmup", 100);
-    private static final int MEASUREMENT_ITERATIONS = Integer.getInteger("graphcompose.benchmark.fullCv.iterations", 500);
-
-    public static void main(String[] args) {
-        BenchmarkSupport.configureQuietLogging();
-        System.out.println("Starting FullCvBenchmark...");
-
-        CvSpec cv = CanonicalBenchmarkSupport.canonicalCv();
-        DocumentTemplate<CvSpec> template = ModernProfessional.create(BusinessTheme.modern());
-
-        System.out.println("Warming up JVM (JIT compilation, font cache warmup)...");
-        for (int i = 0; i < WARMUP_ITERATIONS; i++) {
-            generateCvInMemory(template, cv);
-        }
-
-        System.out.println("Measuring performance (" + MEASUREMENT_ITERATIONS + " iterations)...");
-        long[] durationsNs = new long[MEASUREMENT_ITERATIONS];
-
-        for (int i = 0; i < MEASUREMENT_ITERATIONS; i++) {
-            long start = System.nanoTime();
-            generateCvInMemory(template, cv);
-            long end = System.nanoTime();
-            durationsNs[i] = end - start;
-        }
-
-        printStatistics(durationsNs);
-    }
-
-    private static void generateCvInMemory(DocumentTemplate<CvSpec> template, CvSpec cv) {
-        try (DocumentSession document = GraphCompose.document()
-                .pageSize(com.demcha.compose.document.api.DocumentPageSize.A4)
-                .margin(15, 10, 15, 15)
-                .create()) {
-            template.compose(document, cv);
-            document.toPdfBytes();
-        } catch (Exception e) {
-            throw new RuntimeException("Failed to generate PDF", e);
-        }
-    }
-
-    private static void printStatistics(long[] durationsNs) {
-        Arrays.sort(durationsNs);
-
-        double[] durationsMs = Arrays.stream(durationsNs).mapToDouble(ns -> ns / 1_000_000.0).toArray();
-
-        double min = durationsMs[0];
-        double max = durationsMs[durationsMs.length - 1];
-        double avg = Arrays.stream(durationsMs).average().orElse(0.0);
-        double median = durationsMs[(int) (durationsMs.length * 0.5)];
-        double p95 = durationsMs[(int) (durationsMs.length * 0.95)];
-        double p99 = durationsMs[(int) (durationsMs.length * 0.99)];
-
-        System.out.println("\nBenchmark results (milliseconds):");
-        System.out.println("------------------------------------------------");
-        System.out.printf("Min time:           %.2f ms%n", min);
-        System.out.printf("Average time:       %.2f ms%n", avg);
-        System.out.printf("Median (50%%):       %.2f ms (typical response time)%n", median);
-        System.out.printf("95th percentile:    %.2f ms (95%% of runs finish within this)%n", p95);
-        System.out.printf("99th percentile:    %.2f ms (rare spikes or GC pressure)%n", p99);
-        System.out.printf("Max time:           %.2f ms%n", max);
-        System.out.println("------------------------------------------------");
-
-        if (median < 200) {
-            System.out.println("Verdict: Excellent. The engine is very fast for this scenario.");
-        } else if (median < 1000) {
-            System.out.println("Verdict: Good. This is a healthy speed for complex generation.");
-        } else {
-            System.out.println("Verdict: Slow enough to investigate with a profiler.");
-        }
-    }
-}
diff --git a/benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java
deleted file mode 100644
index f4717e66c..000000000
--- a/benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java
+++ /dev/null
@@ -1,79 +0,0 @@
-package com.demcha.compose;
-
-import com.demcha.compose.engine.components.style.Margin;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-
-import java.util.Arrays;
-
-public class GraphComposeBenchmark {
-
-    private static final int WARMUP_ITERATIONS = Integer.getInteger("graphcompose.benchmark.coreEngine.warmup", 100);
-    private static final int MEASUREMENT_ITERATIONS = Integer.getInteger("graphcompose.benchmark.coreEngine.iterations", 500);
-
-    public static void main(String[] args) {
-        BenchmarkSupport.configureQuietLogging();
-        System.out.println("Starting GraphComposeBenchmark...");
-
-        System.out.println("Warming up JVM (JIT compilation, font cache warmup)...");
-        for (int i = 0; i < WARMUP_ITERATIONS; i++) {
-            generateCvInMemory();
-        }
-
-        System.out.println("Measuring performance (" + MEASUREMENT_ITERATIONS + " iterations)...");
-        long[] durationsNs = new long[MEASUREMENT_ITERATIONS];
-
-        for (int i = 0; i < MEASUREMENT_ITERATIONS; i++) {
-            long start = System.nanoTime();
-            generateCvInMemory();
-            long end = System.nanoTime();
-            durationsNs[i] = end - start;
-        }
-
-        printStatistics(durationsNs);
-    }
-
-    private static void generateCvInMemory() {
-        try {
-            CanonicalBenchmarkSupport.renderSimpleBenchmarkDocument(
-                    PDRectangle.A4,
-                    Margin.of(24),
-                    "CoreEngineRoot",
-                    "GraphCompose Core Benchmark",
-                    "Analytical engineer focused on reliable platform design. "
-                            + "Testing paragraph breaking and layout calculation engine.");
-        } catch (Exception e) {
-            throw new RuntimeException("Failed to generate PDF", e);
-        }
-    }
-
-    private static void printStatistics(long[] durationsNs) {
-        Arrays.sort(durationsNs);
-
-        double[] durationsMs = Arrays.stream(durationsNs).mapToDouble(ns -> ns / 1_000_000.0).toArray();
-
-        double min = durationsMs[0];
-        double max = durationsMs[durationsMs.length - 1];
-        double avg = Arrays.stream(durationsMs).average().orElse(0.0);
-        double median = durationsMs[(int) (durationsMs.length * 0.5)];
-        double p95 = durationsMs[(int) (durationsMs.length * 0.95)];
-        double p99 = durationsMs[(int) (durationsMs.length * 0.99)];
-
-        System.out.println("\nBenchmark results (milliseconds):");
-        System.out.println("------------------------------------------------");
-        System.out.printf("Min time:           %.2f ms%n", min);
-        System.out.printf("Average time:       %.2f ms%n", avg);
-        System.out.printf("Median (50%%):       %.2f ms (typical response time)%n", median);
-        System.out.printf("95th percentile:    %.2f ms (95%% of runs finish within this)%n", p95);
-        System.out.printf("99th percentile:    %.2f ms (rare spikes or GC pressure)%n", p99);
-        System.out.printf("Max time:           %.2f ms%n", max);
-        System.out.println("------------------------------------------------");
-
-        if (median < 100) {
-            System.out.println("Verdict: Excellent. The engine is very fast for this scenario.");
-        } else if (median < 500) {
-            System.out.println("Verdict: Good. This is a healthy speed for a synchronous REST API.");
-        } else {
-            System.out.println("Verdict: Slow enough to investigate with a profiler.");
-        }
-    }
-}
diff --git a/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
new file mode 100644
index 000000000..dcf1ec97e
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
@@ -0,0 +1,98 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.image.DocumentImageData;
+
+import javax.imageio.ImageIO;
+import java.awt.BasicStroke;
+import java.awt.Color;
+import java.awt.GradientPaint;
+import java.awt.Graphics2D;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+
+/**
+ * Deterministic synthetic raster fixtures for the image embed/scale benches and
+ * the {@code PdfImageCache} reuse gate.
+ *
+ * <p>The images are generated in code (a fixed gradient placeholder, a few KB
+ * each) so the suite needs no committed binary asset and the bytes — hence the
+ * cache fingerprint — are stable. {@link #demoImage()} returns the same logical
+ * image every call; {@link #distinctImage(int)} returns visually distinct images
+ * with distinct fingerprints, to exercise the distinct-embed path.</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ImageBenchmarkFixtures {
+
+    /** Native pixel size of every generated fixture. */
+    public static final int NATIVE_WIDTH_PX = 360;
+    /** Native pixel height of every generated fixture. */
+    public static final int NATIVE_HEIGHT_PX = 200;
+
+    /**
+     * Draw size (points) that keeps the original-embed path: at 144 DPI this is
+     * a {@code 360x200 px} target, i.e. &gt; 50% of native, so {@code PdfImageCache}
+     * does not build a downscaled variant and the embed count stays at one.
+     */
+    public static final double DRAW_WIDTH_PT = 180.0;
+    /** Companion draw height (points) for {@link #DRAW_WIDTH_PT}. */
+    public static final double DRAW_HEIGHT_PT = 100.0;
+
+    private ImageBenchmarkFixtures() {
+    }
+
+    /**
+     * One fixed gradient placeholder. Returns equal bytes every call, so all
+     * placements share a fingerprint and the cache treats them as one image.
+     *
+     * @return the shared demo image descriptor
+     */
+    public static DocumentImageData demoImage() {
+        return DocumentImageData.fromBytes(pngBytes(0));
+    }
+
+    /**
+     * The {@code index}-th of a family of visually distinct images, each with a
+     * distinct fingerprint so the cache embeds each one separately.
+     *
+     * @param index variant index (any non-negative int)
+     * @return a distinct image descriptor
+     */
+    public static DocumentImageData distinctImage(int index) {
+        return DocumentImageData.fromBytes(pngBytes(index + 1));
+    }
+
+    private static byte[] pngBytes(int seed) {
+        BufferedImage image = new BufferedImage(NATIVE_WIDTH_PX, NATIVE_HEIGHT_PX, BufferedImage.TYPE_INT_RGB);
+        Graphics2D g = image.createGraphics();
+        try {
+            int r = 20 + (seed * 23) % 200;
+            int b = 95 + (seed * 17) % 150;
+            g.setPaint(new GradientPaint(0, 0, new Color(r, 45, 80),
+                    NATIVE_WIDTH_PX, NATIVE_HEIGHT_PX, new Color(20, 80, b)));
+            g.fillRect(0, 0, NATIVE_WIDTH_PX, NATIVE_HEIGHT_PX);
+            g.setPaint(new Color(196, 153, 76));
+            g.setStroke(new BasicStroke(6f));
+            g.drawLine(0, 170, NATIVE_WIDTH_PX, 110 - (seed % 40));
+            // A seed-positioned 1px marker guarantees byte-distinct content per
+            // seed — the modular gradient/line colours above can repeat at large
+            // seeds, but a unique x keeps distinctImage(i) fingerprints distinct
+            // for i in [0, NATIVE_WIDTH_PX - 1].
+            if (seed < NATIVE_WIDTH_PX) {
+                g.setPaint(new Color(0, 0, 0));
+                g.fillRect(seed, 0, 1, 6);
+            }
+        } finally {
+            g.dispose();
+        }
+        ByteArrayOutputStream png = new ByteArrayOutputStream();
+        try {
+            ImageIO.write(image, "png", png);
+        } catch (IOException e) {
+            throw new UncheckedIOException("Failed to encode synthetic benchmark image", e);
+        }
+        return png.toByteArray();
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java
new file mode 100644
index 000000000..6e8d84847
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java
@@ -0,0 +1,119 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.image.DocumentImageData;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.pdfparser.PDFStreamParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.IdentityHashMap;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Consumer;
+
+/**
+ * Deterministic content-stream probe for the {@code PdfImageCache} dedup path:
+ * the same raster image is placed {@code N} times and counted against {@code N}
+ * distinct images, so the embed structure isolates exactly what the cache saves.
+ *
+ * <p>Placing one logical image {@code N} times must embed a single image XObject
+ * (referenced by {@code N} {@code Do} draws), while {@code N} distinct images must
+ * embed {@code N} XObjects. Counting the distinct image XObjects in the output PDF
+ * proves the cache reuses by fingerprint and catches a regression where embeds
+ * scale with placements (PDF bloat). Byte-deterministic — no A/B build needed.
+ * The image render/scale hot path is also entirely uncovered without this and the
+ * companion {@code ImageJmhBenchmark}.</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ImageCacheOperatorProbe {
+
+    private static final int PLACEMENTS = 30;
+
+    /** Distinct image XObjects embedded in a PDF, and the number of {@code Do} draws. */
+    record EmbedCounts(int embeds, int draws) {
+    }
+
+    public static void main(String[] args) throws Exception {
+        BenchmarkSupport.configureQuietLogging();
+
+        System.out.println("GraphCompose image-cache embed probe (" + PLACEMENTS + " placements each)");
+        System.out.printf("%-22s | %8s | %8s%n", "Mode", "Embeds", "Draws");
+        System.out.println("-".repeat(44));
+        report("same image x N", countPdf(renderSameImage(PLACEMENTS)));
+        report("N distinct images", countPdf(renderDistinctImages(PLACEMENTS)));
+        System.out.println();
+        System.out.println("Embeds = distinct image XObjects in the PDF, Draws = Do operators. "
+                + "PdfImageCache must hold embeds at 1 for the same image regardless of placements; "
+                + "distinct images embed once each.");
+    }
+
+    private static void report(String mode, EmbedCounts counts) {
+        System.out.printf("%-22s | %8d | %8d%n", mode, counts.embeds(), counts.draws());
+    }
+
+    /** Renders {@code count} placements of one shared image (cache should embed it once). */
+    static byte[] renderSameImage(int count) throws Exception {
+        DocumentImageData image = ImageBenchmarkFixtures.demoImage();
+        return render(flow -> {
+            for (int i = 0; i < count; i++) {
+                flow.addImage(spec -> spec.source(image)
+                        .size(ImageBenchmarkFixtures.DRAW_WIDTH_PT, ImageBenchmarkFixtures.DRAW_HEIGHT_PT));
+            }
+        });
+    }
+
+    /** Renders {@code count} distinct images (cache embeds each once). */
+    static byte[] renderDistinctImages(int count) throws Exception {
+        return render(flow -> {
+            for (int i = 0; i < count; i++) {
+                DocumentImageData image = ImageBenchmarkFixtures.distinctImage(i);
+                flow.addImage(spec -> spec.source(image)
+                        .size(ImageBenchmarkFixtures.DRAW_WIDTH_PT, ImageBenchmarkFixtures.DRAW_HEIGHT_PT));
+            }
+        });
+    }
+
+    private static byte[] render(Consumer<PageFlowBuilder> author) throws Exception {
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4).margin(28, 28, 28, 28).create()) {
+            session.pageFlow(flow -> {
+                flow.name("ImageCacheProbe").spacing(8);
+                author.accept(flow);
+            });
+            return session.toPdfBytes();
+        }
+    }
+
+    /** Counts distinct embedded image XObjects (by COS identity) and {@code Do} draws. */
+    static EmbedCounts countPdf(byte[] pdf) throws IOException {
+        try (PDDocument document = Loader.loadPDF(pdf)) {
+            Set<COSBase> embeds = Collections.newSetFromMap(new IdentityHashMap<>());
+            int draws = 0;
+            for (PDPage page : document.getPages()) {
+                for (var name : page.getResources().getXObjectNames()) {
+                    PDXObject xobject = page.getResources().getXObject(name);
+                    if (xobject instanceof PDImageXObject image) {
+                        embeds.add(image.getCOSObject());
+                    }
+                }
+                List<Object> tokens = new PDFStreamParser(page).parse();
+                for (Object token : tokens) {
+                    if (token instanceof Operator operator && "Do".equals(operator.getName())) {
+                        draws++;
+                    }
+                }
+            }
+            return new EmbedCounts(embeds.size(), draws);
+        }
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/MeasurementCountBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/MeasurementCountBenchmark.java
index b4b585d53..926921ad3 100644
--- a/benchmarks/src/main/java/com/demcha/compose/MeasurementCountBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/MeasurementCountBenchmark.java
@@ -70,6 +70,16 @@ public final class MeasurementCountBenchmark {
             "Prefix text before an unbreakable token " + "x".repeat(600)
                     + " and several trailing words that must still wrap onto the following lines here.";
 
+    // High-glyph-diversity accented-Latin (Latin-1) passage: many distinct
+    // diacritic glyphs and varied words, unlike the single repeated ASCII
+    // sentence above, so distinctWidthRequests / repeat-rate reflect a non-ASCII,
+    // high-diversity workload. Standard-14 Helvetica covers Latin-1; true
+    // CJK / Cyrillic would need an embedded font and is out of scope here.
+    private static final String ACCENTED_LATIN_PARAGRAPH =
+            ("Le café à Genève - résumé naïve, façon piñata. Über die Größe schön: "
+                    + "coração São, mañana señor. Déjà brûlée crème, fjörð Århus Tromsø "
+                    + "Köln Zürich Besançon, garçon élève hôtel. ").repeat(40);
+
     public static void main(String[] args) throws Exception {
         BenchmarkSupport.configureQuietLogging();
         new MeasurementCountBenchmark().run();
@@ -87,6 +97,8 @@ private void run() throws Exception {
                 flow.addParagraph(p -> p.text(LONG_PARAGRAPH).textStyle(BODY_STYLE));
         Consumer<PageFlowBuilder> longToken = flow ->
                 flow.addParagraph(p -> p.text(LONG_TOKEN_PARAGRAPH).textStyle(BODY_STYLE));
+        Consumer<PageFlowBuilder> accentedText = flow ->
+                flow.addParagraph(p -> p.text(ACCENTED_LATIN_PARAGRAPH).textStyle(BODY_STYLE));
         Consumer<PageFlowBuilder> largeTable = MeasurementCountBenchmark::authorLargeTable;
 
         // Warm up the JVM (class loading + JIT) BEFORE the allocation window so the
@@ -98,12 +110,14 @@ private void run() throws Exception {
         for (int warmup = 0; warmup < 5; warmup++) {
             measureScenario("warmup", longText);
             measureScenario("warmup", longToken);
+            measureScenario("warmup", accentedText);
             measureScenario("warmup", largeTable);
         }
 
         List<Result> results = new ArrayList<>();
         results.add(measureScenario("long-text", longText));
         results.add(measureScenario("long-token", longToken));
+        results.add(measureScenario("accented-latin", accentedText));
         results.add(measureScenario("large-table", largeTable));
 
         System.out.printf("%-14s | %11s | %9s | %9s | %11s | %8s | %11s | %10s | %6s%n",
diff --git a/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java
index 94cafb25e..016f4ea9e 100644
--- a/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java
+++ b/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java
@@ -70,6 +70,29 @@ public static void main(String[] args) throws Exception {
     }
 
     private static void report(String scenario, Consumer<com.demcha.compose.document.dsl.PageFlowBuilder> author) throws Exception {
+        OpCounts counts = countOperators(author);
+        int saved = Math.max(0, counts.draws() - counts.tf()) + Math.max(0, counts.draws() - counts.rg());
+        double reduction = counts.draws() == 0 ? 0
+                : 100.0 * (2.0 * counts.draws() - counts.tf() - counts.rg()) / (2.0 * counts.draws());
+        System.out.printf("%-22s | %8d | %8d | %8d | %12d | %8.1f%%%n",
+                scenario, counts.draws(), counts.tf(), counts.rg(), saved, reduction);
+    }
+
+    /** Text-show ({@code Tj}/{@code TJ}), {@code setFont} ({@code Tf}) and non-stroking-colour op counts. */
+    record OpCounts(int draws, int tf, int rg) {
+    }
+
+    /**
+     * Renders {@code author} and counts the text-show, font and colour operators.
+     * Exposed (package-visible) so {@code RenderOperatorGateTest} can pin the F5
+     * coalescing invariant: post-F5 the font/colour ops no longer scale 1:1 with
+     * text draws, so {@code tf} and {@code rg} stay below {@code draws}.
+     *
+     * @param author flow author
+     * @return the operator counts of the rendered document
+     * @throws Exception if rendering fails
+     */
+    static OpCounts countOperators(Consumer<com.demcha.compose.document.dsl.PageFlowBuilder> author) throws Exception {
         byte[] pdf;
         try (DocumentSession session = GraphCompose.document()
                 .pageSize(DocumentPageSize.A4).margin(28, 28, 28, 28).create()) {
@@ -80,10 +103,7 @@ private static void report(String scenario, Consumer<com.demcha.compose.document
             int draws = count(document, "Tj") + count(document, "TJ");
             int tf = count(document, "Tf");
             int rg = count(document, "rg") + count(document, "g") + count(document, "sc") + count(document, "scn");
-            int saved = Math.max(0, draws - tf) + Math.max(0, draws - rg);
-            double reduction = draws == 0 ? 0 : 100.0 * (2.0 * draws - tf - rg) / (2.0 * draws);
-            System.out.printf("%-22s | %8d | %8d | %8d | %12d | %8.1f%%%n",
-                    scenario, draws, tf, rg, saved, reduction);
+            return new OpCounts(draws, tf, rg);
         }
     }
 
diff --git a/benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java
deleted file mode 100644
index b8e945ef6..000000000
--- a/benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java
+++ /dev/null
@@ -1,88 +0,0 @@
-package com.demcha.compose;
-
-import com.demcha.compose.engine.components.style.Margin;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.concurrent.*;
-
-/**
- * Linear Scalability Test
- * Measures throughput (documents per second) as thread count increases.
- */
-public class ScalabilityBenchmark {
-
-    private static final int DOCUMENTS_PER_THREAD = Integer.getInteger("graphcompose.scalability.documentsPerThread", 100);
-    private static final int WARMUP_DOCS = Integer.getInteger("graphcompose.scalability.warmupDocs", 100);
-    private static final String THREAD_COUNTS = System.getProperty("graphcompose.scalability.threads", "1,2,4,8,16");
-
-    public static void main(String[] args) throws Exception {
-        BenchmarkSupport.configureQuietLogging();
-        System.out.println("Starting Scalability Benchmark: Linear Scalability");
-        System.out.println("------------------------------------------------------------");
-
-        // Warmup
-        for (int i = 0; i < WARMUP_DOCS; i++) {
-            generateOne();
-        }
-
-        int[] threadCounts = parseThreadCounts(THREAD_COUNTS);
-        System.out.println(String.format("%-10s | %-15s | %-12s", "Threads", "Total Docs", "Throughput (docs/sec)"));
-        System.out.println("------------------------------------------------------------");
-
-        for (int threads : threadCounts) {
-            runScalabilityTest(threads);
-        }
-    }
-
-    private static void runScalabilityTest(int threads) throws Exception {
-        int totalDocs = threads * DOCUMENTS_PER_THREAD;
-        ExecutorService executor = Executors.newFixedThreadPool(threads);
-        
-        long startTime = System.nanoTime();
-        
-        List<Future<?>> futures = new ArrayList<>();
-        for (int i = 0; i < totalDocs; i++) {
-            futures.add(executor.submit(() -> {
-                try {
-                    generateOne();
-                } catch (Exception e) {
-                    e.printStackTrace();
-                }
-            }));
-        }
-
-        for (Future<?> future : futures) {
-            future.get();
-        }
-
-        long endTime = System.nanoTime();
-        executor.shutdown();
-        executor.awaitTermination(1, TimeUnit.MINUTES);
-
-        double durationSec = (endTime - startTime) / 1_000_000_000.0;
-        double throughput = totalDocs / durationSec;
-
-        System.out.println(String.format("%-10d | %-15d | %12.2f", threads, totalDocs, throughput));
-    }
-
-    private static void generateOne() throws Exception {
-        CanonicalBenchmarkSupport.renderSimpleBenchmarkDocument(
-                PDRectangle.A4,
-                Margin.of(24),
-                "ScalabilityRoot",
-                "Scalability",
-                "Scalability test message.");
-    }
-
-    private static int[] parseThreadCounts(String raw) {
-        return Arrays.stream(raw.split(","))
-                .map(String::trim)
-                .filter(value -> !value.isEmpty())
-                .mapToInt(Integer::parseInt)
-                .filter(value -> value > 0)
-                .toArray();
-    }
-}
diff --git a/benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java
new file mode 100644
index 000000000..120741433
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java
@@ -0,0 +1,55 @@
+package com.demcha.compose;
+
+/**
+ * Shared SVG fixtures for the v1.8 vector-import benchmarks (path parse, whole
+ * icon read, icon → node build).
+ *
+ * <p>Self-contained on purpose: the benchmarks module cannot reach the
+ * main-module test constants or the examples module, so the heart path is
+ * vendored here (it also lives in {@code SvgPathTest} / {@code VectorPathExample}
+ * in their own modules). The icon is a synthetic but realistic multi-layer
+ * document — a gradient-filled background, a {@code translate}+{@code scale}
+ * group of filled paths and a stroked circle, and a {@code rotate} group with a
+ * polygon and a quadratic-curve stroke — so it exercises XML parse, {@code <g>}
+ * transform accumulation, gradient resolution and per-layer path lowering the
+ * way a real exporter file would, while staying entirely within the reader's
+ * supported subset (so it never throws).</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class SvgBenchmarkFixtures {
+
+    /** Material "favorite" heart — the same {@code d} used in the SVG tests/examples. */
+    public static final String MATERIAL_HEART_D =
+            "M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 2 5.42 4.42 3 7.5 3"
+            + "c1.74 0 3.41.81 4.5 2.09C13.09 3.81 14.76 3 16.5 3 19.58 3 22 5.42 22 8.5"
+            + "c0 3.78-3.4 6.86-8.55 11.54L12 21.35z";
+
+    /** Heart viewBox edge (square 24×24), passed to {@code SvgPath.parse}. */
+    public static final double HEART_VIEWBOX = 24.0;
+
+    /** A realistic multi-layer icon: gradient bg + transformed groups + stroked curves. */
+    public static final String MULTI_LAYER_ICON_SVG = """
+            <svg viewBox="0 0 48 48" xmlns="http://www.w3.org/2000/svg">
+              <defs>
+                <linearGradient id="sky" x1="0" y1="0" x2="0" y2="48" gradientUnits="userSpaceOnUse">
+                  <stop offset="0" stop-color="#3b82f6"/>
+                  <stop offset="1" stop-color="#1e3a8a"/>
+                </linearGradient>
+              </defs>
+              <rect x="0" y="0" width="48" height="48" rx="6" fill="url(#sky)"/>
+              <g transform="translate(6 6) scale(1.1)">
+                <path d="M0 24 L12 4 L24 24 Z" fill="#fbbf24"/>
+                <path d="M6 24 L16 10 L26 24 Z" fill="#f59e0b"/>
+                <circle cx="20" cy="8" r="4" fill="#fde68a" stroke="#92400e" stroke-width="1.5"/>
+              </g>
+              <g transform="rotate(8 24 40)">
+                <polygon points="4,40 44,40 40,46 8,46" fill="#10b981"/>
+                <path d="M10 42 Q24 38 38 42" fill="none" stroke="#065f46" stroke-width="2"/>
+              </g>
+            </svg>
+            """;
+
+    private SvgBenchmarkFixtures() {
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java b/benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java
new file mode 100644
index 000000000..b8df62a2b
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java
@@ -0,0 +1,93 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.svg.SvgIcon;
+import com.demcha.compose.document.svg.SvgPath;
+
+import java.lang.management.ManagementFactory;
+import java.util.Arrays;
+import java.util.function.Supplier;
+
+/**
+ * Deterministic allocation probe for the v1.8 SVG-import path: warm
+ * (JIT-steady) bytes allocated per {@link SvgPath#parse}, per
+ * {@link SvgIcon#parse}, and per {@link SvgIcon#node} — the three operations
+ * with no analogue in the rest of the suite (which is text / table only).
+ *
+ * <p>Allocation counts are noise-free (unlike wall-clock or {@code peakHeapMb}),
+ * so this is the signal the "optimize the engine, not benchmarks" rule wants:
+ * a develop-vs-branch A/B shows a parse/read/node allocation change directly.
+ * No {@code src/main} changes.</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class SvgParseAllocProbe {
+
+    private static final com.sun.management.ThreadMXBean THREAD_MX =
+            (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
+
+    private static final int WARMUP = 60;
+    private static final int MEASURE = 11;
+
+    /** Escape sink so the JIT cannot elide the measured allocations. */
+    private static long sink;
+
+    public static void main(String[] args) {
+        BenchmarkSupport.configureQuietLogging();
+        enableAllocationMeasurement();
+
+        SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+
+        double parseKb = measureAllocKb(() -> SvgPath.parse(
+                SvgBenchmarkFixtures.MATERIAL_HEART_D,
+                0, 0, SvgBenchmarkFixtures.HEART_VIEWBOX, SvgBenchmarkFixtures.HEART_VIEWBOX));
+        double readKb = measureAllocKb(() -> SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG));
+        double nodeKb = measureAllocKb(() -> icon.node(48.0));
+
+        System.out.println("GraphCompose SVG-import allocation probe (median of " + MEASURE + ")");
+        System.out.printf("  SvgPath.parse (heart d)     : %s%n", kb(parseKb));
+        System.out.printf("  SvgIcon.parse (multi-layer) : %s%n", kb(readKb));
+        System.out.printf("  SvgIcon.node(48)            : %s%n", kb(nodeKb));
+        System.out.println("alloc sink: " + sink);
+    }
+
+    private static double measureAllocKb(Supplier<Object> op) {
+        for (int i = 0; i < WARMUP; i++) {
+            sink += System.identityHashCode(op.get());
+        }
+        long[] alloc = new long[MEASURE];
+        for (int m = 0; m < MEASURE; m++) {
+            long before = currentThreadAllocatedBytes();
+            Object result = op.get();
+            long after = currentThreadAllocatedBytes();
+            sink += System.identityHashCode(result);
+            alloc[m] = before < 0 ? -1 : after - before;
+        }
+        Arrays.sort(alloc);
+        return alloc[MEASURE / 2] / 1024.0;
+    }
+
+    private static String kb(double value) {
+        return value < 0 ? "n/a (allocation measurement unsupported)" : "%.1f KB/op".formatted(value);
+    }
+
+    private static void enableAllocationMeasurement() {
+        try {
+            if (THREAD_MX.isThreadAllocatedMemorySupported() && !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                THREAD_MX.setThreadAllocatedMemoryEnabled(true);
+            }
+        } catch (UnsupportedOperationException ignored) {
+            // Allocation measurement unsupported on this JVM; the probe reports n/a.
+        }
+    }
+
+    private static long currentThreadAllocatedBytes() {
+        try {
+            if (!THREAD_MX.isThreadAllocatedMemorySupported() || !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+                return -1;
+            }
+        } catch (UnsupportedOperationException ex) {
+            return -1;
+        }
+        return THREAD_MX.getCurrentThreadAllocatedBytes();
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
new file mode 100644
index 000000000..cc3c79dcc
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/VectorRenderOperatorProbe.java
@@ -0,0 +1,140 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentPaint;
+import com.demcha.compose.document.style.DocumentStroke;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.pdfparser.PDFStreamParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Deterministic content-stream operator probe for the v1.8 vector-paint render
+ * paths (S5/S6): the same {@code N} curved blob paths rendered three ways —
+ * flat solid fill, linear gradient, and translucent (alpha) fill — so the
+ * operator deltas isolate exactly what each paint mode costs at the PDF level.
+ *
+ * <p>A flat path takes the fast {@code fillAndStrokePath} route (just curve +
+ * fill operators). A gradient fill clips to the path and paints a shading
+ * ({@code q} / {@code W n} clip / {@code sh} / {@code Q} per shape); a
+ * translucent fill sets an ExtGState alpha ({@code gs}). Counting {@code sh} /
+ * {@code gs} / {@code W} against the flat baseline proves the per-shape cost
+ * structure and catches a regression where a flat path accidentally takes the
+ * heavier gradient branch. Byte-deterministic — no A/B build needed.</p>
+ *
+ * @author Artem Demchyshyn
+ */
+public final class VectorRenderOperatorProbe {
+
+    static final int PATHS = 40;
+
+    enum PaintMode { FLAT, GRADIENT, ALPHA, STROKED, DASHED }
+
+    /**
+     * PDF operator counts for one paint mode: cubic curves ({@code c}), shadings
+     * ({@code sh}), ExtGState sets ({@code gs}), clips ({@code W}), strokes
+     * ({@code S}/{@code s}) and dash-array sets ({@code d}).
+     */
+    record OperatorCounts(int curves, int shadings, int extGStates, int clips, int strokes, int dashes) {
+    }
+
+    public static void main(String[] args) throws Exception {
+        BenchmarkSupport.configureQuietLogging();
+
+        System.out.println("GraphCompose vector-paint render-operator probe (" + PATHS + " blob paths each)");
+        System.out.printf("%-10s | %6s | %6s | %6s | %6s | %6s | %6s%n", "Mode", "c", "sh", "gs", "W", "S", "d");
+        System.out.println("-".repeat(64));
+        for (PaintMode mode : PaintMode.values()) {
+            report(mode);
+        }
+        System.out.println();
+        System.out.println("c=cubic curve, sh=shading fill, gs=ExtGState (alpha), W=clip, S=stroke, d=dash set. "
+                + "Flat takes the fast fill path (no sh/gs/W/S/d); gradient adds sh+W per shape; alpha adds gs; "
+                + "stroked adds S per shape; dashed adds d+S per shape.");
+    }
+
+    private static void report(PaintMode mode) throws Exception {
+        OperatorCounts counts = countOperators(mode);
+        System.out.printf("%-10s | %6d | %6d | %6d | %6d | %6d | %6d%n",
+                mode.name().toLowerCase(),
+                counts.curves(),
+                counts.shadings(),
+                counts.extGStates(),
+                counts.clips(),
+                counts.strokes(),
+                counts.dashes());
+    }
+
+    /**
+     * Renders {@link #PATHS} blob paths in the given paint mode and counts the PDF
+     * operators. Exposed (package-visible) so {@code VectorRenderOperatorGateTest}
+     * can pin the per-mode cost structure: flat takes the fast fill path (no
+     * shading / alpha / clip), gradient adds a shading + clip per shape, alpha
+     * adds an ExtGState per shape — and a flat path must never take the heavier
+     * gradient branch.
+     *
+     * @param mode the paint mode to exercise
+     * @return the operator counts of the rendered document
+     * @throws Exception if rendering fails
+     */
+    static OperatorCounts countOperators(PaintMode mode) throws Exception {
+        byte[] pdf;
+        try (DocumentSession session = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4).margin(28, 28, 28, 28).create()) {
+            session.pageFlow(flow -> authorBlobs(flow, mode));
+            pdf = session.toPdfBytes();
+        }
+        try (PDDocument document = Loader.loadPDF(pdf)) {
+            return new OperatorCounts(
+                    count(document, "c"),
+                    count(document, "sh"),
+                    count(document, "gs"),
+                    count(document, "W"),
+                    count(document, "S") + count(document, "s"),
+                    count(document, "d"));
+        }
+    }
+
+    private static void authorBlobs(PageFlowBuilder flow, PaintMode mode) {
+        DocumentPaint gradient = DocumentPaint.linear(
+                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
+        DocumentColor flat = DocumentColor.rgb(40, 90, 160);
+        DocumentColor translucent = DocumentColor.rgb(40, 90, 160).withOpacity(0.5);
+        DocumentStroke stroke = DocumentStroke.of(DocumentColor.rgb(40, 90, 160), 2.0);
+        for (int i = 0; i < PATHS; i++) {
+            flow.addPath(p -> {
+                p.size(60, 36)
+                        .moveTo(0.0, 0.5)
+                        .curveTo(0.25, 1.0, 0.75, 1.0, 1.0, 0.5)
+                        .curveTo(0.75, 0.0, 0.25, 0.0, 0.0, 0.5)
+                        .closePath();
+                switch (mode) {
+                    case FLAT -> p.fillColor(flat);
+                    case GRADIENT -> p.fill(gradient);
+                    case ALPHA -> p.fillColor(translucent);
+                    case STROKED -> p.stroke(stroke);
+                    case DASHED -> p.stroke(stroke).dashed(4.0, 2.0);
+                }
+            });
+        }
+    }
+
+    private static int count(PDDocument document, String op) throws IOException {
+        int n = 0;
+        for (var page : document.getPages()) {
+            List<Object> tokens = new PDFStreamParser(page).parse();
+            for (Object token : tokens) {
+                if (token instanceof Operator operator && op.equals(operator.getName())) {
+                    n++;
+                }
+            }
+        }
+        return n;
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ChartJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ChartJmhBenchmark.java
new file mode 100644
index 000000000..760592853
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ChartJmhBenchmark.java
@@ -0,0 +1,79 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.ChartBenchmarkFixtures;
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: end-to-end render of a chart-heavy document — a
+ * grouped bar, a multi-series line (both 12 categories × 3 series) and a 6-slice
+ * pie — to PDF bytes. Charts compile into engine primitives at layout time, so
+ * this exercises {@code ChartLayoutResolver} + per-primitive geometry + label
+ * text-metrics on top of the normal compose / layout / render pipeline.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar Chart
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class ChartJmhBenchmark {
+
+    /**
+     * Builds the three-chart document and renders it to PDF bytes.
+     *
+     * @param blackhole JMH sink that consumes the rendered bytes
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderChartDocument(Blackhole blackhole) throws Exception {
+        blackhole.consume(renderDocument());
+    }
+
+    private static byte[] renderDocument() throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            document.pageFlow()
+                    .name("ChartBenchmark")
+                    .spacing(12)
+                    .chart(ChartBenchmarkFixtures.barSpec(), ChartBenchmarkFixtures.barStyle())
+                    .chart(ChartBenchmarkFixtures.lineSpec(), ChartBenchmarkFixtures.lineStyle())
+                    .chart(ChartBenchmarkFixtures.pieSpec())
+                    .build();
+            return document.toPdfBytes();
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ChartVariantJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ChartVariantJmhBenchmark.java
new file mode 100644
index 000000000..efdc1ff67
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ChartVariantJmhBenchmark.java
@@ -0,0 +1,94 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.ChartBenchmarkFixtures;
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.chart.ChartSpec;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: end-to-end render of a single chart, parameterized
+ * over the chart-layout branches the resolver takes — grouped bar, horizontal
+ * bar, stacked bar, a non-zero value-axis minimum (lifted baseline), line, pie,
+ * and donut. {@code ChartJmhBenchmark} renders one grouped-bar + line + pie
+ * document; this isolates each distinct {@code ChartLayoutResolver} branch so a
+ * regression in, say, the stacking or horizontal-transpose geometry shows up on
+ * its own row rather than blended into a three-chart total.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar ChartVariant
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class ChartVariantJmhBenchmark {
+
+    @Param({"grouped-bar", "horizontal-bar", "stacked-bar", "axis-min-bar", "line", "pie", "donut"})
+    public String variant;
+
+    /** Resolved once per trial so the bench measures the render, not spec assembly. */
+    private ChartSpec spec;
+
+    @Setup
+    public void setUp() {
+        spec = switch (variant) {
+            case "grouped-bar" -> ChartBenchmarkFixtures.barSpec();
+            case "horizontal-bar" -> ChartBenchmarkFixtures.horizontalBarSpec();
+            case "stacked-bar" -> ChartBenchmarkFixtures.stackedBarSpec();
+            case "axis-min-bar" -> ChartBenchmarkFixtures.axisMinBarSpec();
+            case "line" -> ChartBenchmarkFixtures.lineSpec();
+            case "pie" -> ChartBenchmarkFixtures.pieSpec();
+            case "donut" -> ChartBenchmarkFixtures.donutSpec();
+            default -> throw new IllegalArgumentException("Unknown chart variant: " + variant);
+        };
+    }
+
+    /**
+     * Renders a one-chart document of the parameterized variant to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderChartVariant(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            document.pageFlow().name("ChartVariant").spacing(12).chart(spec).build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java
new file mode 100644
index 000000000..a21e3ddbc
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java
@@ -0,0 +1,141 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.CanonicalBenchmarkSupport;
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.style.DocumentInsets;
+import com.demcha.compose.document.templates.builtins.InvoiceTemplateV1;
+import com.demcha.compose.document.templates.cv.presets.ModernProfessional;
+import com.demcha.compose.document.templates.cv.spec.CvSpec;
+import com.demcha.compose.document.templates.data.invoice.InvoiceDocumentSpec;
+import com.demcha.compose.document.theme.BusinessTheme;
+import com.demcha.compose.document.templates.api.DocumentTemplate;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH <em>single-shot</em> benchmark: the JIT-cold cost of the first PDF
+ * render in a fresh JVM. Every other JMH bench in this module reports
+ * steady-state ({@code AverageTime} after warmup), which is what a long-lived
+ * server pays — but a short-lived CLI invocation or a serverless (Lambda)
+ * cold-start pays the <em>first</em> render, with the layout and PDFBox classes
+ * unloaded and uncompiled. This bench measures exactly that.
+ *
+ * <p>{@code Mode.SingleShotTime} with {@code @Warmup(0)} and {@code @Measurement(1)}
+ * times a single invocation; {@code @Fork(10)} repeats it in ten fresh JVMs so the
+ * reported number is a distribution of cold first-renders, not one lucky start.
+ * The spec/template objects are built in {@link #setUp()} so the measured shot is
+ * the cold render path, not fixture assembly. Same workloads as the warm benches
+ * ({@code engine-simple} inline, {@code InvoiceTemplateV1}, {@code ModernProfessional})
+ * so cold and warm numbers are directly comparable.</p>
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar ColdStart
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode(Mode.SingleShotTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 0)
+@Measurement(iterations = 1)
+@Fork(10)
+public class ColdStartJmhBenchmark {
+
+    private InvoiceDocumentSpec invoice;
+    private InvoiceTemplateV1 invoiceTemplate;
+    private CvSpec cv;
+    private DocumentTemplate<CvSpec> cvTemplate;
+
+    /** Builds the specs and templates once per fork, outside the measured cold shot. */
+    @Setup
+    public void setUp() {
+        invoice = CanonicalBenchmarkSupport.canonicalInvoice();
+        invoiceTemplate = new InvoiceTemplateV1();
+        cv = CanonicalBenchmarkSupport.canonicalCv();
+        cvTemplate = ModernProfessional.create(BusinessTheme.modern());
+    }
+
+    /**
+     * Cold first render of a small inline engine document.
+     *
+     * @return the rendered PDF bytes (consumed by JMH)
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public byte[] coldEngineSimple() throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            document.pageFlow()
+                    .name("ColdEngineSimple")
+                    .spacing(10)
+                    .addParagraph("GraphCompose cold-start check")
+                    .addSection("Summary", section -> section
+                            .addParagraph("First render in a fresh JVM, layout and PDFBox classes cold."))
+                    .addSection("Body", section -> section
+                            .addParagraph("Structured business document composition.")
+                            .addParagraph("Semantic layout, pagination, deterministic output."))
+                    .build();
+            return document.toPdfBytes();
+        }
+    }
+
+    /**
+     * Cold first render of the canonical invoice through {@code InvoiceTemplateV1}.
+     *
+     * @return the rendered PDF bytes (consumed by JMH)
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public byte[] coldInvoiceTemplate() throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            invoiceTemplate.compose(document, invoice);
+            return document.toPdfBytes();
+        }
+    }
+
+    /**
+     * Cold first render of the canonical CV through the {@code ModernProfessional} preset.
+     *
+     * @return the rendered PDF bytes (consumed by JMH)
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public byte[] coldCvTemplate() throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(36))
+                .create()) {
+            cvTemplate.compose(document, cv);
+            return document.toPdfBytes();
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/IconRampJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/IconRampJmhBenchmark.java
new file mode 100644
index 000000000..ec655616d
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/IconRampJmhBenchmark.java
@@ -0,0 +1,82 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.SvgBenchmarkFixtures;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentInsets;
+import com.demcha.compose.document.svg.SvgIcon;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: an "icon ramp" — place {@code N} copies of a
+ * multi-layer SVG icon (the realistic icon-grid / skills-ribbon workload) and
+ * render to PDF. Parameterized over N so the trend (node-build + layout +
+ * render per icon) is visible; the icon is parsed once in setup so the ramp
+ * measures placement scaling, not re-parsing.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar IconRamp
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class IconRampJmhBenchmark {
+
+    @Param({"8", "32", "128"})
+    public int iconCount;
+
+    /** Parsed once: the ramp measures node-build + layout + render scaling, not re-parsing. */
+    private final SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+
+    /**
+     * Places {@code iconCount} icons in a flow and renders the document.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderIconRamp(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(24))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("IconRamp").spacing(4);
+            for (int i = 0; i < iconCount; i++) {
+                flow.addSvgIcon(icon, 32);
+            }
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java
new file mode 100644
index 000000000..2b05b1d09
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java
@@ -0,0 +1,92 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.ImageBenchmarkFixtures;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.image.DocumentImageData;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.IntStream;
+
+/**
+ * Strict JMH micro-benchmark: end-to-end render of an image-heavy document — a
+ * dozen distinct raster images placed at thumbnail size — to PDF bytes. Drawing
+ * below 50% of native resolution drives {@code PdfImageCache}'s downscale path
+ * ({@code ImageIO} decode + bicubic rescale + re-encode + embed), so this covers
+ * the raster embed/scale hot path that no other bench touches.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar Image
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class ImageJmhBenchmark {
+
+    private static final int IMAGES = 12;
+
+    /** Distinct images built once in setup; the bench measures render, not image synthesis. */
+    private List<DocumentImageData> images;
+
+    @Setup
+    public void setUp() {
+        images = IntStream.range(0, IMAGES)
+                .mapToObj(ImageBenchmarkFixtures::distinctImage)
+                .toList();
+    }
+
+    /**
+     * Renders the image-heavy document to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderImageDocument(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(28))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("ImageBenchmark").spacing(8);
+            for (DocumentImageData image : images) {
+                // 60x33 pt -> ~120x66 px target at 144 DPI, i.e. <50% of the
+                // 360x200 native, so the cache builds a downscaled variant.
+                flow.addImage(spec -> spec.source(image).size(60, 33));
+            }
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java
new file mode 100644
index 000000000..82f45a20a
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/LargeTableJmhBenchmark.java
@@ -0,0 +1,96 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.style.DocumentInsets;
+import com.demcha.compose.document.table.DocumentTableColumn;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: end-to-end render of a production-scale priced
+ * table that paginates across many pages, parameterized over row count so the
+ * scaling trend of large-table pagination + render is visible.
+ *
+ * <p>The rest of the suite renders small documents; nothing measured how the
+ * engine handles a genuinely large multi-page table (the existing
+ * {@code TablePaginationAllocProbe} measures layout-compile allocation only, not
+ * end-to-end render). The header repeats on every page (the realistic report
+ * layout), so this exercises per-page header re-emission as well as row layout
+ * and slicing at scale.</p>
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar LargeTable
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class LargeTableJmhBenchmark {
+
+    @Param({"100", "500", "1000"})
+    public int rows;
+
+    /**
+     * Renders the priced table document to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderLargeTable(Blackhole blackhole) throws Exception {
+        // Equal full-width columns (page width minus the 28pt L/R margins, split
+        // five ways) so the table fills the page like a real report, not its text.
+        final double columnWidth = (DocumentPageSize.A4.width() - 2 * 28) / 5.0;
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(28))
+                .create()) {
+            document.pageFlow(flow -> {
+                flow.name("LargeTable").spacing(8);
+                flow.addParagraph("Priced line items");
+                flow.addTable(t -> {
+                    t.columns(
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth),
+                            DocumentTableColumn.fixed(columnWidth))
+                            .header("#", "Item", "Qty", "Unit", "Total").repeatHeader();
+                    for (int r = 1; r <= rows; r++) {
+                        t.row(String.valueOf(r), "Line item " + r, "3", "12.50", "37.50");
+                    }
+                });
+            });
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/MixedShowcaseJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/MixedShowcaseJmhBenchmark.java
new file mode 100644
index 000000000..ae139a705
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/MixedShowcaseJmhBenchmark.java
@@ -0,0 +1,95 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.ChartBenchmarkFixtures;
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.SvgBenchmarkFixtures;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentInsets;
+import com.demcha.compose.document.style.DocumentPaint;
+import com.demcha.compose.document.svg.SvgIcon;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: a representative "v1.8 showcase" document that
+ * mixes every new vector feature in one render — running prose with two inline
+ * sparklines, a grouped bar chart and a pie chart, a row of SVG icons, and
+ * gradient accent paths. This is the integration canary: it answers "did adding
+ * any v1.8 feature blow up a realistic document?" in one number.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar MixedShowcase
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class MixedShowcaseJmhBenchmark {
+
+    private static final int ICONS = 8;
+
+    /** Parsed once; the bench measures the mixed render, not icon parsing. */
+    private final SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+
+    /**
+     * Renders the mixed v1.8 showcase document to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderMixedShowcase(Blackhole blackhole) throws Exception {
+        DocumentPaint accent = DocumentPaint.linear(
+                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(32))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("MixedShowcase").spacing(12);
+            flow.addParagraph("v1.8 feature showcase");
+            flow.addRich(r -> r
+                    .plain("Revenue ")
+                    .sparkline(42, 9, DocumentColor.rgb(20, 80, 95), 65.2, 69.8, 74.1, 81.3, 88.2)
+                    .plain("   profit ")
+                    .sparklineLine(42, 9, 1.6, DocumentColor.rgb(196, 153, 76), 28.1, 30.7, 32.9, 36.4, 39.5));
+            flow.chart(ChartBenchmarkFixtures.barSpec(), ChartBenchmarkFixtures.barStyle());
+            flow.chart(ChartBenchmarkFixtures.pieSpec());
+            for (int i = 0; i < ICONS; i++) {
+                flow.addSvgIcon(icon, 32);
+            }
+            flow.addPath(p -> p.size(220, 28)
+                    .moveTo(0.0, 0.5).curveTo(0.25, 1.0, 0.75, 0.0, 1.0, 0.5).fill(accent));
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/SparklineRampJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/SparklineRampJmhBenchmark.java
new file mode 100644
index 000000000..492aba9f1
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/SparklineRampJmhBenchmark.java
@@ -0,0 +1,83 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: a "sparkline ramp" — a rich paragraph carrying
+ * {@code N} inline sparklines — rendered to PDF, parameterized over N so the
+ * per-sparkline inline-fragment cost (build + layout + vector draw) is visible.
+ * Sparklines were otherwise only exercised once inside
+ * {@code MixedShowcaseJmhBenchmark}, where a regression would dilute into the
+ * surrounding charts and icons; this isolates and scales them.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar SparklineRamp
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class SparklineRampJmhBenchmark {
+
+    private static final DocumentColor ACCENT = DocumentColor.rgb(20, 80, 95);
+
+    @Param({"8", "32", "128"})
+    public int sparklineCount;
+
+    /**
+     * Renders a paragraph of {@code sparklineCount} inline sparklines to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderSparklineRamp(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(28))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("SparklineRamp").spacing(4);
+            flow.addRich(r -> {
+                for (int i = 0; i < sparklineCount; i++) {
+                    r.plain("m ").sparkline(42, 9, ACCENT, 65.2, 69.8, 74.1, 81.3, 88.2);
+                }
+            });
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
new file mode 100644
index 000000000..58ed3f99f
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
@@ -0,0 +1,98 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.SvgBenchmarkFixtures;
+import com.demcha.compose.document.svg.SvgIcon;
+import com.demcha.compose.document.svg.SvgPath;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark for the v1.8 SVG-import surface — the first
+ * feature-object benchmark (the rest of the suite renders text / tables only).
+ *
+ * <p>Three measured operations, all pure CPU + allocation (no
+ * {@code DocumentSession}, no PDF render):</p>
+ * <ul>
+ *   <li>{@code parseSvgPath} — {@link SvgPath#parse} of a real Material icon
+ *       {@code d} string (tokenize, relative/absolute resolution, cubic/line
+ *       lowering, viewBox normalization).</li>
+ *   <li>{@code readSvgIcon} — {@link SvgIcon#parse} of a multi-layer icon (XML
+ *       parse, {@code <g>} transform accumulation, gradient resolution, one
+ *       {@link SvgPath} per layer).</li>
+ *   <li>{@code svgIconToNode} — {@link SvgIcon#node} on a pre-parsed icon (the
+ *       {@code PathNode} / layer-stack allocation done once per placement).</li>
+ * </ul>
+ *
+ * <p>Microsecond-scale work, so it needs the forked, JIT-stable JMH harness
+ * (an {@code exec:java} run cannot fork). Build the runner jar and run:</p>
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar Svg
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class SvgJmhBenchmark {
+
+    /** Parsed once so {@code svgIconToNode} measures only the node-build cost. */
+    private final SvgIcon icon = SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG);
+
+    /**
+     * Parses a real icon path-data string into normalized segments.
+     *
+     * @param blackhole JMH sink
+     */
+    @Benchmark
+    public void parseSvgPath(Blackhole blackhole) {
+        blackhole.consume(SvgPath.parse(
+                SvgBenchmarkFixtures.MATERIAL_HEART_D,
+                0, 0, SvgBenchmarkFixtures.HEART_VIEWBOX, SvgBenchmarkFixtures.HEART_VIEWBOX));
+    }
+
+    /**
+     * Reads a whole multi-layer SVG icon (XML parse → layers).
+     *
+     * @param blackhole JMH sink
+     */
+    @Benchmark
+    public void readSvgIcon(Blackhole blackhole) {
+        blackhole.consume(SvgIcon.parse(SvgBenchmarkFixtures.MULTI_LAYER_ICON_SVG));
+    }
+
+    /**
+     * Builds a placeable node (path nodes + layer stack) from a parsed icon.
+     *
+     * @param blackhole JMH sink
+     */
+    @Benchmark
+    public void svgIconToNode(Blackhole blackhole) {
+        blackhole.consume(icon.node(48.0));
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java
new file mode 100644
index 000000000..382ad4d57
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/VectorPaintJmhBenchmark.java
@@ -0,0 +1,109 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.style.DocumentColor;
+import com.demcha.compose.document.style.DocumentPaint;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH micro-benchmark: render {@code N} identical curved blob paths in one
+ * paint mode — flat solid fill, linear gradient, or translucent (alpha) fill —
+ * parameterized over the mode, so the render-<em>time</em> cost of each vector
+ * paint branch is isolated. This is the timing complement to
+ * {@code VectorRenderOperatorProbe} (which counts the PDF operators each mode
+ * emits): gradient shading and alpha ExtGState are heavier than the flat fast
+ * fill path, and this puts a millisecond number on that.
+ *
+ * <pre>
+ *   ./mvnw -f benchmarks/pom.xml clean package -DskipTests
+ *   java -jar benchmarks/target/benchmarks.jar VectorPaint
+ * </pre>
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class VectorPaintJmhBenchmark {
+
+    private static final int PATHS = 40;
+
+    @Param({"flat", "gradient", "alpha"})
+    public String paint;
+
+    private DocumentPaint gradient;
+    private DocumentColor flat;
+    private DocumentColor translucent;
+
+    /** Paint objects built once per trial, outside the measured render. */
+    @Setup
+    public void setUp() {
+        gradient = DocumentPaint.linear(
+                DocumentColor.rgb(167, 139, 250), DocumentColor.rgb(97, 40, 217));
+        flat = DocumentColor.rgb(40, 90, 160);
+        translucent = DocumentColor.rgb(40, 90, 160).withOpacity(0.5);
+    }
+
+    /**
+     * Renders {@code PATHS} blob paths in the parameterized paint mode to PDF bytes.
+     *
+     * @param blackhole JMH sink
+     * @throws Exception if rendering fails
+     */
+    @Benchmark
+    public void renderVectorPaint(Blackhole blackhole) throws Exception {
+        try (DocumentSession document = GraphCompose.document()
+                .pageSize(DocumentPageSize.A4)
+                .margin(DocumentInsets.of(28))
+                .create()) {
+            PageFlowBuilder flow = document.pageFlow().name("VectorPaint").spacing(4);
+            for (int i = 0; i < PATHS; i++) {
+                flow.addPath(p -> {
+                    p.size(60, 36)
+                            .moveTo(0.0, 0.5)
+                            .curveTo(0.25, 1.0, 0.75, 1.0, 1.0, 0.5)
+                            .curveTo(0.75, 0.0, 0.25, 0.0, 0.0, 0.5)
+                            .closePath();
+                    switch (paint) {
+                        case "flat" -> p.fillColor(flat);
+                        case "gradient" -> p.fill(gradient);
+                        case "alpha" -> p.fillColor(translucent);
+                        default -> throw new IllegalArgumentException("Unknown paint mode: " + paint);
+                    }
+                });
+            }
+            flow.build();
+            blackhole.consume(document.toPdfBytes());
+        }
+    }
+
+    /**
+     * Runs the JMH harness over this benchmark.
+     *
+     * @param args JMH CLI arguments
+     * @throws Exception if the JMH runner fails
+     */
+    public static void main(String[] args) throws Exception {
+        org.openjdk.jmh.Main.main(args);
+    }
+}
diff --git a/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java b/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java
index 783ad2479..d3319131c 100644
--- a/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java
+++ b/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java
@@ -93,6 +93,35 @@ void currentSpeedDiffKeepsOnlyScenariosPresentInBothRuns() throws Exception {
         assertThat(diff.path("throughput").get(0).path("scenario").asText()).isEqualTo("shared");
     }
 
+    @Test
+    void currentSpeedDiffSurfacesAddedRemovedScenariosAndStageDeltas() throws Exception {
+        System.setProperty("graphcompose.benchmark.root", tempDir.toString());
+        Path baseline = write("baseline.json", currentSpeedWithStages("full",
+                latency("shared", 10.0, 10.0, 100.0, 1.0, 100.0) + ","
+                        + latency("only-in-baseline", 10.0, 10.0, 100.0, 1.0, 100.0),
+                stage("shared", 1.0, 2.0, 4.0, 7.0),
+                throughput("shared", 1, 50.0, 20.0)));
+        Path candidate = write("candidate.json", currentSpeedWithStages("full",
+                latency("shared", 10.0, 10.0, 100.0, 1.0, 100.0) + ","
+                        + latency("only-in-candidate", 5.0, 5.0, 200.0, 0.5, 90.0),
+                stage("shared", 1.0, 2.0, 8.0, 11.0),
+                throughput("shared", 1, 50.0, 20.0)));
+
+        BenchmarkDiffTool.main(new String[]{baseline.toString(), candidate.toString()});
+
+        JsonNode diff = readDiff("current-speed");
+        // Loud set-changes: one-sided scenarios are surfaced, not silently dropped.
+        assertThat(toStrings(diff.path("addedScenarios"))).containsExactly("only-in-candidate");
+        assertThat(toStrings(diff.path("removedScenarios"))).containsExactly("only-in-baseline");
+        // The shared scenario is still the only intersected latency delta row.
+        assertThat(diff.path("latency").size()).isEqualTo(1);
+        // Stage diff: render 4 -> 8 = +100%, compose unchanged.
+        JsonNode stageDiff = diff.path("stages").get(0);
+        assertThat(stageDiff.path("scenario").asText()).isEqualTo("shared");
+        assertThat(stageDiff.path("renderDeltaPct").asDouble()).isCloseTo(100.0, within(EPS));
+        assertThat(stageDiff.path("composeDeltaPct").asDouble()).isCloseTo(0.0, within(EPS));
+    }
+
     @Test
     void currentSpeedDiffTreatsZeroBaselineAsHundredPercentAndZeroToZeroAsZero() throws Exception {
         System.setProperty("graphcompose.benchmark.root", tempDir.toString());
@@ -228,6 +257,38 @@ private static String latency(String scenario,
                 """.formatted(scenario, scenario, avgMillis, p95Millis, docsPerSecond, avgKilobytes, peakHeapMb);
     }
 
+    private static String currentSpeedWithStages(String profile, String latencyItems,
+                                                 String stageItems, String throughputItems) {
+        return """
+                {
+                  "timestamp": "2026-04-14 21:00:00",
+                  "profile": "%s",
+                  "latency": [%s],
+                  "stages": [%s],
+                  "throughput": [%s]
+                }
+                """.formatted(profile, latencyItems, stageItems, throughputItems);
+    }
+
+    private static String stage(String scenario, double composeMs, double layoutMs,
+                                double renderMs, double totalMs) {
+        return """
+                {
+                  "scenario": "%s",
+                  "composeMillis": %s,
+                  "layoutMillis": %s,
+                  "renderMillis": %s,
+                  "totalMillis": %s
+                }
+                """.formatted(scenario, composeMs, layoutMs, renderMs, totalMs);
+    }
+
+    private static java.util.List<String> toStrings(JsonNode array) {
+        java.util.List<String> values = new java.util.ArrayList<>();
+        array.forEach(node -> values.add(node.asText()));
+        return values;
+    }
+
     private static String throughput(String scenario, int threads, double docsPerSecond, double avgMillisPerDoc) {
         return """
                 {
diff --git a/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java b/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java
index ee449b4b7..c1bc150b8 100644
--- a/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java
+++ b/benchmarks/src/test/java/com/demcha/compose/BenchmarkMedianToolTest.java
@@ -148,6 +148,11 @@ void shouldWriteMedianCurrentSpeedAggregateForRepeatedRuns() throws Exception {
         assertThat(aggregate.path("latency").get(0).path("peakHeapMb").asDouble()).isEqualTo(120.0);
         assertThat(aggregate.path("throughput").get(0).path("docsPerSecond").asDouble()).isEqualTo(40.0);
         assertThat(aggregate.path("totalBytes").asLong()).isEqualTo(2000L);
+        // None of these runs carried a stages[] (smoke < 20 iters emits none), so the
+        // lenient aggregation must omit stages without throwing.
+        assertThat(aggregate.path("stages").isEmpty())
+                .as("median omits stages when no source run carries them")
+                .isTrue();
     }
 
     @Test
@@ -209,4 +214,52 @@ void shouldWriteMedianComparativeAggregateForRepeatedRuns() throws Exception {
         assertThat(aggregate.path("libraries").get(1).path("avgHeapMb").asDouble()).isEqualTo(0.25);
     }
 
+    @Test
+    void shouldMedianStagesWhenSourceRunsCarryThem() throws Exception {
+        System.setProperty("graphcompose.benchmark.root", tempDir.toString());
+
+        Path suiteDir = Files.createDirectories(tempDir.resolve("current-speed"));
+        // Three runs whose render stage is 10 / 20 / 30 (median 20) and total
+        // 13 / 23 / 33 (median 23); compose/layout are constant (median 1 / 2).
+        double[] renders = {10.0, 20.0, 30.0};
+        String[] paths = new String[renders.length];
+        for (int i = 0; i < renders.length; i++) {
+            double render = renders[i];
+            double total = render + 3.0;
+            Path run = suiteDir.resolve("run-20260415-2200" + i + "0.json");
+            Files.writeString(run, """
+                    {
+                      "profile": "full",
+                      "warmupIterations": 12,
+                      "measurementIterations": 40,
+                      "docsPerThread": 12,
+                      "threadCounts": [1],
+                      "latency": [
+                        {"scenario": "invoice-template", "description": "Invoice", "avgMillis": %1$s,
+                         "p50Millis": 0.0, "p95Millis": 0.0, "maxMillis": 0.0, "docsPerSecond": 0.0,
+                         "avgKilobytes": 0.0, "peakHeapMb": 0.0}
+                      ],
+                      "stages": [
+                        {"scenario": "invoice-template", "composeMillis": 1.0, "layoutMillis": 2.0,
+                         "renderMillis": %1$s, "totalMillis": %2$s}
+                      ],
+                      "throughput": [],
+                      "totalBytes": 1000
+                    }
+                    """.formatted(render, total));
+            paths[i] = run.toString();
+        }
+
+        BenchmarkMedianTool.main(new String[]{"current-speed", paths[0], paths[1], paths[2]});
+
+        JsonNode aggregate = JSON.readTree(
+                Files.readAllBytes(tempDir.resolve("aggregates/current-speed/full/latest.json")));
+
+        JsonNode stage = aggregate.path("stages").get(0);
+        assertThat(stage.path("scenario").asText()).isEqualTo("invoice-template");
+        assertThat(stage.path("composeMillis").asDouble()).isEqualTo(1.0);
+        assertThat(stage.path("renderMillis").asDouble()).isEqualTo(20.0);
+        assertThat(stage.path("totalMillis").asDouble()).isEqualTo(23.0);
+    }
+
 }
diff --git a/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedBenchmarkPerfGateTest.java b/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedBenchmarkPerfGateTest.java
index cae8d91f0..6a1efc07a 100644
--- a/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedBenchmarkPerfGateTest.java
+++ b/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedBenchmarkPerfGateTest.java
@@ -53,14 +53,16 @@ void failsWhenAverageLatencyExceedsThreshold() {
     }
 
     @Test
-    void failsWhenPeakHeapExceedsThreshold() {
+    void treatsPeakHeapAsAdvisoryNotAGateFailure() {
         CurrentSpeedBenchmark.PerformanceGateResult result =
                 CurrentSpeedBenchmark.evaluatePerformanceGate(
                         CurrentSpeedBenchmark.BenchmarkProfile.SMOKE,
-                        List.of(latency(ENGINE_SIMPLE, 1.0, 999.0))); // 999 > 96
+                        List.of(latency(ENGINE_SIMPLE, 1.0, 999.0))); // heap 999 > 96, avg 1.0 ok
 
-        assertThat(result.passed()).isFalse();
-        assertThat(result.message()).contains("peak heap");
+        assertThat(result.passed())
+                .as("peak heap is GC-noisy and advisory — a heap-only breach must not fail the gate")
+                .isTrue();
+        assertThat(result.message()).contains("peak heap").contains("advisory");
     }
 
     @Test
diff --git a/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedScenarioGateTest.java b/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedScenarioGateTest.java
new file mode 100644
index 000000000..da7296d45
--- /dev/null
+++ b/benchmarks/src/test/java/com/demcha/compose/CurrentSpeedScenarioGateTest.java
@@ -0,0 +1,35 @@
+package com.demcha.compose;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Guards that every CurrentSpeed latency scenario is covered by a SMOKE gate
+ * threshold.
+ *
+ * <p>The smoke perf gate silently ignores a scenario that has no configured
+ * threshold (by design — see
+ * {@link CurrentSpeedBenchmarkPerfGateTest#ignoresScenariosWithoutAConfiguredThreshold()}).
+ * That defensive behaviour means a newly added scenario would escape the gate
+ * unnoticed. This test makes the omission fail loudly instead: adding a scenario
+ * to {@code SCENARIO_DEFS} without a matching {@code SMOKE} threshold breaks the
+ * build.</p>
+ */
+class CurrentSpeedScenarioGateTest {
+
+    @Test
+    void everyScenarioHasASmokeGateThreshold() {
+        var gated = CurrentSpeedBenchmark.BenchmarkProfile.SMOKE.smokeThresholds().keySet();
+
+        List<String> ungated = CurrentSpeedBenchmark.scenarioNames().stream()
+                .filter(name -> !gated.contains(name))
+                .toList();
+
+        assertThat(ungated)
+                .as("CurrentSpeed scenarios missing a SMOKE gate threshold")
+                .isEmpty();
+    }
+}
diff --git a/benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java b/benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java
new file mode 100644
index 000000000..e28a2d9c9
--- /dev/null
+++ b/benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java
@@ -0,0 +1,49 @@
+package com.demcha.compose;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Deterministic regression gate for {@code PdfImageCache} dedup, driving
+ * {@link ImageCacheOperatorProbe}'s render + count helpers.
+ *
+ * <p>The cache keys embedded image XObjects by content fingerprint, so the same
+ * image placed many times must embed once (referenced by many draws) while
+ * distinct images embed once each. Counting the embedded XObjects in the output
+ * PDF makes that structural invariant a build-failing assertion — a regression
+ * that re-embeds the same image per placement (PDF bloat) breaks this test
+ * rather than silently passing CI.</p>
+ */
+class ImageCacheGateTest {
+
+    @Test
+    void sameImageEmbedsOnceRegardlessOfPlacements() throws Exception {
+        int placements = 30;
+
+        ImageCacheOperatorProbe.EmbedCounts counts =
+                ImageCacheOperatorProbe.countPdf(ImageCacheOperatorProbe.renderSameImage(placements));
+
+        assertThat(counts.embeds())
+                .as("the same image placed %d times must embed exactly one XObject", placements)
+                .isEqualTo(1);
+        assertThat(counts.draws())
+                .as("each placement must still draw the cached image")
+                .isGreaterThanOrEqualTo(placements);
+    }
+
+    @Test
+    void distinctImagesEachEmbedOnce() throws Exception {
+        int distinct = 8;
+
+        ImageCacheOperatorProbe.EmbedCounts counts =
+                ImageCacheOperatorProbe.countPdf(ImageCacheOperatorProbe.renderDistinctImages(distinct));
+
+        assertThat(counts.embeds())
+                .as("%d distinct images must embed %d XObjects (no over-dedup)", distinct, distinct)
+                .isEqualTo(distinct);
+        assertThat(counts.draws())
+                .as("each distinct image must be drawn")
+                .isGreaterThanOrEqualTo(distinct);
+    }
+}
diff --git a/benchmarks/src/test/java/com/demcha/compose/RenderOperatorGateTest.java b/benchmarks/src/test/java/com/demcha/compose/RenderOperatorGateTest.java
new file mode 100644
index 000000000..01807d4bf
--- /dev/null
+++ b/benchmarks/src/test/java/com/demcha/compose/RenderOperatorGateTest.java
@@ -0,0 +1,41 @@
+package com.demcha.compose;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Deterministic regression gate for the F5 render-operator coalescing, driving
+ * {@link RenderOperatorProbe#countOperators}.
+ *
+ * <p>Before F5 the paragraph handler emitted one {@code setFont} (Tf) and one
+ * non-stroking-colour op per text-show, so font/colour ops scaled 1:1 with the
+ * per-line {@code Tj}/{@code TJ} draws. After F5 they are coalesced, so a single
+ * styled paragraph that wraps to many lines emits far fewer Tf/colour ops than
+ * draws. Asserting {@code tf < draws} and {@code rg < draws} pins that
+ * structural win as a build-failing check — a regression back to per-span font
+ * ops (bloated content streams) breaks this test instead of passing CI. The
+ * assertion is content-independent: it does not hardcode brittle exact counts.</p>
+ */
+class RenderOperatorGateTest {
+
+    private static final String LONG_PARAGRAPH =
+            ("GraphCompose lays out structured business documents across many pages "
+                    + "while keeping header and footer placement stable. ").repeat(30);
+
+    @Test
+    void fontAndColourOpsStayCoalescedBelowTextDraws() throws Exception {
+        RenderOperatorProbe.OpCounts counts =
+                RenderOperatorProbe.countOperators(flow -> flow.addParagraph(LONG_PARAGRAPH));
+
+        assertThat(counts.draws())
+                .as("a long paragraph must wrap to many text-show ops")
+                .isGreaterThanOrEqualTo(10);
+        assertThat(counts.tf())
+                .as("setFont ops must be coalesced below the per-line draw count (F5), not 1:1")
+                .isLessThan(counts.draws());
+        assertThat(counts.rg())
+                .as("non-stroking colour ops must be coalesced below the per-line draw count (F5)")
+                .isLessThan(counts.draws());
+    }
+}
diff --git a/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java b/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java
new file mode 100644
index 000000000..5aa3d91ef
--- /dev/null
+++ b/benchmarks/src/test/java/com/demcha/compose/VectorRenderOperatorGateTest.java
@@ -0,0 +1,85 @@
+package com.demcha.compose;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Deterministic regression gate for the v1.8 vector-paint render branches,
+ * driving {@link VectorRenderOperatorProbe#countOperators}.
+ *
+ * <p>A flat fill takes the fast path (no shading / alpha / clip); a linear
+ * gradient clips to the shape and paints a shading (one {@code W} clip + one
+ * {@code sh} per shape); a translucent fill sets an ExtGState alpha (one
+ * {@code gs} per shape). Pinning these operator counts makes a regression — a
+ * flat path accidentally taking the heavier gradient branch, or the gradient
+ * clip/shading being dropped — a build failure rather than a silent CI pass.</p>
+ */
+class VectorRenderOperatorGateTest {
+
+    @Test
+    void flatFillTakesTheFastPathWithNoShadingAlphaOrClip() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts flat =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.FLAT);
+
+        assertThat(flat.curves()).as("flat paths still emit curve operators").isGreaterThan(0);
+        assertThat(flat.shadings()).as("flat fill must not paint a shading").isZero();
+        assertThat(flat.extGStates()).as("flat fill must not set an ExtGState alpha").isZero();
+        assertThat(flat.clips()).as("flat fill must not clip").isZero();
+        assertThat(flat.strokes()).as("a flat fill must not stroke").isZero();
+        assertThat(flat.dashes()).as("a flat fill must not set a dash array").isZero();
+    }
+
+    @Test
+    void strokedPathStrokesOncePerShapeWithoutFillPaint() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts stroked =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.STROKED);
+
+        assertThat(stroked.strokes())
+                .as("a stroked path strokes once per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(stroked.dashes()).as("a solid stroke sets no dash array").isZero();
+        assertThat(stroked.shadings()).as("a stroke must not paint a shading").isZero();
+        assertThat(stroked.extGStates()).as("a stroke must not set an ExtGState alpha").isZero();
+    }
+
+    @Test
+    void dashedStrokeSetsADashArrayPerShape() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts dashed =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.DASHED);
+
+        assertThat(dashed.dashes())
+                .as("a dashed stroke sets a dash array once per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(dashed.strokes())
+                .as("a dashed path still strokes once per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(dashed.shadings()).as("a dashed stroke must not paint a shading").isZero();
+    }
+
+    @Test
+    void gradientFillClipsAndShadesOncePerShape() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts gradient =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.GRADIENT);
+
+        assertThat(gradient.shadings())
+                .as("a linear gradient paints one shading per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(gradient.clips())
+                .as("a gradient clips to each shape before shading")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+    }
+
+    @Test
+    void translucentFillSetsOneExtGStatePerShape() throws Exception {
+        VectorRenderOperatorProbe.OperatorCounts alpha =
+                VectorRenderOperatorProbe.countOperators(VectorRenderOperatorProbe.PaintMode.ALPHA);
+
+        assertThat(alpha.extGStates())
+                .as("a translucent fill sets one ExtGState alpha per shape")
+                .isEqualTo(VectorRenderOperatorProbe.PATHS);
+        assertThat(alpha.shadings())
+                .as("a translucent solid fill must not paint a shading")
+                .isZero();
+    }
+}
diff --git a/docs/operations/benchmarks.md b/docs/operations/benchmarks.md
index 315f4d523..1c6bd6d75 100644
--- a/docs/operations/benchmarks.md
+++ b/docs/operations/benchmarks.md
@@ -36,15 +36,17 @@ The script prints numbered sections so you can map console output to the pipelin
 1. `01-build-classpath`
    Builds the test classpath once and writes `target/benchmark.classpath`.
 2. `02-current-speed`
-   Runs `CurrentSpeedBenchmark` in the selected profile.
+   Runs `CurrentSpeedBenchmark` in the selected profile. The full profile also
+   runs the thread-scaling throughput sweep (1 → 16 threads).
 3. `03-comparative`
-   Runs the GraphCompose canonical vs iText 5 vs JasperReports comparison.
-4. `04-core-engine`
-   Runs `GraphComposeBenchmark`.
-5. `05-full-cv`
-   Runs `FullCvBenchmark`.
-6. `06-scalability`
-   Runs the thread-scaling throughput benchmark.
+   Runs the GraphCompose canonical vs iText 9 vs JasperReports comparison: a
+   small-invoice tier plus a report-scaling sweep (40 / 200 / 1000 rows) that
+   prints a per-size GraphCompose-advantage ratio and dumps a sample PDF per
+   library/size.
+
+   _Steps 04–06 (`core-engine`, `full-cv`, `scalability`) were retired. The
+   surviving steps keep their original `NN-` console prefixes, so the labels
+   jump from `03-` to `07-`._
 7. `07-stress`
    Runs the concurrent stability stress test.
 8. `08-endurance`
diff --git a/docs/operations/performance.md b/docs/operations/performance.md
index ecf02c5b7..7fc02d480 100644
--- a/docs/operations/performance.md
+++ b/docs/operations/performance.md
@@ -1,7 +1,13 @@
 # Performance — v1.4 numbers
 
-All numbers below come from `scripts/run-benchmarks.ps1` — the full local
-benchmark workflow that builds the test classpath once and runs
+> **Historical snapshot (v1.4).** The numbers and suite list below are frozen
+> as captured for v1.4 and are kept for reference. The pipeline has since
+> changed: the `core-engine`, `full-cv`, and `scalability` suites were retired,
+> and current numbers come from the `current-speed` / `comparative` / `stress`
+> pipeline plus the JMH suite. See [docs/operations/benchmarks.md](./benchmarks.md).
+
+All numbers below were captured from `scripts/run-benchmarks.ps1` — the full
+local benchmark workflow that built the test classpath once and ran
 `current-speed`, `comparative`, `core-engine`, `full-cv`, `scalability`,
 and `stress` suites in sequence. They were captured on a developer
 laptop; CI machines are typically 1.5–2× slower. The benchmark
@@ -93,5 +99,9 @@ snapshots.
 
 ## Engine-only timings
 
+_The `GraphComposeBenchmark` and `FullCvBenchmark` mains below were retired
+after v1.4. Equivalent timings now come from the `CurrentSpeedBenchmark`
+`engine-simple` scenario and the JMH `TemplateCvJmhBenchmark`._
+
 - `GraphComposeBenchmark` (engine-only, no PDF render): avg **1.04 ms**, p50 **0.97 ms**, p95 **1.64 ms**.
 - `FullCvBenchmark` (full CV template, including render): avg **4.14 ms**, p50 **3.80 ms**, p95 **6.37 ms**.
diff --git a/scripts/ab-bench.ps1 b/scripts/ab-bench.ps1
index 5a3e4eb42..a237ec203 100644
--- a/scripts/ab-bench.ps1
+++ b/scripts/ab-bench.ps1
@@ -110,21 +110,10 @@ function Parse-Comparative($jsonPath) {
 }
 function Parse-Logs($logsDir) {
     $o = @{}
-    $scal = Join-Path $logsDir "06-scalability.log"
-    if (Test-Path $scal) {
-        foreach ($line in (Get-Content $scal)) {
-            if ($line -match '^\s*(\d+)\s*\|\s*\d+\s*\|\s*([\d.]+)\s*$') {
-                $o["scalability | $($matches[1])t | docs/s"] = [double]$matches[2]
-            }
-        }
-    }
-    foreach ($pair in @(@("04-core-engine.log", "core-engine"), @("05-full-cv.log", "full-cv"))) {
-        $p = Join-Path $logsDir $pair[0]
-        if (Test-Path $p) {
-            $txt = Get-Content $p -Raw
-            if ($txt -match 'Median[^\r\n]*?:\s*([\d.]+)\s*ms') { $o["$($pair[1]) | median ms"] = [double]$matches[1] }
-        }
-    }
+    # Steps 04-06 (core-engine, full-cv, scalability) were retired, so their logs
+    # are no longer produced. Current-speed throughput — including the
+    # thread-scaling series — is read from the JSON report by Parse-CurrentSpeed;
+    # only the surviving stress log is parsed here.
     $stress = Join-Path $logsDir "07-stress.log"
     if (Test-Path $stress) {
         $txt = Get-Content $stress -Raw
diff --git a/scripts/run-benchmarks.ps1 b/scripts/run-benchmarks.ps1
index dbe162c08..a0dd2c777 100644
--- a/scripts/run-benchmarks.ps1
+++ b/scripts/run-benchmarks.ps1
@@ -5,8 +5,10 @@ Runs the local GraphCompose benchmark pipeline and stores timestamped logs and r
 
 .DESCRIPTION
 The wrapper performs a staged local run:
-01 build classpath, 02 current-speed, 03 comparative, 04 core engine, 05 full CV, 06 scalability,
-07 stress, optional 08 endurance, then 09/10 diff steps.
+01 build classpath, 02 current-speed, 03 comparative, 07 stress,
+optional 08 endurance, then 09/10 diff and 11 verdict steps. Steps 04-06
+(core-engine, full-cv, scalability) were retired; the surviving steps keep
+their original numeric prefixes, so the numbering jumps from 03 to 07.
 
 Current-speed diffs are profile-aware. The wrapper only compares reports
 from the same current-speed profile (`smoke` or `full`) and skips the
@@ -368,9 +370,6 @@ try {
                 -InputPaths $comparativeRuns | Out-Null
         }
 
-        Invoke-JavaMain -Name "04-core-engine" -Classpath $javaClasspath -MainClass "com.demcha.compose.GraphComposeBenchmark"
-        Invoke-JavaMain -Name "05-full-cv" -Classpath $javaClasspath -MainClass "com.demcha.compose.FullCvBenchmark"
-        Invoke-JavaMain -Name "06-scalability" -Classpath $javaClasspath -MainClass "com.demcha.compose.ScalabilityBenchmark"
         Invoke-JavaMain -Name "07-stress" -Classpath $javaClasspath -MainClass "com.demcha.compose.GraphComposeStressTest"
 
         if ($IncludeEndurance) {