From 917f3ce440b30b7a0d51f911f5e4b7fc0381ecb9 Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Sun, 14 Jun 2026 19:02:44 +0100
Subject: [PATCH 01/36] chore(benchmarks): remove three redundant benchmark
mains
FullCvBenchmark duplicated the JMH TemplateCvJmhBenchmark (CV through
ModernProfessional) with a hand-rolled, JIT-noisier loop and no report.
GraphComposeBenchmark was an early-engine relic measuring the same
title+body+divider doc as CurrentSpeedBenchmark's engine-simple scenario.
ScalabilityBenchmark's thread-scaling sweep is folded into
CurrentSpeedBenchmark's full-profile throughput run (thread counts now
1,2,4,8,16).
Drop the matching run-benchmarks.ps1 steps and the benchmarks.md /
benchmarks/README.md entries. ComparativeBenchmark, the JMH benches, the
deterministic probes, and the soak/stress runners stay. Benchmark module
compiles; its 28 tests pass.
---
CHANGELOG.md | 7 ++
benchmarks/README.md | 6 +-
.../demcha/compose/CurrentSpeedBenchmark.java | 4 +-
.../com/demcha/compose/FullCvBenchmark.java | 84 ------------------
.../demcha/compose/GraphComposeBenchmark.java | 79 -----------------
.../demcha/compose/ScalabilityBenchmark.java | 88 -------------------
docs/operations/benchmarks.md | 9 +-
scripts/run-benchmarks.ps1 | 7 +-
8 files changed, 15 insertions(+), 269 deletions(-)
delete mode 100644 benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java
delete mode 100644 benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java
delete mode 100644 benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19c44ff5f..e9f7124c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -337,6 +337,13 @@ Entries land here as they merge.
### Internal
+- **Benchmark suite cleanup (not shipped).** Removed three redundant
+ benchmark mains: `FullCvBenchmark` (superseded by the JMH
+ `TemplateCvJmhBenchmark`), `GraphComposeBenchmark` (early-engine relic
+ duplicating `CurrentSpeedBenchmark`'s `engine-simple` scenario), and
+ `ScalabilityBenchmark` (its thread-scaling sweep folded into
+ `CurrentSpeedBenchmark`'s full-profile throughput run, now `1,2,4,8,16`).
+ Dropped the matching `run-benchmarks.ps1` steps and doc entries.
- **Removed the `java.awt.*` / `java.util.*` co-wildcard in four files.**
`InvoiceTemplateComposer`, `ProposalTemplateComposer`,
`WeeklyScheduleTemplateComposer`, and the engine `PdfRenderingSystemECS`
diff --git a/benchmarks/README.md b/benchmarks/README.md
index f6041365c..e232c6e21 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -62,15 +62,11 @@
| File | Role |
|---|---|
| `CurrentSpeedBenchmark` | Default scenario runner — what CI's `perf-smoke` job exercises. Takes a `-Dgraphcompose.benchmark.profile=smoke\|full\|stress` switch. |
-| `ComparativeBenchmark` | Renders the same fixtures through GraphCompose, iText, openHTMLToPDF, JasperReports. **Rough local comparison only** — see "When not to use" above. |
-| `FullCvBenchmark`, `ScalabilityBenchmark` | Fixture-specific runners for CV and table-heavy scenarios. |
-| `CanonicalBenchmarkSupport`, `BenchmarkSupport` | Shared fixture builders + measurement helpers. |
+| `ComparativeBenchmark` | Renders the same fixtures through GraphCompose, iText, openHTMLToPDF, JasperReports. **Rough local comparison only** — see "When not to use" above. || `CanonicalBenchmarkSupport`, `BenchmarkSupport` | Shared fixture builders + measurement helpers. |
| `BenchmarkReportWriter` | Writes JSON / CSV / text reports under `benchmarks/target/benchmarks/`. |
| `BenchmarkDiffTool` | Compares two JSON reports and prints a delta table. Useful for pre/post comparisons. |
| `BenchmarkMedianTool` | Median + dispersion across N runs of the same scenario. |
| `GraphComposeStressTest`, `EnduranceTest` | Long-running stress / endurance harnesses. |
-| `GraphComposeBenchmark` | Legacy entry point preserved for one downstream caller. New work should target `CurrentSpeedBenchmark`. |
-
## Running
From the repo root:
diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index 2858d64a6..bbda30b8f 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -55,7 +55,9 @@ public final class CurrentSpeedBenchmark {
private static final int DEFAULT_FULL_WARMUP_ITERATIONS = 12;
private static final int DEFAULT_FULL_MEASUREMENT_ITERATIONS = 40;
private static final int DEFAULT_FULL_DOCS_PER_THREAD = 12;
- private static final String DEFAULT_FULL_THREAD_COUNTS = "1,2,4,8";
+ // The 16-thread tier is absorbed from the removed ScalabilityBenchmark so the
+ // full profile keeps a thread-scaling data point (smoke runs no throughput).
+ private static final String DEFAULT_FULL_THREAD_COUNTS = "1,2,4,8,16";
// Bumped from 2/5 to 30/100 so smoke runs reach a steady JIT state and the
// p95 calculation actually has enough samples to interpolate rather than
// collapsing to the maximum observed time. The smoke profile remains the
diff --git a/benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java
deleted file mode 100644
index c035f96e3..000000000
--- a/benchmarks/src/main/java/com/demcha/compose/FullCvBenchmark.java
+++ /dev/null
@@ -1,84 +0,0 @@
-package com.demcha.compose;
-
-import com.demcha.compose.document.api.DocumentSession;
-import com.demcha.compose.document.templates.api.DocumentTemplate;
-import com.demcha.compose.document.templates.cv.presets.ModernProfessional;
-import com.demcha.compose.document.templates.cv.spec.CvSpec;
-import com.demcha.compose.document.theme.BusinessTheme;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-
-import java.util.Arrays;
-
-public class FullCvBenchmark {
-
- private static final int WARMUP_ITERATIONS = Integer.getInteger("graphcompose.benchmark.fullCv.warmup", 100);
- private static final int MEASUREMENT_ITERATIONS = Integer.getInteger("graphcompose.benchmark.fullCv.iterations", 500);
-
- public static void main(String[] args) {
- BenchmarkSupport.configureQuietLogging();
- System.out.println("Starting FullCvBenchmark...");
-
- CvSpec cv = CanonicalBenchmarkSupport.canonicalCv();
- DocumentTemplate template = ModernProfessional.create(BusinessTheme.modern());
-
- System.out.println("Warming up JVM (JIT compilation, font cache warmup)...");
- for (int i = 0; i < WARMUP_ITERATIONS; i++) {
- generateCvInMemory(template, cv);
- }
-
- System.out.println("Measuring performance (" + MEASUREMENT_ITERATIONS + " iterations)...");
- long[] durationsNs = new long[MEASUREMENT_ITERATIONS];
-
- for (int i = 0; i < MEASUREMENT_ITERATIONS; i++) {
- long start = System.nanoTime();
- generateCvInMemory(template, cv);
- long end = System.nanoTime();
- durationsNs[i] = end - start;
- }
-
- printStatistics(durationsNs);
- }
-
- private static void generateCvInMemory(DocumentTemplate template, CvSpec cv) {
- try (DocumentSession document = GraphCompose.document()
- .pageSize(com.demcha.compose.document.api.DocumentPageSize.A4)
- .margin(15, 10, 15, 15)
- .create()) {
- template.compose(document, cv);
- document.toPdfBytes();
- } catch (Exception e) {
- throw new RuntimeException("Failed to generate PDF", e);
- }
- }
-
- private static void printStatistics(long[] durationsNs) {
- Arrays.sort(durationsNs);
-
- double[] durationsMs = Arrays.stream(durationsNs).mapToDouble(ns -> ns / 1_000_000.0).toArray();
-
- double min = durationsMs[0];
- double max = durationsMs[durationsMs.length - 1];
- double avg = Arrays.stream(durationsMs).average().orElse(0.0);
- double median = durationsMs[(int) (durationsMs.length * 0.5)];
- double p95 = durationsMs[(int) (durationsMs.length * 0.95)];
- double p99 = durationsMs[(int) (durationsMs.length * 0.99)];
-
- System.out.println("\nBenchmark results (milliseconds):");
- System.out.println("------------------------------------------------");
- System.out.printf("Min time: %.2f ms%n", min);
- System.out.printf("Average time: %.2f ms%n", avg);
- System.out.printf("Median (50%%): %.2f ms (typical response time)%n", median);
- System.out.printf("95th percentile: %.2f ms (95%% of runs finish within this)%n", p95);
- System.out.printf("99th percentile: %.2f ms (rare spikes or GC pressure)%n", p99);
- System.out.printf("Max time: %.2f ms%n", max);
- System.out.println("------------------------------------------------");
-
- if (median < 200) {
- System.out.println("Verdict: Excellent. The engine is very fast for this scenario.");
- } else if (median < 1000) {
- System.out.println("Verdict: Good. This is a healthy speed for complex generation.");
- } else {
- System.out.println("Verdict: Slow enough to investigate with a profiler.");
- }
- }
-}
diff --git a/benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java
deleted file mode 100644
index f4717e66c..000000000
--- a/benchmarks/src/main/java/com/demcha/compose/GraphComposeBenchmark.java
+++ /dev/null
@@ -1,79 +0,0 @@
-package com.demcha.compose;
-
-import com.demcha.compose.engine.components.style.Margin;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-
-import java.util.Arrays;
-
-public class GraphComposeBenchmark {
-
- private static final int WARMUP_ITERATIONS = Integer.getInteger("graphcompose.benchmark.coreEngine.warmup", 100);
- private static final int MEASUREMENT_ITERATIONS = Integer.getInteger("graphcompose.benchmark.coreEngine.iterations", 500);
-
- public static void main(String[] args) {
- BenchmarkSupport.configureQuietLogging();
- System.out.println("Starting GraphComposeBenchmark...");
-
- System.out.println("Warming up JVM (JIT compilation, font cache warmup)...");
- for (int i = 0; i < WARMUP_ITERATIONS; i++) {
- generateCvInMemory();
- }
-
- System.out.println("Measuring performance (" + MEASUREMENT_ITERATIONS + " iterations)...");
- long[] durationsNs = new long[MEASUREMENT_ITERATIONS];
-
- for (int i = 0; i < MEASUREMENT_ITERATIONS; i++) {
- long start = System.nanoTime();
- generateCvInMemory();
- long end = System.nanoTime();
- durationsNs[i] = end - start;
- }
-
- printStatistics(durationsNs);
- }
-
- private static void generateCvInMemory() {
- try {
- CanonicalBenchmarkSupport.renderSimpleBenchmarkDocument(
- PDRectangle.A4,
- Margin.of(24),
- "CoreEngineRoot",
- "GraphCompose Core Benchmark",
- "Analytical engineer focused on reliable platform design. "
- + "Testing paragraph breaking and layout calculation engine.");
- } catch (Exception e) {
- throw new RuntimeException("Failed to generate PDF", e);
- }
- }
-
- private static void printStatistics(long[] durationsNs) {
- Arrays.sort(durationsNs);
-
- double[] durationsMs = Arrays.stream(durationsNs).mapToDouble(ns -> ns / 1_000_000.0).toArray();
-
- double min = durationsMs[0];
- double max = durationsMs[durationsMs.length - 1];
- double avg = Arrays.stream(durationsMs).average().orElse(0.0);
- double median = durationsMs[(int) (durationsMs.length * 0.5)];
- double p95 = durationsMs[(int) (durationsMs.length * 0.95)];
- double p99 = durationsMs[(int) (durationsMs.length * 0.99)];
-
- System.out.println("\nBenchmark results (milliseconds):");
- System.out.println("------------------------------------------------");
- System.out.printf("Min time: %.2f ms%n", min);
- System.out.printf("Average time: %.2f ms%n", avg);
- System.out.printf("Median (50%%): %.2f ms (typical response time)%n", median);
- System.out.printf("95th percentile: %.2f ms (95%% of runs finish within this)%n", p95);
- System.out.printf("99th percentile: %.2f ms (rare spikes or GC pressure)%n", p99);
- System.out.printf("Max time: %.2f ms%n", max);
- System.out.println("------------------------------------------------");
-
- if (median < 100) {
- System.out.println("Verdict: Excellent. The engine is very fast for this scenario.");
- } else if (median < 500) {
- System.out.println("Verdict: Good. This is a healthy speed for a synchronous REST API.");
- } else {
- System.out.println("Verdict: Slow enough to investigate with a profiler.");
- }
- }
-}
diff --git a/benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java
deleted file mode 100644
index b8e945ef6..000000000
--- a/benchmarks/src/main/java/com/demcha/compose/ScalabilityBenchmark.java
+++ /dev/null
@@ -1,88 +0,0 @@
-package com.demcha.compose;
-
-import com.demcha.compose.engine.components.style.Margin;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.concurrent.*;
-
-/**
- * Linear Scalability Test
- * Measures throughput (documents per second) as thread count increases.
- */
-public class ScalabilityBenchmark {
-
- private static final int DOCUMENTS_PER_THREAD = Integer.getInteger("graphcompose.scalability.documentsPerThread", 100);
- private static final int WARMUP_DOCS = Integer.getInteger("graphcompose.scalability.warmupDocs", 100);
- private static final String THREAD_COUNTS = System.getProperty("graphcompose.scalability.threads", "1,2,4,8,16");
-
- public static void main(String[] args) throws Exception {
- BenchmarkSupport.configureQuietLogging();
- System.out.println("Starting Scalability Benchmark: Linear Scalability");
- System.out.println("------------------------------------------------------------");
-
- // Warmup
- for (int i = 0; i < WARMUP_DOCS; i++) {
- generateOne();
- }
-
- int[] threadCounts = parseThreadCounts(THREAD_COUNTS);
- System.out.println(String.format("%-10s | %-15s | %-12s", "Threads", "Total Docs", "Throughput (docs/sec)"));
- System.out.println("------------------------------------------------------------");
-
- for (int threads : threadCounts) {
- runScalabilityTest(threads);
- }
- }
-
- private static void runScalabilityTest(int threads) throws Exception {
- int totalDocs = threads * DOCUMENTS_PER_THREAD;
- ExecutorService executor = Executors.newFixedThreadPool(threads);
-
- long startTime = System.nanoTime();
-
- List> futures = new ArrayList<>();
- for (int i = 0; i < totalDocs; i++) {
- futures.add(executor.submit(() -> {
- try {
- generateOne();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }));
- }
-
- for (Future> future : futures) {
- future.get();
- }
-
- long endTime = System.nanoTime();
- executor.shutdown();
- executor.awaitTermination(1, TimeUnit.MINUTES);
-
- double durationSec = (endTime - startTime) / 1_000_000_000.0;
- double throughput = totalDocs / durationSec;
-
- System.out.println(String.format("%-10d | %-15d | %12.2f", threads, totalDocs, throughput));
- }
-
- private static void generateOne() throws Exception {
- CanonicalBenchmarkSupport.renderSimpleBenchmarkDocument(
- PDRectangle.A4,
- Margin.of(24),
- "ScalabilityRoot",
- "Scalability",
- "Scalability test message.");
- }
-
- private static int[] parseThreadCounts(String raw) {
- return Arrays.stream(raw.split(","))
- .map(String::trim)
- .filter(value -> !value.isEmpty())
- .mapToInt(Integer::parseInt)
- .filter(value -> value > 0)
- .toArray();
- }
-}
diff --git a/docs/operations/benchmarks.md b/docs/operations/benchmarks.md
index 315f4d523..775483384 100644
--- a/docs/operations/benchmarks.md
+++ b/docs/operations/benchmarks.md
@@ -36,15 +36,10 @@ The script prints numbered sections so you can map console output to the pipelin
1. `01-build-classpath`
Builds the test classpath once and writes `target/benchmark.classpath`.
2. `02-current-speed`
- Runs `CurrentSpeedBenchmark` in the selected profile.
+ Runs `CurrentSpeedBenchmark` in the selected profile. The full profile also
+ runs the thread-scaling throughput sweep (1 → 16 threads).
3. `03-comparative`
Runs the GraphCompose canonical vs iText 5 vs JasperReports comparison.
-4. `04-core-engine`
- Runs `GraphComposeBenchmark`.
-5. `05-full-cv`
- Runs `FullCvBenchmark`.
-6. `06-scalability`
- Runs the thread-scaling throughput benchmark.
7. `07-stress`
Runs the concurrent stability stress test.
8. `08-endurance`
diff --git a/scripts/run-benchmarks.ps1 b/scripts/run-benchmarks.ps1
index dbe162c08..e3d3947b6 100644
--- a/scripts/run-benchmarks.ps1
+++ b/scripts/run-benchmarks.ps1
@@ -5,8 +5,8 @@ Runs the local GraphCompose benchmark pipeline and stores timestamped logs and r
.DESCRIPTION
The wrapper performs a staged local run:
-01 build classpath, 02 current-speed, 03 comparative, 04 core engine, 05 full CV, 06 scalability,
-07 stress, optional 08 endurance, then 09/10 diff steps.
+01 build classpath, 02 current-speed, 03 comparative, 07 stress,
+optional 08 endurance, then 09/10 diff steps.
Current-speed diffs are profile-aware. The wrapper only compares reports
from the same current-speed profile (`smoke` or `full`) and skips the
@@ -368,9 +368,6 @@ try {
-InputPaths $comparativeRuns | Out-Null
}
- Invoke-JavaMain -Name "04-core-engine" -Classpath $javaClasspath -MainClass "com.demcha.compose.GraphComposeBenchmark"
- Invoke-JavaMain -Name "05-full-cv" -Classpath $javaClasspath -MainClass "com.demcha.compose.FullCvBenchmark"
- Invoke-JavaMain -Name "06-scalability" -Classpath $javaClasspath -MainClass "com.demcha.compose.ScalabilityBenchmark"
Invoke-JavaMain -Name "07-stress" -Classpath $javaClasspath -MainClass "com.demcha.compose.GraphComposeStressTest"
if ($IncludeEndurance) {
From 019f64b32cd23aa44a0694cd43604e11d2c88818 Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Sun, 14 Jun 2026 19:26:31 +0100
Subject: [PATCH 02/36] perf(benchmarks): persist compose/layout/render stages
+ a run summary.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The stage breakdown (per-template compose / layout / render medians) was
printed to the console and discarded. Promote it into the report:
runStageBreakdown returns a StageRow, CurrentSpeedReport carries a stages[]
array, and a stages CSV is written — so a diff can attribute a regression to
an engine stage, not just the blended total. Also write a per-run summary.md
(latency + stages + throughput tables) so a reviewer reads one file instead
of the JSON plus several CSVs.
Additive output only: diff/verdict/median read the report by field and ignore
the new array. Benchmark module compiles; 28 tests pass; verified on a smoke
run (stages[] present, summary.md readable, perf gate passes).
---
.../demcha/compose/BenchmarkReportWriter.java | 8 +
.../demcha/compose/CurrentSpeedBenchmark.java | 144 +++++++++++++++---
2 files changed, 131 insertions(+), 21 deletions(-)
diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java
index 73e061d3d..51d2b2e42 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkReportWriter.java
@@ -60,6 +60,14 @@ Path writeCsv(String tableName, List headers, List> rows) t
return archived;
}
+ Path writeMarkdown(String name, String content) throws IOException {
+ Path latest = directory.resolve("latest-" + name + ".md");
+ Path archived = directory.resolve(name + "-" + timestamp + ".md");
+ Files.writeString(latest, content, StandardCharsets.UTF_8);
+ Files.writeString(archived, content, StandardCharsets.UTF_8);
+ return archived;
+ }
+
Path directory() {
return directory;
}
diff --git a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
index bbda30b8f..e3d877943 100644
--- a/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/CurrentSpeedBenchmark.java
@@ -143,20 +143,21 @@ private void run() throws Exception {
// Stage breakdown: for each template scenario we time compose / layout
// / render separately so consumers can attribute regressions to the
- // engine vs. PDFBox. Engine-simple and feature-rich scenarios also
- // use the canonical pipeline and benefit from the same probe.
+ // engine vs. PDFBox. Only the template scenarios are probed here; the
+ // latency table above still covers every scenario.
+ List stageRows = new ArrayList<>();
if (profile != BenchmarkProfile.SMOKE || config.measurementIterations() >= 20) {
System.out.println();
System.out.println("Stage breakdown (median ms per stage)");
System.out.printf("%-18s | %12s | %12s | %12s | %12s%n",
"Scenario", "Compose", "Layout", "Render", "Total");
System.out.println("-".repeat(78));
- runStageBreakdown("invoice-template", () -> openInvoiceSession(),
- s -> invoiceTemplate.compose(s, invoice), config.measurementIterations());
- runStageBreakdown("cv-template", () -> openCvSession(),
- s -> cvTemplate.compose(s, cv), config.measurementIterations());
- runStageBreakdown("proposal-template", () -> openProposalSession(),
- s -> proposalTemplate.compose(s, proposal), config.measurementIterations());
+ stageRows.add(runStageBreakdown("invoice-template", () -> openInvoiceSession(),
+ s -> invoiceTemplate.compose(s, invoice), config.measurementIterations()));
+ stageRows.add(runStageBreakdown("cv-template", () -> openCvSession(),
+ s -> cvTemplate.compose(s, cv), config.measurementIterations()));
+ stageRows.add(runStageBreakdown("proposal-template", () -> openProposalSession(),
+ s -> proposalTemplate.compose(s, proposal), config.measurementIterations()));
}
List throughputRows = new ArrayList<>();
@@ -201,10 +202,13 @@ private void run() throws Exception {
config.docsPerThread(),
config.threadCounts(),
latencyRows,
+ stageRows,
throughputRows,
totalBenchmarkBytes);
System.out.println("Saved JSON benchmark report to " + summary.jsonPath());
- System.out.println("Saved CSV benchmark reports to " + summary.latencyCsvPath() + " and " + summary.throughputCsvPath());
+ System.out.println("Saved CSV benchmark reports to " + summary.latencyCsvPath() + ", "
+ + summary.stagesCsvPath() + ", and " + summary.throughputCsvPath());
+ System.out.println("Saved markdown summary to " + summary.summaryMarkdownPath());
if (enforceGate) {
PerformanceGateResult gateResult = evaluatePerformanceGate(profile, latencyRows);
@@ -363,10 +367,10 @@ private interface SessionComposer {
* median-ms-per-stage row so callers can attribute regressions to
* compose / layout / render independently.
*/
- private void runStageBreakdown(String scenario,
- SessionFactory factory,
- SessionComposer composer,
- int iterations) throws Exception {
+ private StageRow runStageBreakdown(String scenario,
+ SessionFactory factory,
+ SessionComposer composer,
+ int iterations) throws Exception {
int warmup = Math.max(2, Math.min(20, iterations / 5));
for (int i = 0; i < warmup; i++) {
try (DocumentSession session = factory.open()) {
@@ -398,12 +402,13 @@ private void runStageBreakdown(String scenario,
throw new AssertionError();
}
}
+ double composeMs = medianMs(composeNs);
+ double layoutMs = medianMs(layoutNs);
+ double renderMs = medianMs(renderNs);
+ double totalMs = medianMs(totalNs);
System.out.printf("%-18s | %12.3f | %12.3f | %12.3f | %12.3f%n",
- scenario,
- medianMs(composeNs),
- medianMs(layoutNs),
- medianMs(renderNs),
- medianMs(totalNs));
+ scenario, composeMs, layoutMs, renderMs, totalMs);
+ return new StageRow(scenario, round(composeMs), round(layoutMs), round(renderMs), round(totalMs));
}
private static double medianMs(long[] arr) {
@@ -677,16 +682,19 @@ private PathSummary writeReports(BenchmarkReportWriter.BenchmarkArtifacts artifa
int docsPerThread,
int[] threadCounts,
List latencyRows,
+ List stageRows,
List throughputRows,
long totalBenchmarkBytes) throws Exception {
+ String timestamp = LocalDateTime.now().format(TIMESTAMP_FORMAT);
CurrentSpeedReport report = new CurrentSpeedReport(
- LocalDateTime.now().format(TIMESTAMP_FORMAT),
+ timestamp,
profileId,
warmupIterations,
measurementIterations,
docsPerThread,
Arrays.stream(threadCounts).boxed().toList(),
latencyRows,
+ stageRows,
throughputRows,
totalBenchmarkBytes);
@@ -717,8 +725,88 @@ private PathSummary writeReports(BenchmarkReportWriter.BenchmarkArtifacts artifa
format(row.docsPerSecond()),
format(row.avgMillisPerDoc())))
.toList());
+ var stagesCsvPath = artifacts.writeCsv(
+ "stages",
+ List.of("scenario", "compose_ms", "layout_ms", "render_ms", "total_ms"),
+ stageRows.stream()
+ .map(row -> List.of(
+ row.scenario(),
+ format(row.composeMillis()),
+ format(row.layoutMillis()),
+ format(row.renderMillis()),
+ format(row.totalMillis())))
+ .toList());
+ var summaryMarkdownPath = artifacts.writeMarkdown(
+ "summary",
+ buildSummaryMarkdown(timestamp, profileId, latencyRows, stageRows,
+ throughputRows, totalBenchmarkBytes));
+
+ return new PathSummary(jsonPath.toString(), latencyCsvPath.toString(),
+ stagesCsvPath.toString(), throughputCsvPath.toString(),
+ summaryMarkdownPath.toString());
+ }
+
+ /**
+ * Renders a single human-readable summary of the run — the latency table,
+ * the per-stage compose/layout/render split (the only place the suite
+ * attributes time to engine stages vs. PDFBox), and the throughput table
+ * when present — so a reviewer reads one file instead of stitching the JSON
+ * and several CSVs together.
+ */
+ private static String buildSummaryMarkdown(String timestamp,
+ String profileId,
+ List latencyRows,
+ List stageRows,
+ List throughputRows,
+ long totalBenchmarkBytes) {
+ StringBuilder md = new StringBuilder();
+ md.append("# Current-speed benchmark — ").append(profileId).append(" profile\n\n");
+ md.append('`').append(timestamp).append("`\n\n");
+
+ md.append("## Latency (ms)\n\n");
+ md.append("| Scenario | Avg | p50 | p95 | Max | Docs/s | Avg KB | Peak MB |\n");
+ md.append("|---|---:|---:|---:|---:|---:|---:|---:|\n");
+ for (LatencyRow row : latencyRows) {
+ md.append("| ").append(row.scenario())
+ .append(" | ").append(format(row.avgMillis()))
+ .append(" | ").append(format(row.p50Millis()))
+ .append(" | ").append(format(row.p95Millis()))
+ .append(" | ").append(format(row.maxMillis()))
+ .append(" | ").append(format(row.docsPerSecond()))
+ .append(" | ").append(format(row.avgKilobytes()))
+ .append(" | ").append(format(row.peakHeapMb()))
+ .append(" |\n");
+ }
- return new PathSummary(jsonPath.toString(), latencyCsvPath.toString(), throughputCsvPath.toString());
+ if (!stageRows.isEmpty()) {
+ md.append("\n## Stages — template scenarios (median ms — compose / layout / render)\n\n");
+ md.append("| Scenario | Compose | Layout | Render | Total |\n");
+ md.append("|---|---:|---:|---:|---:|\n");
+ for (StageRow row : stageRows) {
+ md.append("| ").append(row.scenario())
+ .append(" | ").append(format(row.composeMillis()))
+ .append(" | ").append(format(row.layoutMillis()))
+ .append(" | ").append(format(row.renderMillis()))
+ .append(" | ").append(format(row.totalMillis()))
+ .append(" |\n");
+ }
+ }
+
+ if (!throughputRows.isEmpty()) {
+ md.append("\n## Throughput\n\n");
+ md.append("| Threads | Total docs | Docs/s | Avg doc ms |\n");
+ md.append("|---:|---:|---:|---:|\n");
+ for (ThroughputRow row : throughputRows) {
+ md.append("| ").append(row.threads())
+ .append(" | ").append(row.totalDocs())
+ .append(" | ").append(format(row.docsPerSecond()))
+ .append(" | ").append(format(row.avgMillisPerDoc()))
+ .append(" |\n");
+ }
+ }
+
+ md.append("\nByte guard: ").append(totalBenchmarkBytes).append('\n');
+ return md.toString();
}
private static double round(double value) {
@@ -772,6 +860,18 @@ private record ThroughputRow(String scenario,
double avgMillisPerDoc) {
}
+ /**
+ * Per-scenario compose / layout / render split (median ms). Persisted so a
+ * diff can attribute a regression to an engine stage rather than only the
+ * blended total — previously this was printed to the console and discarded.
+ */
+ private record StageRow(String scenario,
+ double composeMillis,
+ double layoutMillis,
+ double renderMillis,
+ double totalMillis) {
+ }
+
private record CurrentSpeedReport(String timestamp,
String profile,
int warmupIterations,
@@ -779,11 +879,13 @@ private record CurrentSpeedReport(String timestamp,
int docsPerThread,
List threadCounts,
List latency,
+ List stages,
List throughput,
long totalBytes) {
}
- private record PathSummary(String jsonPath, String latencyCsvPath, String throughputCsvPath) {
+ private record PathSummary(String jsonPath, String latencyCsvPath, String stagesCsvPath,
+ String throughputCsvPath, String summaryMarkdownPath) {
}
private record BenchmarkConfig(int warmupIterations,
From 2d2785208a73d5fd4a3337cf63d72b4a869be487 Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Sun, 14 Jun 2026 19:37:13 +0100
Subject: [PATCH 03/36] perf(benchmarks): diff consumes stages[] and reports
added/removed scenarios
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
BenchmarkDiffTool now (1) surfaces scenario set changes — addedScenarios /
removedScenarios — instead of silently intersecting, so a newly-added (or
dropped) scenario can no longer vanish from a diff unnoticed; and (2) diffs
the stages[] array, emitting per-scenario compose/layout/render/total percent
deltas (console block + stages-diff CSV) so a regression can be attributed to
an engine stage.
Backward-compatible: a report without stages[] yields an empty stage diff
(MissingNode iterates empty); latency/throughput delta rows stay
intersection-only; the diff report is terminal (median/verdict read producer
reports, not diffs). Adds a DiffToolTest case; 29 bench tests pass.
---
.../com/demcha/compose/BenchmarkDiffTool.java | 100 +++++++++++++++++-
.../demcha/compose/BenchmarkDiffToolTest.java | 61 +++++++++++
2 files changed, 160 insertions(+), 1 deletion(-)
diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
index 9b99d272f..0fb058bf8 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
@@ -93,6 +93,31 @@ private void diffCurrentSpeed(DiffInput input,
signedPercent(row.peakHeapMbDeltaPct()));
}
+ if (!report.addedScenarios().isEmpty() || !report.removedScenarios().isEmpty()) {
+ System.out.println();
+ System.out.println("Scenario set changes");
+ System.out.println(" Added in candidate: "
+ + (report.addedScenarios().isEmpty() ? "(none)" : String.join(", ", report.addedScenarios())));
+ System.out.println(" Removed from baseline: "
+ + (report.removedScenarios().isEmpty() ? "(none)" : String.join(", ", report.removedScenarios())));
+ }
+
+ if (!report.stages().isEmpty()) {
+ System.out.println();
+ System.out.println("Stage diff (pct delta per stage)");
+ System.out.printf("%-18s | %12s | %12s | %12s | %12s%n",
+ "Scenario", "Compose pct", "Layout pct", "Render pct", "Total pct");
+ System.out.println("-".repeat(78));
+ for (StageDiff row : report.stages()) {
+ System.out.printf("%-18s | %12s | %12s | %12s | %12s%n",
+ row.scenario(),
+ signedPercent(row.composeDeltaPct()),
+ signedPercent(row.layoutDeltaPct()),
+ signedPercent(row.renderDeltaPct()),
+ signedPercent(row.totalDeltaPct()));
+ }
+ }
+
System.out.println();
System.out.println("Throughput diff");
System.out.printf("%-18s | %8s | %12s | %14s%n",
@@ -143,10 +168,29 @@ private void diffCurrentSpeed(DiffInput input,
format(row.candidateAvgMillisPerDoc()),
format(row.avgMillisPerDocDeltaPct())))
.toList());
+ Path stagesCsv = artifacts.writeCsv(
+ "stages-diff",
+ List.of("scenario", "baseline_compose_ms", "candidate_compose_ms", "compose_delta_pct", "baseline_layout_ms", "candidate_layout_ms", "layout_delta_pct", "baseline_render_ms", "candidate_render_ms", "render_delta_pct", "baseline_total_ms", "candidate_total_ms", "total_delta_pct"),
+ report.stages().stream()
+ .map(row -> List.of(
+ row.scenario(),
+ format(row.baselineComposeMillis()),
+ format(row.candidateComposeMillis()),
+ format(row.composeDeltaPct()),
+ format(row.baselineLayoutMillis()),
+ format(row.candidateLayoutMillis()),
+ format(row.layoutDeltaPct()),
+ format(row.baselineRenderMillis()),
+ format(row.candidateRenderMillis()),
+ format(row.renderDeltaPct()),
+ format(row.baselineTotalMillis()),
+ format(row.candidateTotalMillis()),
+ format(row.totalDeltaPct())))
+ .toList());
System.out.println();
System.out.println("Saved JSON diff report to " + jsonPath);
- System.out.println("Saved CSV diff reports to " + latencyCsv + " and " + throughputCsv);
+ System.out.println("Saved CSV diff reports to " + latencyCsv + ", " + throughputCsv + ", and " + stagesCsv);
}
private void diffComparative(DiffInput input,
@@ -214,6 +258,29 @@ private CurrentSpeedDiffReport buildCurrentSpeedDiff(DiffInput input, JsonNode b
})
.toList();
+ Map baselineStages = indexBy(baseline.path("stages"), "scenario");
+ Map candidateStages = indexBy(candidate.path("stages"), "scenario");
+ List stageDiffs = intersectKeys(baselineStages, candidateStages).stream()
+ .map(key -> {
+ JsonNode before = baselineStages.get(key);
+ JsonNode after = candidateStages.get(key);
+ return new StageDiff(
+ key,
+ before.path("composeMillis").asDouble(),
+ after.path("composeMillis").asDouble(),
+ percentDelta(before.path("composeMillis").asDouble(), after.path("composeMillis").asDouble()),
+ before.path("layoutMillis").asDouble(),
+ after.path("layoutMillis").asDouble(),
+ percentDelta(before.path("layoutMillis").asDouble(), after.path("layoutMillis").asDouble()),
+ before.path("renderMillis").asDouble(),
+ after.path("renderMillis").asDouble(),
+ percentDelta(before.path("renderMillis").asDouble(), after.path("renderMillis").asDouble()),
+ before.path("totalMillis").asDouble(),
+ after.path("totalMillis").asDouble(),
+ percentDelta(before.path("totalMillis").asDouble(), after.path("totalMillis").asDouble()));
+ })
+ .toList();
+
Map baselineThroughput = indexThroughput(baseline.path("throughput"));
Map candidateThroughput = indexThroughput(candidate.path("throughput"));
List throughputDiffs = intersectKeys(baselineThroughput, candidateThroughput).stream()
@@ -237,7 +304,10 @@ private CurrentSpeedDiffReport buildCurrentSpeedDiff(DiffInput input, JsonNode b
input.candidatePath().toString(),
baseline.path("timestamp").asText(),
candidate.path("timestamp").asText(),
+ addedKeys(baselineLatency, candidateLatency),
+ removedKeys(baselineLatency, candidateLatency),
latencyDiffs,
+ stageDiffs,
throughputDiffs
);
}
@@ -294,6 +364,16 @@ private static List intersectKeys(Map left, Map addedKeys(Map baseline, Map candidate) {
+ return candidate.keySet().stream().filter(key -> !baseline.containsKey(key)).sorted().toList();
+ }
+
+ /** Keys present in {@code baseline} but not {@code candidate} (dropped scenarios). */
+ private static List removedKeys(Map baseline, Map candidate) {
+ return baseline.keySet().stream().filter(key -> !candidate.containsKey(key)).sorted().toList();
+ }
+
private static Iterable iterable(JsonNode array) {
return () -> new Iterator<>() {
private final Iterator delegate = array.iterator();
@@ -477,11 +557,29 @@ private record CurrentSpeedThroughputDiff(String scenario,
double avgMillisPerDocDeltaPct) {
}
+ private record StageDiff(String scenario,
+ double baselineComposeMillis,
+ double candidateComposeMillis,
+ double composeDeltaPct,
+ double baselineLayoutMillis,
+ double candidateLayoutMillis,
+ double layoutDeltaPct,
+ double baselineRenderMillis,
+ double candidateRenderMillis,
+ double renderDeltaPct,
+ double baselineTotalMillis,
+ double candidateTotalMillis,
+ double totalDeltaPct) {
+ }
+
private record CurrentSpeedDiffReport(String baselinePath,
String candidatePath,
String baselineTimestamp,
String candidateTimestamp,
+ List addedScenarios,
+ List removedScenarios,
List latency,
+ List stages,
List throughput) {
}
diff --git a/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java b/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java
index 783ad2479..d3319131c 100644
--- a/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java
+++ b/benchmarks/src/test/java/com/demcha/compose/BenchmarkDiffToolTest.java
@@ -93,6 +93,35 @@ void currentSpeedDiffKeepsOnlyScenariosPresentInBothRuns() throws Exception {
assertThat(diff.path("throughput").get(0).path("scenario").asText()).isEqualTo("shared");
}
+ @Test
+ void currentSpeedDiffSurfacesAddedRemovedScenariosAndStageDeltas() throws Exception {
+ System.setProperty("graphcompose.benchmark.root", tempDir.toString());
+ Path baseline = write("baseline.json", currentSpeedWithStages("full",
+ latency("shared", 10.0, 10.0, 100.0, 1.0, 100.0) + ","
+ + latency("only-in-baseline", 10.0, 10.0, 100.0, 1.0, 100.0),
+ stage("shared", 1.0, 2.0, 4.0, 7.0),
+ throughput("shared", 1, 50.0, 20.0)));
+ Path candidate = write("candidate.json", currentSpeedWithStages("full",
+ latency("shared", 10.0, 10.0, 100.0, 1.0, 100.0) + ","
+ + latency("only-in-candidate", 5.0, 5.0, 200.0, 0.5, 90.0),
+ stage("shared", 1.0, 2.0, 8.0, 11.0),
+ throughput("shared", 1, 50.0, 20.0)));
+
+ BenchmarkDiffTool.main(new String[]{baseline.toString(), candidate.toString()});
+
+ JsonNode diff = readDiff("current-speed");
+ // Loud set-changes: one-sided scenarios are surfaced, not silently dropped.
+ assertThat(toStrings(diff.path("addedScenarios"))).containsExactly("only-in-candidate");
+ assertThat(toStrings(diff.path("removedScenarios"))).containsExactly("only-in-baseline");
+ // The shared scenario is still the only intersected latency delta row.
+ assertThat(diff.path("latency").size()).isEqualTo(1);
+ // Stage diff: render 4 -> 8 = +100%, compose unchanged.
+ JsonNode stageDiff = diff.path("stages").get(0);
+ assertThat(stageDiff.path("scenario").asText()).isEqualTo("shared");
+ assertThat(stageDiff.path("renderDeltaPct").asDouble()).isCloseTo(100.0, within(EPS));
+ assertThat(stageDiff.path("composeDeltaPct").asDouble()).isCloseTo(0.0, within(EPS));
+ }
+
@Test
void currentSpeedDiffTreatsZeroBaselineAsHundredPercentAndZeroToZeroAsZero() throws Exception {
System.setProperty("graphcompose.benchmark.root", tempDir.toString());
@@ -228,6 +257,38 @@ private static String latency(String scenario,
""".formatted(scenario, scenario, avgMillis, p95Millis, docsPerSecond, avgKilobytes, peakHeapMb);
}
+ private static String currentSpeedWithStages(String profile, String latencyItems,
+ String stageItems, String throughputItems) {
+ return """
+ {
+ "timestamp": "2026-04-14 21:00:00",
+ "profile": "%s",
+ "latency": [%s],
+ "stages": [%s],
+ "throughput": [%s]
+ }
+ """.formatted(profile, latencyItems, stageItems, throughputItems);
+ }
+
+ private static String stage(String scenario, double composeMs, double layoutMs,
+ double renderMs, double totalMs) {
+ return """
+ {
+ "scenario": "%s",
+ "composeMillis": %s,
+ "layoutMillis": %s,
+ "renderMillis": %s,
+ "totalMillis": %s
+ }
+ """.formatted(scenario, composeMs, layoutMs, renderMs, totalMs);
+ }
+
+ private static java.util.List toStrings(JsonNode array) {
+ java.util.List values = new java.util.ArrayList<>();
+ array.forEach(node -> values.add(node.asText()));
+ return values;
+ }
+
private static String throughput(String scenario, int threads, double docsPerSecond, double avgMillisPerDoc) {
return """
{
From faec9e3f23c02eb54e2fa5fa5d6ab9fc94d1ae9c Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Sun, 14 Jun 2026 19:55:24 +0100
Subject: [PATCH 04/36] perf(benchmarks): add SVG-import feature benches (parse
/ read / node)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
First feature-object benchmarks for the v1.8 vector surface (the rest of the
suite is text/table only):
- SvgJmhBenchmark (forked JMH): SvgPath.parse of a real Material heart d,
SvgIcon.parse of a multi-layer icon, SvgIcon.node on a pre-parsed icon.
- SvgParseAllocProbe (deterministic ThreadMXBean alloc, median of 11): KB/op
for the same three operations.
- SvgBenchmarkFixtures: the heart d (vendored — the benchmark module can't
reach the test/example copies) and a synthetic multi-layer icon (gradient
bg + transformed groups + stroked curves) within the reader's supported
subset, so it always parses.
Run on demand, not per-PR: java -jar benchmarks/target/benchmarks.jar Svg.
Verified: compiles; both benches run — path parse ~3.6 us/op, icon read
~308 us/op (DOM-parse dominated, 114 KB/op), node build ~0.4 us/op / 2 KB/op.
---
.../demcha/compose/SvgBenchmarkFixtures.java | 55 +++++++++++
.../demcha/compose/SvgParseAllocProbe.java | 93 ++++++++++++++++++
.../demcha/compose/jmh/SvgJmhBenchmark.java | 97 +++++++++++++++++++
3 files changed, 245 insertions(+)
create mode 100644 benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java
create mode 100644 benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java
create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
diff --git a/benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java
new file mode 100644
index 000000000..120741433
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/SvgBenchmarkFixtures.java
@@ -0,0 +1,55 @@
+package com.demcha.compose;
+
+/**
+ * Shared SVG fixtures for the v1.8 vector-import benchmarks (path parse, whole
+ * icon read, icon → node build).
+ *
+ *
Self-contained on purpose: the benchmarks module cannot reach the
+ * main-module test constants or the examples module, so the heart path is
+ * vendored here (it also lives in {@code SvgPathTest} / {@code VectorPathExample}
+ * in their own modules). The icon is a synthetic but realistic multi-layer
+ * document — a gradient-filled background, a {@code translate}+{@code scale}
+ * group of filled paths and a stroked circle, and a {@code rotate} group with a
+ * polygon and a quadratic-curve stroke — so it exercises XML parse, {@code }
+ * transform accumulation, gradient resolution and per-layer path lowering the
+ * way a real exporter file would, while staying entirely within the reader's
+ * supported subset (so it never throws).
+ *
+ * @author Artem Demchyshyn
+ */
+public final class SvgBenchmarkFixtures {
+
+ /** Material "favorite" heart — the same {@code d} used in the SVG tests/examples. */
+ public static final String MATERIAL_HEART_D =
+ "M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 2 5.42 4.42 3 7.5 3"
+ + "c1.74 0 3.41.81 4.5 2.09C13.09 3.81 14.76 3 16.5 3 19.58 3 22 5.42 22 8.5"
+ + "c0 3.78-3.4 6.86-8.55 11.54L12 21.35z";
+
+ /** Heart viewBox edge (square 24×24), passed to {@code SvgPath.parse}. */
+ public static final double HEART_VIEWBOX = 24.0;
+
+ /** A realistic multi-layer icon: gradient bg + transformed groups + stroked curves. */
+ public static final String MULTI_LAYER_ICON_SVG = """
+
+ """;
+
+ private SvgBenchmarkFixtures() {
+ }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java b/benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java
new file mode 100644
index 000000000..b8df62a2b
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/SvgParseAllocProbe.java
@@ -0,0 +1,93 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.svg.SvgIcon;
+import com.demcha.compose.document.svg.SvgPath;
+
+import java.lang.management.ManagementFactory;
+import java.util.Arrays;
+import java.util.function.Supplier;
+
+/**
+ * Deterministic allocation probe for the v1.8 SVG-import path: warm
+ * (JIT-steady) bytes allocated per {@link SvgPath#parse}, per
+ * {@link SvgIcon#parse}, and per {@link SvgIcon#node} — the three operations
+ * with no analogue in the rest of the suite (which is text / table only).
+ *
+ *
Allocation counts are noise-free (unlike wall-clock or {@code peakHeapMb}),
+ * so this is the signal the "optimize the engine, not benchmarks" rule wants:
+ * a develop-vs-branch A/B shows a parse/read/node allocation change directly.
+ * No {@code src/main} changes.
The current-speed per-stage breakdown ({@code stages[]}) is not
+ * carried into the median aggregate — only latency and throughput are medianed.
+ * A median-vs-median diff therefore shows no compose/layout/render stage deltas;
+ * diff a single-run pair when you need stage attribution.
*/
public final class BenchmarkMedianTool {
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
index f7a63b30c..58ed3f99f 100644
--- a/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/SvgJmhBenchmark.java
@@ -24,7 +24,8 @@
* {@code DocumentSession}, no PDF render):
*
*
{@code parseSvgPath} — {@link SvgPath#parse} of a real Material icon
- * {@code d} string (arc→cubic conversion, normalization).
{@code readSvgIcon} — {@link SvgIcon#parse} of a multi-layer icon (XML
* parse, {@code } transform accumulation, gradient resolution, one
* {@link SvgPath} per layer).
diff --git a/docs/operations/benchmarks.md b/docs/operations/benchmarks.md
index 775483384..3611d877e 100644
--- a/docs/operations/benchmarks.md
+++ b/docs/operations/benchmarks.md
@@ -40,6 +40,10 @@ The script prints numbered sections so you can map console output to the pipelin
runs the thread-scaling throughput sweep (1 → 16 threads).
3. `03-comparative`
Runs the GraphCompose canonical vs iText 5 vs JasperReports comparison.
+
+ _Steps 04–06 (`core-engine`, `full-cv`, `scalability`) were retired. The
+ surviving steps keep their original `NN-` console prefixes, so the labels
+ jump from `03-` to `07-`._
7. `07-stress`
Runs the concurrent stability stress test.
8. `08-endurance`
diff --git a/docs/operations/performance.md b/docs/operations/performance.md
index ecf02c5b7..7fc02d480 100644
--- a/docs/operations/performance.md
+++ b/docs/operations/performance.md
@@ -1,7 +1,13 @@
# Performance — v1.4 numbers
-All numbers below come from `scripts/run-benchmarks.ps1` — the full local
-benchmark workflow that builds the test classpath once and runs
+> **Historical snapshot (v1.4).** The numbers and suite list below are frozen
+> as captured for v1.4 and are kept for reference. The pipeline has since
+> changed: the `core-engine`, `full-cv`, and `scalability` suites were retired,
+> and current numbers come from the `current-speed` / `comparative` / `stress`
+> pipeline plus the JMH suite. See [docs/operations/benchmarks.md](./benchmarks.md).
+
+All numbers below were captured from `scripts/run-benchmarks.ps1` — the full
+local benchmark workflow that built the test classpath once and ran
`current-speed`, `comparative`, `core-engine`, `full-cv`, `scalability`,
and `stress` suites in sequence. They were captured on a developer
laptop; CI machines are typically 1.5–2× slower. The benchmark
@@ -93,5 +99,9 @@ snapshots.
## Engine-only timings
+_The `GraphComposeBenchmark` and `FullCvBenchmark` mains below were retired
+after v1.4. Equivalent timings now come from the `CurrentSpeedBenchmark`
+`engine-simple` scenario and the JMH `TemplateCvJmhBenchmark`._
+
- `GraphComposeBenchmark` (engine-only, no PDF render): avg **1.04 ms**, p50 **0.97 ms**, p95 **1.64 ms**.
- `FullCvBenchmark` (full CV template, including render): avg **4.14 ms**, p50 **3.80 ms**, p95 **6.37 ms**.
diff --git a/scripts/ab-bench.ps1 b/scripts/ab-bench.ps1
index 5a3e4eb42..a237ec203 100644
--- a/scripts/ab-bench.ps1
+++ b/scripts/ab-bench.ps1
@@ -110,21 +110,10 @@ function Parse-Comparative($jsonPath) {
}
function Parse-Logs($logsDir) {
$o = @{}
- $scal = Join-Path $logsDir "06-scalability.log"
- if (Test-Path $scal) {
- foreach ($line in (Get-Content $scal)) {
- if ($line -match '^\s*(\d+)\s*\|\s*\d+\s*\|\s*([\d.]+)\s*$') {
- $o["scalability | $($matches[1])t | docs/s"] = [double]$matches[2]
- }
- }
- }
- foreach ($pair in @(@("04-core-engine.log", "core-engine"), @("05-full-cv.log", "full-cv"))) {
- $p = Join-Path $logsDir $pair[0]
- if (Test-Path $p) {
- $txt = Get-Content $p -Raw
- if ($txt -match 'Median[^\r\n]*?:\s*([\d.]+)\s*ms') { $o["$($pair[1]) | median ms"] = [double]$matches[1] }
- }
- }
+ # Steps 04-06 (core-engine, full-cv, scalability) were retired, so their logs
+ # are no longer produced. Current-speed throughput — including the
+ # thread-scaling series — is read from the JSON report by Parse-CurrentSpeed;
+ # only the surviving stress log is parsed here.
$stress = Join-Path $logsDir "07-stress.log"
if (Test-Path $stress) {
$txt = Get-Content $stress -Raw
diff --git a/scripts/run-benchmarks.ps1 b/scripts/run-benchmarks.ps1
index e3d3947b6..a0dd2c777 100644
--- a/scripts/run-benchmarks.ps1
+++ b/scripts/run-benchmarks.ps1
@@ -6,7 +6,9 @@ Runs the local GraphCompose benchmark pipeline and stores timestamped logs and r
.DESCRIPTION
The wrapper performs a staged local run:
01 build classpath, 02 current-speed, 03 comparative, 07 stress,
-optional 08 endurance, then 09/10 diff steps.
+optional 08 endurance, then 09/10 diff and 11 verdict steps. Steps 04-06
+(core-engine, full-cv, scalability) were retired; the surviving steps keep
+their original numeric prefixes, so the numbering jumps from 03 to 07.
Current-speed diffs are profile-aware. The wrapper only compares reports
from the same current-speed profile (`smoke` or `full`) and skips the
From b93c44ec62ce1a386889302cec4383f3b3f31405 Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Sun, 14 Jun 2026 23:15:51 +0100
Subject: [PATCH 10/36] docs(changelog): note the v1.8 feature-object benches,
stage output, and gate coverage
---
CHANGELOG.md | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e9f7124c2..6cb0e7074 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -344,6 +344,28 @@ Entries land here as they merge.
`ScalabilityBenchmark` (its thread-scaling sweep folded into
`CurrentSpeedBenchmark`'s full-profile throughput run, now `1,2,4,8,16`).
Dropped the matching `run-benchmarks.ps1` steps and doc entries.
+- **Feature-object benchmarks for the v1.8 vector surface (not shipped).**
+ The suite previously exercised only text/table primitives. Added JMH render
+ benches and deterministic probes over the new vector features:
+ `SvgJmhBenchmark` (path parse / whole-file icon read / icon→node) plus a
+ `SvgParseAllocProbe`; `ChartJmhBenchmark` (bar + line + pie render) plus a
+ `ChartAllocProbe` (layout-compile allocation); `VectorRenderOperatorProbe`
+ (the same paths drawn flat vs. gradient vs. translucent, counted as PDF
+ content-stream operators); `IconRampJmhBenchmark` (icon-placement scaling,
+ `@Param` 8/32/128); and `MixedShowcaseJmhBenchmark` (one document combining
+ prose, inline sparklines, bar + pie charts, SVG icons and a gradient path).
+ Shared `SvgBenchmarkFixtures` / `ChartBenchmarkFixtures` hold the inputs so
+ each bench and its probe measure identical data.
+- **Current-speed report carries a stage breakdown and a run summary (not
+ shipped).** `CurrentSpeedBenchmark` persists a per-scenario compose / layout /
+ render split (`stages[]`, median ms) to the JSON and a `stages` CSV, and
+ writes a readable `summary.md`. `BenchmarkDiffTool` consumes `stages[]`,
+ prints a per-stage delta table, and reports the scenarios added/removed
+ between two runs.
+- **Every current-speed scenario is now covered by the smoke perf gate (not
+ shipped).** The `long-token` scenario previously had no SMOKE threshold and
+ silently escaped the gate; it now has one, and `CurrentSpeedScenarioGateTest`
+ fails the build if any scenario lacks a threshold.
- **Removed the `java.awt.*` / `java.util.*` co-wildcard in four files.**
`InvoiceTemplateComposer`, `ProposalTemplateComposer`,
`WeeklyScheduleTemplateComposer`, and the engine `PdfRenderingSystemECS`
From 7e74b555ff0015ac9a5fe750efc1d020c9ba7ac2 Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Sun, 14 Jun 2026 23:35:01 +0100
Subject: [PATCH 11/36] fix(benchmarks): widen comparative-diff Library column
so GraphCompose Canonical fits
The comparative-diff table printed the Library column as %-20s, but "GraphCompose Canonical" is 22 chars, so it overflowed the field and pushed the | separator right, misaligning that row. Widen to %-24s (matching the comparative run table in ComparativeBenchmark) and extend the rule to 56 so the column fits the longest library label.
---
.../src/main/java/com/demcha/compose/BenchmarkDiffTool.java | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
index 0fb058bf8..ce99ce16e 100644
--- a/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
+++ b/benchmarks/src/main/java/com/demcha/compose/BenchmarkDiffTool.java
@@ -200,11 +200,11 @@ private void diffComparative(DiffInput input,
ComparativeDiffReport report = buildComparativeDiff(input, baseline, candidate);
System.out.println("Comparative diff");
- System.out.printf("%-20s | %12s | %12s%n",
+ System.out.printf("%-24s | %12s | %12s%n",
"Library", "Time pct", "Heap pct");
- System.out.println("-".repeat(52));
+ System.out.println("-".repeat(56));
for (ComparativeLibraryDiff row : report.libraries()) {
- System.out.printf("%-20s | %12s | %12s%n",
+ System.out.printf("%-24s | %12s | %12s%n",
row.library(),
signedPercent(row.avgTimeDeltaPct()),
signedPercent(row.avgHeapDeltaPct()));
From 87ebe8400c2eada70f21b2223b5761f55089de7d Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Mon, 15 Jun 2026 09:57:47 +0100
Subject: [PATCH 12/36] perf(benchmarks): add image embed/scale coverage + a
PdfImageCache reuse gate
The suite had no image coverage at all: no bench or probe placed a raster image, so the embed/scale hot path and PdfImageCache dedup could regress unmeasured.
ImageBenchmarkFixtures builds deterministic in-code synthetic PNGs (a shared demoImage plus distinctImage(i)), so no binary asset is committed. ImageCacheOperatorProbe places one image N times vs N distinct images and counts embedded image XObjects + Do draws (same image x30 -> 1 embed/30 draws; 30 distinct -> 30/30). ImageCacheGateTest turns that reuse invariant into a build-failing assertion (1 embed for the same image regardless of placements; N for N distinct), so a dedup regression cannot pass silently. ImageJmhBenchmark renders a 12-image thumbnail document, driving the ImageIO decode + bicubic rescale + embed path that nothing else exercised.
---
.../compose/ImageBenchmarkFixtures.java | 90 +++++++++++++
.../compose/ImageCacheOperatorProbe.java | 119 ++++++++++++++++++
.../demcha/compose/jmh/ImageJmhBenchmark.java | 92 ++++++++++++++
.../demcha/compose/ImageCacheGateTest.java | 49 ++++++++
4 files changed, 350 insertions(+)
create mode 100644 benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
create mode 100644 benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java
create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java
create mode 100644 benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java
diff --git a/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java b/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
new file mode 100644
index 000000000..c9f95b739
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ImageBenchmarkFixtures.java
@@ -0,0 +1,90 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.image.DocumentImageData;
+
+import javax.imageio.ImageIO;
+import java.awt.BasicStroke;
+import java.awt.Color;
+import java.awt.GradientPaint;
+import java.awt.Graphics2D;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+
+/**
+ * Deterministic synthetic raster fixtures for the image embed/scale benches and
+ * the {@code PdfImageCache} reuse gate.
+ *
+ *
The images are generated in code (a fixed gradient placeholder, a few KB
+ * each) so the suite needs no committed binary asset and the bytes — hence the
+ * cache fingerprint — are stable. {@link #demoImage()} returns the same logical
+ * image every call; {@link #distinctImage(int)} returns visually distinct images
+ * with distinct fingerprints, to exercise the distinct-embed path.
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ImageBenchmarkFixtures {
+
+ /** Native pixel size of every generated fixture. */
+ public static final int NATIVE_WIDTH_PX = 360;
+ /** Native pixel height of every generated fixture. */
+ public static final int NATIVE_HEIGHT_PX = 200;
+
+ /**
+ * Draw size (points) that keeps the original-embed path: at 144 DPI this is
+ * a {@code 360x200 px} target, i.e. > 50% of native, so {@code PdfImageCache}
+ * does not build a downscaled variant and the embed count stays at one.
+ */
+ public static final double DRAW_WIDTH_PT = 180.0;
+ /** Companion draw height (points) for {@link #DRAW_WIDTH_PT}. */
+ public static final double DRAW_HEIGHT_PT = 100.0;
+
+ private ImageBenchmarkFixtures() {
+ }
+
+ /**
+ * One fixed gradient placeholder. Returns equal bytes every call, so all
+ * placements share a fingerprint and the cache treats them as one image.
+ *
+ * @return the shared demo image descriptor
+ */
+ public static DocumentImageData demoImage() {
+ return DocumentImageData.fromBytes(pngBytes(0));
+ }
+
+ /**
+ * The {@code index}-th of a family of visually distinct images, each with a
+ * distinct fingerprint so the cache embeds each one separately.
+ *
+ * @param index variant index (any non-negative int)
+ * @return a distinct image descriptor
+ */
+ public static DocumentImageData distinctImage(int index) {
+ return DocumentImageData.fromBytes(pngBytes(index + 1));
+ }
+
+ private static byte[] pngBytes(int seed) {
+ BufferedImage image = new BufferedImage(NATIVE_WIDTH_PX, NATIVE_HEIGHT_PX, BufferedImage.TYPE_INT_RGB);
+ Graphics2D g = image.createGraphics();
+ try {
+ int r = 20 + (seed * 23) % 200;
+ int b = 95 + (seed * 17) % 150;
+ g.setPaint(new GradientPaint(0, 0, new Color(r, 45, 80),
+ NATIVE_WIDTH_PX, NATIVE_HEIGHT_PX, new Color(20, 80, b)));
+ g.fillRect(0, 0, NATIVE_WIDTH_PX, NATIVE_HEIGHT_PX);
+ g.setPaint(new Color(196, 153, 76));
+ g.setStroke(new BasicStroke(6f));
+ g.drawLine(0, 170, NATIVE_WIDTH_PX, 110 - (seed % 40));
+ } finally {
+ g.dispose();
+ }
+ ByteArrayOutputStream png = new ByteArrayOutputStream();
+ try {
+ ImageIO.write(image, "png", png);
+ } catch (IOException e) {
+ throw new UncheckedIOException("Failed to encode synthetic benchmark image", e);
+ }
+ return png.toByteArray();
+ }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java
new file mode 100644
index 000000000..6e8d84847
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/ImageCacheOperatorProbe.java
@@ -0,0 +1,119 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.image.DocumentImageData;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.pdfparser.PDFStreamParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.IdentityHashMap;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Consumer;
+
+/**
+ * Deterministic content-stream probe for the {@code PdfImageCache} dedup path:
+ * the same raster image is placed {@code N} times and counted against {@code N}
+ * distinct images, so the embed structure isolates exactly what the cache saves.
+ *
+ *
Placing one logical image {@code N} times must embed a single image XObject
+ * (referenced by {@code N} {@code Do} draws), while {@code N} distinct images must
+ * embed {@code N} XObjects. Counting the distinct image XObjects in the output PDF
+ * proves the cache reuses by fingerprint and catches a regression where embeds
+ * scale with placements (PDF bloat). Byte-deterministic — no A/B build needed.
+ * The image render/scale hot path is also entirely uncovered without this and the
+ * companion {@code ImageJmhBenchmark}.
+ *
+ * @author Artem Demchyshyn
+ */
+public final class ImageCacheOperatorProbe {
+
+ private static final int PLACEMENTS = 30;
+
+ /** Distinct image XObjects embedded in a PDF, and the number of {@code Do} draws. */
+ record EmbedCounts(int embeds, int draws) {
+ }
+
+ public static void main(String[] args) throws Exception {
+ BenchmarkSupport.configureQuietLogging();
+
+ System.out.println("GraphCompose image-cache embed probe (" + PLACEMENTS + " placements each)");
+ System.out.printf("%-22s | %8s | %8s%n", "Mode", "Embeds", "Draws");
+ System.out.println("-".repeat(44));
+ report("same image x N", countPdf(renderSameImage(PLACEMENTS)));
+ report("N distinct images", countPdf(renderDistinctImages(PLACEMENTS)));
+ System.out.println();
+ System.out.println("Embeds = distinct image XObjects in the PDF, Draws = Do operators. "
+ + "PdfImageCache must hold embeds at 1 for the same image regardless of placements; "
+ + "distinct images embed once each.");
+ }
+
+ private static void report(String mode, EmbedCounts counts) {
+ System.out.printf("%-22s | %8d | %8d%n", mode, counts.embeds(), counts.draws());
+ }
+
+ /** Renders {@code count} placements of one shared image (cache should embed it once). */
+ static byte[] renderSameImage(int count) throws Exception {
+ DocumentImageData image = ImageBenchmarkFixtures.demoImage();
+ return render(flow -> {
+ for (int i = 0; i < count; i++) {
+ flow.addImage(spec -> spec.source(image)
+ .size(ImageBenchmarkFixtures.DRAW_WIDTH_PT, ImageBenchmarkFixtures.DRAW_HEIGHT_PT));
+ }
+ });
+ }
+
+ /** Renders {@code count} distinct images (cache embeds each once). */
+ static byte[] renderDistinctImages(int count) throws Exception {
+ return render(flow -> {
+ for (int i = 0; i < count; i++) {
+ DocumentImageData image = ImageBenchmarkFixtures.distinctImage(i);
+ flow.addImage(spec -> spec.source(image)
+ .size(ImageBenchmarkFixtures.DRAW_WIDTH_PT, ImageBenchmarkFixtures.DRAW_HEIGHT_PT));
+ }
+ });
+ }
+
+ private static byte[] render(Consumer author) throws Exception {
+ try (DocumentSession session = GraphCompose.document()
+ .pageSize(DocumentPageSize.A4).margin(28, 28, 28, 28).create()) {
+ session.pageFlow(flow -> {
+ flow.name("ImageCacheProbe").spacing(8);
+ author.accept(flow);
+ });
+ return session.toPdfBytes();
+ }
+ }
+
+ /** Counts distinct embedded image XObjects (by COS identity) and {@code Do} draws. */
+ static EmbedCounts countPdf(byte[] pdf) throws IOException {
+ try (PDDocument document = Loader.loadPDF(pdf)) {
+ Set embeds = Collections.newSetFromMap(new IdentityHashMap<>());
+ int draws = 0;
+ for (PDPage page : document.getPages()) {
+ for (var name : page.getResources().getXObjectNames()) {
+ PDXObject xobject = page.getResources().getXObject(name);
+ if (xobject instanceof PDImageXObject image) {
+ embeds.add(image.getCOSObject());
+ }
+ }
+ List tokens = new PDFStreamParser(page).parse();
+ for (Object token : tokens) {
+ if (token instanceof Operator operator && "Do".equals(operator.getName())) {
+ draws++;
+ }
+ }
+ }
+ return new EmbedCounts(embeds.size(), draws);
+ }
+ }
+}
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java
new file mode 100644
index 000000000..2b05b1d09
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ImageJmhBenchmark.java
@@ -0,0 +1,92 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.ImageBenchmarkFixtures;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.dsl.PageFlowBuilder;
+import com.demcha.compose.document.image.DocumentImageData;
+import com.demcha.compose.document.style.DocumentInsets;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.IntStream;
+
+/**
+ * Strict JMH micro-benchmark: end-to-end render of an image-heavy document — a
+ * dozen distinct raster images placed at thumbnail size — to PDF bytes. Drawing
+ * below 50% of native resolution drives {@code PdfImageCache}'s downscale path
+ * ({@code ImageIO} decode + bicubic rescale + re-encode + embed), so this covers
+ * the raster embed/scale hot path that no other bench touches.
+ *
+ *
+ *
+ * @author Artem Demchyshyn
+ */
+@BenchmarkMode({Mode.AverageTime, Mode.Throughput})
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 2)
+@Measurement(iterations = 5, time = 2)
+@Fork(1)
+public class ImageJmhBenchmark {
+
+ private static final int IMAGES = 12;
+
+ /** Distinct images built once in setup; the bench measures render, not image synthesis. */
+ private List images;
+
+ @Setup
+ public void setUp() {
+ images = IntStream.range(0, IMAGES)
+ .mapToObj(ImageBenchmarkFixtures::distinctImage)
+ .toList();
+ }
+
+ /**
+ * Renders the image-heavy document to PDF bytes.
+ *
+ * @param blackhole JMH sink
+ * @throws Exception if rendering fails
+ */
+ @Benchmark
+ public void renderImageDocument(Blackhole blackhole) throws Exception {
+ try (DocumentSession document = GraphCompose.document()
+ .pageSize(DocumentPageSize.A4)
+ .margin(DocumentInsets.of(28))
+ .create()) {
+ PageFlowBuilder flow = document.pageFlow().name("ImageBenchmark").spacing(8);
+ for (DocumentImageData image : images) {
+ // 60x33 pt -> ~120x66 px target at 144 DPI, i.e. <50% of the
+ // 360x200 native, so the cache builds a downscaled variant.
+ flow.addImage(spec -> spec.source(image).size(60, 33));
+ }
+ flow.build();
+ blackhole.consume(document.toPdfBytes());
+ }
+ }
+
+ /**
+ * Runs the JMH harness over this benchmark.
+ *
+ * @param args JMH CLI arguments
+ * @throws Exception if the JMH runner fails
+ */
+ public static void main(String[] args) throws Exception {
+ org.openjdk.jmh.Main.main(args);
+ }
+}
diff --git a/benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java b/benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java
new file mode 100644
index 000000000..e28a2d9c9
--- /dev/null
+++ b/benchmarks/src/test/java/com/demcha/compose/ImageCacheGateTest.java
@@ -0,0 +1,49 @@
+package com.demcha.compose;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Deterministic regression gate for {@code PdfImageCache} dedup, driving
+ * {@link ImageCacheOperatorProbe}'s render + count helpers.
+ *
+ *
The cache keys embedded image XObjects by content fingerprint, so the same
+ * image placed many times must embed once (referenced by many draws) while
+ * distinct images embed once each. Counting the embedded XObjects in the output
+ * PDF makes that structural invariant a build-failing assertion — a regression
+ * that re-embeds the same image per placement (PDF bloat) breaks this test
+ * rather than silently passing CI.
+ */
+class ImageCacheGateTest {
+
+ @Test
+ void sameImageEmbedsOnceRegardlessOfPlacements() throws Exception {
+ int placements = 30;
+
+ ImageCacheOperatorProbe.EmbedCounts counts =
+ ImageCacheOperatorProbe.countPdf(ImageCacheOperatorProbe.renderSameImage(placements));
+
+ assertThat(counts.embeds())
+ .as("the same image placed %d times must embed exactly one XObject", placements)
+ .isEqualTo(1);
+ assertThat(counts.draws())
+ .as("each placement must still draw the cached image")
+ .isGreaterThanOrEqualTo(placements);
+ }
+
+ @Test
+ void distinctImagesEachEmbedOnce() throws Exception {
+ int distinct = 8;
+
+ ImageCacheOperatorProbe.EmbedCounts counts =
+ ImageCacheOperatorProbe.countPdf(ImageCacheOperatorProbe.renderDistinctImages(distinct));
+
+ assertThat(counts.embeds())
+ .as("%d distinct images must embed %d XObjects (no over-dedup)", distinct, distinct)
+ .isEqualTo(distinct);
+ assertThat(counts.draws())
+ .as("each distinct image must be drawn")
+ .isGreaterThanOrEqualTo(distinct);
+ }
+}
From 14390d5372eaeafbbc74df578ac5699882e19844 Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Mon, 15 Jun 2026 10:01:54 +0100
Subject: [PATCH 13/36] perf(benchmarks): run the deterministic benchmark gates
in CI + add a render-operator gate
The deterministic probes produce machine-independent counts, but nothing asserted on them and the benchmarks module's tests never ran in CI (perf-smoke used -DskipTests; the root verify skips the standalone module), so an operator-count or cache regression passed CI silently.
Add a 'Run deterministic benchmark gates' step to the PR-triggered perf-smoke job (./mvnw -f benchmarks/pom.xml test) so the image-cache reuse gate, the scenario/threshold coverage gate, and the diff-tooling tests now fail the build on a structural regression. Refactor RenderOperatorProbe to expose countOperators(...) and add RenderOperatorGateTest, which pins the F5 coalescing invariant: a long single-style paragraph keeps Tf/colour ops below the per-line text-draw count, so a regression back to per-span font ops breaks the test. Probe console output is unchanged.
---
.github/workflows/ci.yml | 14 +++++++
.../demcha/compose/RenderOperatorProbe.java | 28 +++++++++++--
.../compose/RenderOperatorGateTest.java | 41 +++++++++++++++++++
3 files changed, 79 insertions(+), 4 deletions(-)
create mode 100644 benchmarks/src/test/java/com/demcha/compose/RenderOperatorGateTest.java
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 31ce987b2..c2cf8a7d2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -208,6 +208,12 @@ jobs:
- name: Compile benchmarks module
run: ./mvnw -B -ntp -f benchmarks/pom.xml clean compile
+ - name: Run deterministic benchmark gates
+ # Fast, machine-independent unit/gate tests (image-cache reuse,
+ # render-operator coalescing, scenario/threshold coverage, diff tooling).
+ # Catches structural regressions the timing smoke run cannot.
+ run: ./mvnw -B -ntp -f benchmarks/pom.xml test
+
- name: Run coarse performance smoke benchmark
run: |
./mvnw -B -ntp -f benchmarks/pom.xml -DskipTests \
@@ -223,6 +229,14 @@ jobs:
path: benchmarks/target/benchmarks/current-speed/**
if-no-files-found: ignore
+ - name: Upload benchmark gate reports
+ if: always()
+ uses: actions/upload-artifact@v7
+ with:
+ name: benchmark-gate-reports-${{ github.run_id }}
+ path: benchmarks/target/surefire-reports/**
+ if-no-files-found: ignore
+
benchmark-diff:
name: Weekly Benchmark Diff
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
diff --git a/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java b/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java
index 94cafb25e..016f4ea9e 100644
--- a/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java
+++ b/benchmarks/src/main/java/com/demcha/compose/RenderOperatorProbe.java
@@ -70,6 +70,29 @@ public static void main(String[] args) throws Exception {
}
private static void report(String scenario, Consumer author) throws Exception {
+ OpCounts counts = countOperators(author);
+ int saved = Math.max(0, counts.draws() - counts.tf()) + Math.max(0, counts.draws() - counts.rg());
+ double reduction = counts.draws() == 0 ? 0
+ : 100.0 * (2.0 * counts.draws() - counts.tf() - counts.rg()) / (2.0 * counts.draws());
+ System.out.printf("%-22s | %8d | %8d | %8d | %12d | %8.1f%%%n",
+ scenario, counts.draws(), counts.tf(), counts.rg(), saved, reduction);
+ }
+
+ /** Text-show ({@code Tj}/{@code TJ}), {@code setFont} ({@code Tf}) and non-stroking-colour op counts. */
+ record OpCounts(int draws, int tf, int rg) {
+ }
+
+ /**
+ * Renders {@code author} and counts the text-show, font and colour operators.
+ * Exposed (package-visible) so {@code RenderOperatorGateTest} can pin the F5
+ * coalescing invariant: post-F5 the font/colour ops no longer scale 1:1 with
+ * text draws, so {@code tf} and {@code rg} stay below {@code draws}.
+ *
+ * @param author flow author
+ * @return the operator counts of the rendered document
+ * @throws Exception if rendering fails
+ */
+ static OpCounts countOperators(Consumer author) throws Exception {
byte[] pdf;
try (DocumentSession session = GraphCompose.document()
.pageSize(DocumentPageSize.A4).margin(28, 28, 28, 28).create()) {
@@ -80,10 +103,7 @@ private static void report(String scenario, ConsumerBefore F5 the paragraph handler emitted one {@code setFont} (Tf) and one
+ * non-stroking-colour op per text-show, so font/colour ops scaled 1:1 with the
+ * per-line {@code Tj}/{@code TJ} draws. After F5 they are coalesced, so a single
+ * styled paragraph that wraps to many lines emits far fewer Tf/colour ops than
+ * draws. Asserting {@code tf < draws} and {@code rg < draws} pins that
+ * structural win as a build-failing check — a regression back to per-span font
+ * ops (bloated content streams) breaks this test instead of passing CI. The
+ * assertion is content-independent: it does not hardcode brittle exact counts.
+ */
+class RenderOperatorGateTest {
+
+ private static final String LONG_PARAGRAPH =
+ ("GraphCompose lays out structured business documents across many pages "
+ + "while keeping header and footer placement stable. ").repeat(30);
+
+ @Test
+ void fontAndColourOpsStayCoalescedBelowTextDraws() throws Exception {
+ RenderOperatorProbe.OpCounts counts =
+ RenderOperatorProbe.countOperators(flow -> flow.addParagraph(LONG_PARAGRAPH));
+
+ assertThat(counts.draws())
+ .as("a long paragraph must wrap to many text-show ops")
+ .isGreaterThanOrEqualTo(10);
+ assertThat(counts.tf())
+ .as("setFont ops must be coalesced below the per-line draw count (F5), not 1:1")
+ .isLessThan(counts.draws());
+ assertThat(counts.rg())
+ .as("non-stroking colour ops must be coalesced below the per-line draw count (F5)")
+ .isLessThan(counts.draws());
+ }
+}
From 8f8b47702005b04e5d157b505d72a34b78eee7f2 Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Mon, 15 Jun 2026 10:21:06 +0100
Subject: [PATCH 14/36] perf(benchmarks): add a single-shot cold-start render
bench
Every JMH bench reported steady-state (warm) timings, which is what a long-lived server pays; nothing measured the JIT-cold first render a short-lived CLI invocation or a serverless cold-start actually pays.
ColdStartJmhBenchmark uses Mode.SingleShotTime with @Warmup(0)/@Measurement(1)/@Fork(10) to sample the cold first render across ten fresh JVMs, over the same workloads as the warm benches (an inline engine doc, InvoiceTemplateV1, the ModernProfessional CV preset). Specs and templates are built in @Setup so the measured shot is the cold render path, not fixture assembly. Observed cold first render ~370-510 ms/op locally, vs the warm ms-scale numbers -- the headline metric for CLI/Lambda consumers.
---
.../compose/jmh/ColdStartJmhBenchmark.java | 141 ++++++++++++++++++
1 file changed, 141 insertions(+)
create mode 100644 benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java
diff --git a/benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java b/benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java
new file mode 100644
index 000000000..a21e3ddbc
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/jmh/ColdStartJmhBenchmark.java
@@ -0,0 +1,141 @@
+package com.demcha.compose.jmh;
+
+import com.demcha.compose.CanonicalBenchmarkSupport;
+import com.demcha.compose.GraphCompose;
+import com.demcha.compose.document.api.DocumentPageSize;
+import com.demcha.compose.document.api.DocumentSession;
+import com.demcha.compose.document.style.DocumentInsets;
+import com.demcha.compose.document.templates.builtins.InvoiceTemplateV1;
+import com.demcha.compose.document.templates.cv.presets.ModernProfessional;
+import com.demcha.compose.document.templates.cv.spec.CvSpec;
+import com.demcha.compose.document.templates.data.invoice.InvoiceDocumentSpec;
+import com.demcha.compose.document.theme.BusinessTheme;
+import com.demcha.compose.document.templates.api.DocumentTemplate;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Strict JMH single-shot benchmark: the JIT-cold cost of the first PDF
+ * render in a fresh JVM. Every other JMH bench in this module reports
+ * steady-state ({@code AverageTime} after warmup), which is what a long-lived
+ * server pays — but a short-lived CLI invocation or a serverless (Lambda)
+ * cold-start pays the first render, with the layout and PDFBox classes
+ * unloaded and uncompiled. This bench measures exactly that.
+ *
+ *
{@code Mode.SingleShotTime} with {@code @Warmup(0)} and {@code @Measurement(1)}
+ * times a single invocation; {@code @Fork(10)} repeats it in ten fresh JVMs so the
+ * reported number is a distribution of cold first-renders, not one lucky start.
+ * The spec/template objects are built in {@link #setUp()} so the measured shot is
+ * the cold render path, not fixture assembly. Same workloads as the warm benches
+ * ({@code engine-simple} inline, {@code InvoiceTemplateV1}, {@code ModernProfessional})
+ * so cold and warm numbers are directly comparable.