From 01fc990b60e038546deabf4630d570ea76af6fff Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Mon, 8 Jun 2026 23:04:00 +0100
Subject: [PATCH 1/2] perf(engine): cache measurement fonts per-thread, drop
throwaway-doc embed
Measurement used to subset-embed every binary (Google/custom) font family into a
per-session PDDocument that was immediately discarded, repeated on every new
DocumentSession (one per server render). Resolve binary families to a per-thread
cached PDType0Font bound to a reusable, never-saved document instead, so a face
embeds once per worker thread; PdfMeasurementResources no longer owns a document.
Widths, vertical metrics and glyph coverage stay byte-identical to the render
font (both read the same parsed TrueTypeFont), proven by MeasurementFontParityTest
(30 families x 4 faces, max|delta| = 0) and the visual/snapshot suite. The
per-open embed waste drops ~94-97% (FontEmbedProbe). Standard-14-only documents
are unaffected.
Finding 4.
---
CHANGELOG.md | 17 ++
.../com/demcha/compose/FontEmbedProbe.java | 271 ++++++++++++++++++
.../fixed/pdf/PdfFontLibraryFactory.java | 64 ++++-
.../backend/fixed/pdf/PdfFontLoader.java | 113 ++++++--
.../fixed/pdf/PdfMeasurementResources.java | 19 +-
.../fixed/pdf/MeasurementFontParityTest.java | 84 ++++++
6 files changed, 530 insertions(+), 38 deletions(-)
create mode 100644 benchmarks/src/main/java/com/demcha/compose/FontEmbedProbe.java
create mode 100644 src/test/java/com/demcha/compose/document/backend/fixed/pdf/MeasurementFontParityTest.java
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0938c80bd..403c6ec3b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,23 @@ Open cycle — bug-fix / housekeeping. Entries land here as they merge.
token). **Output is byte-identical** — the fit predicate is monotonic, so the
search returns the same break index. No public API or behaviour change.
+- **Text measurement no longer embeds binary fonts into a throwaway document.**
+ The layout measurement pipeline used to subset-embed every Google/custom font
+ family into a private `PDDocument` that was immediately discarded — repeated on
+ every new `DocumentSession`, because each render in a server opens a fresh
+ session. Measurement now resolves binary families to a **per-thread cached**
+ font (mirroring the existing parsed-TrueType cache) bound to a reusable,
+ never-saved document, so a family embeds once per worker thread instead of once
+ per session, and opening measurement resources owns no PDF document at all.
+ **Output is byte-identical** — both paths read glyph widths and metrics from the
+ same parsed `TrueTypeFont`; proven by a 960-case render-vs-measurement
+ width-parity check (max |Δ| = 0.0), a new `MeasurementFontParityTest`, and the
+ full visual-regression / snapshot suite passing unchanged. Only Google/custom-font
+ documents are affected (the standard-14 path never embedded); a measurement probe
+ showed the per-session embed waste drop ~94–97% (≈1.5–3 MB and ≈2–4.5 ms of font
+ subsetting removed per session after the first on a thread). Standard-14-only
+ documents are unaffected. No public API or behaviour change.
+
### Tests / tooling
- **Benchmark regression gate and measurement probe (benchmarks module, not part
diff --git a/benchmarks/src/main/java/com/demcha/compose/FontEmbedProbe.java b/benchmarks/src/main/java/com/demcha/compose/FontEmbedProbe.java
new file mode 100644
index 000000000..25b6ec529
--- /dev/null
+++ b/benchmarks/src/main/java/com/demcha/compose/FontEmbedProbe.java
@@ -0,0 +1,271 @@
+package com.demcha.compose;
+
+import com.demcha.compose.document.backend.fixed.pdf.PdfFontLibraryFactory;
+import com.demcha.compose.document.backend.fixed.pdf.PdfMeasurementResources;
+import com.demcha.compose.engine.components.content.text.TextDecoration;
+import com.demcha.compose.engine.components.content.text.TextStyle;
+import com.demcha.compose.engine.measurement.TextMeasurementSystem;
+import com.demcha.compose.engine.render.pdf.PdfFont;
+import com.demcha.compose.font.FontLibrary;
+import com.demcha.compose.font.FontName;
+import org.apache.pdfbox.pdmodel.PDDocument;
+
+import java.awt.Color;
+import java.lang.management.ManagementFactory;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Finding 4 probe — quantifies the wasted cost of embedding binary (Google) font
+ * families into the throwaway measurement {@code PDDocument}.
+ *
+ * The canonical pipeline builds two {@code PDDocument}s on a first render:
+ *
+ * - a measurement doc ({@code DocumentSession} ->
+ * {@link PdfMeasurementResources#open}) used only to read glyph widths /
+ * line metrics during layout, and
+ * - a render doc ({@code PdfFixedLayoutBackend.renderToOutput}) that is
+ * actually saved.
+ *
+ *
+ * Each binary family the document uses is
+ * {@code PDType0Font.load(doc, ttf, subset=true)}-ed into both. The
+ * measurement doc is never saved, so that embed is pure waste — Finding 4. The
+ * bundled standard-14 families (Helvetica/Times/Courier) use {@code PDType1Font}
+ * and embed nothing; only the 30 Google families are binary TTF.
+ *
+ * This probe measures, warm (steady state, the honest signal per the
+ * perf-change workflow) and deterministically (allocated bytes via
+ * {@code ThreadMXBean}), the cost of resolving N binary families into a fresh
+ * measurement doc. After warm-up the raw font bytes and parsed {@code TrueTypeFont}
+ * are already cached, so the residual is precisely the per-document
+ * {@code PDType0Font.load} embed — the work F4 proposes to remove from the
+ * measurement side. Standard-14 (Helvetica) is the zero-embed baseline; the
+ * (binary − Helvetica) delta isolates the embed. One resolved family loads all
+ * four faces (regular/bold/italic/boldItalic), so a family costs 4
+ * {@code PDType0Font.load} calls. Needs no {@code src/main} changes.
+ */
+public final class FontEmbedProbe {
+
+ private static final com.sun.management.ThreadMXBean THREAD_MX =
+ (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean();
+
+ private static final String SAMPLE = "The quick brown fox Ag 0123456789";
+ private static final int WARMUP_ITERATIONS = 8;
+ private static final int MEASURED_ITERATIONS = 11;
+
+ /** All 30 bundled binary (Google) families — every family that embeds. */
+ private static final List GOOGLE_FAMILIES = List.of(
+ FontName.LATO, FontName.PT_SANS, FontName.PT_SERIF, FontName.FIRA_SANS, FontName.UBUNTU,
+ FontName.ALEGREYA_SANS, FontName.CARLITO, FontName.POPPINS, FontName.BARLOW,
+ FontName.BARLOW_CONDENSED, FontName.ASAP_CONDENSED, FontName.ARSENAL, FontName.IBM_PLEX_SERIF,
+ FontName.IBM_PLEX_MONO, FontName.CRIMSON_TEXT, FontName.SPECTRAL, FontName.ZILLA_SLAB,
+ FontName.GENTIUM_PLUS, FontName.TINOS, FontName.COUSINE, FontName.FIRA_SANS_CONDENSED,
+ FontName.KANIT, FontName.VOLKHOV, FontName.TAVIRAJ, FontName.TRIRONG, FontName.SARABUN,
+ FontName.PROMPT, FontName.ANDIKA, FontName.BAI_JAMJUREE, FontName.JETBRAINS_MONO);
+
+ private static final List FACES = List.of(
+ TextDecoration.DEFAULT, TextDecoration.BOLD, TextDecoration.ITALIC, TextDecoration.BOLD_ITALIC);
+
+ /** Width-parity battery: plain text, kerning-prone runs, and sanitize/unencodable cases. */
+ private static final List PARITY_STRINGS = List.of(
+ "The quick brown fox jumps over the lazy dog",
+ "Ag",
+ "01234567890",
+ "Proportional WAVE Type AVA To. kerning",
+ "Em dash — and “smart quotes” nbsp",
+ "Arrows → bullet ● emoji 😀 fallback",
+ " leading and trailing spaces ",
+ "Mixed CASE punctuation!?.,;: (parens) [brackets]");
+
+ public static void main(String[] args) throws Exception {
+ BenchmarkSupport.configureQuietLogging();
+ new FontEmbedProbe().run();
+ }
+
+ private void run() throws Exception {
+ enableAllocationMeasurement();
+
+ List scenarios = List.of(
+ new Scenario("helvetica (std-14)", List.of(FontName.HELVETICA)),
+ new Scenario("1 google (Lato)", List.of(FontName.LATO)),
+ new Scenario("2 google (Lato+Poppins)", List.of(FontName.LATO, FontName.POPPINS)),
+ new Scenario("3 google (Lato+Poppins+Ubuntu)",
+ List.of(FontName.LATO, FontName.POPPINS, FontName.UBUNTU)));
+
+ System.out.println("GraphCompose Finding-4 Font-Embed Probe (measurement document)");
+ System.out.println("Allocation measurement: " + (allocationSupported() ? "enabled" : "UNAVAILABLE"));
+ System.out.println("Warm iterations: " + WARMUP_ITERATIONS + ", measured (median): " + MEASURED_ITERATIONS);
+ System.out.println();
+
+ // Warm up class-load / JIT / TTF-parse so the measured window reflects the
+ // steady-state PDType0Font.load embed, not one-time cold-start cost.
+ for (int i = 0; i < WARMUP_ITERATIONS; i++) {
+ for (Scenario scenario : scenarios) {
+ measureOnce(scenario);
+ }
+ }
+
+ List results = new ArrayList<>();
+ for (Scenario scenario : scenarios) {
+ long[] allocs = new long[MEASURED_ITERATIONS];
+ double[] millis = new double[MEASURED_ITERATIONS];
+ for (int i = 0; i < MEASURED_ITERATIONS; i++) {
+ Sample sample = measureOnce(scenario);
+ allocs[i] = sample.allocBytes();
+ millis[i] = sample.nanos() / 1_000_000.0;
+ }
+ results.add(new Result(scenario, medianLong(allocs), medianDouble(millis)));
+ }
+
+ long baselineAlloc = results.get(0).medianAllocBytes();
+ double baselineMs = results.get(0).medianMillis();
+
+ System.out.printf("%-32s | %12s | %10s | %14s | %10s%n",
+ "Scenario", "Alloc (KB)", "Time (ms)", "Embed Δalloc", "Embed Δms");
+ System.out.println("-".repeat(92));
+ for (Result result : results) {
+ long deltaAlloc = result.medianAllocBytes() - baselineAlloc;
+ double deltaMs = result.medianMillis() - baselineMs;
+ boolean isBaseline = result == results.get(0);
+ System.out.printf("%-32s | %12s | %10.3f | %14s | %10s%n",
+ result.scenario().label(),
+ formatKb(result.medianAllocBytes()),
+ result.medianMillis(),
+ isBaseline ? "(baseline)" : formatKb(deltaAlloc),
+ isBaseline ? "—" : "%.3f".formatted(deltaMs));
+ }
+
+ System.out.println();
+ System.out.println("Embed Δ = scenario − Helvetica baseline = measurement-doc binary embed (the F4 waste).");
+ System.out.println("After F4 the per-thread cache absorbs the embed, so warm google rows collapse toward baseline.");
+
+ parityCheck();
+ }
+
+ /**
+ * Proves the F4 change is geometry-neutral: for every binary family and face,
+ * the measurement-path width must equal the render-path width to the bit. Both
+ * resolve through the same cached {@link org.apache.fontbox.ttf.TrueTypeFont},
+ * so any non-zero delta would mean a real measurement regression.
+ */
+ private void parityCheck() throws Exception {
+ long comparisons = 0;
+ double maxAbsDiff = 0.0;
+ String worst = "";
+
+ try (PDDocument renderDocument = new PDDocument();
+ PdfMeasurementResources measurement = PdfMeasurementResources.open(List.of())) {
+ // Exactly what PdfFixedLayoutBackend builds: a render library that embeds
+ // a subset into the (saved) render document.
+ FontLibrary renderLibrary = PdfFontLibraryFactory.library(renderDocument, List.of());
+ TextMeasurementSystem measure = measurement.textMeasurementSystem();
+
+ for (FontName family : GOOGLE_FAMILIES) {
+ PdfFont renderFont = renderLibrary.getFont(family, PdfFont.class)
+ .orElseThrow(() -> new IllegalStateException("missing render font " + family));
+ for (TextDecoration face : FACES) {
+ for (String text : PARITY_STRINGS) {
+ TextStyle style = new TextStyle(family, 11.0, face, Color.BLACK);
+ double renderWidth = renderFont.getTextWidth(style, text);
+ double measureWidth = measure.textWidth(style, text);
+ double diff = Math.abs(renderWidth - measureWidth);
+ comparisons++;
+ if (diff > maxAbsDiff) {
+ maxAbsDiff = diff;
+ worst = family + "/" + face + " : \"" + text + "\" (render=" + renderWidth
+ + ", measure=" + measureWidth + ")";
+ }
+ }
+ }
+ }
+ }
+
+ boolean pass = maxAbsDiff == 0.0;
+ System.out.println();
+ System.out.printf("PARITY: %s — %d comparisons (%d google families x %d faces x %d strings), max|Δwidth| = %s%n",
+ pass ? "PASS (byte-identical render vs measurement)" : "FAIL",
+ comparisons, GOOGLE_FAMILIES.size(), FACES.size(), PARITY_STRINGS.size(),
+ maxAbsDiff);
+ if (!pass) {
+ System.out.println(" worst: " + worst);
+ }
+ }
+
+ private Sample measureOnce(Scenario scenario) throws Exception {
+ List styles = new ArrayList<>();
+ for (FontName fontName : scenario.fonts()) {
+ styles.add(new TextStyle(fontName, 10.0, TextDecoration.DEFAULT, Color.BLACK));
+ }
+
+ long allocBefore = currentThreadAllocatedBytes();
+ long t0 = System.nanoTime();
+ PdfMeasurementResources resources = PdfMeasurementResources.open(List.of());
+ TextMeasurementSystem measurement = resources.textMeasurementSystem();
+ double sink = 0;
+ for (TextStyle style : styles) {
+ // First width call lazily resolves the family -> loads all 4 faces
+ // via PDType0Font.load into this throwaway measurement document.
+ sink += measurement.textWidth(style, SAMPLE);
+ }
+ long nanos = System.nanoTime() - t0;
+ long allocBytes = allocBefore < 0 ? -1 : currentThreadAllocatedBytes() - allocBefore;
+
+ if (sink < 0) {
+ throw new IllegalStateException("unreachable");
+ }
+ resources.close();
+ return new Sample(allocBytes, nanos);
+ }
+
+ private static void enableAllocationMeasurement() {
+ try {
+ if (THREAD_MX.isThreadAllocatedMemorySupported() && !THREAD_MX.isThreadAllocatedMemoryEnabled()) {
+ THREAD_MX.setThreadAllocatedMemoryEnabled(true);
+ }
+ } catch (UnsupportedOperationException ignored) {
+ // Allocation measurement unsupported; Alloc column reports n/a.
+ }
+ }
+
+ private static boolean allocationSupported() {
+ try {
+ return THREAD_MX.isThreadAllocatedMemorySupported() && THREAD_MX.isThreadAllocatedMemoryEnabled();
+ } catch (UnsupportedOperationException ex) {
+ return false;
+ }
+ }
+
+ private static long currentThreadAllocatedBytes() {
+ if (!allocationSupported()) {
+ return -1;
+ }
+ return THREAD_MX.getCurrentThreadAllocatedBytes();
+ }
+
+ private static long medianLong(long[] values) {
+ long[] copy = values.clone();
+ Arrays.sort(copy);
+ return copy[copy.length / 2];
+ }
+
+ private static double medianDouble(double[] values) {
+ double[] copy = values.clone();
+ Arrays.sort(copy);
+ return copy[copy.length / 2];
+ }
+
+ private static String formatKb(long bytes) {
+ return bytes < 0 ? "n/a" : "%.1f".formatted(bytes / 1024.0);
+ }
+
+ private record Scenario(String label, List fonts) {
+ }
+
+ private record Sample(long allocBytes, long nanos) {
+ }
+
+ private record Result(Scenario scenario, long medianAllocBytes, double medianMillis) {
+ }
+}
diff --git a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLibraryFactory.java b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLibraryFactory.java
index 8a6ee0278..781e06125 100644
--- a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLibraryFactory.java
+++ b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLibraryFactory.java
@@ -6,6 +6,7 @@
import com.demcha.compose.font.FontFamilyDefinition;
import com.demcha.compose.font.FontLibrary;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
@@ -35,7 +36,7 @@ private PdfFontLibraryFactory() {
*/
public static FontLibrary standardLibrary() {
FontLibrary fontLibrary = new FontLibrary();
- DefaultFonts.standardFamilies().forEach(definition -> register(fontLibrary, null, definition));
+ DefaultFonts.standardFamilies().forEach(definition -> register(fontLibrary, null, definition, false));
return fontLibrary;
}
@@ -57,19 +58,45 @@ public static FontLibrary library(PDDocument document) {
* @return PDF-backed font library
*/
public static FontLibrary library(PDDocument document, Collection customFamilies) {
+ return buildLibrary(document, customFamilies, false);
+ }
+
+ /**
+ * Creates a measurement-only font library.
+ *
+ * Binary families resolve to per-thread cached measurement fonts instead of
+ * embedding a fresh subset into a throwaway {@link PDDocument} on every session
+ * (Finding 4: the measurement document is discarded, so its embed was pure
+ * waste). Standard-14 families are unaffected — they never embed. The resolved
+ * font metrics are byte-identical to the render library, so layout geometry is
+ * unchanged.
+ *
+ * @param customFamilies document-local custom font families
+ * @return measurement-backed font library that needs no owning document
+ */
+ public static FontLibrary measurementLibrary(Collection customFamilies) {
+ return buildLibrary(null, customFamilies, true);
+ }
+
+ private static FontLibrary buildLibrary(PDDocument document,
+ Collection customFamilies,
+ boolean measurement) {
FontLibrary fontLibrary = new FontLibrary();
for (FontFamilyDefinition definition : DefaultFonts.bundledFamilies()) {
- register(fontLibrary, document, definition);
+ register(fontLibrary, document, definition, measurement);
}
for (FontFamilyDefinition definition : customFamilies) {
- register(fontLibrary, document, definition);
+ register(fontLibrary, document, definition, measurement);
}
return fontLibrary;
}
- private static void register(FontLibrary library, PDDocument document, FontFamilyDefinition definition) {
+ private static void register(FontLibrary library,
+ PDDocument document,
+ FontFamilyDefinition definition,
+ boolean measurement) {
Objects.requireNonNull(library, "library");
Objects.requireNonNull(definition, "definition");
@@ -82,15 +109,12 @@ private static void register(FontLibrary library, PDDocument document, FontFamil
definition.fontSourceSet().ifPresent(sources ->
library.addFontFactory(definition.name(), PdfFont.class, () -> {
- PDDocument owner = Objects.requireNonNull(
- document,
- "A PDF document is required to load binary font family " + definition.name());
try {
return new PdfFont(
- PdfFontLoader.loadFont(owner, sources.regular().openStream(), sources.regular().description()),
- PdfFontLoader.loadFont(owner, sources.bold().openStream(), sources.bold().description()),
- PdfFontLoader.loadFont(owner, sources.italic().openStream(), sources.italic().description()),
- PdfFontLoader.loadFont(owner, sources.boldItalic().openStream(), sources.boldItalic().description()));
+ loadBinaryFace(measurement, document, definition, sources.regular()),
+ loadBinaryFace(measurement, document, definition, sources.bold()),
+ loadBinaryFace(measurement, document, definition, sources.italic()),
+ loadBinaryFace(measurement, document, definition, sources.boldItalic()));
} catch (IOException e) {
throw new IllegalStateException("Unable to register PDF font family " + definition.name(), e);
}
@@ -99,6 +123,24 @@ private static void register(FontLibrary library, PDDocument document, FontFamil
library.addFontFactory(definition.name(), WordFont.class, () -> new WordFont(definition.wordFamily()));
}
+ /**
+ * Loads a single binary face for either the render path (subset embedded into
+ * {@code document}) or the measurement path (per-thread cached, doc-independent
+ * metrics). Both observe the same parsed font program, so metrics match.
+ */
+ private static PDType0Font loadBinaryFace(boolean measurement,
+ PDDocument document,
+ FontFamilyDefinition definition,
+ FontFamilyDefinition.FontBinarySource source) throws IOException {
+ if (measurement) {
+ return PdfFontLoader.loadMeasurementFont(source.openStream(), source.description());
+ }
+ PDDocument owner = Objects.requireNonNull(
+ document,
+ "A PDF document is required to load binary font family " + definition.name());
+ return PdfFontLoader.loadFont(owner, source.openStream(), source.description());
+ }
+
private static Standard14Fonts.FontName standardFont(String name) {
return Standard14Fonts.FontName.valueOf(name);
}
diff --git a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLoader.java b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLoader.java
index 2417c1429..4921c3a38 100644
--- a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLoader.java
+++ b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLoader.java
@@ -9,6 +9,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@@ -41,32 +42,110 @@ protected boolean removeEldestEntry(Map.Entry eldest) {
}
});
+ /**
+ * Per-thread, never-saved document that owns measurement-only embedded fonts.
+ *
+ * The layout pipeline reads glyph widths, vertical metrics and glyph
+ * coverage from a real {@link PDType0Font}. Those answers are derived from the
+ * parsed {@link TrueTypeFont} (advance widths, descriptor tables, cmap) and do
+ * not depend on which document owns the font, so a single reusable owner per
+ * thread produces byte-identical metrics to the per-render embed. The document
+ * is never saved, so the deferred subset build never runs; it only accumulates
+ * the bounded set of distinct font faces touched on the thread.
+ */
+ private static final ThreadLocal THREAD_LOCAL_MEASUREMENT_DOCUMENT =
+ ThreadLocal.withInitial(PDDocument::new);
+
+ /**
+ * Per-thread cache of measurement-only fonts keyed by source description,
+ * bound to {@link #THREAD_LOCAL_MEASUREMENT_DOCUMENT}.
+ *
+ * Deliberately uncapped, unlike {@link #THREAD_LOCAL_TTF_CACHE}.
+ * Evicting an entry would not free anything: the {@link PDType0Font} stays
+ * registered in the never-pruned measurement document, and the next use of
+ * that face would {@code PDType0Font.load} a second copy into the same
+ * document — so an LRU here grows the document on every evict/reload instead of
+ * bounding it. Loading each face exactly once per thread keeps the document at
+ * one font per distinct face, which is the real bound (≈ the bundled face count
+ * plus any custom faces the thread touches).
+ */
+ private static final ThreadLocal
*/
public final class PdfMeasurementResources implements AutoCloseable {
- private final PDDocument document;
private final FontLibrary fontLibrary;
private final TextMeasurementSystem textMeasurementSystem;
- private PdfMeasurementResources(PDDocument document,
- FontLibrary fontLibrary,
+ private PdfMeasurementResources(FontLibrary fontLibrary,
TextMeasurementSystem textMeasurementSystem) {
- this.document = document;
this.fontLibrary = fontLibrary;
this.textMeasurementSystem = textMeasurementSystem;
}
/**
- * Opens a fresh measurement document and resolves built-in plus custom fonts.
+ * Resolves built-in plus custom fonts for the measurement pipeline.
+ *
+ * Binary families resolve to per-thread cached measurement fonts rather than
+ * embedding a subset into a throwaway PDF document (Finding 4), so opening these
+ * resources owns no {@link org.apache.pdfbox.pdmodel.PDDocument}. Measured
+ * metrics are byte-identical to the render font library.
*
* @param customFontFamilies document-local font families
* @return owned measurement resources
*/
public static PdfMeasurementResources open(Collection customFontFamilies) {
- PDDocument document = new PDDocument();
- FontLibrary fontLibrary = PdfFontLibraryFactory.library(document, customFontFamilies);
+ FontLibrary fontLibrary = PdfFontLibraryFactory.measurementLibrary(customFontFamilies);
TextMeasurementSystem measurement = new FontLibraryTextMeasurementSystem(fontLibrary, PdfFont.class);
- return new PdfMeasurementResources(document, fontLibrary, measurement);
+ return new PdfMeasurementResources(fontLibrary, measurement);
}
/**
@@ -63,6 +63,5 @@ public TextMeasurementSystem textMeasurementSystem() {
@Override
public void close() throws Exception {
textMeasurementSystem.clearCaches();
- document.close();
}
}
diff --git a/src/test/java/com/demcha/compose/document/backend/fixed/pdf/MeasurementFontParityTest.java b/src/test/java/com/demcha/compose/document/backend/fixed/pdf/MeasurementFontParityTest.java
new file mode 100644
index 000000000..79b541be3
--- /dev/null
+++ b/src/test/java/com/demcha/compose/document/backend/fixed/pdf/MeasurementFontParityTest.java
@@ -0,0 +1,84 @@
+package com.demcha.compose.document.backend.fixed.pdf;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import com.demcha.compose.engine.components.content.text.TextDecoration;
+import com.demcha.compose.engine.components.content.text.TextStyle;
+import com.demcha.compose.engine.render.pdf.PdfFont;
+import com.demcha.compose.font.FontLibrary;
+import com.demcha.compose.font.FontName;
+
+import java.awt.Color;
+import java.util.List;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Guards Finding 4 (measurement no longer embeds binary fonts into a throwaway
+ * document).
+ *
+ * Measurement resolves binary families to a per-thread cached, document-free
+ * {@code PDType0Font}; the render path embeds a fresh subset into the saved
+ * document. Both must report byte-identical glyph widths — they read the
+ * same parsed {@code TrueTypeFont}, so any drift here would silently move layout
+ * geometry. This is the permanent CI counterpart to the manual
+ * {@code FontEmbedProbe} width-parity check in the benchmarks module.
+ */
+class MeasurementFontParityTest {
+
+ /** Every bundled binary (Google) family — the ones that actually embed. */
+ private static final List BINARY_FAMILIES = List.of(
+ FontName.LATO, FontName.PT_SANS, FontName.PT_SERIF, FontName.FIRA_SANS, FontName.UBUNTU,
+ FontName.ALEGREYA_SANS, FontName.CARLITO, FontName.POPPINS, FontName.BARLOW,
+ FontName.BARLOW_CONDENSED, FontName.ASAP_CONDENSED, FontName.ARSENAL, FontName.IBM_PLEX_SERIF,
+ FontName.IBM_PLEX_MONO, FontName.CRIMSON_TEXT, FontName.SPECTRAL, FontName.ZILLA_SLAB,
+ FontName.GENTIUM_PLUS, FontName.TINOS, FontName.COUSINE, FontName.FIRA_SANS_CONDENSED,
+ FontName.KANIT, FontName.VOLKHOV, FontName.TAVIRAJ, FontName.TRIRONG, FontName.SARABUN,
+ FontName.PROMPT, FontName.ANDIKA, FontName.BAI_JAMJUREE, FontName.JETBRAINS_MONO);
+
+ private static final List FACES = List.of(
+ TextDecoration.DEFAULT, TextDecoration.BOLD, TextDecoration.ITALIC, TextDecoration.BOLD_ITALIC);
+
+ private static final List STRINGS = List.of(
+ "The quick brown fox jumps over the lazy dog WAVE AVA To.",
+ "Em dash — “smart quotes” nbsp", // standard sanitize cleanup
+ "Arrows → bullet ● emoji 😀 fallback"); // unencodable code points -> '?'
+
+ @Test
+ void measurementWidthsMatchRenderWidthsForEveryBinaryFamily() throws Exception {
+ try (PDDocument renderDocument = new PDDocument();
+ PdfMeasurementResources measurement = PdfMeasurementResources.open(List.of())) {
+ // Exactly what PdfFixedLayoutBackend builds: a render library that embeds
+ // a fresh subset into the (saved) render document.
+ FontLibrary renderLibrary = PdfFontLibraryFactory.library(renderDocument, List.of());
+
+ for (FontName family : BINARY_FAMILIES) {
+ PdfFont renderFont = renderLibrary.getFont(family, PdfFont.class)
+ .orElseThrow(() -> new AssertionError("render font missing for " + family));
+ for (TextDecoration face : FACES) {
+ for (String text : STRINGS) {
+ TextStyle style = new TextStyle(family, 11.0, face, Color.BLACK);
+ double renderWidth = renderFont.getTextWidth(style, text);
+ double measurementWidth = measurement.textMeasurementSystem().textWidth(style, text);
+
+ assertThat(measurementWidth)
+ .describedAs("measurement vs render width parity: %s / %s / \"%s\"", family, face, text)
+ .isEqualTo(renderWidth);
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ void measurementLibraryResolvesBinaryFamiliesWithoutOwningDocument() {
+ // F4 contract: a measurement library embeds nothing into a document and so
+ // needs none to resolve a binary family.
+ FontLibrary measurementLibrary = PdfFontLibraryFactory.measurementLibrary(List.of());
+
+ assertThat(measurementLibrary.getFont(FontName.LATO, PdfFont.class))
+ .describedAs("binary family resolves through the document-free measurement library")
+ .isPresent();
+ }
+}
From f9261125348b94d05c2aa43d488e2058208da9e8 Mon Sep 17 00:00:00 2001
From: DemchaAV
Date: Mon, 8 Jun 2026 23:04:30 +0100
Subject: [PATCH 2/2] perf(engine): memoize glyph coverage so encode runs once
per distinct glyph
GlyphFallbackLogger.sanitize (shared by paragraph spans, table cells, watermark
and header/footer chrome, and by width measurement) called PDFont.encode for
every code point of every string, allocating a String per glyph and throwing a
caught exception per unencodable glyph, at measurement and again at render.
Memoize coverage per (font, code point): encode runs once per distinct glyph,
then a map lookup; kept glyphs append by code point with no per-glyph String.
Output is byte-identical (same encode decision, cached; warn cadence unchanged),
pinned by PdfFontSanitizerTest output assertions plus new memo tests (4 probes
for "banana banana", 0 on repeat, counted via a test-scope counting font).
Finding 3.
---
CHANGELOG.md | 15 ++++
.../render/pdf/GlyphFallbackLogger.java | 59 ++++++++++++---
.../render/pdf/PdfFontSanitizerTest.java | 74 +++++++++++++++++++
3 files changed, 138 insertions(+), 10 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 403c6ec3b..d36081591 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -45,6 +45,21 @@ Open cycle — bug-fix / housekeeping. Entries land here as they merge.
subsetting removed per session after the first on a thread). Standard-14-only
documents are unaffected. No public API or behaviour change.
+- **Glyph-coverage probing is memoized instead of repeated per glyph.** The render
+ sanitizer (`GlyphFallbackLogger.sanitize` — shared by paragraph spans, table
+ cells, watermark and header/footer chrome, and by width measurement) used to
+ call `PDFont.encode` for *every code point of every string* — allocating a
+ `String` per glyph and, for any glyph the font cannot encode, **throwing and
+ catching an exception** — at measurement and again at render. Coverage is now
+ memoized per `(font, code point)`: `encode` runs once per distinct glyph, then
+ it is a map lookup, and kept glyphs append by code point with no per-glyph
+ `String`. **Output is byte-identical** — the substitution decision is the same
+ `encode`, only cached; the glyph-fallback warning cadence is unchanged (pinned
+ by `PdfFontSanitizerTest`, and width parity by `MeasurementFontParityTest`).
+ This removes real per-glyph work from the render hot path: a long document
+ re-probed tens of thousands of glyph occurrences that now collapse to roughly
+ the number of distinct characters it uses. No public API or behaviour change.
+
### Tests / tooling
- **Benchmark regression gate and measurement probe (benchmarks module, not part
diff --git a/src/main/java/com/demcha/compose/engine/render/pdf/GlyphFallbackLogger.java b/src/main/java/com/demcha/compose/engine/render/pdf/GlyphFallbackLogger.java
index 22c9fbb11..6209fbe8e 100644
--- a/src/main/java/com/demcha/compose/engine/render/pdf/GlyphFallbackLogger.java
+++ b/src/main/java/com/demcha/compose/engine/render/pdf/GlyphFallbackLogger.java
@@ -4,6 +4,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@@ -40,6 +41,24 @@ public final class GlyphFallbackLogger {
*/
private static final Set SEEN = ConcurrentHashMap.newKeySet();
+ /**
+ * Per-font glyph-coverage memo: a font's PostScript base name to the set of
+ * code points it can ({@code true}) or cannot ({@code false}) encode.
+ *
+ * Glyph coverage is an immutable property of the loaded font program, so
+ * the first {@link PDFont#encode(String)} result for a {@code (font, code
+ * point)} pair holds for the lifetime of the process. Memoizing it turns the
+ * heavy probe — which also throws an exception for every unencodable glyph —
+ * into a map lookup, so {@code encode} runs once per distinct
+ * {@code (font, code point)} instead of once per glyph occurrence on every
+ * measurement and render pass. Two {@code PDType0Font} instances of the same
+ * embedded font share a base name (the subset prefix is only added at save,
+ * after sanitisation), so the measurement font and each render font reuse the
+ * same memo. Bounded in practice by (distinct fonts × distinct code points
+ * actually drawn).
+ */
+ private static final Map> ENCODABLE_BY_FONT = new ConcurrentHashMap<>();
+
private GlyphFallbackLogger() {
}
@@ -53,7 +72,7 @@ private GlyphFallbackLogger() {
* @param codePoint the Unicode code point that was substituted
*/
public static void report(PDFont font, int codePoint) {
- String fontName = font != null ? font.getName() : "";
+ String fontName = fontKey(font);
long key = ((long) fontName.hashCode() << 32) | (codePoint & 0xFFFFFFFFL);
if (SEEN.add(key)) {
LOG.warn("glyph.missing font={} codePoint=U+{} replaced='?'",
@@ -82,34 +101,54 @@ public static String sanitize(PDFont font, String text) {
if (text == null || text.isEmpty()) {
return text == null ? "" : text;
}
+ Map coverage = ENCODABLE_BY_FONT.computeIfAbsent(fontKey(font), key -> new ConcurrentHashMap<>());
StringBuilder sb = new StringBuilder(text.length());
- text.codePoints().forEach(cp -> {
- if (cp == '\n' || cp == '\r') return;
- String ch = new String(Character.toChars(cp));
- if (canEncode(font, ch)) {
- sb.append(ch);
+ int length = text.length();
+ for (int offset = 0; offset < length; ) {
+ int codePoint = text.codePointAt(offset);
+ offset += Character.charCount(codePoint);
+ if (codePoint == '\n' || codePoint == '\r') {
+ continue;
+ }
+ if (isEncodable(font, coverage, codePoint)) {
+ sb.appendCodePoint(codePoint);
} else {
- report(font, cp);
+ report(font, codePoint);
sb.append('?');
}
- });
+ }
return sb.toString();
}
- private static boolean canEncode(PDFont font, String ch) {
+ private static boolean isEncodable(PDFont font, Map coverage, int codePoint) {
+ Boolean cached = coverage.get(codePoint);
+ if (cached != null) {
+ return cached;
+ }
+ boolean encodable = canEncode(font, codePoint);
+ coverage.put(codePoint, encodable);
+ return encodable;
+ }
+
+ private static boolean canEncode(PDFont font, int codePoint) {
try {
- font.encode(ch);
+ font.encode(new String(Character.toChars(codePoint)));
return true;
} catch (Exception e) {
return false;
}
}
+ private static String fontKey(PDFont font) {
+ return font != null ? font.getName() : "";
+ }
+
/**
* Visible for tests. Clears the deduplication cache so a fresh test
* can assert on the warn sequence without process restart.
*/
static void resetForTesting() {
SEEN.clear();
+ ENCODABLE_BY_FONT.clear();
}
}
diff --git a/src/test/java/com/demcha/compose/engine/render/pdf/PdfFontSanitizerTest.java b/src/test/java/com/demcha/compose/engine/render/pdf/PdfFontSanitizerTest.java
index 4ac014c31..948f41647 100644
--- a/src/test/java/com/demcha/compose/engine/render/pdf/PdfFontSanitizerTest.java
+++ b/src/test/java/com/demcha/compose/engine/render/pdf/PdfFontSanitizerTest.java
@@ -1,11 +1,14 @@
package com.demcha.compose.engine.render.pdf;
import com.demcha.compose.engine.components.content.text.TextStyle;
+import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import java.io.IOException;
+
import static org.assertj.core.api.Assertions.assertThat;
/**
@@ -126,4 +129,75 @@ void sanitizeByFont_directlyReplacesUnsupportedGlyphsOnly() {
assertThat(output).isEqualTo("ok ? then");
}
+
+ @Test
+ void coverageMemo_probesEachDistinctGlyphOnceAcrossRepeatedSanitisation() {
+ // Finding 3: glyph coverage is memoized per (font, code point), so the
+ // heavy PDFont.encode probe runs once per distinct glyph instead of once
+ // per occurrence on every measurement/render pass.
+ GlyphFallbackLogger.resetForTesting();
+ EncodeCountingFont font = new EncodeCountingFont();
+
+ // "banana banana" repeats only four distinct code points: b, a, n, space.
+ String first = GlyphFallbackLogger.sanitize(font, "banana banana");
+ int probesAfterFirst = font.encodeCalls();
+
+ String second = GlyphFallbackLogger.sanitize(font, "banana banana");
+ int probesAfterSecond = font.encodeCalls();
+
+ assertThat(first).isEqualTo("banana banana");
+ assertThat(second).isEqualTo("banana banana");
+ assertThat(probesAfterFirst)
+ .describedAs("encode probed once per distinct (font, code point), not per occurrence")
+ .isEqualTo(4);
+ assertThat(probesAfterSecond - probesAfterFirst)
+ .describedAs("re-sanitising the same glyphs adds no encode probes")
+ .isZero();
+ }
+
+ @Test
+ void coverageMemo_probesUnencodableGlyphOnceThenReusesSubstitution() {
+ // The cache remembers negatives too: a missing glyph is probed once, then
+ // every later occurrence is a cache hit that still substitutes '?'.
+ GlyphFallbackLogger.resetForTesting();
+ EncodeCountingFont font = new EncodeCountingFont();
+
+ String first = GlyphFallbackLogger.sanitize(font, "a●b●c●"); // ● = U+25CF, unencodable
+ int probesAfterFirst = font.encodeCalls();
+
+ String second = GlyphFallbackLogger.sanitize(font, "●●●");
+ int probesAfterSecond = font.encodeCalls();
+
+ assertThat(first).isEqualTo("a?b?c?");
+ assertThat(second).isEqualTo("???");
+ // Distinct code points in "a●b●c●": a, ●, b, c = four probes.
+ assertThat(probesAfterFirst).isEqualTo(4);
+ assertThat(probesAfterSecond - probesAfterFirst)
+ .describedAs("the unencodable glyph is probed once, then served from cache")
+ .isZero();
+ }
+
+ /**
+ * Test-only Helvetica that counts how often the glyph sanitizer probes the
+ * font, so the memo tests can assert probe counts with no instrumentation in
+ * the production {@link GlyphFallbackLogger}. {@link PDFont#encode(int)} is the
+ * per-code-point hook the sanitizer reaches through {@code encode(String)}.
+ */
+ private static final class EncodeCountingFont extends PDType1Font {
+ private int encodeCalls;
+
+ EncodeCountingFont() {
+ super(Standard14Fonts.FontName.HELVETICA);
+ }
+
+ @Override
+ protected byte[] encode(int code) throws IOException {
+ encodeCalls++;
+ return super.encode(code);
+ }
+
+ int encodeCalls() {
+ return encodeCalls;
+ }
+ }
}