diff --git a/CHANGELOG.md b/CHANGELOG.md index 0938c80bd..d36081591 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,38 @@ Open cycle — bug-fix / housekeeping. Entries land here as they merge. token). **Output is byte-identical** — the fit predicate is monotonic, so the search returns the same break index. No public API or behaviour change. +- **Text measurement no longer embeds binary fonts into a throwaway document.** + The layout measurement pipeline used to subset-embed every Google/custom font + family into a private `PDDocument` that was immediately discarded — repeated on + every new `DocumentSession`, because each render in a server opens a fresh + session. Measurement now resolves binary families to a **per-thread cached** + font (mirroring the existing parsed-TrueType cache) bound to a reusable, + never-saved document, so a family embeds once per worker thread instead of once + per session, and opening measurement resources owns no PDF document at all. + **Output is byte-identical** — both paths read glyph widths and metrics from the + same parsed `TrueTypeFont`; proven by a 960-case render-vs-measurement + width-parity check (max |Δ| = 0.0), a new `MeasurementFontParityTest`, and the + full visual-regression / snapshot suite passing unchanged. Only Google/custom-font + documents are affected (the standard-14 path never embedded); a measurement probe + showed the per-session embed waste drop ~94–97% (≈1.5–3 MB and ≈2–4.5 ms of font + subsetting removed per session after the first on a thread). Standard-14-only + documents are unaffected. No public API or behaviour change. + +- **Glyph-coverage probing is memoized instead of repeated per glyph.** The render + sanitizer (`GlyphFallbackLogger.sanitize` — shared by paragraph spans, table + cells, watermark and header/footer chrome, and by width measurement) used to + call `PDFont.encode` for *every code point of every string* — allocating a + `String` per glyph and, for any glyph the font cannot encode, **throwing and + catching an exception** — at measurement and again at render. Coverage is now + memoized per `(font, code point)`: `encode` runs once per distinct glyph, then + it is a map lookup, and kept glyphs append by code point with no per-glyph + `String`. **Output is byte-identical** — the substitution decision is the same + `encode`, only cached; the glyph-fallback warning cadence is unchanged (pinned + by `PdfFontSanitizerTest`, and width parity by `MeasurementFontParityTest`). + This removes real per-glyph work from the render hot path: a long document + re-probed tens of thousands of glyph occurrences that now collapse to roughly + the number of distinct characters it uses. No public API or behaviour change. + ### Tests / tooling - **Benchmark regression gate and measurement probe (benchmarks module, not part diff --git a/benchmarks/src/main/java/com/demcha/compose/FontEmbedProbe.java b/benchmarks/src/main/java/com/demcha/compose/FontEmbedProbe.java new file mode 100644 index 000000000..25b6ec529 --- /dev/null +++ b/benchmarks/src/main/java/com/demcha/compose/FontEmbedProbe.java @@ -0,0 +1,271 @@ +package com.demcha.compose; + +import com.demcha.compose.document.backend.fixed.pdf.PdfFontLibraryFactory; +import com.demcha.compose.document.backend.fixed.pdf.PdfMeasurementResources; +import com.demcha.compose.engine.components.content.text.TextDecoration; +import com.demcha.compose.engine.components.content.text.TextStyle; +import com.demcha.compose.engine.measurement.TextMeasurementSystem; +import com.demcha.compose.engine.render.pdf.PdfFont; +import com.demcha.compose.font.FontLibrary; +import com.demcha.compose.font.FontName; +import org.apache.pdfbox.pdmodel.PDDocument; + +import java.awt.Color; +import java.lang.management.ManagementFactory; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Finding 4 probe — quantifies the wasted cost of embedding binary (Google) font + * families into the throwaway measurement {@code PDDocument}. + * + *

The canonical pipeline builds two {@code PDDocument}s on a first render:

+ * + * + *

Each binary family the document uses is + * {@code PDType0Font.load(doc, ttf, subset=true)}-ed into both. The + * measurement doc is never saved, so that embed is pure waste — Finding 4. The + * bundled standard-14 families (Helvetica/Times/Courier) use {@code PDType1Font} + * and embed nothing; only the 30 Google families are binary TTF.

+ * + *

This probe measures, warm (steady state, the honest signal per the + * perf-change workflow) and deterministically (allocated bytes via + * {@code ThreadMXBean}), the cost of resolving N binary families into a fresh + * measurement doc. After warm-up the raw font bytes and parsed {@code TrueTypeFont} + * are already cached, so the residual is precisely the per-document + * {@code PDType0Font.load} embed — the work F4 proposes to remove from the + * measurement side. Standard-14 (Helvetica) is the zero-embed baseline; the + * (binary − Helvetica) delta isolates the embed. One resolved family loads all + * four faces (regular/bold/italic/boldItalic), so a family costs 4 + * {@code PDType0Font.load} calls. Needs no {@code src/main} changes.

+ */ +public final class FontEmbedProbe { + + private static final com.sun.management.ThreadMXBean THREAD_MX = + (com.sun.management.ThreadMXBean) ManagementFactory.getThreadMXBean(); + + private static final String SAMPLE = "The quick brown fox Ag 0123456789"; + private static final int WARMUP_ITERATIONS = 8; + private static final int MEASURED_ITERATIONS = 11; + + /** All 30 bundled binary (Google) families — every family that embeds. */ + private static final List GOOGLE_FAMILIES = List.of( + FontName.LATO, FontName.PT_SANS, FontName.PT_SERIF, FontName.FIRA_SANS, FontName.UBUNTU, + FontName.ALEGREYA_SANS, FontName.CARLITO, FontName.POPPINS, FontName.BARLOW, + FontName.BARLOW_CONDENSED, FontName.ASAP_CONDENSED, FontName.ARSENAL, FontName.IBM_PLEX_SERIF, + FontName.IBM_PLEX_MONO, FontName.CRIMSON_TEXT, FontName.SPECTRAL, FontName.ZILLA_SLAB, + FontName.GENTIUM_PLUS, FontName.TINOS, FontName.COUSINE, FontName.FIRA_SANS_CONDENSED, + FontName.KANIT, FontName.VOLKHOV, FontName.TAVIRAJ, FontName.TRIRONG, FontName.SARABUN, + FontName.PROMPT, FontName.ANDIKA, FontName.BAI_JAMJUREE, FontName.JETBRAINS_MONO); + + private static final List FACES = List.of( + TextDecoration.DEFAULT, TextDecoration.BOLD, TextDecoration.ITALIC, TextDecoration.BOLD_ITALIC); + + /** Width-parity battery: plain text, kerning-prone runs, and sanitize/unencodable cases. */ + private static final List PARITY_STRINGS = List.of( + "The quick brown fox jumps over the lazy dog", + "Ag", + "01234567890", + "Proportional WAVE Type AVA To. kerning", + "Em dash — and “smart quotes”  nbsp", + "Arrows → bullet ● emoji 😀 fallback", + " leading and trailing spaces ", + "Mixed CASE punctuation!?.,;: (parens) [brackets]"); + + public static void main(String[] args) throws Exception { + BenchmarkSupport.configureQuietLogging(); + new FontEmbedProbe().run(); + } + + private void run() throws Exception { + enableAllocationMeasurement(); + + List scenarios = List.of( + new Scenario("helvetica (std-14)", List.of(FontName.HELVETICA)), + new Scenario("1 google (Lato)", List.of(FontName.LATO)), + new Scenario("2 google (Lato+Poppins)", List.of(FontName.LATO, FontName.POPPINS)), + new Scenario("3 google (Lato+Poppins+Ubuntu)", + List.of(FontName.LATO, FontName.POPPINS, FontName.UBUNTU))); + + System.out.println("GraphCompose Finding-4 Font-Embed Probe (measurement document)"); + System.out.println("Allocation measurement: " + (allocationSupported() ? "enabled" : "UNAVAILABLE")); + System.out.println("Warm iterations: " + WARMUP_ITERATIONS + ", measured (median): " + MEASURED_ITERATIONS); + System.out.println(); + + // Warm up class-load / JIT / TTF-parse so the measured window reflects the + // steady-state PDType0Font.load embed, not one-time cold-start cost. + for (int i = 0; i < WARMUP_ITERATIONS; i++) { + for (Scenario scenario : scenarios) { + measureOnce(scenario); + } + } + + List results = new ArrayList<>(); + for (Scenario scenario : scenarios) { + long[] allocs = new long[MEASURED_ITERATIONS]; + double[] millis = new double[MEASURED_ITERATIONS]; + for (int i = 0; i < MEASURED_ITERATIONS; i++) { + Sample sample = measureOnce(scenario); + allocs[i] = sample.allocBytes(); + millis[i] = sample.nanos() / 1_000_000.0; + } + results.add(new Result(scenario, medianLong(allocs), medianDouble(millis))); + } + + long baselineAlloc = results.get(0).medianAllocBytes(); + double baselineMs = results.get(0).medianMillis(); + + System.out.printf("%-32s | %12s | %10s | %14s | %10s%n", + "Scenario", "Alloc (KB)", "Time (ms)", "Embed Δalloc", "Embed Δms"); + System.out.println("-".repeat(92)); + for (Result result : results) { + long deltaAlloc = result.medianAllocBytes() - baselineAlloc; + double deltaMs = result.medianMillis() - baselineMs; + boolean isBaseline = result == results.get(0); + System.out.printf("%-32s | %12s | %10.3f | %14s | %10s%n", + result.scenario().label(), + formatKb(result.medianAllocBytes()), + result.medianMillis(), + isBaseline ? "(baseline)" : formatKb(deltaAlloc), + isBaseline ? "—" : "%.3f".formatted(deltaMs)); + } + + System.out.println(); + System.out.println("Embed Δ = scenario − Helvetica baseline = measurement-doc binary embed (the F4 waste)."); + System.out.println("After F4 the per-thread cache absorbs the embed, so warm google rows collapse toward baseline."); + + parityCheck(); + } + + /** + * Proves the F4 change is geometry-neutral: for every binary family and face, + * the measurement-path width must equal the render-path width to the bit. Both + * resolve through the same cached {@link org.apache.fontbox.ttf.TrueTypeFont}, + * so any non-zero delta would mean a real measurement regression. + */ + private void parityCheck() throws Exception { + long comparisons = 0; + double maxAbsDiff = 0.0; + String worst = ""; + + try (PDDocument renderDocument = new PDDocument(); + PdfMeasurementResources measurement = PdfMeasurementResources.open(List.of())) { + // Exactly what PdfFixedLayoutBackend builds: a render library that embeds + // a subset into the (saved) render document. + FontLibrary renderLibrary = PdfFontLibraryFactory.library(renderDocument, List.of()); + TextMeasurementSystem measure = measurement.textMeasurementSystem(); + + for (FontName family : GOOGLE_FAMILIES) { + PdfFont renderFont = renderLibrary.getFont(family, PdfFont.class) + .orElseThrow(() -> new IllegalStateException("missing render font " + family)); + for (TextDecoration face : FACES) { + for (String text : PARITY_STRINGS) { + TextStyle style = new TextStyle(family, 11.0, face, Color.BLACK); + double renderWidth = renderFont.getTextWidth(style, text); + double measureWidth = measure.textWidth(style, text); + double diff = Math.abs(renderWidth - measureWidth); + comparisons++; + if (diff > maxAbsDiff) { + maxAbsDiff = diff; + worst = family + "/" + face + " : \"" + text + "\" (render=" + renderWidth + + ", measure=" + measureWidth + ")"; + } + } + } + } + } + + boolean pass = maxAbsDiff == 0.0; + System.out.println(); + System.out.printf("PARITY: %s — %d comparisons (%d google families x %d faces x %d strings), max|Δwidth| = %s%n", + pass ? "PASS (byte-identical render vs measurement)" : "FAIL", + comparisons, GOOGLE_FAMILIES.size(), FACES.size(), PARITY_STRINGS.size(), + maxAbsDiff); + if (!pass) { + System.out.println(" worst: " + worst); + } + } + + private Sample measureOnce(Scenario scenario) throws Exception { + List styles = new ArrayList<>(); + for (FontName fontName : scenario.fonts()) { + styles.add(new TextStyle(fontName, 10.0, TextDecoration.DEFAULT, Color.BLACK)); + } + + long allocBefore = currentThreadAllocatedBytes(); + long t0 = System.nanoTime(); + PdfMeasurementResources resources = PdfMeasurementResources.open(List.of()); + TextMeasurementSystem measurement = resources.textMeasurementSystem(); + double sink = 0; + for (TextStyle style : styles) { + // First width call lazily resolves the family -> loads all 4 faces + // via PDType0Font.load into this throwaway measurement document. + sink += measurement.textWidth(style, SAMPLE); + } + long nanos = System.nanoTime() - t0; + long allocBytes = allocBefore < 0 ? -1 : currentThreadAllocatedBytes() - allocBefore; + + if (sink < 0) { + throw new IllegalStateException("unreachable"); + } + resources.close(); + return new Sample(allocBytes, nanos); + } + + private static void enableAllocationMeasurement() { + try { + if (THREAD_MX.isThreadAllocatedMemorySupported() && !THREAD_MX.isThreadAllocatedMemoryEnabled()) { + THREAD_MX.setThreadAllocatedMemoryEnabled(true); + } + } catch (UnsupportedOperationException ignored) { + // Allocation measurement unsupported; Alloc column reports n/a. + } + } + + private static boolean allocationSupported() { + try { + return THREAD_MX.isThreadAllocatedMemorySupported() && THREAD_MX.isThreadAllocatedMemoryEnabled(); + } catch (UnsupportedOperationException ex) { + return false; + } + } + + private static long currentThreadAllocatedBytes() { + if (!allocationSupported()) { + return -1; + } + return THREAD_MX.getCurrentThreadAllocatedBytes(); + } + + private static long medianLong(long[] values) { + long[] copy = values.clone(); + Arrays.sort(copy); + return copy[copy.length / 2]; + } + + private static double medianDouble(double[] values) { + double[] copy = values.clone(); + Arrays.sort(copy); + return copy[copy.length / 2]; + } + + private static String formatKb(long bytes) { + return bytes < 0 ? "n/a" : "%.1f".formatted(bytes / 1024.0); + } + + private record Scenario(String label, List fonts) { + } + + private record Sample(long allocBytes, long nanos) { + } + + private record Result(Scenario scenario, long medianAllocBytes, double medianMillis) { + } +} diff --git a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLibraryFactory.java b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLibraryFactory.java index 8a6ee0278..781e06125 100644 --- a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLibraryFactory.java +++ b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLibraryFactory.java @@ -6,6 +6,7 @@ import com.demcha.compose.font.FontFamilyDefinition; import com.demcha.compose.font.FontLibrary; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.font.PDType0Font; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; @@ -35,7 +36,7 @@ private PdfFontLibraryFactory() { */ public static FontLibrary standardLibrary() { FontLibrary fontLibrary = new FontLibrary(); - DefaultFonts.standardFamilies().forEach(definition -> register(fontLibrary, null, definition)); + DefaultFonts.standardFamilies().forEach(definition -> register(fontLibrary, null, definition, false)); return fontLibrary; } @@ -57,19 +58,45 @@ public static FontLibrary library(PDDocument document) { * @return PDF-backed font library */ public static FontLibrary library(PDDocument document, Collection customFamilies) { + return buildLibrary(document, customFamilies, false); + } + + /** + * Creates a measurement-only font library. + * + *

Binary families resolve to per-thread cached measurement fonts instead of + * embedding a fresh subset into a throwaway {@link PDDocument} on every session + * (Finding 4: the measurement document is discarded, so its embed was pure + * waste). Standard-14 families are unaffected — they never embed. The resolved + * font metrics are byte-identical to the render library, so layout geometry is + * unchanged.

+ * + * @param customFamilies document-local custom font families + * @return measurement-backed font library that needs no owning document + */ + public static FontLibrary measurementLibrary(Collection customFamilies) { + return buildLibrary(null, customFamilies, true); + } + + private static FontLibrary buildLibrary(PDDocument document, + Collection customFamilies, + boolean measurement) { FontLibrary fontLibrary = new FontLibrary(); for (FontFamilyDefinition definition : DefaultFonts.bundledFamilies()) { - register(fontLibrary, document, definition); + register(fontLibrary, document, definition, measurement); } for (FontFamilyDefinition definition : customFamilies) { - register(fontLibrary, document, definition); + register(fontLibrary, document, definition, measurement); } return fontLibrary; } - private static void register(FontLibrary library, PDDocument document, FontFamilyDefinition definition) { + private static void register(FontLibrary library, + PDDocument document, + FontFamilyDefinition definition, + boolean measurement) { Objects.requireNonNull(library, "library"); Objects.requireNonNull(definition, "definition"); @@ -82,15 +109,12 @@ private static void register(FontLibrary library, PDDocument document, FontFamil definition.fontSourceSet().ifPresent(sources -> library.addFontFactory(definition.name(), PdfFont.class, () -> { - PDDocument owner = Objects.requireNonNull( - document, - "A PDF document is required to load binary font family " + definition.name()); try { return new PdfFont( - PdfFontLoader.loadFont(owner, sources.regular().openStream(), sources.regular().description()), - PdfFontLoader.loadFont(owner, sources.bold().openStream(), sources.bold().description()), - PdfFontLoader.loadFont(owner, sources.italic().openStream(), sources.italic().description()), - PdfFontLoader.loadFont(owner, sources.boldItalic().openStream(), sources.boldItalic().description())); + loadBinaryFace(measurement, document, definition, sources.regular()), + loadBinaryFace(measurement, document, definition, sources.bold()), + loadBinaryFace(measurement, document, definition, sources.italic()), + loadBinaryFace(measurement, document, definition, sources.boldItalic())); } catch (IOException e) { throw new IllegalStateException("Unable to register PDF font family " + definition.name(), e); } @@ -99,6 +123,24 @@ private static void register(FontLibrary library, PDDocument document, FontFamil library.addFontFactory(definition.name(), WordFont.class, () -> new WordFont(definition.wordFamily())); } + /** + * Loads a single binary face for either the render path (subset embedded into + * {@code document}) or the measurement path (per-thread cached, doc-independent + * metrics). Both observe the same parsed font program, so metrics match. + */ + private static PDType0Font loadBinaryFace(boolean measurement, + PDDocument document, + FontFamilyDefinition definition, + FontFamilyDefinition.FontBinarySource source) throws IOException { + if (measurement) { + return PdfFontLoader.loadMeasurementFont(source.openStream(), source.description()); + } + PDDocument owner = Objects.requireNonNull( + document, + "A PDF document is required to load binary font family " + definition.name()); + return PdfFontLoader.loadFont(owner, source.openStream(), source.description()); + } + private static Standard14Fonts.FontName standardFont(String name) { return Standard14Fonts.FontName.valueOf(name); } diff --git a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLoader.java b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLoader.java index 2417c1429..4921c3a38 100644 --- a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLoader.java +++ b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfFontLoader.java @@ -9,6 +9,7 @@ import java.io.IOException; import java.io.InputStream; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -41,32 +42,110 @@ protected boolean removeEldestEntry(Map.Entry eldest) { } }); + /** + * Per-thread, never-saved document that owns measurement-only embedded fonts. + * + *

The layout pipeline reads glyph widths, vertical metrics and glyph + * coverage from a real {@link PDType0Font}. Those answers are derived from the + * parsed {@link TrueTypeFont} (advance widths, descriptor tables, cmap) and do + * not depend on which document owns the font, so a single reusable owner per + * thread produces byte-identical metrics to the per-render embed. The document + * is never saved, so the deferred subset build never runs; it only accumulates + * the bounded set of distinct font faces touched on the thread.

+ */ + private static final ThreadLocal THREAD_LOCAL_MEASUREMENT_DOCUMENT = + ThreadLocal.withInitial(PDDocument::new); + + /** + * Per-thread cache of measurement-only fonts keyed by source description, + * bound to {@link #THREAD_LOCAL_MEASUREMENT_DOCUMENT}. + * + *

Deliberately uncapped, unlike {@link #THREAD_LOCAL_TTF_CACHE}. + * Evicting an entry would not free anything: the {@link PDType0Font} stays + * registered in the never-pruned measurement document, and the next use of + * that face would {@code PDType0Font.load} a second copy into the same + * document — so an LRU here grows the document on every evict/reload instead of + * bounding it. Loading each face exactly once per thread keeps the document at + * one font per distinct face, which is the real bound (≈ the bundled face count + * plus any custom faces the thread touches).

+ */ + private static final ThreadLocal> THREAD_LOCAL_MEASUREMENT_FONT_CACHE = + ThreadLocal.withInitial(HashMap::new); + private PdfFontLoader() { } + /** + * Loads a binary font and embeds a fresh subset into {@code document}. Used by + * the render path, where the font program is written when the document is + * saved. + */ static PDType0Font loadFont(PDDocument document, InputStream inputStream, String sourceDescription) { try (InputStream streamToClose = inputStream) { - byte[] fontBytes = RAW_FONT_CACHE.computeIfAbsent(sourceDescription, key -> { - try { - return streamToClose.readAllBytes(); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - - TrueTypeFont ttf = THREAD_LOCAL_TTF_CACHE.get().computeIfAbsent(sourceDescription, key -> { - try { - RandomAccessReadBuffer buffer = new RandomAccessReadBuffer(fontBytes); - return new TTFParser().parse(buffer); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - + TrueTypeFont ttf = resolveTrueTypeFont(streamToClose, sourceDescription); return PDType0Font.load(document, ttf, true); } catch (IOException e) { log.error("Unable to load font from {}", sourceDescription, e); throw new RuntimeException(e); } } + + /** + * Loads a binary font for the measurement pipeline. + * + *

Unlike {@link #loadFont(PDDocument, InputStream, String)} — which embeds a + * fresh subset into the saved render document on every render — this returns a + * per-thread cached {@link PDType0Font} bound to a reusable, never-saved + * measurement document. Width, vertical-metric and glyph-coverage answers are + * derived from the parsed {@link TrueTypeFont} and are therefore byte-identical + * to the render font, so layout geometry is unchanged; the only difference is + * that the embed cost is paid once per thread instead of once per + * {@code DocumentSession} (Finding 4: the measurement document was discarded, + * so its embed was pure waste).

+ * + * @param inputStream font data stream (closed by this method) + * @param sourceDescription stable identity used as the cache key + * @return a reusable measurement font for the current thread + */ + static PDType0Font loadMeasurementFont(InputStream inputStream, String sourceDescription) { + try (InputStream streamToClose = inputStream) { + Map measurementFonts = THREAD_LOCAL_MEASUREMENT_FONT_CACHE.get(); + PDType0Font cached = measurementFonts.get(sourceDescription); + if (cached != null) { + return cached; + } + + TrueTypeFont ttf = resolveTrueTypeFont(streamToClose, sourceDescription); + PDType0Font measurementFont = PDType0Font.load(THREAD_LOCAL_MEASUREMENT_DOCUMENT.get(), ttf, true); + measurementFonts.put(sourceDescription, measurementFont); + return measurementFont; + } catch (IOException e) { + log.error("Unable to load measurement font from {}", sourceDescription, e); + throw new RuntimeException(e); + } + } + + /** + * Resolves the parsed {@link TrueTypeFont} for a source, reusing the shared raw + * byte cache and the per-thread parsed-font cache. Shared by the render and + * measurement load paths so both observe identical font programs. + */ + private static TrueTypeFont resolveTrueTypeFont(InputStream streamToClose, String sourceDescription) { + byte[] fontBytes = RAW_FONT_CACHE.computeIfAbsent(sourceDescription, key -> { + try { + return streamToClose.readAllBytes(); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + return THREAD_LOCAL_TTF_CACHE.get().computeIfAbsent(sourceDescription, key -> { + try { + RandomAccessReadBuffer buffer = new RandomAccessReadBuffer(fontBytes); + return new TTFParser().parse(buffer); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } } diff --git a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfMeasurementResources.java b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfMeasurementResources.java index 780c9b383..44797bbce 100644 --- a/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfMeasurementResources.java +++ b/src/main/java/com/demcha/compose/document/backend/fixed/pdf/PdfMeasurementResources.java @@ -5,7 +5,6 @@ import com.demcha.compose.engine.render.pdf.PdfFont; import com.demcha.compose.font.FontFamilyDefinition; import com.demcha.compose.font.FontLibrary; -import org.apache.pdfbox.pdmodel.PDDocument; import java.util.Collection; @@ -17,29 +16,30 @@ * until GraphCompose has a backend-neutral font measurement implementation.

*/ public final class PdfMeasurementResources implements AutoCloseable { - private final PDDocument document; private final FontLibrary fontLibrary; private final TextMeasurementSystem textMeasurementSystem; - private PdfMeasurementResources(PDDocument document, - FontLibrary fontLibrary, + private PdfMeasurementResources(FontLibrary fontLibrary, TextMeasurementSystem textMeasurementSystem) { - this.document = document; this.fontLibrary = fontLibrary; this.textMeasurementSystem = textMeasurementSystem; } /** - * Opens a fresh measurement document and resolves built-in plus custom fonts. + * Resolves built-in plus custom fonts for the measurement pipeline. + * + *

Binary families resolve to per-thread cached measurement fonts rather than + * embedding a subset into a throwaway PDF document (Finding 4), so opening these + * resources owns no {@link org.apache.pdfbox.pdmodel.PDDocument}. Measured + * metrics are byte-identical to the render font library.

* * @param customFontFamilies document-local font families * @return owned measurement resources */ public static PdfMeasurementResources open(Collection customFontFamilies) { - PDDocument document = new PDDocument(); - FontLibrary fontLibrary = PdfFontLibraryFactory.library(document, customFontFamilies); + FontLibrary fontLibrary = PdfFontLibraryFactory.measurementLibrary(customFontFamilies); TextMeasurementSystem measurement = new FontLibraryTextMeasurementSystem(fontLibrary, PdfFont.class); - return new PdfMeasurementResources(document, fontLibrary, measurement); + return new PdfMeasurementResources(fontLibrary, measurement); } /** @@ -63,6 +63,5 @@ public TextMeasurementSystem textMeasurementSystem() { @Override public void close() throws Exception { textMeasurementSystem.clearCaches(); - document.close(); } } diff --git a/src/main/java/com/demcha/compose/engine/render/pdf/GlyphFallbackLogger.java b/src/main/java/com/demcha/compose/engine/render/pdf/GlyphFallbackLogger.java index 22c9fbb11..6209fbe8e 100644 --- a/src/main/java/com/demcha/compose/engine/render/pdf/GlyphFallbackLogger.java +++ b/src/main/java/com/demcha/compose/engine/render/pdf/GlyphFallbackLogger.java @@ -4,6 +4,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -40,6 +41,24 @@ public final class GlyphFallbackLogger { */ private static final Set SEEN = ConcurrentHashMap.newKeySet(); + /** + * Per-font glyph-coverage memo: a font's PostScript base name to the set of + * code points it can ({@code true}) or cannot ({@code false}) encode. + * + *

Glyph coverage is an immutable property of the loaded font program, so + * the first {@link PDFont#encode(String)} result for a {@code (font, code + * point)} pair holds for the lifetime of the process. Memoizing it turns the + * heavy probe — which also throws an exception for every unencodable glyph — + * into a map lookup, so {@code encode} runs once per distinct + * {@code (font, code point)} instead of once per glyph occurrence on every + * measurement and render pass. Two {@code PDType0Font} instances of the same + * embedded font share a base name (the subset prefix is only added at save, + * after sanitisation), so the measurement font and each render font reuse the + * same memo. Bounded in practice by (distinct fonts × distinct code points + * actually drawn).

+ */ + private static final Map> ENCODABLE_BY_FONT = new ConcurrentHashMap<>(); + private GlyphFallbackLogger() { } @@ -53,7 +72,7 @@ private GlyphFallbackLogger() { * @param codePoint the Unicode code point that was substituted */ public static void report(PDFont font, int codePoint) { - String fontName = font != null ? font.getName() : ""; + String fontName = fontKey(font); long key = ((long) fontName.hashCode() << 32) | (codePoint & 0xFFFFFFFFL); if (SEEN.add(key)) { LOG.warn("glyph.missing font={} codePoint=U+{} replaced='?'", @@ -82,34 +101,54 @@ public static String sanitize(PDFont font, String text) { if (text == null || text.isEmpty()) { return text == null ? "" : text; } + Map coverage = ENCODABLE_BY_FONT.computeIfAbsent(fontKey(font), key -> new ConcurrentHashMap<>()); StringBuilder sb = new StringBuilder(text.length()); - text.codePoints().forEach(cp -> { - if (cp == '\n' || cp == '\r') return; - String ch = new String(Character.toChars(cp)); - if (canEncode(font, ch)) { - sb.append(ch); + int length = text.length(); + for (int offset = 0; offset < length; ) { + int codePoint = text.codePointAt(offset); + offset += Character.charCount(codePoint); + if (codePoint == '\n' || codePoint == '\r') { + continue; + } + if (isEncodable(font, coverage, codePoint)) { + sb.appendCodePoint(codePoint); } else { - report(font, cp); + report(font, codePoint); sb.append('?'); } - }); + } return sb.toString(); } - private static boolean canEncode(PDFont font, String ch) { + private static boolean isEncodable(PDFont font, Map coverage, int codePoint) { + Boolean cached = coverage.get(codePoint); + if (cached != null) { + return cached; + } + boolean encodable = canEncode(font, codePoint); + coverage.put(codePoint, encodable); + return encodable; + } + + private static boolean canEncode(PDFont font, int codePoint) { try { - font.encode(ch); + font.encode(new String(Character.toChars(codePoint))); return true; } catch (Exception e) { return false; } } + private static String fontKey(PDFont font) { + return font != null ? font.getName() : ""; + } + /** * Visible for tests. Clears the deduplication cache so a fresh test * can assert on the warn sequence without process restart. */ static void resetForTesting() { SEEN.clear(); + ENCODABLE_BY_FONT.clear(); } } diff --git a/src/test/java/com/demcha/compose/document/backend/fixed/pdf/MeasurementFontParityTest.java b/src/test/java/com/demcha/compose/document/backend/fixed/pdf/MeasurementFontParityTest.java new file mode 100644 index 000000000..79b541be3 --- /dev/null +++ b/src/test/java/com/demcha/compose/document/backend/fixed/pdf/MeasurementFontParityTest.java @@ -0,0 +1,84 @@ +package com.demcha.compose.document.backend.fixed.pdf; + +import static org.assertj.core.api.Assertions.assertThat; + +import com.demcha.compose.engine.components.content.text.TextDecoration; +import com.demcha.compose.engine.components.content.text.TextStyle; +import com.demcha.compose.engine.render.pdf.PdfFont; +import com.demcha.compose.font.FontLibrary; +import com.demcha.compose.font.FontName; + +import java.awt.Color; +import java.util.List; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Test; + +/** + * Guards Finding 4 (measurement no longer embeds binary fonts into a throwaway + * document). + * + *

Measurement resolves binary families to a per-thread cached, document-free + * {@code PDType0Font}; the render path embeds a fresh subset into the saved + * document. Both must report byte-identical glyph widths — they read the + * same parsed {@code TrueTypeFont}, so any drift here would silently move layout + * geometry. This is the permanent CI counterpart to the manual + * {@code FontEmbedProbe} width-parity check in the benchmarks module.

+ */ +class MeasurementFontParityTest { + + /** Every bundled binary (Google) family — the ones that actually embed. */ + private static final List BINARY_FAMILIES = List.of( + FontName.LATO, FontName.PT_SANS, FontName.PT_SERIF, FontName.FIRA_SANS, FontName.UBUNTU, + FontName.ALEGREYA_SANS, FontName.CARLITO, FontName.POPPINS, FontName.BARLOW, + FontName.BARLOW_CONDENSED, FontName.ASAP_CONDENSED, FontName.ARSENAL, FontName.IBM_PLEX_SERIF, + FontName.IBM_PLEX_MONO, FontName.CRIMSON_TEXT, FontName.SPECTRAL, FontName.ZILLA_SLAB, + FontName.GENTIUM_PLUS, FontName.TINOS, FontName.COUSINE, FontName.FIRA_SANS_CONDENSED, + FontName.KANIT, FontName.VOLKHOV, FontName.TAVIRAJ, FontName.TRIRONG, FontName.SARABUN, + FontName.PROMPT, FontName.ANDIKA, FontName.BAI_JAMJUREE, FontName.JETBRAINS_MONO); + + private static final List FACES = List.of( + TextDecoration.DEFAULT, TextDecoration.BOLD, TextDecoration.ITALIC, TextDecoration.BOLD_ITALIC); + + private static final List STRINGS = List.of( + "The quick brown fox jumps over the lazy dog WAVE AVA To.", + "Em dash — “smart quotes”  nbsp", // standard sanitize cleanup + "Arrows → bullet ● emoji 😀 fallback"); // unencodable code points -> '?' + + @Test + void measurementWidthsMatchRenderWidthsForEveryBinaryFamily() throws Exception { + try (PDDocument renderDocument = new PDDocument(); + PdfMeasurementResources measurement = PdfMeasurementResources.open(List.of())) { + // Exactly what PdfFixedLayoutBackend builds: a render library that embeds + // a fresh subset into the (saved) render document. + FontLibrary renderLibrary = PdfFontLibraryFactory.library(renderDocument, List.of()); + + for (FontName family : BINARY_FAMILIES) { + PdfFont renderFont = renderLibrary.getFont(family, PdfFont.class) + .orElseThrow(() -> new AssertionError("render font missing for " + family)); + for (TextDecoration face : FACES) { + for (String text : STRINGS) { + TextStyle style = new TextStyle(family, 11.0, face, Color.BLACK); + double renderWidth = renderFont.getTextWidth(style, text); + double measurementWidth = measurement.textMeasurementSystem().textWidth(style, text); + + assertThat(measurementWidth) + .describedAs("measurement vs render width parity: %s / %s / \"%s\"", family, face, text) + .isEqualTo(renderWidth); + } + } + } + } + } + + @Test + void measurementLibraryResolvesBinaryFamiliesWithoutOwningDocument() { + // F4 contract: a measurement library embeds nothing into a document and so + // needs none to resolve a binary family. + FontLibrary measurementLibrary = PdfFontLibraryFactory.measurementLibrary(List.of()); + + assertThat(measurementLibrary.getFont(FontName.LATO, PdfFont.class)) + .describedAs("binary family resolves through the document-free measurement library") + .isPresent(); + } +} diff --git a/src/test/java/com/demcha/compose/engine/render/pdf/PdfFontSanitizerTest.java b/src/test/java/com/demcha/compose/engine/render/pdf/PdfFontSanitizerTest.java index 4ac014c31..948f41647 100644 --- a/src/test/java/com/demcha/compose/engine/render/pdf/PdfFontSanitizerTest.java +++ b/src/test/java/com/demcha/compose/engine/render/pdf/PdfFontSanitizerTest.java @@ -1,11 +1,14 @@ package com.demcha.compose.engine.render.pdf; import com.demcha.compose.engine.components.content.text.TextStyle; +import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.io.IOException; + import static org.assertj.core.api.Assertions.assertThat; /** @@ -126,4 +129,75 @@ void sanitizeByFont_directlyReplacesUnsupportedGlyphsOnly() { assertThat(output).isEqualTo("ok ? then"); } + + @Test + void coverageMemo_probesEachDistinctGlyphOnceAcrossRepeatedSanitisation() { + // Finding 3: glyph coverage is memoized per (font, code point), so the + // heavy PDFont.encode probe runs once per distinct glyph instead of once + // per occurrence on every measurement/render pass. + GlyphFallbackLogger.resetForTesting(); + EncodeCountingFont font = new EncodeCountingFont(); + + // "banana banana" repeats only four distinct code points: b, a, n, space. + String first = GlyphFallbackLogger.sanitize(font, "banana banana"); + int probesAfterFirst = font.encodeCalls(); + + String second = GlyphFallbackLogger.sanitize(font, "banana banana"); + int probesAfterSecond = font.encodeCalls(); + + assertThat(first).isEqualTo("banana banana"); + assertThat(second).isEqualTo("banana banana"); + assertThat(probesAfterFirst) + .describedAs("encode probed once per distinct (font, code point), not per occurrence") + .isEqualTo(4); + assertThat(probesAfterSecond - probesAfterFirst) + .describedAs("re-sanitising the same glyphs adds no encode probes") + .isZero(); + } + + @Test + void coverageMemo_probesUnencodableGlyphOnceThenReusesSubstitution() { + // The cache remembers negatives too: a missing glyph is probed once, then + // every later occurrence is a cache hit that still substitutes '?'. + GlyphFallbackLogger.resetForTesting(); + EncodeCountingFont font = new EncodeCountingFont(); + + String first = GlyphFallbackLogger.sanitize(font, "a●b●c●"); // ● = U+25CF, unencodable + int probesAfterFirst = font.encodeCalls(); + + String second = GlyphFallbackLogger.sanitize(font, "●●●"); + int probesAfterSecond = font.encodeCalls(); + + assertThat(first).isEqualTo("a?b?c?"); + assertThat(second).isEqualTo("???"); + // Distinct code points in "a●b●c●": a, ●, b, c = four probes. + assertThat(probesAfterFirst).isEqualTo(4); + assertThat(probesAfterSecond - probesAfterFirst) + .describedAs("the unencodable glyph is probed once, then served from cache") + .isZero(); + } + + /** + * Test-only Helvetica that counts how often the glyph sanitizer probes the + * font, so the memo tests can assert probe counts with no instrumentation in + * the production {@link GlyphFallbackLogger}. {@link PDFont#encode(int)} is the + * per-code-point hook the sanitizer reaches through {@code encode(String)}. + */ + private static final class EncodeCountingFont extends PDType1Font { + private int encodeCalls; + + EncodeCountingFont() { + super(Standard14Fonts.FontName.HELVETICA); + } + + @Override + protected byte[] encode(int code) throws IOException { + encodeCalls++; + return super.encode(code); + } + + int encodeCalls() { + return encodeCalls; + } + } }