From 8ef69955d7c8478e277f3ee1e16b0f0c8fa1fae1 Mon Sep 17 00:00:00 2001 From: Vasyl Vdovychenko Date: Sat, 23 May 2026 13:37:07 -0400 Subject: [PATCH 1/2] =?UTF-8?q?fix(pdf):=20bullet=20=E2=86=92=20new=20para?= =?UTF-8?q?graph,=20drop=20TOC=20chapter,=20allow=20re-extract?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two visible bugs reported on a fresh PDF upload: - Bullet lists in the Preface got concatenated into one wall of text — "• You're building... • You want to... • Tool developers..." all in a single run. PdfPageTextExtractor.GroupLinesIntoParagraphs only split on vertical y-gap, and tightly-spaced lists don't have one. - The "Table of Contents" chapter showed up as a single dense run of leader-dotted entries. It's rendered nowhere readably and the in-app TOC is already built from the chapter list itself. Changes: - Add BulletGlyphs set + StartsWithBulletGlyph check in GroupLinesIntoParagraphs — a line whose first word is •, ●, ▪, ◦, ○, ‣, ⁃ (etc.) forces a new paragraph regardless of y-gap. - New FrontMatterFilter.IsTableOfContents (anchored regex, en + ru/uk + a few EU languages, tolerates trailing page-number bookmarks). - PdfTextExtractor skips the chapter when the bookmark title matches. - UserBookService.RetryAsync now also accepts Ready (not just Failed) so existing books can pick up extraction improvements without a delete+reupload roundtrip. UserIngestionService already wipes old chapters before re-extracting, so this is safe. - UserBookDetailPage gets a small "Re-extract" icon button next to delete (single click, no confirm — reversible). Tests: 36/36 in TextStack.Extraction.Tests pass — 14 new across FrontMatterFilterTests + PdfPageTextExtractorTests covering bullet glyph detection and TOC title matching. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/web/src/pages/UserBookDetailPage.tsx | 33 ++++++++++++++- apps/web/src/styles/books.css | 32 +++++++++++++++ .../Application/UserBooks/UserBookService.cs | 8 +++- .../Extractors/FrontMatterFilter.cs | 29 +++++++++++++ .../Extractors/Pdf/PdfPageTextExtractor.cs | 32 ++++++++++++++- .../Extractors/PdfTextExtractor.cs | 12 ++++++ .../FrontMatterFilterTests.cs | 41 +++++++++++++++++++ .../PdfPageTextExtractorTests.cs | 32 +++++++++++++++ 8 files changed, 215 insertions(+), 4 deletions(-) create mode 100644 backend/src/Extraction/TextStack.Extraction/Extractors/FrontMatterFilter.cs create mode 100644 tests/TextStack.Extraction.Tests/FrontMatterFilterTests.cs create mode 100644 tests/TextStack.Extraction.Tests/PdfPageTextExtractorTests.cs diff --git a/apps/web/src/pages/UserBookDetailPage.tsx b/apps/web/src/pages/UserBookDetailPage.tsx index 97799768..d5ea0473 100644 --- a/apps/web/src/pages/UserBookDetailPage.tsx +++ b/apps/web/src/pages/UserBookDetailPage.tsx @@ -2,7 +2,7 @@ import { useState, useEffect, useMemo, useRef } from 'react' import { useParams, Link, useNavigate } from 'react-router-dom' import { useAuth } from '../context/AuthContext' import { useLanguage } from '../context/LanguageContext' -import { getUserBook, deleteUserBook, markUserBookComplete, unmarkUserBookComplete, getUserBookCoverUrl, type UserBookDetail } from '../api/userBooks' +import { getUserBook, deleteUserBook, retryUserBook, markUserBookComplete, unmarkUserBookComplete, getUserBookCoverUrl, type UserBookDetail } from '../api/userBooks' import { SeoHead } from '../components/SeoHead' import { Footer } from '../components/Footer' import { stringToColor } from '../utils/colors' @@ -27,6 +27,7 @@ export function UserBookDetailPage() { const [loading, setLoading] = useState(true) const [error, setError] = useState(null) const [deleting, setDeleting] = useState(false) + const [reprocessing, setReprocessing] = useState(false) // Inline two-stage confirm — mirrors VocabularyPage's delete pattern. // First click flips the icon button into a red "Confirm?" pill; a second // click within 3 s actually deletes. Avoids the browser-native confirm() @@ -297,6 +298,36 @@ export function UserBookDetailPage() { /> )} + {isReady && ( + + )} + {isReady && (