mrviduus · mrviduus · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/apps/web/src/pages/UserBookDetailPage.tsx b/apps/web/src/pages/UserBookDetailPage.tsx
@@ -2,7 +2,7 @@ import { useState, useEffect, useMemo, useRef } from 'react'
 import { useParams, Link, useNavigate } from 'react-router-dom'
 import { useAuth } from '../context/AuthContext'
 import { useLanguage } from '../context/LanguageContext'
-import { getUserBook, deleteUserBook, markUserBookComplete, unmarkUserBookComplete, getUserBookCoverUrl, type UserBookDetail } from '../api/userBooks'
+import { getUserBook, deleteUserBook, retryUserBook, markUserBookComplete, unmarkUserBookComplete, getUserBookCoverUrl, type UserBookDetail } from '../api/userBooks'
 import { SeoHead } from '../components/SeoHead'
 import { Footer } from '../components/Footer'
 import { stringToColor } from '../utils/colors'
@@ -27,6 +27,7 @@ export function UserBookDetailPage() {
   const [loading, setLoading] = useState(true)
   const [error, setError] = useState<string | null>(null)
   const [deleting, setDeleting] = useState(false)
+  const [reprocessing, setReprocessing] = useState(false)
   // Inline two-stage confirm — mirrors VocabularyPage's delete pattern.
   // First click flips the icon button into a red "Confirm?" pill; a second
   // click within 3 s actually deletes. Avoids the browser-native confirm()
@@ -297,6 +298,36 @@ export function UserBookDetailPage() {
               />
             )}
 
+            {isReady && (
+              <button
+                type="button"
+                onClick={async () => {
+                  if (!id || reprocessing) return
+                  setReprocessing(true)
+                  try {
+                    await retryUserBook(id)
+                    // Flip to Processing so the auto-refresh effect kicks in
+                    // and the user sees the spinner instead of stale Ready state.
+                    setBook({ ...book, status: 'Processing' })
+                  } catch (err) {
+                    setError(err instanceof Error ? err.message : 'Failed to reprocess')
+                  } finally {
+                    setReprocessing(false)
+                  }
+                }}
+                disabled={reprocessing}
+                className="user-book-detail__reextract-icon"
+                aria-label={reprocessing ? 'Reprocessing…' : 'Re-extract this book (pick up extraction improvements)'}
+              >
+                <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" aria-hidden="true">
+                  <polyline points="23 4 23 10 17 10" />
+                  <polyline points="1 20 1 14 7 14" />
+                  <path d="M3.51 9a9 9 0 0 1 14.85-3.36L23 10" />
+                  <path d="M20.49 15a9 9 0 0 1-14.85 3.36L1 14" />
+                </svg>
+              </button>
+            )}
+
             {isReady && (
               <button
                 type="button"

diff --git a/apps/web/src/styles/books.css b/apps/web/src/styles/books.css
@@ -4662,6 +4662,38 @@ html.dark .add-to-collection-button--icon[aria-label]::after {
   background: rgba(0, 0, 0, 0.04);
 }
 
+/* Re-extract icon: same neutral shape as delete-idle. Single click, no
+   confirm — the worst case is a few minutes of processing, fully reversible. */
+.user-book-detail__reextract-icon {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  width: 36px;
+  height: 36px;
+  padding: 0;
+  border-radius: 18px;
+  background: transparent;
+  border: 1px solid var(--color-border, #E8E2D9);
+  color: var(--color-text-muted, #6b6b6b);
+  cursor: pointer;
+  transition: background 0.15s ease, color 0.15s ease, border-color 0.15s ease;
+}
+.user-book-detail__reextract-icon:hover:not(:disabled) {
+  color: var(--color-accent, #2563eb);
+  border-color: var(--color-accent, #2563eb);
+  background: rgba(37, 99, 235, 0.06);
+}
+.user-book-detail__reextract-icon:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+.user-book-detail__reextract-icon:disabled svg {
+  animation: user-book-detail__spin 1s linear infinite;
+}
+@keyframes user-book-detail__spin {
+  to { transform: rotate(360deg); }
+}
+
 /* Trash icon in the primary action row. Idle styling matches the neighbour
    icon buttons (Add-to-collection, Link, Share) — neutral border so the
    destructive action doesn't shout. Two-stage confirm (see *--confirming

diff --git a/backend/src/Application/UserBooks/UserBookService.cs b/backend/src/Application/UserBooks/UserBookService.cs
@@ -327,8 +327,12 @@ public async Task<IReadOnlyList<UserBookListDto>> GetBooksAsync(Guid userId, Can
         if (book is null)
             return (false, "Book not found");
 
-        if (book.Status != UserBookStatus.Failed)
-            return (false, "Only failed books can be retried");
+        // Allow retrying Failed (original behaviour) AND re-extracting Ready
+        // books — extractor improvements (e.g. bullet paragraph split, TOC
+        // drop) should be reachable without a delete+reupload roundtrip.
+        // Processing is excluded so we don't queue duplicate jobs.
+        if (book.Status != UserBookStatus.Failed && book.Status != UserBookStatus.Ready)
+            return (false, $"Cannot reprocess book in status {book.Status}");
 
         var bookFile = book.BookFiles.FirstOrDefault();
         if (bookFile is null)

diff --git a/backend/src/Extraction/TextStack.Extraction/Extractors/FrontMatterFilter.cs b/backend/src/Extraction/TextStack.Extraction/Extractors/FrontMatterFilter.cs
@@ -0,0 +1,29 @@
+using System.Text.RegularExpressions;
+
+namespace TextStack.Extraction.Extractors;
+
+/// <summary>
+/// Identifies low-value front-matter chapter titles that should be dropped at
+/// extraction time. Right now: only "Table of Contents" (and translated
+/// variants) — TOC chapters from PDFs come out as a flat run of leader-dotted
+/// entries with no usable line breaks; they add nothing to the in-app TOC
+/// (which we generate from the chapter list itself) and just waste a click.
+/// Kept narrow on purpose — adding e.g. "Copyright" or "Index" here would
+/// silently drop content some users want to keep.
+/// </summary>
+public static class FrontMatterFilter
+{
+    private static readonly Regex TocTitle = new(
+        @"^\s*(table\s+of\s+contents?|contents|toc|" +
+        @"оглавление|содержание|зміст|" +
+        @"sommaire|inhaltsverzeichnis|índice|indice|sumário)\s*$",
+        RegexOptions.IgnoreCase | RegexOptions.Compiled);
+
+    public static bool IsTableOfContents(string? title)
+    {
+        if (string.IsNullOrWhiteSpace(title)) return false;
+        // Tolerate trailing numbers/page refs the bookmark sometimes carries.
+        var normalized = Regex.Replace(title, @"\s*\d+\s*$", "").Trim();
+        return TocTitle.IsMatch(normalized);
+    }
+}
diff --git a/backend/src/Extraction/TextStack.Extraction/Extractors/Pdf/PdfPageTextExtractor.cs b/backend/src/Extraction/TextStack.Extraction/Extractors/Pdf/PdfPageTextExtractor.cs
@@ -14,7 +14,18 @@ namespace TextStack.Extraction.Extractors.Pdf;
 public static class PdfPageTextExtractor
 {
     private const double LineYTolerance = 3.0;
-    private const double ParagraphGapMultiplier = 1.5;
+    // Paragraph break = median line gap × multiplier. Median (not mean) so
+    // the actual paragraph gaps in the data don't pull the threshold above
+    // themselves. 1.2 covers O'Reilly-style 1.25× para-spacing and similar;
+    // 1.5 (the old value) missed almost every paragraph break because the
+    // modal spacing is line-spacing and the mean got dragged up by the rare
+    // gaps the algorithm was supposed to find.
+    private const double ParagraphGapMultiplier = 1.2;
+    // First-line indent: a line whose left edge is at least this many points
+    // greater than the page's modal left margin counts as a new paragraph,
+    // independent of y-gap. Most book typography uses an indent rather than
+    // vertical space — y-gap-only detection misses every one of those.
+    private const double IndentMinPoints = 6.0;
     private const double HeadingFontRatio = 0.9;
     private const double MinHeadingFontDifference = 1.5;
     private const int MaxHeadingTextLength = 200;
@@ -26,6 +37,15 @@ public static class PdfPageTextExtractor
         "|", "•", "·", "—", "-", "–", "*", "■", "□", "○", "●", "▪", "▫"
     };
 
+    // Glyphs that, when they're the first word of a line, mean the line is a
+    // bullet-list item — force a new paragraph regardless of vertical gap.
+    // Without this, tightly-spaced lists in PDFs get glued into one giant
+    // paragraph (the "•" loses its line-break role when only y-gap is used).
+    private static readonly HashSet<string> BulletGlyphs = new(StringComparer.Ordinal)
+    {
+        "•", "●", "▪", "■", "◦", "○", "▫", "◆", "‣", "⁃", "►", "❖"
+    };
+
     private static readonly Regex PageNumberPattern = new(@"^\d{1,4}$", RegexOptions.Compiled);
 
     // Running headers from O'Reilly-style tech books take the shape
@@ -199,18 +219,32 @@ private static List<List<List<Word>>> GroupLinesIntoParagraphs(List<List<Word>>
         if (lines.Count == 0)
             return [];
 
-        var lineHeights = new List<double>();
+        var lineGaps = new List<double>();
         for (var i = 1; i < lines.Count; i++)
         {
             var prevY = lines[i - 1].Average(w => w.BoundingBox.Bottom);
             var currY = lines[i].Average(w => w.BoundingBox.Bottom);
             var gap = Math.Abs(prevY - currY);
             if (gap > 0)
-                lineHeights.Add(gap);
+                lineGaps.Add(gap);
         }
 
-        var avgLineHeight = lineHeights.Count > 0 ? lineHeights.Average() : 12.0;
-        var paragraphGapThreshold = avgLineHeight * ParagraphGapMultiplier;
+        // Median, not mean — paragraph gaps in the data must NOT pull the
+        // threshold up to (or above) themselves. The modal gap on a typical
+        // body page is one line-height; that's exactly what we want as the
+        // baseline.
+        var baselineGap = Median(lineGaps, fallback: 12.0);
+        var paragraphGapThreshold = baselineGap * ParagraphGapMultiplier;
+
+        // Modal left margin: rounded so micro-jitter (sub-point alignment
+        // differences) doesn't disqualify a margin from being modal.
+        var leftEdges = lines
+            .Where(l => l.Count > 0)
+            .Select(l => Math.Round(l.Min(w => w.BoundingBox.Left)))
+            .ToList();
+        var baseLeft = leftEdges.Count > 0
+            ? leftEdges.GroupBy(e => e).OrderByDescending(g => g.Count()).First().Key
+            : 0.0;
 
         var paragraphs = new List<List<List<Word>>>();
         var currentParagraph = new List<List<Word>> { lines[0] };
@@ -221,7 +255,11 @@ private static List<List<List<Word>>> GroupLinesIntoParagraphs(List<List<Word>>
             var currY = lines[i].Average(w => w.BoundingBox.Bottom);
             var gap = Math.Abs(prevY - currY);
 
-            if (gap > paragraphGapThreshold)
+            var isYGapBreak = gap > paragraphGapThreshold;
+            var isBulletBreak = StartsWithBulletGlyph(lines[i]);
+            var isIndentBreak = StartsWithIndent(lines[i], baseLeft);
+
+            if (isYGapBreak || isBulletBreak || isIndentBreak)
             {
                 paragraphs.Add(currentParagraph);
                 currentParagraph = [lines[i]];
@@ -236,6 +274,49 @@ private static List<List<List<Word>>> GroupLinesIntoParagraphs(List<List<Word>>
         return paragraphs;
     }
 
+    /// <summary>
+    /// True if the line's left edge sits at least <see cref="IndentMinPoints"/>
+    /// further right than the modal left margin — the textbook signature of a
+    /// first-line indent and therefore a paragraph break.
+    /// </summary>
+    internal static bool StartsWithIndent(List<Word> line, double baseLeft)
+    {
+        if (line.Count == 0) return false;
+        var left = line.Min(w => w.BoundingBox.Left);
+        return left - baseLeft >= IndentMinPoints;
+    }
+
+    private static double Median(List<double> values, double fallback)
+    {
+        if (values.Count == 0) return fallback;
+        var sorted = values.OrderBy(v => v).ToList();
+        var mid = sorted.Count / 2;
+        return sorted.Count % 2 == 0
+            ? (sorted[mid - 1] + sorted[mid]) / 2.0
+            : sorted[mid];
+    }
+
+    /// <summary>
+    /// True if the first word of the line is (or begins with) a bullet glyph —
+    /// •, ●, ▪, ◦, etc. Lines like "• You're building..." should always start a
+    /// fresh paragraph, even if the y-gap from the previous line is normal.
+    /// </summary>
+    internal static bool StartsWithBulletGlyph(List<Word> line)
+    {
+        if (line.Count == 0) return false;
+        return IsBulletPrefix(line[0].Text);
+    }
+
+    /// <summary>Test-visible string form of <see cref="StartsWithBulletGlyph"/>.</summary>
+    internal static bool IsBulletPrefix(string? text)
+    {
+        if (string.IsNullOrEmpty(text)) return false;
+        if (BulletGlyphs.Contains(text)) return true;
+        // Some PDFs glue the bullet to the first word ("•You're").
+        var firstChar = text[0].ToString();
+        return BulletGlyphs.Contains(firstChar);
+    }
+
     private static string GetDominantFontName(List<Word> words)
     {
         var fontCounts = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);

diff --git a/backend/src/Extraction/TextStack.Extraction/Extractors/PdfTextExtractor.cs b/backend/src/Extraction/TextStack.Extraction/Extractors/PdfTextExtractor.cs
@@ -142,6 +142,20 @@ private static ExtractionResult ExtractFromDocument(
             var chapter = chapters[chapterIdx];
             var chapterNumber = chapterIdx + 1;
 
+            // Drop TOC chapters at extraction time. PDF TOCs come out as one
+            // dense run of leader-dotted entries and we already build the
+            // reader-side TOC from the chapter list itself. Guard: never drop
+            // the only chapter — a single-chapter book literally titled
+            // "Contents" would otherwise vanish entirely (paranoid edge case
+            // raised in PR #244 bug report).
+            if (chapters.Count > 1 && FrontMatterFilter.IsTableOfContents(chapter.Title))
+            {
+                warnings.Add(new ExtractionWarning(
+                    ExtractionWarningCode.ContentFiltered,
+                    $"Skipped Table of Contents chapter: {chapter.Title}"));
+                continue;
+            }
+
             try
             {
                 // Extract pages for this chapter

diff --git a/tests/TextStack.Extraction.Tests/FrontMatterFilterTests.cs b/tests/TextStack.Extraction.Tests/FrontMatterFilterTests.cs
@@ -0,0 +1,41 @@
+using TextStack.Extraction.Extractors;
+
+namespace TextStack.Extraction.Tests;
+
+public class FrontMatterFilterTests
+{
+    [Theory]
+    [InlineData("Table of Contents")]
+    [InlineData("table of contents")]
+    [InlineData("TABLE OF CONTENTS")]
+    [InlineData("Contents")]
+    [InlineData("CONTENTS")]
+    [InlineData("TOC")]
+    [InlineData("Оглавление")]
+    [InlineData("Содержание")]
+    [InlineData("Зміст")]
+    [InlineData("  Contents  ")]
+    [InlineData("Contents 5")]   // bookmark sometimes carries a page ref
+    [InlineData("Table of Contents 7")]
+    public void IsTableOfContents_Matches_KnownTocTitles(string title)
+    {
+        Assert.True(FrontMatterFilter.IsTableOfContents(title));
+    }
+
+    [Theory]
+    [InlineData("")]
+    [InlineData(null)]
+    [InlineData("Chapter 1")]
+    [InlineData("Introduction")]
+    [InlineData("Acknowledgments")]
+    [InlineData("Index")]
+    [InlineData("Bibliography")]
+    [InlineData("Glossary")]
+    // "Discontent" contains "content" as substring — anchors must reject this.
+    [InlineData("Discontent")]
+    [InlineData("Table Setting")]
+    public void IsTableOfContents_DoesNotMatch_OtherTitles(string? title)
+    {
+        Assert.False(FrontMatterFilter.IsTableOfContents(title));
+    }
+}