Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion apps/web/src/pages/UserBookDetailPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { useState, useEffect, useMemo, useRef } from 'react'
import { useParams, Link, useNavigate } from 'react-router-dom'
import { useAuth } from '../context/AuthContext'
import { useLanguage } from '../context/LanguageContext'
import { getUserBook, deleteUserBook, markUserBookComplete, unmarkUserBookComplete, getUserBookCoverUrl, type UserBookDetail } from '../api/userBooks'
import { getUserBook, deleteUserBook, retryUserBook, markUserBookComplete, unmarkUserBookComplete, getUserBookCoverUrl, type UserBookDetail } from '../api/userBooks'
import { SeoHead } from '../components/SeoHead'
import { Footer } from '../components/Footer'
import { stringToColor } from '../utils/colors'
Expand All @@ -27,6 +27,7 @@ export function UserBookDetailPage() {
const [loading, setLoading] = useState(true)
const [error, setError] = useState<string | null>(null)
const [deleting, setDeleting] = useState(false)
const [reprocessing, setReprocessing] = useState(false)
// Inline two-stage confirm — mirrors VocabularyPage's delete pattern.
// First click flips the icon button into a red "Confirm?" pill; a second
// click within 3 s actually deletes. Avoids the browser-native confirm()
Expand Down Expand Up @@ -297,6 +298,36 @@ export function UserBookDetailPage() {
/>
)}

{isReady && (
<button
type="button"
onClick={async () => {
if (!id || reprocessing) return
setReprocessing(true)
try {
await retryUserBook(id)
// Flip to Processing so the auto-refresh effect kicks in
// and the user sees the spinner instead of stale Ready state.
setBook({ ...book, status: 'Processing' })
} catch (err) {
setError(err instanceof Error ? err.message : 'Failed to reprocess')
} finally {
setReprocessing(false)
}
}}
disabled={reprocessing}
className="user-book-detail__reextract-icon"
aria-label={reprocessing ? 'Reprocessing…' : 'Re-extract this book (pick up extraction improvements)'}
>
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" aria-hidden="true">
<polyline points="23 4 23 10 17 10" />
<polyline points="1 20 1 14 7 14" />
<path d="M3.51 9a9 9 0 0 1 14.85-3.36L23 10" />
<path d="M20.49 15a9 9 0 0 1-14.85 3.36L1 14" />
</svg>
</button>
)}

{isReady && (
<button
type="button"
Expand Down
32 changes: 32 additions & 0 deletions apps/web/src/styles/books.css
Original file line number Diff line number Diff line change
Expand Up @@ -4662,6 +4662,38 @@ html.dark .add-to-collection-button--icon[aria-label]::after {
background: rgba(0, 0, 0, 0.04);
}

/* Re-extract icon: same neutral shape as delete-idle. Single click, no
confirm — the worst case is a few minutes of processing, fully reversible. */
.user-book-detail__reextract-icon {
display: inline-flex;
align-items: center;
justify-content: center;
width: 36px;
height: 36px;
padding: 0;
border-radius: 18px;
background: transparent;
border: 1px solid var(--color-border, #E8E2D9);
color: var(--color-text-muted, #6b6b6b);
cursor: pointer;
transition: background 0.15s ease, color 0.15s ease, border-color 0.15s ease;
}
.user-book-detail__reextract-icon:hover:not(:disabled) {
color: var(--color-accent, #2563eb);
border-color: var(--color-accent, #2563eb);
background: rgba(37, 99, 235, 0.06);
}
.user-book-detail__reextract-icon:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.user-book-detail__reextract-icon:disabled svg {
animation: user-book-detail__spin 1s linear infinite;
}
@keyframes user-book-detail__spin {
to { transform: rotate(360deg); }
}

/* Trash icon in the primary action row. Idle styling matches the neighbour
icon buttons (Add-to-collection, Link, Share) — neutral border so the
destructive action doesn't shout. Two-stage confirm (see *--confirming
Expand Down
8 changes: 6 additions & 2 deletions backend/src/Application/UserBooks/UserBookService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,12 @@ public async Task<IReadOnlyList<UserBookListDto>> GetBooksAsync(Guid userId, Can
if (book is null)
return (false, "Book not found");

if (book.Status != UserBookStatus.Failed)
return (false, "Only failed books can be retried");
// Allow retrying Failed (original behaviour) AND re-extracting Ready
// books — extractor improvements (e.g. bullet paragraph split, TOC
// drop) should be reachable without a delete+reupload roundtrip.
// Processing is excluded so we don't queue duplicate jobs.
if (book.Status != UserBookStatus.Failed && book.Status != UserBookStatus.Ready)
return (false, $"Cannot reprocess book in status {book.Status}");

var bookFile = book.BookFiles.FirstOrDefault();
if (bookFile is null)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
using System.Text.RegularExpressions;

namespace TextStack.Extraction.Extractors;

/// <summary>
/// Identifies low-value front-matter chapter titles that should be dropped at
/// extraction time. Right now: only "Table of Contents" (and translated
/// variants) — TOC chapters from PDFs come out as a flat run of leader-dotted
/// entries with no usable line breaks; they add nothing to the in-app TOC
/// (which we generate from the chapter list itself) and just waste a click.
/// Kept narrow on purpose — adding e.g. "Copyright" or "Index" here would
/// silently drop content some users want to keep.
/// </summary>
public static class FrontMatterFilter
{
private static readonly Regex TocTitle = new(
@"^\s*(table\s+of\s+contents?|contents|toc|" +
@"оглавление|содержание|зміст|" +
@"sommaire|inhaltsverzeichnis|índice|indice|sumário)\s*$",
RegexOptions.IgnoreCase | RegexOptions.Compiled);

public static bool IsTableOfContents(string? title)
{
if (string.IsNullOrWhiteSpace(title)) return false;
// Tolerate trailing numbers/page refs the bookmark sometimes carries.
var normalized = Regex.Replace(title, @"\s*\d+\s*$", "").Trim();
return TocTitle.IsMatch(normalized);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,18 @@ namespace TextStack.Extraction.Extractors.Pdf;
public static class PdfPageTextExtractor
{
private const double LineYTolerance = 3.0;
private const double ParagraphGapMultiplier = 1.5;
// Paragraph break = median line gap × multiplier. Median (not mean) so
// the actual paragraph gaps in the data don't pull the threshold above
// themselves. 1.2 covers O'Reilly-style 1.25× para-spacing and similar;
// 1.5 (the old value) missed almost every paragraph break because the
// modal spacing is line-spacing and the mean got dragged up by the rare
// gaps the algorithm was supposed to find.
private const double ParagraphGapMultiplier = 1.2;
// First-line indent: a line whose left edge is at least this many points
// greater than the page's modal left margin counts as a new paragraph,
// independent of y-gap. Most book typography uses an indent rather than
// vertical space — y-gap-only detection misses every one of those.
private const double IndentMinPoints = 6.0;
private const double HeadingFontRatio = 0.9;
private const double MinHeadingFontDifference = 1.5;
private const int MaxHeadingTextLength = 200;
Expand All @@ -26,6 +37,15 @@ public static class PdfPageTextExtractor
"|", "•", "·", "—", "-", "–", "*", "■", "□", "○", "●", "▪", "▫"
};

// Glyphs that, when they're the first word of a line, mean the line is a
// bullet-list item — force a new paragraph regardless of vertical gap.
// Without this, tightly-spaced lists in PDFs get glued into one giant
// paragraph (the "•" loses its line-break role when only y-gap is used).
private static readonly HashSet<string> BulletGlyphs = new(StringComparer.Ordinal)
{
"•", "●", "▪", "■", "◦", "○", "▫", "◆", "‣", "⁃", "►", "❖"
};

private static readonly Regex PageNumberPattern = new(@"^\d{1,4}$", RegexOptions.Compiled);

// Running headers from O'Reilly-style tech books take the shape
Expand Down Expand Up @@ -199,18 +219,32 @@ private static List<List<List<Word>>> GroupLinesIntoParagraphs(List<List<Word>>
if (lines.Count == 0)
return [];

var lineHeights = new List<double>();
var lineGaps = new List<double>();
for (var i = 1; i < lines.Count; i++)
{
var prevY = lines[i - 1].Average(w => w.BoundingBox.Bottom);
var currY = lines[i].Average(w => w.BoundingBox.Bottom);
var gap = Math.Abs(prevY - currY);
if (gap > 0)
lineHeights.Add(gap);
lineGaps.Add(gap);
}

var avgLineHeight = lineHeights.Count > 0 ? lineHeights.Average() : 12.0;
var paragraphGapThreshold = avgLineHeight * ParagraphGapMultiplier;
// Median, not mean — paragraph gaps in the data must NOT pull the
// threshold up to (or above) themselves. The modal gap on a typical
// body page is one line-height; that's exactly what we want as the
// baseline.
var baselineGap = Median(lineGaps, fallback: 12.0);
var paragraphGapThreshold = baselineGap * ParagraphGapMultiplier;

// Modal left margin: rounded so micro-jitter (sub-point alignment
// differences) doesn't disqualify a margin from being modal.
var leftEdges = lines
.Where(l => l.Count > 0)
.Select(l => Math.Round(l.Min(w => w.BoundingBox.Left)))
.ToList();
var baseLeft = leftEdges.Count > 0
? leftEdges.GroupBy(e => e).OrderByDescending(g => g.Count()).First().Key
: 0.0;

var paragraphs = new List<List<List<Word>>>();
var currentParagraph = new List<List<Word>> { lines[0] };
Expand All @@ -221,7 +255,11 @@ private static List<List<List<Word>>> GroupLinesIntoParagraphs(List<List<Word>>
var currY = lines[i].Average(w => w.BoundingBox.Bottom);
var gap = Math.Abs(prevY - currY);

if (gap > paragraphGapThreshold)
var isYGapBreak = gap > paragraphGapThreshold;
var isBulletBreak = StartsWithBulletGlyph(lines[i]);
var isIndentBreak = StartsWithIndent(lines[i], baseLeft);

if (isYGapBreak || isBulletBreak || isIndentBreak)
{
paragraphs.Add(currentParagraph);
currentParagraph = [lines[i]];
Expand All @@ -236,6 +274,49 @@ private static List<List<List<Word>>> GroupLinesIntoParagraphs(List<List<Word>>
return paragraphs;
}

/// <summary>
/// True if the line's left edge sits at least <see cref="IndentMinPoints"/>
/// further right than the modal left margin — the textbook signature of a
/// first-line indent and therefore a paragraph break.
/// </summary>
internal static bool StartsWithIndent(List<Word> line, double baseLeft)
{
if (line.Count == 0) return false;
var left = line.Min(w => w.BoundingBox.Left);
return left - baseLeft >= IndentMinPoints;
}

private static double Median(List<double> values, double fallback)
{
if (values.Count == 0) return fallback;
var sorted = values.OrderBy(v => v).ToList();
var mid = sorted.Count / 2;
return sorted.Count % 2 == 0
? (sorted[mid - 1] + sorted[mid]) / 2.0
: sorted[mid];
}

/// <summary>
/// True if the first word of the line is (or begins with) a bullet glyph —
/// •, ●, ▪, ◦, etc. Lines like "• You're building..." should always start a
/// fresh paragraph, even if the y-gap from the previous line is normal.
/// </summary>
internal static bool StartsWithBulletGlyph(List<Word> line)
{
if (line.Count == 0) return false;
return IsBulletPrefix(line[0].Text);
}

/// <summary>Test-visible string form of <see cref="StartsWithBulletGlyph"/>.</summary>
internal static bool IsBulletPrefix(string? text)
{
if (string.IsNullOrEmpty(text)) return false;
if (BulletGlyphs.Contains(text)) return true;
// Some PDFs glue the bullet to the first word ("•You're").
var firstChar = text[0].ToString();
return BulletGlyphs.Contains(firstChar);
}

private static string GetDominantFontName(List<Word> words)
{
var fontCounts = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,20 @@ private static ExtractionResult ExtractFromDocument(
var chapter = chapters[chapterIdx];
var chapterNumber = chapterIdx + 1;

// Drop TOC chapters at extraction time. PDF TOCs come out as one
// dense run of leader-dotted entries and we already build the
// reader-side TOC from the chapter list itself. Guard: never drop
// the only chapter — a single-chapter book literally titled
// "Contents" would otherwise vanish entirely (paranoid edge case
// raised in PR #244 bug report).
if (chapters.Count > 1 && FrontMatterFilter.IsTableOfContents(chapter.Title))
{
warnings.Add(new ExtractionWarning(
ExtractionWarningCode.ContentFiltered,
$"Skipped Table of Contents chapter: {chapter.Title}"));
continue;
}

try
{
// Extract pages for this chapter
Expand Down
41 changes: 41 additions & 0 deletions tests/TextStack.Extraction.Tests/FrontMatterFilterTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
using TextStack.Extraction.Extractors;

namespace TextStack.Extraction.Tests;

public class FrontMatterFilterTests
{
[Theory]
[InlineData("Table of Contents")]
[InlineData("table of contents")]
[InlineData("TABLE OF CONTENTS")]
[InlineData("Contents")]
[InlineData("CONTENTS")]
[InlineData("TOC")]
[InlineData("Оглавление")]
[InlineData("Содержание")]
[InlineData("Зміст")]
[InlineData(" Contents ")]
[InlineData("Contents 5")] // bookmark sometimes carries a page ref
[InlineData("Table of Contents 7")]
public void IsTableOfContents_Matches_KnownTocTitles(string title)
{
Assert.True(FrontMatterFilter.IsTableOfContents(title));
}

[Theory]
[InlineData("")]
[InlineData(null)]
[InlineData("Chapter 1")]
[InlineData("Introduction")]
[InlineData("Acknowledgments")]
[InlineData("Index")]
[InlineData("Bibliography")]
[InlineData("Glossary")]
// "Discontent" contains "content" as substring — anchors must reject this.
[InlineData("Discontent")]
[InlineData("Table Setting")]
public void IsTableOfContents_DoesNotMatch_OtherTitles(string? title)
{
Assert.False(FrontMatterFilter.IsTableOfContents(title));
}
}
Loading
Loading