Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,9 @@ RESEND_API_KEY=
RESEND_FROM_EMAIL=noreply@textstack.app
# Where SEO backfill / ops failure alerts go. Empty = alerts disabled.
ADMIN_ALERT_EMAIL=

# PDF content cleanup (quality-poll.sh Phase 3 — feat-0007).
# When true, chapters scoring below the threshold get an LLM cleanup pass
# via Claude CLI. Off by default — leaves the poller at structure-only.
CONTENT_CLEANUP_ENABLED=false
CONTENT_QUALITY_THRESHOLD=60
26 changes: 26 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,32 @@

## [Unreleased]

### PDF content quality — Claude cleanup pipeline (2026-05-22)

Slices 1-4 of feat-0007 (`docs/05-features/feat-0007-pdf-content-quality.md`).
Makes PDF-extracted books readable: heuristics get ~70-75%, the gap to ~90% is
semantic (running headers in body, fragmented paragraphs, hyphenation, inlined
footnotes). Closes it with a gated Claude cleanup pass, and logs every fix so
the deterministic heuristics can ratchet up over time. Marker (ML PDF pipeline)
was evaluated and shelved — the prod GPU's 4 GB VRAM can't hold its model set.

- **`ChapterContentQualityAnalyzer`** — deterministic 0-100 content-quality
score + issue codes (fragmented paragraphs, running headers in body,
unmerged hyphenation, orphan page numbers, inlined footnotes) for extracted
chapter HTML. Pure C#, 12 unit tests. The gate that decides which chapters
warrant an LLM pass.
- **Score persisted at ingest** — `ContentQualityScore` column on `Chapter` +
`UserChapter`, set in both ingestion paths; `BookQualityJob` carries Phase 3
tracking counters. Worker logs a per-book score distribution.
- **`quality-poll.sh` Phase 3** — for each chapter below the quality threshold,
Claude CLI fixes structure (preserving content verbatim); a stdlib-only
preservation gate (`pdf-cleanup-gate.py`) rejects hallucination or
over-deletion via word-multiset diff before the cleaned HTML is written back.
Every (messy → clean) pair is logged to `data/pdf-cleanup-dataset/` as fuel
for the future heuristic ratchet. Off by default — `CONTENT_CLEANUP_ENABLED`.
- **Admin observability** — the Book Quality job detail panel shows Phase 3
results (chapters cleaned / rejected / skipped).

### Mobile reader — autosave restore (2026-05-13)

- **WordCard parity with web WordPopup** — single-word tap on mobile
Expand Down
3 changes: 3 additions & 0 deletions apps/admin/src/api/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,9 @@ export interface BookQualityJobListItem {
export interface BookQualityJobDetail extends BookQualityJobListItem {
issuesJson: string | null
logOutput: string | null
contentChaptersCleaned: number | null
contentChaptersRejected: number | null
contentChaptersSkipped: number | null
}

export interface BookQualitySettings {
Expand Down
10 changes: 10 additions & 0 deletions apps/admin/src/pages/BookQualityPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,16 @@ export function BookQualityPage() {
<p>Issues found: <strong>{selectedJob.issuesFound}</strong> | Fixed: <strong>{selectedJob.issuesFixed ?? 0}</strong></p>
)}

{(selectedJob.contentChaptersCleaned != null
|| selectedJob.contentChaptersRejected != null
|| selectedJob.contentChaptersSkipped != null) && (
<p>
Content cleanup — cleaned: <strong>{selectedJob.contentChaptersCleaned ?? 0}</strong>
{' | '}rejected: <strong>{selectedJob.contentChaptersRejected ?? 0}</strong>
{' | '}skipped: <strong>{selectedJob.contentChaptersSkipped ?? 0}</strong>
</p>
)}

{selectedJob.issuesJson && (
<div>
<strong>Issues:</strong>
Expand Down
5 changes: 5 additions & 0 deletions backend/src/Api/Endpoints/AdminBookQualityEndpoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ private static async Task<IResult> GetJob(Guid id, IAppDbContext db, Cancellatio
return Results.Ok(new QualityJobDetailDto(
job.Id, job.EditionId, job.UserBookId,
job.Status.ToString(), job.IssuesJson, job.IssuesFound, job.IssuesFixed,
job.ContentChaptersCleaned, job.ContentChaptersRejected, job.ContentChaptersSkipped,
job.Error, job.LogOutput,
job.CreatedAt, job.StartedAt, job.FinishedAt,
job.Edition?.Title, job.UserBook?.Title
Expand Down Expand Up @@ -134,6 +135,9 @@ private static async Task<IResult> RetryJob(Guid id, IAppDbContext db, Cancellat
job.IssuesJson = null;
job.IssuesFound = null;
job.IssuesFixed = null;
job.ContentChaptersCleaned = null;
job.ContentChaptersRejected = null;
job.ContentChaptersSkipped = null;
job.StartedAt = null;
job.FinishedAt = null;
await db.SaveChangesAsync(ct);
Expand Down Expand Up @@ -177,6 +181,7 @@ public record QualityJobListDto(
public record QualityJobDetailDto(
Guid Id, Guid? EditionId, Guid? UserBookId,
string Status, string? IssuesJson, int? IssuesFound, int? IssuesFixed,
int? ContentChaptersCleaned, int? ContentChaptersRejected, int? ContentChaptersSkipped,
string? Error, string? LogOutput,
DateTimeOffset CreatedAt, DateTimeOffset? StartedAt, DateTimeOffset? FinishedAt,
string? EditionTitle, string? UserBookTitle);
9 changes: 9 additions & 0 deletions backend/src/Api/Endpoints/InternalEndpoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,9 @@ private static async Task<IResult> GetQualityJob(
job.IssuesJson,
job.IssuesFound,
job.IssuesFixed,
job.ContentChaptersCleaned,
job.ContentChaptersRejected,
job.ContentChaptersSkipped,
job.Error,
job.LogOutput,
job.CreatedAt,
Expand All @@ -458,6 +461,9 @@ private static async Task<IResult> UpdateQualityJob(
if (req.IssuesFixed.HasValue) job.IssuesFixed = req.IssuesFixed;
if (req.Error is not null) job.Error = req.Error;
if (req.LogOutput is not null) job.LogOutput = req.LogOutput;
if (req.ContentChaptersCleaned.HasValue) job.ContentChaptersCleaned = req.ContentChaptersCleaned;
if (req.ContentChaptersRejected.HasValue) job.ContentChaptersRejected = req.ContentChaptersRejected;
if (req.ContentChaptersSkipped.HasValue) job.ContentChaptersSkipped = req.ContentChaptersSkipped;
if (req.SetStartedAt) job.StartedAt = DateTimeOffset.UtcNow;
if (req.SetFinishedAt) job.FinishedAt = DateTimeOffset.UtcNow;

Expand Down Expand Up @@ -506,5 +512,8 @@ public record UpdateQualityJobRequest(
int? IssuesFixed = null,
string? Error = null,
string? LogOutput = null,
int? ContentChaptersCleaned = null,
int? ContentChaptersRejected = null,
int? ContentChaptersSkipped = null,
bool SetStartedAt = false,
bool SetFinishedAt = false);
20 changes: 18 additions & 2 deletions backend/src/Application/Ingestion/IngestionService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
using Domain.Enums;
using Domain.Utilities;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;
using TextStack.Extraction.Quality;

namespace Application.Ingestion;

Expand All @@ -29,7 +31,8 @@ List<ExtractionWarningDto> Warnings

public record ExtractionWarningDto(int Code, string Message);

public class IngestionService(IAppDbContext db, IFileStorageService storage)
public class IngestionService(
IAppDbContext db, IFileStorageService storage, ILogger<IngestionService> logger)
{
private static readonly TimeSpan StuckJobTimeout = TimeSpan.FromMinutes(10);

Expand Down Expand Up @@ -86,19 +89,24 @@ public async Task ProcessParsedBookAsync(
db.Chapters.RemoveRange(existingChapters);

// Create new chapters
var qualityScores = new List<int>();
foreach (var ch in parsed.Chapters)
{
var chapterSlug = SlugGenerator.GenerateChapterSlug(ch.Title, ch.Order);
var chapterHtml = SanitizeText(ch.Html);
var score = ChapterContentQualityAnalyzer.Analyze(chapterHtml).Score;
qualityScores.Add(score);
var chapter = new Chapter
{
Id = Guid.NewGuid(),
EditionId = job.EditionId,
ChapterNumber = ch.Order,
Slug = chapterSlug,
Title = SanitizeText(ch.Title),
Html = SanitizeText(ch.Html),
Html = chapterHtml,
PlainText = SanitizeText(ch.PlainText),
WordCount = ch.WordCount,
ContentQualityScore = score,
OriginalChapterNumber = ch.OriginalChapterNumber,
PartNumber = ch.PartNumber,
TotalParts = ch.TotalParts,
Expand All @@ -108,6 +116,14 @@ public async Task ProcessParsedBookAsync(
db.Chapters.Add(chapter);
}

if (qualityScores.Count > 0)
{
logger.LogInformation(
"Content quality for edition {EditionId}: {Count} chapters, avg score {Avg}, {Below} below 60",
job.EditionId, qualityScores.Count, (int)qualityScores.Average(),
qualityScores.Count(s => s < 60));
}

// Publish the edition
job.Edition.Status = EditionStatus.Published;
job.Edition.PublishedAt = DateTimeOffset.UtcNow;
Expand Down
8 changes: 8 additions & 0 deletions backend/src/Domain/Entities/BookQualityJob.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ public class BookQualityJob
public int? IssuesFound { get; set; }
public int? IssuesFixed { get; set; }

// ── Content-cleanup phase (Phase 3) — populated by quality-poll.sh ──
/// <summary>Chapters whose HTML the LLM cleanup pass rewrote and the gate accepted.</summary>
public int? ContentChaptersCleaned { get; set; }
/// <summary>Chapters where the LLM output was rejected by the preservation gate.</summary>
public int? ContentChaptersRejected { get; set; }
/// <summary>Flagged chapters skipped (non-PDF, or cleanup disabled).</summary>
public int? ContentChaptersSkipped { get; set; }

public string? Error { get; set; }
public string? LogOutput { get; set; }

Expand Down
6 changes: 6 additions & 0 deletions backend/src/Domain/Entities/Chapter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ public class Chapter
/// <summary>Total parts the original chapter was split into (for "Part 2 of 5" display)</summary>
public int? TotalParts { get; set; }

/// <summary>
/// Deterministic extraction-quality score 0-100 (see ChapterContentQualityAnalyzer).
/// Null = not yet analyzed. Below the flag threshold → candidate for LLM cleanup.
/// </summary>
public int? ContentQualityScore { get; set; }

public NpgsqlTsVector SearchVector { get; set; } = null!;
public DateTimeOffset CreatedAt { get; set; }
public DateTimeOffset UpdatedAt { get; set; }
Expand Down
7 changes: 7 additions & 0 deletions backend/src/Domain/Entities/UserChapter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ public class UserChapter
public required string Html { get; set; }
public required string PlainText { get; set; }
public int? WordCount { get; set; }

/// <summary>
/// Deterministic extraction-quality score 0-100 (see ChapterContentQualityAnalyzer).
/// Null = not yet analyzed. Below the flag threshold → candidate for LLM cleanup.
/// </summary>
public int? ContentQualityScore { get; set; }

public DateTimeOffset CreatedAt { get; set; }

public UserBook UserBook { get; set; } = null!;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
using System.Text.RegularExpressions;
using HtmlAgilityPack;

namespace TextStack.Extraction.Quality;

/// <summary>
/// Scores extracted chapter HTML for the structural defects typical of PDF
/// extraction (see <see cref="ContentQualityIssue"/>). Pure, deterministic,
/// no I/O — the gate that decides which chapters are worth an LLM cleanup pass.
///
/// Score starts at 100; each detected defect subtracts a frequency-scaled
/// penalty. A defect is only reported once it crosses a floor, so trivial
/// one-off noise doesn't flag an otherwise-clean chapter.
/// </summary>
public static class ChapterContentQualityAnalyzer
{
// Fragment-fraction is only meaningful once a chapter has enough paragraphs.
private const int MinParagraphsForFragmentCheck = 4;

// Compiled, not [GeneratedRegex] — ARM64 SIGILL bug (see Extraction/RULES.md).
private static readonly Regex PageNumberOnly =
new(@"^\s*\d{1,4}\s*$", RegexOptions.Compiled);
private static readonly Regex RunningHeaderPipe =
new(@"(^\s*\d{1,4}\s*\|)|(\|\s*\d{1,4}\s*$)", RegexOptions.Compiled);
private static readonly Regex HyphenArtifact =
new(@"\p{L}[‐­‑] \p{Ll}", RegexOptions.Compiled);
private static readonly Regex FootnoteStart =
new(@"^\s*\d{1,3}\s+\p{Lu}", RegexOptions.Compiled);
private static readonly Regex Whitespace =
new(@"\s+", RegexOptions.Compiled);

private static readonly HashSet<string> NoiseGlyphs =
new(StringComparer.Ordinal) { "|", "•", "·", "*", "■", "□", "—", "–" };

public static ContentQualityReport Analyze(string? html)
{
if (string.IsNullOrWhiteSpace(html))
return ContentQualityReport.Clean;

var doc = new HtmlDocument();
doc.LoadHtml(html);

var paragraphs = (doc.DocumentNode.SelectNodes("//p") ?? Enumerable.Empty<HtmlNode>())
.Select(p => NormalizeText(p.InnerText))
.Where(t => t.Length > 0)
.ToList();

if (paragraphs.Count == 0)
return ContentQualityReport.Clean;

var issues = new List<ContentQualityIssue>();
var penalty = 0;

penalty += ScoreFragments(paragraphs, issues);
penalty += ScoreRunningHeaders(paragraphs, issues);
penalty += ScoreHyphenation(paragraphs, issues);
penalty += ScoreOrphanNumbers(paragraphs, issues);
penalty += ScoreFootnotes(paragraphs, issues);

return new ContentQualityReport(Math.Clamp(100 - penalty, 0, 100), issues);
}

// ── Detectors ──────────────────────────────────────────────────────────
// Each returns a penalty (0 = nothing wrong) and appends its issue code
// when the defect is real, not incidental.

private static int ScoreFragments(List<string> paragraphs, List<ContentQualityIssue> issues)
{
if (paragraphs.Count < MinParagraphsForFragmentCheck)
return 0;

var fragments = paragraphs.Count(IsFragment);
var fraction = (double)fragments / paragraphs.Count;

// Real signal: ≥12% of paragraphs are fragments, or ≥8 of them outright.
if (fraction < 0.12 && fragments < 8)
return 0;

issues.Add(ContentQualityIssue.FragmentedParagraphs);
return (int)Math.Min(60, fraction * 150);
}

private static int ScoreRunningHeaders(List<string> paragraphs, List<ContentQualityIssue> issues)
{
var pipeHeaders = paragraphs.Count(p => RunningHeaderPipe.IsMatch(p));

// Identical short paragraphs repeating within one chapter = leaked chrome.
var repeats = paragraphs
.Where(p => p.Length <= 100)
.GroupBy(p => p, StringComparer.Ordinal)
.Where(g => g.Count() >= 2)
.Sum(g => g.Count() - 1);

var count = pipeHeaders + repeats;
if (count < 2)
return 0;

issues.Add(ContentQualityIssue.RunningHeaderInBody);
return Math.Min(25, count * 7);
}

private static int ScoreHyphenation(List<string> paragraphs, List<ContentQualityIssue> issues)
{
var count = paragraphs.Sum(p => HyphenArtifact.Matches(p).Count);
if (count < 3)
return 0;

issues.Add(ContentQualityIssue.HyphenationArtifacts);
return Math.Min(20, count * 2);
}

private static int ScoreOrphanNumbers(List<string> paragraphs, List<ContentQualityIssue> issues)
{
var count = paragraphs.Count(IsOrphanNumberOrGlyph);
if (count < 2)
return 0;

issues.Add(ContentQualityIssue.OrphanPageNumbers);
return Math.Min(15, count * 5);
}

private static int ScoreFootnotes(List<string> paragraphs, List<ContentQualityIssue> issues)
{
var count = paragraphs.Count(p => FootnoteStart.IsMatch(p));
if (count < 3)
return 0;

issues.Add(ContentQualityIssue.SuspectedFootnotes);
return Math.Min(10, count * 2);
}

// ── Helpers ────────────────────────────────────────────────────────────

/// <summary>A stray ≤2-word paragraph that doesn't end a sentence.</summary>
private static bool IsFragment(string text)
{
var words = text.Split(' ', StringSplitOptions.RemoveEmptyEntries);
if (words.Length > 2)
return false;
var last = text[^1];
return last is not ('.' or '!' or '?' or '…' or ':' or ';');
}

private static bool IsOrphanNumberOrGlyph(string text)
=> PageNumberOnly.IsMatch(text)
|| (text.Length <= 2 && NoiseGlyphs.Contains(text));

/// <summary>De-entitize, collapse whitespace, trim.</summary>
private static string NormalizeText(string raw)
=> Whitespace.Replace(HtmlEntity.DeEntitize(raw) ?? string.Empty, " ").Trim();
}
Loading
Loading