diff --git a/backend/src/Extraction/TextStack.Extraction/Extractors/Pdf/PdfPageTextExtractor.cs b/backend/src/Extraction/TextStack.Extraction/Extractors/Pdf/PdfPageTextExtractor.cs index 5c87ddba..9ccb4de5 100644 --- a/backend/src/Extraction/TextStack.Extraction/Extractors/Pdf/PdfPageTextExtractor.cs +++ b/backend/src/Extraction/TextStack.Extraction/Extractors/Pdf/PdfPageTextExtractor.cs @@ -28,6 +28,18 @@ public static class PdfPageTextExtractor private static readonly Regex PageNumberPattern = new(@"^\d{1,4}$", RegexOptions.Compiled); + // Running headers from O'Reilly-style tech books take the shape + // "4 | Chapter 1: Introduction to Building AI Applications…" + // "The Rise of AI Engineering | 3" + // The page number varies per page, so the cross-page (identical-text) + // filter in PdfTextExtractor can't catch them — but the structural + // signature (small int + " | " + text, on a short paragraph) is + // distinctive. Encoded here from a Claude cleanup pair (slice 5 r1). + private const int RunningHeaderMaxLength = 200; + private static readonly Regex RunningHeaderLike = new( + @"^(?:\d{1,4}\s*\|\s*\S.+|\S.+\s*\|\s*\d{1,4})$", + RegexOptions.Compiled); + public static List ExtractPage(Page page) { var words = page.GetWords(NearestNeighbourWordExtractor.Instance).ToList(); @@ -138,15 +150,17 @@ private static bool EndsWithSoftHyphen(string text) } /// - /// True for short fragments that are page numbers, single dividers, or pure - /// punctuation noise that belong to header/footer chrome, not body. + /// True for short fragments that are page numbers, single dividers, pure + /// punctuation noise, or O'Reilly-style running headers — all chrome that + /// belongs at the page margin, not in the body. /// - private static bool IsArtifactNoise(string text) + internal static bool IsArtifactNoise(string text) { var trimmed = text.Trim(); if (trimmed.Length == 0) return true; if (trimmed.Length <= 2 && NoisePunctuation.Contains(trimmed)) return true; if (PageNumberPattern.IsMatch(trimmed)) return true; + if (trimmed.Length <= RunningHeaderMaxLength && RunningHeaderLike.IsMatch(trimmed)) return true; return false; } diff --git a/backend/src/Extraction/TextStack.Extraction/TextStack.Extraction.csproj b/backend/src/Extraction/TextStack.Extraction/TextStack.Extraction.csproj index 351c58c6..41db529f 100644 --- a/backend/src/Extraction/TextStack.Extraction/TextStack.Extraction.csproj +++ b/backend/src/Extraction/TextStack.Extraction/TextStack.Extraction.csproj @@ -10,6 +10,10 @@ + + + + diff --git a/tests/TextStack.Extraction.Tests/RunningHeaderFilterTests.cs b/tests/TextStack.Extraction.Tests/RunningHeaderFilterTests.cs new file mode 100644 index 00000000..b9b7209c --- /dev/null +++ b/tests/TextStack.Extraction.Tests/RunningHeaderFilterTests.cs @@ -0,0 +1,51 @@ +using TextStack.Extraction.Extractors.Pdf; + +namespace TextStack.Extraction.Tests; + +/// +/// Ratchet round 1 (feat-0007 slice 5). Encodes recurring fix patterns +/// observed in Claude cleanup pairs into deterministic filters. +/// +public class RunningHeaderFilterTests +{ + [Theory] + // O'Reilly running headers — page number on either side of " | ". + [InlineData("4 | Chapter 1: Introduction to Building AI Applications with Foundation Models")] + [InlineData("2 | Chapter 1: Introduction to Building AI Applications")] + [InlineData("The Rise of AI Engineering | 3")] + [InlineData("The Rise of AI Engineering | 5")] + [InlineData("Foundation Model Use Cases | 17")] + public void IsArtifactNoise_RunningHeaderWithPipeAndPageNumber_Filtered(string text) + { + Assert.True(PdfPageTextExtractor.IsArtifactNoise(text)); + } + + [Theory] + // Earlier defects the filter already caught — confirm regression-free. + [InlineData("4")] // bare page number + [InlineData("|")] // divider glyph + [InlineData("")] // empty + public void IsArtifactNoise_LegacyArtifacts_StillFiltered(string text) + { + Assert.True(PdfPageTextExtractor.IsArtifactNoise(text)); + } + + [Theory] + // Real body content that happens to contain digits or pipes — must NOT match. + [InlineData("Foundation models emerged from large language models, which in turn originated as language models.")] + [InlineData("The Mixtral 8x7B model has a vocabulary size of 32,000.")] + [InlineData("Section 1.1 covers the basics — see also chapter 4 for details.")] + [InlineData("GPT-4 was released in March 2023.")] + public void IsArtifactNoise_BodyProse_NotFiltered(string text) + { + Assert.False(PdfPageTextExtractor.IsArtifactNoise(text)); + } + + [Fact] + public void IsArtifactNoise_LongRunningHeaderLike_NotFiltered() + { + // > 200 chars — even with the running-header signature, too long to be chrome. + var text = "9 | " + new string('a', 250); + Assert.False(PdfPageTextExtractor.IsArtifactNoise(text)); + } +}