Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ public static class PdfPageTextExtractor

private static readonly Regex PageNumberPattern = new(@"^\d{1,4}$", RegexOptions.Compiled);

// Running headers from O'Reilly-style tech books take the shape
// "4 | Chapter 1: Introduction to Building AI Applications…"
// "The Rise of AI Engineering | 3"
// The page number varies per page, so the cross-page (identical-text)
// filter in PdfTextExtractor can't catch them — but the structural
// signature (small int + " | " + text, on a short paragraph) is
// distinctive. Encoded here from a Claude cleanup pair (slice 5 r1).
private const int RunningHeaderMaxLength = 200;
private static readonly Regex RunningHeaderLike = new(
@"^(?:\d{1,4}\s*\|\s*\S.+|\S.+\s*\|\s*\d{1,4})$",
RegexOptions.Compiled);

public static List<PdfTextElement> ExtractPage(Page page)
{
var words = page.GetWords(NearestNeighbourWordExtractor.Instance).ToList();
Expand Down Expand Up @@ -138,15 +150,17 @@ private static bool EndsWithSoftHyphen(string text)
}

/// <summary>
/// True for short fragments that are page numbers, single dividers, or pure
/// punctuation noise that belong to header/footer chrome, not body.
/// True for short fragments that are page numbers, single dividers, pure
/// punctuation noise, or O'Reilly-style running headers — all chrome that
/// belongs at the page margin, not in the body.
/// </summary>
private static bool IsArtifactNoise(string text)
internal static bool IsArtifactNoise(string text)
{
var trimmed = text.Trim();
if (trimmed.Length == 0) return true;
if (trimmed.Length <= 2 && NoisePunctuation.Contains(trimmed)) return true;
if (PageNumberPattern.IsMatch(trimmed)) return true;
if (trimmed.Length <= RunningHeaderMaxLength && RunningHeaderLike.IsMatch(trimmed)) return true;
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
<PackageReference Include="PDFtoImage" />
</ItemGroup>

<ItemGroup>
<InternalsVisibleTo Include="TextStack.Extraction.Tests" />
</ItemGroup>

<ItemGroup>
<EmbeddedResource Include="TextProcessing\Data\words.txt" />
<EmbeddedResource Include="TextProcessing\Data\spellings.json" />
Expand Down
51 changes: 51 additions & 0 deletions tests/TextStack.Extraction.Tests/RunningHeaderFilterTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using TextStack.Extraction.Extractors.Pdf;

namespace TextStack.Extraction.Tests;

/// <summary>
/// Ratchet round 1 (feat-0007 slice 5). Encodes recurring fix patterns
/// observed in Claude cleanup pairs into deterministic filters.
/// </summary>
public class RunningHeaderFilterTests
{
[Theory]
// O'Reilly running headers — page number on either side of " | ".
[InlineData("4 | Chapter 1: Introduction to Building AI Applications with Foundation Models")]
[InlineData("2 | Chapter 1: Introduction to Building AI Applications")]
[InlineData("The Rise of AI Engineering | 3")]
[InlineData("The Rise of AI Engineering | 5")]
[InlineData("Foundation Model Use Cases | 17")]
public void IsArtifactNoise_RunningHeaderWithPipeAndPageNumber_Filtered(string text)
{
Assert.True(PdfPageTextExtractor.IsArtifactNoise(text));
}

[Theory]
// Earlier defects the filter already caught — confirm regression-free.
[InlineData("4")] // bare page number
[InlineData("|")] // divider glyph
[InlineData("")] // empty
public void IsArtifactNoise_LegacyArtifacts_StillFiltered(string text)
{
Assert.True(PdfPageTextExtractor.IsArtifactNoise(text));
}

[Theory]
// Real body content that happens to contain digits or pipes — must NOT match.
[InlineData("Foundation models emerged from large language models, which in turn originated as language models.")]
[InlineData("The Mixtral 8x7B model has a vocabulary size of 32,000.")]
[InlineData("Section 1.1 covers the basics — see also chapter 4 for details.")]
[InlineData("GPT-4 was released in March 2023.")]
public void IsArtifactNoise_BodyProse_NotFiltered(string text)
{
Assert.False(PdfPageTextExtractor.IsArtifactNoise(text));
}

[Fact]
public void IsArtifactNoise_LongRunningHeaderLike_NotFiltered()
{
// > 200 chars — even with the running-header signature, too long to be chrome.
var text = "9 | " + new string('a', 250);
Assert.False(PdfPageTextExtractor.IsArtifactNoise(text));
}
}
Loading