Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Tests

on:
push:
branches: [ main, dev ]
pull_request:
branches: [ main, dev ]

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"

- name: Cache uv dependencies
uses: actions/cache@v3
with:
path: |
.venv
.uv/cache
key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }}
restore-keys: |
${{ runner.os }}-uv-

- name: Install dependencies
run: |
uv sync --group dev

- name: Run tests
run: |
uv run pytest test/
12 changes: 7 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
__pycache__
legacy
./legacy
dist
.vscode
.pytest_cache
.idea
.venv*
test/outputs/histogram*
test/refs/attn/*
test/holdout
test/outputs/nmr
test/outputs/ditr*
plans
docs/build
notebooks
support_arena/*
test/outputs/actual
data/test/outputs
experiments
TODO.md
.coverage
coverage.xml

Binary file added data/test/references/img/pdf2_t2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 0 additions & 18 deletions dev.md

This file was deleted.

23 changes: 1 addition & 22 deletions gmft/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@ def _deprecation_warning(name):
has_warned = True


# These are fine, but discouraged.
# they are relatively light classes; also,
# Needed out of fear that isinstance() calls will fail
# These small classes are fine, but still discouraged.
# Rect
# BasePDFDocument
# BasePage
Expand Down Expand Up @@ -140,22 +138,3 @@ class AutoTableDetector(AutoTableDetectorOrig):
def __init__(self, *args, **kwargs):
_deprecation_warning("AutoTableDetector")
super().__init__(*args, **kwargs)


# Rect = LazyHouse.Rect
# BasePDFDocument = LazyHouse.BasePDFDocument
# BasePage = LazyHouse.BasePage
# CroppedTable = LazyHouse.CroppedTable
# RotatedCroppedTable = LazyHouse.RotatedCroppedTable
# TATRTableDetector = LazyHouse.TATRTableDetector
# TableDetectorConfig = LazyHouse.TableDetectorConfig
# TableDetector = LazyHouse.TableDetector
# FormattedTable = LazyHouse.FormattedTable
# TATRFormatConfig = LazyHouse.TATRFormatConfig
# TATRFormattedTable = LazyHouse.TATRFormattedTable
# TATRTableFormatter = LazyHouse.TATRTableFormatter


# AutoTableFormatter = AccessTracker(lambda x: gmft_aliases.AutoTableFormatter)
# AutoFormatConfig = AccessTracker(lambda x: gmft_aliases.AutoFormatConfig)
# AutoTableDetector = AccessTracker(lambda x: gmft_aliases.AutoTableDetector)
134 changes: 0 additions & 134 deletions gmft/_dataclasses.py

This file was deleted.

4 changes: 4 additions & 0 deletions gmft/_rich_text/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
"""
Note: this module is experimental and may be subject to refactors and changes.
"""

raise ImportError(
"gmft._rich_text was experimental. Functionality has been moved to gmft.formatters.page.auto.AutoPageFormatter."
)
1 change: 0 additions & 1 deletion gmft/_rich_text/common.py

This file was deleted.

62 changes: 0 additions & 62 deletions gmft/algorithm/captions.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,8 @@ def _find_captions(
:param stop_y_factor_above: if the top caption gets taller than stop_y_factor_above * caption_word_height, we stop. This is intended to eliminate paragraphs.
:param stop_y_factor_below: if the bottom caption gets taller than stop_y_factor_below * caption_word_height, we stop. This is intended to eliminate paragraphs.
"""
# if max_gap_space is None:
# word_height = ct.predicted_word_height()
# max_gap_space = ct.predicted_word_height() * 2.5
# maximum_supported_rows=5
if margin is None:
margin = (50, 50, 0, 50) # d_xmin, d_ymin, d_xmax, d_ymax to look for captions
# search_rect = Rect(ct.rect.xmin - margin[0], ct.rect.ymin - margin[1], ct.rect.xmax + margin[2], ct.rect.ymax + margin[3])

midpoint = (ct.rect.ymax + ct.rect.ymin) / 2

Expand Down Expand Up @@ -287,60 +282,3 @@ def _find_captions(
captions.append(caption)

return (captions[0], captions[1]) # [caption_above, caption_below]


def _detect_caption_with_mu(
table_bbox: Tuple[float, float, float, float],
block: Tuple[float, float, float, float, str],
max_abs_dist: float = 2.5,
) -> Tuple[str, str]:
x1, y1, x2, y2 = block[:4]
text = block[4]

# Block in PyMupdf can consist of multiple lines of text
n_lines = text.count("\n") + 1

normalized_dist = 1000
top_caption, bottom_caption = "", ""

# Take care of captions above the table
if y2 < table_bbox[1]: # block in question is above the table
# Normalized distance = how many word "lines" this current sentence is from the table
normalized_dist = (y2 - table_bbox[1]) / ((y2 - y1) / n_lines)
if abs(normalized_dist) < max_abs_dist:
top_caption = block[4]

# Take care of captions below the table
elif y1 > table_bbox[3]: # block in question is below the table
normalized_dist = (y1 - table_bbox[3]) / ((y2 - y1) / n_lines)
if abs(normalized_dist) < max_abs_dist:
bottom_caption = block[4]
return top_caption, bottom_caption


# Extract captions using PyMuPDF, assumes we have table bbox


def _find_caption_with_mu(ct: CroppedTable, **kwargs):
# import gmft_pymupdf
import pymupdf

page = ct.page.page # type: pymupdf.TextPage
blocks = page.get_text_blocks()

top_captions, bottom_captions = [], []
for block in blocks:
top_cap, bottom_cap = _detect_caption_with_mu(table_bbox=ct.bbox, block=block)
top_captions.append(top_cap)
bottom_captions.append(bottom_cap)

top_captions = "\n".join([c for c in top_captions if c]) # clear out empty captions
bottom_captions = "\n".join([c for c in bottom_captions if c])

whitespace_re = re.compile(r"\s*[\u202f\u2002\u2009\u00A0]\s*") # \u2002 \u2009
top_captions = re.sub(whitespace_re, " ", top_captions).replace("\n", "")
bottom_captions = re.sub(whitespace_re, " ", bottom_captions).replace("\n", "")
return (
top_captions,
bottom_captions,
) # ('\n'.join(top_captions), '\n'.join(bottom_captions))
6 changes: 4 additions & 2 deletions gmft/algorithm/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,17 @@ def get_index_before_or_equal(self, point):
Return the index of the last change point that is <= the query point.
If no such point exists, return -1.
"""
idx = bisect_right(self.sorted_points, point, key=lambda x: x[0]) - 1
points = [x[0] for x in self.sorted_points]
idx = bisect_right(points, point) - 1
return idx

def get_index_after(self, point):
"""
Return the index of the first change point that is strictly > the query point.
If no such point exists, return len(self.sorted_points).
"""
return bisect_right(self.sorted_points, point, key=lambda x: x[0])
points = [x[0] for x in self.sorted_points]
return bisect_right(points, point)

def frequency(self, point):
"""
Expand Down
Loading