conjuncts · conjuncts · Jun 30, 2025 · May 24, 2025 · May 24, 2025 · May 24, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,45 @@
+name: Tests
+
+on:
+  push:
+    branches: [ main, dev ]
+  pull_request:
+    branches: [ main, dev ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v3
+      with:
+        version: "latest"
+
+    - name: Cache uv dependencies
+      uses: actions/cache@v3
+      with:
+        path: |
+          .venv
+          .uv/cache
+        key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }}
+        restore-keys: |
+          ${{ runner.os }}-uv-
+
+    - name: Install dependencies
+      run: |
+        uv sync --group dev
+
+    - name: Run tests
+      run: |
+        uv run pytest test/ 
diff --git a/.gitignore b/.gitignore
@@ -1,17 +1,19 @@
 __pycache__
-legacy
+./legacy
 dist
 .vscode
 .pytest_cache
 .idea
 .venv*
-test/outputs/histogram*
 test/refs/attn/*
 test/holdout
-test/outputs/nmr
-test/outputs/ditr*
 plans
 docs/build
 notebooks
 support_arena/*
-test/outputs/actual
+data/test/outputs
+experiments
+TODO.md
+.coverage
+coverage.xml
+
diff --git a/data/test/references/img/pdf2_t2.png b/data/test/references/img/pdf2_t2.png
diff --git a/test/outputs/tiny_cropped_positions.tsv → ...est/references/tiny_cropped_positions.tsv b/test/outputs/tiny_cropped_positions.tsv → ...est/references/tiny_cropped_positions.tsv
diff --git a/dev.md b/dev.md
diff --git a/gmft/__init__.py b/gmft/__init__.py
@@ -42,9 +42,7 @@ def _deprecation_warning(name):
     has_warned = True
 
 
-# These are fine, but discouraged.
-# they are relatively light classes; also,
-# Needed out of fear that isinstance() calls will fail
+# These small classes are fine, but still discouraged.
 # Rect
 # BasePDFDocument
 # BasePage
@@ -140,22 +138,3 @@ class AutoTableDetector(AutoTableDetectorOrig):
     def __init__(self, *args, **kwargs):
         _deprecation_warning("AutoTableDetector")
         super().__init__(*args, **kwargs)
-
-
-# Rect = LazyHouse.Rect
-# BasePDFDocument = LazyHouse.BasePDFDocument
-# BasePage = LazyHouse.BasePage
-# CroppedTable = LazyHouse.CroppedTable
-# RotatedCroppedTable = LazyHouse.RotatedCroppedTable
-# TATRTableDetector = LazyHouse.TATRTableDetector
-# TableDetectorConfig = LazyHouse.TableDetectorConfig
-# TableDetector = LazyHouse.TableDetector
-# FormattedTable = LazyHouse.FormattedTable
-# TATRFormatConfig = LazyHouse.TATRFormatConfig
-# TATRFormattedTable = LazyHouse.TATRFormattedTable
-# TATRTableFormatter = LazyHouse.TATRTableFormatter
-
-
-# AutoTableFormatter = AccessTracker(lambda x: gmft_aliases.AutoTableFormatter)
-# AutoFormatConfig = AccessTracker(lambda x: gmft_aliases.AutoFormatConfig)
-# AutoTableDetector = AccessTracker(lambda x: gmft_aliases.AutoTableDetector)
diff --git a/gmft/_dataclasses.py b/gmft/_dataclasses.py
diff --git a/gmft/_rich_text/__init__.py b/gmft/_rich_text/__init__.py
@@ -1,3 +1,7 @@
 """
 Note: this module is experimental and may be subject to refactors and changes.
 """
+
+raise ImportError(
+    "gmft._rich_text was experimental. Functionality has been moved to gmft.formatters.page.auto.AutoPageFormatter."
+)
diff --git a/gmft/_rich_text/common.py b/gmft/_rich_text/common.py
diff --git a/gmft/algorithm/captions.py b/gmft/algorithm/captions.py
@@ -98,13 +98,8 @@ def _find_captions(
     :param stop_y_factor_above: if the top caption gets taller than stop_y_factor_above * caption_word_height, we stop. This is intended to eliminate paragraphs.
     :param stop_y_factor_below: if the bottom caption gets taller than stop_y_factor_below * caption_word_height, we stop. This is intended to eliminate paragraphs.
     """
-    # if max_gap_space is None:
-    # word_height = ct.predicted_word_height()
-    # max_gap_space = ct.predicted_word_height() * 2.5
-    # maximum_supported_rows=5
     if margin is None:
         margin = (50, 50, 0, 50)  # d_xmin, d_ymin, d_xmax, d_ymax to look for captions
-    # search_rect = Rect(ct.rect.xmin - margin[0], ct.rect.ymin - margin[1], ct.rect.xmax + margin[2], ct.rect.ymax + margin[3])
 
     midpoint = (ct.rect.ymax + ct.rect.ymin) / 2
 
@@ -287,60 +282,3 @@ def _find_captions(
         captions.append(caption)
 
     return (captions[0], captions[1])  # [caption_above, caption_below]
-
-
-def _detect_caption_with_mu(
-    table_bbox: Tuple[float, float, float, float],
-    block: Tuple[float, float, float, float, str],
-    max_abs_dist: float = 2.5,
-) -> Tuple[str, str]:
-    x1, y1, x2, y2 = block[:4]
-    text = block[4]
-
-    # Block in PyMupdf can consist of multiple lines of text
-    n_lines = text.count("\n") + 1
-
-    normalized_dist = 1000
-    top_caption, bottom_caption = "", ""
-
-    # Take care of captions above the table
-    if y2 < table_bbox[1]:  # block in question is above the table
-        # Normalized distance = how many word "lines" this current sentence is from the table
-        normalized_dist = (y2 - table_bbox[1]) / ((y2 - y1) / n_lines)
-        if abs(normalized_dist) < max_abs_dist:
-            top_caption = block[4]
-
-    # Take care of captions below the table
-    elif y1 > table_bbox[3]:  # block in question is below the table
-        normalized_dist = (y1 - table_bbox[3]) / ((y2 - y1) / n_lines)
-        if abs(normalized_dist) < max_abs_dist:
-            bottom_caption = block[4]
-    return top_caption, bottom_caption
-
-
-# Extract captions using PyMuPDF, assumes we have table bbox
-
-
-def _find_caption_with_mu(ct: CroppedTable, **kwargs):
-    # import gmft_pymupdf
-    import pymupdf
-
-    page = ct.page.page  # type: pymupdf.TextPage
-    blocks = page.get_text_blocks()
-
-    top_captions, bottom_captions = [], []
-    for block in blocks:
-        top_cap, bottom_cap = _detect_caption_with_mu(table_bbox=ct.bbox, block=block)
-        top_captions.append(top_cap)
-        bottom_captions.append(bottom_cap)
-
-    top_captions = "\n".join([c for c in top_captions if c])  # clear out empty captions
-    bottom_captions = "\n".join([c for c in bottom_captions if c])
-
-    whitespace_re = re.compile(r"\s*[\u202f\u2002\u2009\u00A0]\s*")  #  \u2002 \u2009
-    top_captions = re.sub(whitespace_re, " ", top_captions).replace("\n", "")
-    bottom_captions = re.sub(whitespace_re, " ", bottom_captions).replace("\n", "")
-    return (
-        top_captions,
-        bottom_captions,
-    )  # ('\n'.join(top_captions), '\n'.join(bottom_captions))
diff --git a/gmft/algorithm/histogram.py b/gmft/algorithm/histogram.py
@@ -30,15 +30,17 @@ def get_index_before_or_equal(self, point):
         Return the index of the last change point that is <= the query point.
         If no such point exists, return -1.
         """
-        idx = bisect_right(self.sorted_points, point, key=lambda x: x[0]) - 1
+        points = [x[0] for x in self.sorted_points]
+        idx = bisect_right(points, point) - 1
         return idx
 
     def get_index_after(self, point):
         """
         Return the index of the first change point that is strictly > the query point.
         If no such point exists, return len(self.sorted_points).
         """
-        return bisect_right(self.sorted_points, point, key=lambda x: x[0])
+        points = [x[0] for x in self.sorted_points]
+        return bisect_right(points, point)
 
     def frequency(self, point):
         """