diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..26ca824 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,45 @@ +name: Tests + +on: + push: + branches: [ main, dev ] + pull_request: + branches: [ main, dev ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + version: "latest" + + - name: Cache uv dependencies + uses: actions/cache@v3 + with: + path: | + .venv + .uv/cache + key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }} + restore-keys: | + ${{ runner.os }}-uv- + + - name: Install dependencies + run: | + uv sync --group dev + + - name: Run tests + run: | + uv run pytest test/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index e849813..189cb88 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ __pycache__ -legacy +./legacy dist .vscode .pytest_cache @@ -14,3 +14,6 @@ support_arena/* data/test/outputs experiments TODO.md +.coverage +coverage.xml + diff --git a/data/test/references/img/pdf2_t2.png b/data/test/references/img/pdf2_t2.png new file mode 100644 index 0000000..8792185 Binary files /dev/null and b/data/test/references/img/pdf2_t2.png differ diff --git a/gmft/algorithm/structure.py b/gmft/algorithm/structure.py index 63483e2..ed53969 100644 --- a/gmft/algorithm/structure.py +++ b/gmft/algorithm/structure.py @@ -7,6 +7,11 @@ from gmft.base import Rect from typing import TYPE_CHECKING +from gmft.core.ml.prediction import ( + _empty_effective_predictions, + _empty_indices_predictions, +) + if TYPE_CHECKING: from gmft.impl.tatr.config import TATRFormatConfig from gmft.formatters.tatr import TATRFormattedTable @@ -772,7 +777,7 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None): outliers = {} # store table-wide information about outliers or pecularities - results = table.fctn_results + results = table.predictions["tatr"] # 1. collate identified boxes boxes = [] @@ -894,14 +899,8 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None): if not known_means: # no text was detected outliers["no text"] = True - table.effective_rows = [] - table.effective_columns = [] - table.effective_headers = [] - table.effective_projecting = [] - table.effective_spanning = [] - table._top_header_indices = [] - table._projecting_indices = [] - table._hier_left_indices = [] + table.predictions["effective"] = _empty_effective_predictions() + table.predictions["indices"] = _empty_indices_predictions() table._df = pd.DataFrame() table.outliers = outliers return table._df @@ -941,12 +940,13 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None): ) # nms takes care of deduplication - - table.effective_rows = sorted_rows - table.effective_columns = sorted_columns - table.effective_headers = sorted_headers - table.effective_projecting = sorted_projecting - table.effective_spanning = spanning_cells + table.predictions["effective"] = { + "rows": sorted_rows, + "columns": sorted_columns, + "headers": sorted_headers, + "projecting": sorted_projecting, + "spanning": spanning_cells, + } # 4b. check for catastrophic overlap total_column_area = 0 @@ -1004,6 +1004,7 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None): ) # semantic spanning fill + indices_preds = {} if config.semantic_spanning_cells: sorted_headers_bboxes = [x["bbox"] for x in sorted_headers] sorted_row_bboxes = [x["bbox"] for x in sorted_rows] @@ -1037,15 +1038,15 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None): header_indices=header_indices, config=config, ) - table._hier_left_indices = hier_left_idxs + indices_preds["_hier_left"] = hier_left_idxs else: - table._hier_left_indices = [] # for the user + indices_preds["_hier_left"] = [] # for the user # technically these indices will be off by the number of header rows ;-; if config.enable_multi_header: - table._top_header_indices = header_indices + indices_preds["_top_header"] = header_indices else: - table._top_header_indices = [0] if header_indices else [] + indices_preds["_top_header"] = [0] if header_indices else [] # extract out the headers header_rows = table_array[header_indices] @@ -1078,7 +1079,9 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None): is_projecting = [ x for i, x in enumerate(is_projecting) if i not in header_indices ] - table._projecting_indices = [i for i, x in enumerate(is_projecting) if x] + indices_preds["_projecting"] = [i for i, x in enumerate(is_projecting) if x] + + table.predictions["indices"] = indices_preds # if projecting_indices: # insert at end diff --git a/gmft/auto.py b/gmft/auto.py index 80e81e2..ec4ccbf 100644 --- a/gmft/auto.py +++ b/gmft/auto.py @@ -22,6 +22,7 @@ TATRTableFormatter = TATRFormatter # TATRFormatConfig = TATRFormatConfig + class AutoTableFormatter: """ The recommended :class:`~gmft.formatters.base.BaseFormatter`. Currently points to :class:`~gmft.formatters.tatr.TATRFormatter`. @@ -29,8 +30,10 @@ class AutoTableFormatter: Using :meth:`extract`, a :class:`~gmft.formatters.base.FormattedTable` is produced, which can be exported to csv, df, etc. """ + def __new__(cls, *args, **kwargs): from gmft.formatters.tatr import TATRFormatter + return TATRFormatter(*args, **kwargs) @@ -38,8 +41,10 @@ class AutoFormatConfig: """ Configuration for the recommended :class:`~gmft.formatters.base.BaseFormatter`. Currently points to :class:`~gmft.formatters.tatr.TATRFormatConfig`. """ + def __new__(cls, *args, **kwargs): from gmft.impl.tatr.config import TATRFormatConfig + return TATRFormatConfig(*args, **kwargs) @@ -50,6 +55,8 @@ class AutoTableDetector: Using :meth:`~gmft.detectors.base.BaseDetector.extract` produces a :class:`~gmft.formatters.base.FormattedTable`, which can be exported to csv, df, etc. """ + def __new__(cls, *args, **kwargs): from gmft.detectors.tatr import TATRDetector + return TATRDetector(*args, **kwargs) diff --git a/gmft/base.py b/gmft/base.py index 261cfa6..8a50cd8 100644 --- a/gmft/base.py +++ b/gmft/base.py @@ -1,5 +1,5 @@ from typing import TypeVar, Union -from gmft.core.exceptions import DocumentClosedException +from gmft.core.exception import DocumentClosedException class Rect: diff --git a/gmft/core/_dataclasses.py b/gmft/core/_dataclasses.py index d4b1368..dca779d 100644 --- a/gmft/core/_dataclasses.py +++ b/gmft/core/_dataclasses.py @@ -56,78 +56,3 @@ def non_defaults_only(config: object) -> dict: if default_value != current_value: result[f.name] = current_value return result - - -import warnings - -string_types = (type(b""), type("")) - - -def removed_property(reason): - """ - Custom decorator for marking class properties as removed. - Automatically raises a DeprecationWarning when the property is accessed or set. - - See https://stackoverflow.com/questions/2536307/decorators-in-the-python-standard-lib-deprecated-specifically - """ - if isinstance(reason, string_types): - # The @deprecated is used with a 'reason'. - # - # .. code-block:: python - # - # @deprecated("please, use another function") - # def old_function(x, y): - # pass - - def decorator(func1): - if inspect.isclass(func1): - fmt1 = "Call to deprecated class {name} ({reason})." - else: - fmt1 = "Call to deprecated function {name} ({reason})." - - @functools.wraps(func1) - def new_func1(*args, **kwargs): - warnings.simplefilter("always", DeprecationWarning) - warnings.warn( - fmt1.format(name=func1.__name__, reason=reason), - category=DeprecationWarning, - stacklevel=2, - ) - warnings.simplefilter("default", DeprecationWarning) - return func1(*args, **kwargs) - - return new_func1 - - return decorator - - elif inspect.isclass(reason) or inspect.isfunction(reason): - # The @deprecated is used without any 'reason'. - # - # .. code-block:: python - # - # @deprecated - # def old_function(x, y): - # pass - - func2 = reason - - if inspect.isclass(func2): - fmt2 = "Call to deprecated class {name}." - else: - fmt2 = "Call to deprecated function {name}." - - @functools.wraps(func2) - def new_func2(*args, **kwargs): - warnings.simplefilter("always", DeprecationWarning) - warnings.warn( - fmt2.format(name=func2.__name__), - category=DeprecationWarning, - stacklevel=2, - ) - warnings.simplefilter("default", DeprecationWarning) - return func2(*args, **kwargs) - - return new_func2 - - else: - raise TypeError(repr(type(reason))) diff --git a/gmft/core/exceptions/__init__.py b/gmft/core/exception/__init__.py similarity index 100% rename from gmft/core/exceptions/__init__.py rename to gmft/core/exception/__init__.py diff --git a/gmft/core/reformat/__init__.py b/gmft/core/io/__init__.py similarity index 100% rename from gmft/core/reformat/__init__.py rename to gmft/core/io/__init__.py diff --git a/gmft/core/io/serial/dicts.py b/gmft/core/io/serial/dicts.py new file mode 100644 index 0000000..1969f1f --- /dev/null +++ b/gmft/core/io/serial/dicts.py @@ -0,0 +1,59 @@ +import copy +from typing import Optional +from gmft.core.ml.prediction import ( + IndicesPredictions, + RawBboxPredictions, + _empty_indices_predictions, +) +from gmft.detectors.base import CroppedTable +from gmft.formatters.base import _normalize_bbox +from gmft.impl.tatr.config import TATRFormatConfig +from gmft.pdf_bindings.base import BasePage + + +def _extract_fctn_results(d: dict) -> RawBboxPredictions: + """ + Extract prediction["tatr"], formerly known as fctn_results + """ + if "fctn_results" not in d: + raise ValueError( + "fctn_results not found in dict -- dict may be a CroppedTable but not a TATRFormattedTable." + ) + + results = d["fctn_results"] # fix shallow copy issue + if ( + "fctn_scale_factor" in d + or "scale_factor" in d + or "fctn_padding" in d + or "padding" in d + ): + # deprecated: this is for backwards compatibility + scale_factor = d.get("fctn_scale_factor", d.get("scale_factor", 1)) + padding = d.get("fctn_padding", d.get("padding", (0, 0))) + padding = tuple(padding) + + # normalize results here + for i, bbox in enumerate(results["boxes"]): + results["boxes"][i] = _normalize_bbox( + bbox, used_scale_factor=scale_factor, used_padding=padding + ) + return results + + +def _extract_indices(d: dict) -> IndicesPredictions: + # version gmft>=0.5 format + if "predictions.indices" in d: + return d["predictions.indices"] + + # version gmft<0.5 format + if any( + x in d + for x in ["_hier_left_indices", "_top_header_indices", "_projecting_indices"] + ): + return { + "_projecting": d.get("_projecting_indices"), + "_hier_left": d.get("_hier_left_indices"), + "_top_header": d.get("_top_header_indices"), + } + + return _empty_indices_predictions() diff --git a/gmft/core/legacy/fctn_results.py b/gmft/core/legacy/fctn_results.py new file mode 100644 index 0000000..97ccb30 --- /dev/null +++ b/gmft/core/legacy/fctn_results.py @@ -0,0 +1,104 @@ +from gmft.core.ml.prediction import ( + RawBboxPredictions, + EffectivePredictions, + TablePredictions, +) +from typing_extensions import deprecated + + +class LegacyFctnResults: + """ + Small class to re-route old + """ + + predictions: TablePredictions + + @property + @deprecated("Use self.predictions['tatr']") + def fctn_results(self) -> RawBboxPredictions: + return self.predictions["tatr"] + + @fctn_results.setter + @deprecated("Use self.predictions['tatr']") + def fctn_results(self, value: RawBboxPredictions): + self.predictions["tatr"] = value + + @property + @deprecated("Use self.predictions['effective']") + def effective_rows(self): + return self.predictions["effective"]["rows"] + + @effective_rows.setter + @deprecated("Use self.predictions['effective']") + def effective_rows(self, value): + self.predictions["effective"]["rows"] = value + + @property + @deprecated("Use self.predictions['effective']") + def effective_columns(self): + return self.predictions["effective"]["columns"] + + @effective_columns.setter + @deprecated("Use self.predictions['effective']") + def effective_columns(self, value): + self.predictions["effective"]["columns"] = value + + @property + @deprecated("Use self.predictions['effective']") + def effective_headers(self): + return self.predictions["effective"]["headers"] + + @effective_headers.setter + @deprecated("Use self.predictions['effective']") + def effective_headers(self, value): + self.predictions["effective"]["headers"] = value + + @property + @deprecated("Use self.predictions['effective']") + def effective_projecting(self): + return self.predictions["effective"]["projecting"] + + @effective_projecting.setter + @deprecated("Use self.predictions['effective']") + def effective_projecting(self, value): + self.predictions["effective"]["projecting"] = value + + @property + @deprecated("Use self.predictions['effective']") + def effective_spanning(self): + return self.predictions["effective"]["spanning"] + + @effective_spanning.setter + @deprecated("Use self.predictions['effective']") + def effective_spanning(self, value): + self.predictions["effective"]["spanning"] = value + + @property + @deprecated("Use self.predictions['indices']['top_header']") + def _top_header_indices(self): + return self.predictions["indices"].get("top_header") + + @_top_header_indices.setter + @deprecated("Use self.predictions['indices']['_top_header']") + def _top_header_indices(self, value): + self.predictions["indices"]["_top_header"] = value + + @property + @deprecated("Use self.predictions['indices']['_projecting']") + def _projecting_indices(self): + return self.predictions["indices"].get("_projecting") + + @_projecting_indices.setter + @deprecated("Use self.predictions['indices']['_projecting']") + def _projecting_indices(self, value): + self.predictions["indices"]["_projecting"] = value + + @property + @deprecated("Use self.predictions['indices']['_hier_left']") + def _hier_left_indices(self): + return self.predictions["indices"].get("_hier_left") + + @_hier_left_indices.setter + @deprecated("Use self.predictions['indices']['hier_left']") + def _hier_left_indices(self, value): + self.predictions["indices"]["hier_left"] = value diff --git a/gmft/core/legacy/removed_config.py b/gmft/core/legacy/removed_config.py new file mode 100644 index 0000000..0625d1d --- /dev/null +++ b/gmft/core/legacy/removed_config.py @@ -0,0 +1,68 @@ +from typing_extensions import deprecated + + +class LegacyRemovedConfig: + """ + This class contains legacy configuration settings that will soon be removed. + """ + + # ---- deprecated ---- + # aggregate_spanning_cells = False + @property + @deprecated("This config setting is unused and will be removed in v0.6.0") + def aggregate_spanning_cells(self): + raise DeprecationWarning( + "aggregate_spanning_cells has been removed. Will break in v0.6.0." + ) + + @aggregate_spanning_cells.setter + @deprecated("This config setting is unused and will be removed in v0.6.0") + def aggregate_spanning_cells(self, value): + raise DeprecationWarning( + "aggregate_spanning_cells has been removed. Will break in v0.6.0." + ) + + # corner_clip_outlier_threshold = 0.1 + # """"corner clip" is when the text is clipped by a corner, and not an edge""" + @property + @deprecated("This config setting is unused and will be removed in v0.6.0") + def corner_clip_outlier_threshold(self): + raise DeprecationWarning( + "corner_clip_outlier_threshold has been removed. Will break in v0.6.0." + ) + + @corner_clip_outlier_threshold.setter + @deprecated("This config setting is unused and will be removed in v0.6.0") + def corner_clip_outlier_threshold(self, value): + raise DeprecationWarning( + "corner_clip_outlier_threshold has been removed. Will break in v0.6.0." + ) + + # spanning_cell_minimum_width = 0.6 + @property + @deprecated("This config setting is unused and will be removed in v0.6.0") + def spanning_cell_minimum_width(self): + raise DeprecationWarning( + "spanning_cell_minimum_width has been removed. Will break in v0.6.0." + ) + + @spanning_cell_minimum_width.setter + @deprecated("This config setting is unused and will be removed in v0.6.0") + def spanning_cell_minimum_width(self, value): + raise DeprecationWarning( + "spanning_cell_minimum_width has been removed. Will break in v0.6.0." + ) + + @property + @deprecated("This config setting is unused and will be removed in v0.6.0") + def deduplication_iob_threshold(self): + raise DeprecationWarning( + "deduplication_iob_threshold is deprecated. See nms_overlap_threshold instead. Will break in v0.6.0." + ) + + @deduplication_iob_threshold.setter + @deprecated("This config setting is unused and will be removed in v0.6.0") + def deduplication_iob_threshold(self, value): + raise DeprecationWarning( + "deduplication_iob_threshold is deprecated. See nms_overlap_threshold instead. Will break in v0.6.0." + ) diff --git a/gmft/core/ml/__init__.py b/gmft/core/ml/__init__.py index 0c2f6cd..ed06c67 100644 --- a/gmft/core/ml/__init__.py +++ b/gmft/core/ml/__init__.py @@ -1,11 +1,14 @@ from typing import TYPE_CHECKING, Literal, Union -def _resolve_device(device: Union[Literal["cpu", "cuda", "auto"], str]) -> Literal["cpu", "cuda"]: +def _resolve_device( + device: Union[Literal["cpu", "cuda", "auto"], str], +) -> Literal["cpu", "cuda"]: """ Lazy resolve the device when needed (without importing torch at the top level). """ - if device == 'auto': + if device == "auto": import torch - return 'cuda' if torch.cuda.is_available() else 'cpu' - return device \ No newline at end of file + + return "cuda" if torch.cuda.is_available() else "cpu" + return device diff --git a/gmft/core/ml/prediction/__init__.py b/gmft/core/ml/prediction/__init__.py new file mode 100644 index 0000000..08b070f --- /dev/null +++ b/gmft/core/ml/prediction/__init__.py @@ -0,0 +1,119 @@ +from typing import Optional, Tuple, TypedDict, List, Union +from typing_extensions import NotRequired + + +# Type definitions for predictions structure +class RawBboxPredictions(TypedDict): + """Type definition for a single model's bbox prediction output.""" + + scores: List[float] + labels: List[int] + boxes: List[List[float]] + + +class BboxPrediction(TypedDict): + confidence: float + label: str + bbox: Tuple[float, float, float, float] + + +class EffectivePredictions(TypedDict): + """ + Effective rows/columns/etc as seen by the image --> df algorithm. + + May be postprocessed from the table structure recognition model of choice (ie. TATR). + """ + + rows: List[BboxPrediction] + + columns: List[BboxPrediction] + + headers: List[BboxPrediction] + + projecting: List[BboxPrediction] + "Projected rows as seen by the image --> df algorithm." + + spanning: List[BboxPrediction] + "Spanning cells as seen by the image --> df algorithm." + + +class IndicesPredictions(TypedDict): + """ + Indices of key rows/columns, such as: top header, projecting, hier_left. + """ + + _top_header: NotRequired[List[int]] + _projecting: NotRequired[List[int]] + _hier_left: NotRequired[List[int]] + + +class TablePredictions(TypedDict): + """Type definition for the complete predictions dictionary.""" + + tatr: RawBboxPredictions + + effective: EffectivePredictions + indices: IndicesPredictions + + +def _empty_effective_predictions(): + return { + "rows": [], + "columns": [], + "headers": [], + "projecting": [], + "spanning": [], + } + + +def _empty_indices_predictions(): + return {} + + +# predictions: Predictions = { +# "tatr": { +# "scores": [ +# 0.9999045133590698, +# 0.9998310804367065, +# 0.9999147653579712, +# 0.9998205304145813, +# 0.9999688863754272, +# 0.9998650550842285, +# 0.9998096823692322, +# 0.9897574186325073, +# 0.9998759031295776, +# ], +# "labels": [2, 2, 1, 2, 1, 1, 2, 3, 0], +# "boxes": [ +# [ +# 71.36495971679688, +# 159.0726318359375, +# 797.0186767578125, +# 206.53753662109375, +# ], +# [ +# 70.94971466064453, +# 110.53954315185547, +# 797.128173828125, +# 158.9207000732422, +# ], +# [71.17463684082031, 73.58935546875, 329.6531677246094, 244.5222625732422], +# [71.1388931274414, 73.6107177734375, 797.3575439453125, 109.99236297607422], +# [331.3564147949219, 73.64269256591797, 576.944091796875, 244.3546905517578], +# [ +# 575.6424560546875, +# 73.62675476074219, +# 797.5115356445312, +# 244.22035217285156, +# ], +# [71.27164459228516, 206.5450439453125, 796.82958984375, 244.68435668945312], +# [ +# 71.13404083251953, +# 73.61981964111328, +# 797.3654174804688, +# 109.93215942382812, +# ], +# [71.12321472167969, 73.54254150390625, 797.08642578125, 244.42941284179688], +# ], +# } +# } diff --git a/gmft/detectors/base.py b/gmft/detectors/base.py index e5f57ee..ef7b29e 100644 --- a/gmft/detectors/base.py +++ b/gmft/detectors/base.py @@ -67,6 +67,8 @@ def __init__( bbox: Union[tuple[int, int, int, int], Rect], confidence_score: float = 1.0, label=0, + *, + angle: Literal[0, 90, 180, 270] = 0, ): """ Construct a CroppedTable object. @@ -93,6 +95,10 @@ def __init__( self._word_height = None self._captions = None + self.angle = angle + if angle not in [0, 90, 180, 270]: + raise ValueError("Only 0, 90, 180, 270 are supported.") + def image( self, dpi: int = None, @@ -138,10 +144,15 @@ def image( img = self.page.get_image(dpi=dpi, rect=rect) if padding is not None: img = PIL.ImageOps.expand(img, padding, fill="white") + + if self.angle != 0: + # rotate by negative angle to get back to original orientation + img = img.rotate(-self.angle, expand=True) self._img = img self._img_dpi = dpi self._img_padding = padding self._img_margin = margin + return self._img def text_positions( @@ -152,24 +163,52 @@ def text_positions( Any words that intersect the table are captured, even if they are not fully contained. - :param remove_table_offset: if True, the positions are adjusted to be relative to the top-left corner of the table. + :param remove_table_offset: if True, the coordinates are transformed (rotated and translated) so that the top-left corner of the table is (0, 0) and the bottom-right corner is (width, height). + If False, transforms (including rotation) are ignored and original coordinates are returned. :param outside: if True, returns the **complement** of the table: all the text positions outside the table. - By default, it returns the text positions inside the table. + (default: False) :return: list of text positions, which is a tuple ``(x0, y0, x1, y1, "string")`` """ - for w in self.page.get_positions_and_text(): - if Rect(w[:4]).is_intersecting(self.rect) != outside: - if remove_table_offset: - yield ( - w[0] - self.rect.xmin, - w[1] - self.rect.ymin, - w[2] - self.rect.xmin, - w[3] - self.rect.ymin, - w[4], - ) - else: - yield w + + def _old_generator(remove_table_offset, outside): + for w in self.page.get_positions_and_text(): + if Rect(w[:4]).is_intersecting(self.rect) != outside: + if remove_table_offset: + yield ( + w[0] - self.rect.xmin, + w[1] - self.rect.ymin, + w[2] - self.rect.xmin, + w[3] - self.rect.ymin, + w[4], + ) + else: + yield w + + if self.angle == 0 or remove_table_offset == False: + yield from _old_generator( + remove_table_offset=remove_table_offset, outside=outside + ) + elif self.angle == 90: + for w in _old_generator(remove_table_offset=True, outside=outside): + x0, y0, x1, y1, text = w + x0, y0, x1, y1 = self.rect.height - y1, x0, self.rect.height - y0, x1 + yield (x0, y0, x1, y1, text) + elif self.angle == 180: + for w in _old_generator(remove_table_offset=True, outside=outside): + x0, y0, x1, y1, text = w + x0, y0, x1, y1 = ( + self.rect.width - x1, + self.rect.height - y1, + self.rect.width - x0, + self.rect.height - y0, + ) + yield (x0, y0, x1, y1, text) + elif self.angle == 270: + for w in _old_generator(remove_table_offset=True, outside=outside): + x0, y0, x1, y1, text = w + x0, y0, x1, y1 = y0, self.rect.width - x1, y1, self.rect.width - x0 + yield (x0, y0, x1, y1, text) def text(self): """ @@ -269,6 +308,8 @@ def to_dict(self): "confidence_score": self.confidence_score, "label": self.label, } + if self.angle != 0: + obj["angle"] = self.angle return obj @staticmethod @@ -297,10 +338,14 @@ def from_dict( :param page: BasePage :return: CroppedTable object """ - if "angle" in d: + if "angle" in d and d["angle"] != 0: return RotatedCroppedTable.from_dict(d, page) table = CroppedTable( - page, d["bbox"], d.get("confidence_score", 1.0), d.get("label", 0) + page, + d["bbox"], + d.get("confidence_score", 1.0), + label=d.get("label", 0), + angle=d.get("angle", 0), ) table._captions = d.get("captions", []) return table @@ -327,10 +372,14 @@ def bbox(self): @property def width(self): + if self.angle == 90 or self.angle == 270: + return self.rect.height return self.rect.width @property def height(self): + if self.angle == 90 or self.angle == 270: + return self.rect.width return self.rect.height @@ -374,7 +423,10 @@ class RotatedCroppedTable(CroppedTable): Currently, only 0, 90, 180, and 270 degree rotations are supported. An angle of 90 would mean that a 90 degree cc rotation has been applied to a level image. - In practice, the majority of rotated tables are rotated by 90 degrees. + In practice, most rotated tables are rotated by 90 degrees. + + Note: after v0.5, this class is nearly identical to CroppedTable. `angle` is now directly availble in CroppedTable. + """ def __init__( @@ -385,84 +437,8 @@ def __init__( angle: float, label=0, ): - """ - Currently, only 0, 90, 180, and 270 degree rotations are supported. - - :param page: BasePage - :param angle: angle in degrees, counterclockwise. - That is, 90 would mean that a 90 degree cc rotation has been applied to a level image. - In practice, the majority of rotated tables are rotated by 90 degrees. - - """ - super().__init__(page, bbox, confidence_score, label) - - if angle not in [0, 90, 180, 270]: - raise ValueError("Only 0, 90, 180, 270 are supported.") - self.angle = angle - - def image( - self, - dpi: int = None, - padding: Union[tuple[int, int, int, int], Literal["auto", None]] = None, - margin: Union[tuple[int, int, int, int], Literal["auto", None]] = None, - **kwargs, - ) -> PILImage: - """ - Return the image of the cropped table. - - """ - img = super().image(dpi=dpi, padding=padding, margin=margin, **kwargs) - # if self.angle == 90: - if self.angle != 0: - # rotate by negative angle to get back to original orientation - img = img.rotate(-self.angle, expand=True) - - return img - - def text_positions( - self, remove_table_offset: bool = False, outside: bool = False - ) -> Generator[tuple[int, int, int, int, str], None, None]: - """ - Return the text positions of the cropped table. - - If remove_table_offset is False, positions are relative to the top-left corner of the pdf (no adjustment for rotation). - - If remove_table_offset is True, positions are relative to a hypothetical pdf where the text in the table is perfectly level, and - pdf's top-left corner is also the table's top-left corner (both at 0, 0). - - :param remove_table_offset: if True, the positions are adjusted to be relative to the top-left corner of the table. - :param outside: if True, returns the **complement** of the table: all the text positions outside the table. - :return: list of text positions, which are tuples of (xmin, ymin, xmax, ymax, "string") - """ - if self.angle == 0 or remove_table_offset == False: - yield from super().text_positions( - remove_table_offset=remove_table_offset, outside=outside - ) - elif self.angle == 90: - for w in super().text_positions(remove_table_offset=True, outside=outside): - x0, y0, x1, y1, text = w - x0, y0, x1, y1 = self.rect.height - y1, x0, self.rect.height - y0, x1 - yield (x0, y0, x1, y1, text) - elif self.angle == 180: - for w in super().text_positions(remove_table_offset=True, outside=outside): - x0, y0, x1, y1, text = w - x0, y0, x1, y1 = ( - self.rect.width - x1, - self.rect.height - y1, - self.rect.width - x0, - self.rect.height - y0, - ) - yield (x0, y0, x1, y1, text) - elif self.angle == 270: - for w in super().text_positions(remove_table_offset=True, outside=outside): - x0, y0, x1, y1, text = w - x0, y0, x1, y1 = y0, self.rect.width - x1, y1, self.rect.width - x0 - yield (x0, y0, x1, y1, text) - - def to_dict(self): - d = super().to_dict() - d["angle"] = self.angle - return d + # NOTE: angle and label are permuted (historical artifact) + super().__init__(page, bbox, confidence_score, label, angle=angle) @staticmethod def from_dict( @@ -474,19 +450,7 @@ def from_dict( if "angle" not in d: return CroppedTable.from_dict(d, page) table = RotatedCroppedTable( - page, d["bbox"], d["confidence_score"], d["angle"], d["label"] + page, d["bbox"], d["confidence_score"], angle=d["angle"], label=d["label"] ) table._captions = d.get("captions", []) return table - - @property - def width(self): - if self.angle == 90 or self.angle == 270: - return self.rect.height - return self.rect.width - - @property - def height(self): - if self.angle == 90 or self.angle == 270: - return self.rect.width - return self.rect.height diff --git a/gmft/formatters/base.py b/gmft/formatters/base.py index 0077f78..b6c809f 100644 --- a/gmft/formatters/base.py +++ b/gmft/formatters/base.py @@ -2,6 +2,7 @@ import pandas as pd +from gmft.core.ml.prediction import TablePredictions from gmft.pdf_bindings.base import BasePage from gmft.detectors.base import CroppedTable, RotatedCroppedTable @@ -14,6 +15,8 @@ class FormattedTable(RotatedCroppedTable): Warning: This class is not meant to be instantiated directly. Use a :class:`.TableFormatter` to convert a :class:`.CroppedTable` to a :class:`.FormattedTable`. """ + predictions: TablePredictions + def __init__(self, cropped_table: CroppedTable, df: pd.DataFrame = None): self._df = df diff --git a/gmft/formatters/ditr.py b/gmft/formatters/ditr.py index 461b0f5..e2df8a6 100644 --- a/gmft/formatters/ditr.py +++ b/gmft/formatters/ditr.py @@ -12,7 +12,13 @@ _ioa, get_good_between_dividers, ) +from gmft.core.io.serial.dicts import _extract_fctn_results, _extract_indices +from gmft.core.legacy.fctn_results import LegacyFctnResults from gmft.core.ml import _resolve_device +from gmft.core.ml.prediction import ( + _empty_effective_predictions, + _empty_indices_predictions, +) from gmft.detectors.base import CroppedTable, RotatedCroppedTable from gmft.impl.ditr.config import DITRFormatConfig from gmft.formatters.base import FormattedTable, TableFormatter, _normalize_bbox @@ -34,7 +40,7 @@ from transformers import DetrForObjectDetection -class DITRFormattedTable(HistogramFormattedTable): +class DITRFormattedTable(HistogramFormattedTable, LegacyFctnResults): """ FormattedTable, as seen by a Table Transformer for dividers (dubbed DITR). See :class:`.DITRTableFormatter`. @@ -54,19 +60,6 @@ class DITRFormattedTable(HistogramFormattedTable): config: DITRFormatConfig outliers: dict[str, bool] - effective_headers: list[tuple] - "Headers as seen by the image --> df algorithm." - - effective_projecting: list[tuple] - "Projected rows as seen by the image --> df algorithm." - - effective_spanning: list[tuple] - "Spanning cells as seen by the image --> df algorithm." - - _top_header_indices: list[int] = None - _projecting_indices: list[int] = None - _hier_left_indices: list[int] = None - def __init__( self, cropped_table: CroppedTable, @@ -77,7 +70,11 @@ def __init__( super(DITRFormattedTable, self).__init__( cropped_table, None, irvl_results, config=config ) - self.fctn_results = fctn_results + self.predictions = { + "tatr": fctn_results, + "effective": _empty_effective_predictions(), + "indices": _empty_indices_predictions(), + } if config is None: config = DITRFormatConfig() @@ -87,7 +84,7 @@ def __init__( def df(self, recalculate=False, config_overrides: DITRFormatConfig = None): """ Return the table as a pandas dataframe. - :param recalculate: by default, the dataframe is cached + :param recalculate: by default, the dataframe is cached. DEPRECATED: use recompute() instead. :param config_overrides: override the config settings for this call only """ if recalculate != False: @@ -113,8 +110,6 @@ def visualize(self, **kwargs): Visualize the cropped table. """ img = self.image() - # labels = self.fctn_results['labels'] - # bboxes = self.fctn_results['boxes'] tbl_width = self.width # adjust for rotations too tbl_height = self.height @@ -126,13 +121,13 @@ def visualize(self, **kwargs): for y0, y1 in self.irvl_results["row_dividers"]: bboxes.append([0, y0, tbl_width, y1]) labels.append(2) - for x0, y0, x1, y1 in self.effective_headers: + for x0, y0, x1, y1 in self.predictions["effective"]["headers"]: bboxes.append([x0, y0, x1, y1]) labels.append(3) - for x0, y0, x1, y1 in self.effective_projecting: + for x0, y0, x1, y1 in self.predictions["effective"]["headers"]: bboxes.append([x0, y0, x1, y1]) labels.append(4) - for x0, y0, x1, y1 in self.effective_spanning: + for x0, y0, x1, y1 in self.predictions["effective"]["headers"]: bboxes.append([x0, y0, x1, y1]) labels.append(5) return plot_shaded_boxes(img, labels=labels, boxes=bboxes, **kwargs) @@ -146,18 +141,14 @@ def to_dict(self): else: parent = CroppedTable.to_dict(self) optional = {} - if self._projecting_indices is not None: - optional["_projecting_indices"] = self._projecting_indices - if self._hier_left_indices is not None: - optional["_hier_left_indices"] = self._hier_left_indices - if self._top_header_indices is not None: - optional["_top_header_indices"] = self._top_header_indices + if self.predictions["indices"]: + optional["predictions.indices"] = self.predictions["indices"] return { **parent, **{ "config": non_defaults_only(self.config), "outliers": self.outliers, - "fctn_results": self.fctn_results, + "fctn_results": self.predictions["tatr"], }, **optional, } @@ -171,31 +162,10 @@ def from_dict(d: dict, page: BasePage): d = copy.deepcopy(d) # don't modify the original dict cropped_table = CroppedTable.from_dict(d, page) - if "fctn_results" not in d: - raise ValueError( - "fctn_results not found in dict -- dict may be a CroppedTable but not a TATRFormattedTable." - ) + results = _extract_fctn_results(d) config = DITRFormatConfig(**d["config"]) - results = d["fctn_results"] # fix shallow copy issue - if ( - "fctn_scale_factor" in d - or "scale_factor" in d - or "fctn_padding" in d - or "padding" in d - ): - # deprecated: this is for backwards compatibility - scale_factor = d.get("fctn_scale_factor", d.get("scale_factor", 1)) - padding = d.get("fctn_padding", d.get("padding", (0, 0))) - padding = tuple(padding) - - # normalize results here - for i, bbox in enumerate(results["boxes"]): - results["boxes"][i] = _normalize_bbox( - bbox, used_scale_factor=scale_factor, used_padding=padding - ) - table = DITRFormattedTable( cropped_table, None, @@ -204,6 +174,7 @@ def from_dict(d: dict, page: BasePage): ) table.recompute() table.outliers = d.get("outliers", None) + table.predictions["indices"] = _extract_indices(d) return table @@ -463,7 +434,7 @@ def ditr_extract_to_df(table: DITRFormattedTable, config: DITRFormatConfig = Non outliers = {} # store table-wide information about outliers or pecularities - results = table.fctn_results + results = table.predictions["tatr"] row_divider_boxes, col_divider_boxes, top_headers, projected, spanning_cells = ( proportion_fctn_results(results, config) ) @@ -491,9 +462,13 @@ def ditr_extract_to_df(table: DITRFormattedTable, config: DITRFormatConfig = Non "row_dividers": row_divider_intervals, "col_dividers": col_divider_intervals, } - table.effective_headers = top_headers - table.effective_projecting = projected - table.effective_spanning = [span["bbox"] for span in spanning_cells] + table.predictions["effective"] = { + "rows": [], + "columns": [], + "headers": top_headers, + "projecting": projected, + "spanning": [span["bbox"] for span in spanning_cells], + } # table_bounds = table.bbox # empirical_table_bbox(row_divider_boxes, col_divider_boxes) fixed_table_bounds = (0, 0, table.width, table.height) # adjust for rotations too @@ -549,6 +524,7 @@ def ditr_extract_to_df(table: DITRFormattedTable, config: DITRFormatConfig = Non projecting_indices = [i for i in projecting_indices if i not in empty_rows] # semantic spanning fill + indices_preds = {} if config.semantic_spanning_cells: # TODO probably not worth it to duplicate the code old_rows = [(None, y0, None, y1) for y0, y1 in good_row_intervals] @@ -582,15 +558,15 @@ def ditr_extract_to_df(table: DITRFormattedTable, config: DITRFormatConfig = Non header_indices=header_indices, config=config, ) - table._hier_left_indices = hier_left_idxs + indices_preds["_hier_left"] = hier_left_idxs else: - table._hier_left_indices = [] # for the user + indices_preds["_hier_left"] = [] # for the user # technically these indices will be off by the number of header rows ;-; if config.enable_multi_header: - table._top_header_indices = header_indices + indices_preds["_top_header"] = header_indices else: - table._top_header_indices = [0] if header_indices else [] + indices_preds["_top_header"] = [0] if header_indices else [] # extract out the headers header_rows = table_array[header_indices] @@ -621,8 +597,9 @@ def ditr_extract_to_df(table: DITRFormattedTable, config: DITRFormatConfig = Non # remove the header_indices # note that ditr._determine_headers_and_projecting # automatically makes is_projecting and header_indices mutually exclusive - table._projecting_indices = [i for i, x in enumerate(is_projecting) if x] + indices_preds["_projecting"] = [i for i, x in enumerate(is_projecting) if x] + table.predictions["indices"] = indices_preds # b. drop the former header rows always table._df.drop(index=header_indices, inplace=True) diff --git a/gmft/formatters/tatr.py b/gmft/formatters/tatr.py index 5a29eb5..c56207c 100644 --- a/gmft/formatters/tatr.py +++ b/gmft/formatters/tatr.py @@ -1,8 +1,15 @@ import copy -from typing import Union +from typing import List, Union from gmft.core._dataclasses import non_defaults_only, with_config +from gmft.core.io.serial.dicts import _extract_fctn_results, _extract_indices +from gmft.core.legacy.fctn_results import LegacyFctnResults from gmft.core.ml import _resolve_device +from gmft.core.ml.prediction import ( + BboxPrediction, + _empty_effective_predictions, + _empty_indices_predictions, +) from gmft.detectors.base import CroppedTable, RotatedCroppedTable from gmft.impl.tatr.config import TATRFormatConfig from gmft.formatters.base import FormattedTable, TableFormatter, _normalize_bbox @@ -14,7 +21,7 @@ from gmft.table_visualization import plot_results_unwr -class TATRFormattedTable(FormattedTable): +class TATRFormattedTable(FormattedTable, LegacyFctnResults): """ FormattedTable, as seen by a Table Transformer (TATR). See :class:`.TATRTableFormatter`. @@ -44,25 +51,6 @@ class TATRFormattedTable(FormattedTable): config: TATRFormatConfig outliers: dict[str, bool] - effective_rows: list[tuple] - "Rows as seen by the image --> df algorithm, which may differ from what the table transformer sees." - - effective_columns: list[tuple] - "Columns as seen by the image --> df algorithm, which may differ from what the table transformer sees." - - effective_headers: list[tuple] - "Headers as seen by the image --> df algorithm." - - effective_projecting: list[tuple] - "Projected rows as seen by the image --> df algorithm." - - effective_spanning: list[tuple] - "Spanning cells as seen by the image --> df algorithm." - - _top_header_indices: list[int] = None - _projecting_indices: list[int] = None - _hier_left_indices: list[int] = None - def __init__( self, cropped_table: CroppedTable, @@ -70,7 +58,11 @@ def __init__( config: TATRFormatConfig = None, ): super(TATRFormattedTable, self).__init__(cropped_table) - self.fctn_results = fctn_results + self.predictions = { + "tatr": fctn_results, + "effective": _empty_effective_predictions(), + "indices": _empty_indices_predictions(), + } if config is None: config = TATRFormatConfig() @@ -129,13 +121,11 @@ def visualize( if effective: if self._df is None: self._df = self.df() - vis = ( - self.effective_rows - + self.effective_columns - + self.effective_headers - + self.effective_projecting - + self.effective_spanning - ) + vis: List[BboxPrediction] = [ + item + for sublist in self.predictions["effective"].values() + for item in sublist + ] boxes = [x["bbox"] for x in vis] boxes = [(x * scale_by for x in bbox) for bbox in boxes] _to_visualize = { @@ -146,12 +136,13 @@ def visualize( else: # transform functionalized coordinates into image coordinates boxes = [ - (x * scale_by for x in bbox) for bbox in self.fctn_results["boxes"] + (x * scale_by for x in bbox) + for bbox in self.predictions["tatr"]["boxes"] ] _to_visualize = { - "scores": self.fctn_results["scores"], - "labels": self.fctn_results["labels"], + "scores": self.predictions["tatr"]["scores"], + "labels": self.predictions["tatr"]["labels"], "boxes": boxes, } @@ -180,18 +171,14 @@ def to_dict(self): else: parent = CroppedTable.to_dict(self) optional = {} - if self._projecting_indices is not None: - optional["_projecting_indices"] = self._projecting_indices - if self._hier_left_indices is not None: - optional["_hier_left_indices"] = self._hier_left_indices - if self._top_header_indices is not None: - optional["_top_header_indices"] = self._top_header_indices + if self.predictions["indices"]: + optional["predictions.indices"] = self.predictions["indices"] return { **parent, **{ "config": non_defaults_only(self.config), "outliers": self.outliers, - "fctn_results": self.fctn_results, + "fctn_results": self.predictions["tatr"], }, **optional, } @@ -205,37 +192,16 @@ def from_dict(d: dict, page: BasePage): d = copy.deepcopy(d) # don't modify the original dict cropped_table = CroppedTable.from_dict(d, page) - if "fctn_results" not in d: - raise ValueError( - "fctn_results not found in dict -- dict may be a CroppedTable but not a TATRFormattedTable." - ) - + results = _extract_fctn_results(d) config = TATRFormatConfig(**d["config"]) - results = d["fctn_results"] # fix shallow copy issue - if ( - "fctn_scale_factor" in d - or "scale_factor" in d - or "fctn_padding" in d - or "padding" in d - ): - # deprecated: this is for backwards compatibility - scale_factor = d.get("fctn_scale_factor", d.get("scale_factor", 1)) - padding = d.get("fctn_padding", d.get("padding", (0, 0))) - padding = tuple(padding) - - # normalize results here - for i, bbox in enumerate(results["boxes"]): - results["boxes"][i] = _normalize_bbox( - bbox, used_scale_factor=scale_factor, used_padding=padding - ) - table = TATRFormattedTable( cropped_table, results, config=config, ) table.outliers = d.get("outliers", None) + table.predictions["indices"] = _extract_indices(d) return table diff --git a/gmft/impl/ditr/config.py b/gmft/impl/ditr/config.py index afbb053..76b7460 100644 --- a/gmft/impl/ditr/config.py +++ b/gmft/impl/ditr/config.py @@ -1,67 +1,24 @@ -from gmft.core._dataclasses import removed_property -from gmft.formatters.histogram import HistogramConfig +from dataclasses import dataclass +from typing import Literal +from typing_extensions import deprecated -from dataclasses import dataclass, field -from typing import Literal, Union +from gmft.formatters.histogram import HistogramConfig +from gmft.impl.tatr.config import TATRFormatConfig @dataclass -class DITRFormatConfig(HistogramConfig): +class DITRFormatConfig(HistogramConfig, TATRFormatConfig): """ Configuration for :class:`.DITRTableFormatter`. """ # ---- model settings ---- - warn_uninitialized_weights: bool = False - image_processor_path: str = ( - "microsoft/table-transformer-structure-recognition-v1.1-all" - ) formatter_path: str = "conjuncts/ditr-e15" - # no_timm: bool = True # use a model which uses AutoBackbone. - torch_device: Union[Literal["auto", "cpu", "cuda"], str] = "auto" - - verbosity: int = 1 - """ - 0: errors only\n - 1: print warnings\n - 2: print warnings and info\n - 3: print warnings, info, and debug - """ - - formatter_base_threshold: float = 0.3 - """Base threshold for the confidence demanded of a separating line. - - Since merged rows are generally harder to deal with than empty rows, a low threshold is usually - better, because then more separating lines are detected. - """ - - cell_required_confidence: dict = field( - default_factory=lambda: { - 0: 0.3, # table - 1: 0.3, # column - 2: 0.3, # row - 3: 0.3, # column header - 4: 0.5, # projected row header - 5: 0.5, # spanning cell - 6: 99, # no object - } - ) - """Confidences required (>=) for a row/column feature to be considered good. See DITRFormattedTable.id2label - - But low confidences may be better than too high confidence (see formatter_base_threshold) - """ - - # ---- df() settings ---- - - # ---- options ---- - - remove_null_rows: bool = True - """Remove rows with no text.""" enable_multi_header: bool = True """Enable multi-indices in the dataframe. - If false, then multiple headers will be merged column-wise.""" + If false, then multiple headers will be merged vertically.""" semantic_spanning_cells: bool = True """ @@ -86,25 +43,22 @@ class DITRFormatConfig(HistogramConfig): # hence nms is also not useful anymore. - @removed_property("Large table approach ({name}) is not used for the DITR model.") + @deprecated("Large table approach ({name}) is not used for the DITR model.") def large_table_if_n_rows_removed(self): pass - @removed_property("Large table approach ({name}) is not used for the DITR model.") + @deprecated("Large table approach ({name}) is not used for the DITR model.") def large_table_threshold(self): pass - @removed_property("Large table approach ({name}) is not used for the DITR model.") + @deprecated("Large table approach ({name}) is not used for the DITR model.") def large_table_row_overlap_threshold(self): pass - @removed_property("Large table approach ({name}) is not used for the DITR model.") + @deprecated("Large table approach ({name}) is not used for the DITR model.") def force_large_table_assumption(self): pass - large_table_maximum_rows: int = 1000 - """If the table predicts a large number of rows, refuse to proceed. Therefore prevent memory issues for super small text.""" - # ---- rejection and warnings ---- # note that the overlap metric is not useful anymore since separating lines are not @@ -112,56 +66,30 @@ def force_large_table_assumption(self): # hence nms is also not useful anymore. - @removed_property("Overlap ({name}) is not used for the DITR model.") + @deprecated("Overlap ({name}) is not used for the DITR model.") def total_overlap_reject_threshold(self): pass - @removed_property("Overlap ({name}) is not used for the DITR model.") + @deprecated("Overlap ({name}) is not used for the DITR model.") def total_overlap_warn_threshold(self): pass - @removed_property("Overlap (nms) ({name}) is not used for the DITR model.") + @deprecated("Overlap (nms) ({name}) is not used for the DITR model.") def nms_warn_threshold(self): pass - @removed_property("Overlap ({name}) is not used for the DITR model.") + @deprecated("Overlap ({name}) is not used for the DITR model.") def iob_reject_threshold(self): pass - @removed_property("Overlap ({name}) is not used for the DITR model.") + @deprecated("Overlap ({name}) is not used for the DITR model.") def iob_warn_threshold(self): pass # ---- technical ---- - _nms_overlap_threshold: float = 0.1 _nms_overlap_threshold_larger: float = 0.5 - @removed_property("Large table approach ({name}) is not used for the DITR model.") + @deprecated("Large table approach ({name}) is not used for the DITR model.") def _large_table_merge_distance(self): pass - - _smallest_supported_text_height: float = 0.1 - """The smallest supported text height. Text smaller than this height will be ignored. - Helps prevent very small text from creating huge arrays under large table assumption.""" - - # ---- deprecated ---- - # aggregate_spanning_cells = False - @removed_property - def aggregate_spanning_cells(self): - pass - - # corner_clip_outlier_threshold = 0.1 - # """"corner clip" is when the text is clipped by a corner, and not an edge""" - @removed_property - def corner_clip_outlier_threshold(self): - pass - - # spanning_cell_minimum_width = 0.6 - @removed_property - def spanning_cell_minimum_width(self): - pass - - @property - def deduplication_iob_threshold(self): - pass diff --git a/gmft/impl/tatr/config.py b/gmft/impl/tatr/config.py index f55d92a..3f806a9 100644 --- a/gmft/impl/tatr/config.py +++ b/gmft/impl/tatr/config.py @@ -3,10 +3,13 @@ from dataclasses import dataclass, field from typing import Literal, Union +from typing_extensions import deprecated + +from gmft.core.legacy.removed_config import LegacyRemovedConfig @dataclass -class TATRFormatConfig: +class TATRFormatConfig(LegacyRemovedConfig): """ Configuration for :class:`.TATRTableFormatter`. """ @@ -61,7 +64,7 @@ class TATRFormatConfig: enable_multi_header: bool = False """Enable multi-indices in the dataframe. - If false, then multiple headers will be merged column-wise.""" + If false, then multiple headers will be merged vertically.""" semantic_spanning_cells: bool = False """ @@ -140,56 +143,3 @@ class TATRFormatConfig: _smallest_supported_text_height: float = 0.1 """The smallest supported text height. Text smaller than this height will be ignored. Helps prevent very small text from creating huge arrays under large table assumption.""" - - # ---- deprecated ---- - # aggregate_spanning_cells = False - @property - def aggregate_spanning_cells(self): - raise DeprecationWarning( - "aggregate_spanning_cells has been removed. Will break in v0.6.0." - ) - - @aggregate_spanning_cells.setter - def aggregate_spanning_cells(self, value): - raise DeprecationWarning( - "aggregate_spanning_cells has been removed. Will break in v0.6.0." - ) - - # corner_clip_outlier_threshold = 0.1 - # """"corner clip" is when the text is clipped by a corner, and not an edge""" - @property - def corner_clip_outlier_threshold(self): - raise DeprecationWarning( - "corner_clip_outlier_threshold has been removed. Will break in v0.6.0." - ) - - @corner_clip_outlier_threshold.setter - def corner_clip_outlier_threshold(self, value): - raise DeprecationWarning( - "corner_clip_outlier_threshold has been removed. Will break in v0.6.0." - ) - - # spanning_cell_minimum_width = 0.6 - @property - def spanning_cell_minimum_width(self): - raise DeprecationWarning( - "spanning_cell_minimum_width has been removed. Will break in v0.6.0." - ) - - @spanning_cell_minimum_width.setter - def spanning_cell_minimum_width(self, value): - raise DeprecationWarning( - "spanning_cell_minimum_width has been removed. Will break in v0.6.0." - ) - - @property - def deduplication_iob_threshold(self): - raise DeprecationWarning( - "deduplication_iob_threshold is deprecated. See nms_overlap_threshold instead. Will break in v0.6.0." - ) - - @deduplication_iob_threshold.setter - def deduplication_iob_threshold(self, value): - raise DeprecationWarning( - "deduplication_iob_threshold is deprecated. See nms_overlap_threshold instead. Will break in v0.6.0." - ) diff --git a/gmft/pdf_bindings/pdfium.py b/gmft/pdf_bindings/pdfium.py index 767eb44..6e1b5a6 100644 --- a/gmft/pdf_bindings/pdfium.py +++ b/gmft/pdf_bindings/pdfium.py @@ -5,7 +5,7 @@ import pypdfium2 as pdfium from gmft.base import Rect -from gmft.core.exceptions import DocumentClosedException +from gmft.core.exception import DocumentClosedException from gmft.pdf_bindings.base import BasePDFDocument, BasePage, _infer_line_breaks from PIL.Image import Image as PILImage diff --git a/pyproject.toml b/pyproject.toml index 35b0c13..98f5aee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "pandas", "matplotlib", "tabulate>=0.9.0", + "typing_extensions>=4.9" ] [project.urls] @@ -65,6 +66,7 @@ ignore = ["E712"] [dependency-groups] dev = [ "pytest>=8.3.5", + "pytest-cov>=6.2.1", "ruff>=0.11.11", ] docs = [ diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c74dd3a --- /dev/null +++ b/pytest.ini @@ -0,0 +1,13 @@ +[tool:pytest] +testpaths = test +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = + --strict-markers + --strict-config + --tb=short +markers = + slow: marks tests as slow (deselect with '-m "not slow"') + integration: marks tests as integration tests + unit: marks tests as unit tests \ No newline at end of file diff --git a/test/formatters/ditr/test_df.py b/test/formatters/ditr/test_df.py index a4b73c9..dcb65cb 100644 --- a/test/formatters/ditr/test_df.py +++ b/test/formatters/ditr/test_df.py @@ -95,7 +95,7 @@ def test_bulk_pdf5_t0(self, pdf5_tables): pass # this one just doesn't work very well # TODO make it work based on minima # try_jth_table(pdf5_tables, 5, 0) - # assert pdf5_tables[0]._projecting_indices == [15, 18, 22, 29] + # assert pdf5_tables[0].predictions["indices"]["_projecting"] == [15, 18, 22, 29] def test_bulk_pdf5_t1(self, ditr_tables, ditr_csvs, docs_bulk): try_table("pdf5_t1", ditr_tables, ditr_csvs, docs_bulk[5 - 1]) diff --git a/test/formatters/histogram/test_df.py b/test/formatters/histogram/test_df.py index 46ab6b9..19b4d02 100644 --- a/test/formatters/histogram/test_df.py +++ b/test/formatters/histogram/test_df.py @@ -112,7 +112,7 @@ def test_bulk_pdf5_t0(self, pdf5_tables): pass # this one just doesn't work very well # TODO make it work based on minima # try_jth_table(pdf5_tables, 5, 0) - # assert pdf5_tables[0]._projecting_indices == [15, 18, 22, 29] + # assert pdf5_tables[0].predictions["indices"]["_projecting"] == [15, 18, 22, 29] def test_bulk_pdf5_t1(self, pdf5_tables, tatr_csvs): try_jth_table(pdf5_tables, tatr_csvs, 5, 1) diff --git a/test/formatters/tatr/test_df.py b/test/formatters/tatr/test_df.py index b076858..dfba1b5 100644 --- a/test/formatters/tatr/test_df.py +++ b/test/formatters/tatr/test_df.py @@ -92,11 +92,11 @@ def test_bulk_pdf2_t0(self, pdf2_tables, tatr_csvs): def test_bulk_pdf2_t1(self, pdf2_tables, tatr_csvs): try_jth_table(pdf2_tables, tatr_csvs, 2, 1) # hint: subtract 2 from the line no to get the proj. index (assume 1 header) - assert pdf2_tables[1]._projecting_indices == [9, 12, 16] + assert pdf2_tables[1].predictions["indices"]["_projecting"] == [9, 12, 16] def test_bulk_pdf2_t2(self, pdf2_tables, tatr_csvs): try_jth_table(pdf2_tables, tatr_csvs, 2, 2) - assert pdf2_tables[2]._projecting_indices == [0, 5] + assert pdf2_tables[2].predictions["indices"]["_projecting"] == [0, 5] def test_bulk_pdf2_t3(self, pdf2_tables, tatr_csvs): try_jth_table(pdf2_tables, tatr_csvs, 2, 3) @@ -112,7 +112,7 @@ def test_bulk_pdf3_t1(self, pdf3_tables, tatr_csvs): def test_bulk_pdf3_t2(self, pdf3_tables, tatr_csvs): try_jth_table(pdf3_tables, tatr_csvs, 3, 2) - assert pdf3_tables[2]._projecting_indices == [0, 8] + assert pdf3_tables[2].predictions["indices"]["_projecting"] == [0, 8] def test_bulk_pdf3_t3(self, pdf3_tables, tatr_csvs): try_jth_table(pdf3_tables, tatr_csvs, 3, 3) @@ -124,17 +124,17 @@ def test_bulk_pdf4_t0(self, pdf4_tables, tatr_csvs): def test_bulk_pdf4_t1(self, pdf4_tables, tatr_csvs): try_jth_table(pdf4_tables, tatr_csvs, 4, 1) - assert pdf4_tables[1]._projecting_indices == [0, 14] + assert pdf4_tables[1].predictions["indices"]["_projecting"] == [0, 14] class TestPdf5: def test_bulk_pdf5_t0(self, pdf5_tables, tatr_csvs): try_jth_table(pdf5_tables, tatr_csvs, 5, 0) - assert pdf5_tables[0]._projecting_indices == [15, 18, 22, 29] + assert pdf5_tables[0].predictions["indices"]["_projecting"] == [15, 18, 22, 29] def test_bulk_pdf5_t1(self, pdf5_tables, tatr_csvs): try_jth_table(pdf5_tables, tatr_csvs, 5, 1) - assert pdf5_tables[1]._projecting_indices == [13, 16, 22, 26] + assert pdf5_tables[1].predictions["indices"]["_projecting"] == [13, 16, 22, 26] class TestPdf6: diff --git a/test/formatters/tatr/test_spanning.py b/test/formatters/tatr/test_spanning.py index 03a8c1b..2d413d8 100644 --- a/test/formatters/tatr/test_spanning.py +++ b/test/formatters/tatr/test_spanning.py @@ -178,7 +178,7 @@ def test_pdf2_t2(self, pdf2_tables): try_jth_table(pdf2_tables, 2, 2, expected, config=config2) - assert pdf2_tables[2]._projecting_indices == [0, 5] + assert pdf2_tables[2].predictions["indices"]["_projecting"] == [0, 5] # pdf4 t1 is arguably HierTop, but the ground truth is not yet clear diff --git a/test/formatters/tatr/test_visualize.py b/test/formatters/tatr/test_visualize.py new file mode 100644 index 0000000..4b5346f --- /dev/null +++ b/test/formatters/tatr/test_visualize.py @@ -0,0 +1,96 @@ +import numpy as np +from PIL import Image +import pytest +from gmft.formatters.tatr import TATRFormattedTable +from gmft.detectors.base import CroppedTable +from gmft.impl.tatr.config import TATRFormatConfig + + +# def test_tatr_formatted_table_visualize_minimal(): +# # Create a minimal synthetic CroppedTable +# class DummyCroppedTable(CroppedTable): +# def __init__(self): +# self._img_dpi = 72 +# self._img_padding = (0, 0) +# self._img_margin = (0, 0, 0, 0) +# self.angle = 0 +# self._df = None +# self.predictions = {} +# self.image_shape = (100, 100, 3) + +# def image(self, dpi=None, padding=None, margin=None): +# # Return a blank white image +# arr = np.ones(self.image_shape, dtype=np.uint8) * 255 +# return Image.fromarray(arr) + +# cropped_table = DummyCroppedTable() +# # Minimal fctn_results with one box +# fctn_results = { +# "boxes": [[10, 10, 50, 50]], +# "scores": [0.99], +# "labels": [0], +# } +# config = TATRFormatConfig() +# tft = TATRFormattedTable(cropped_table, fctn_results, config=config) +# # Should return a PIL Image +# img = tft.visualize(return_img=True) +# assert isinstance(img, Image.Image) +# # Should not raise for effective=True +# img2 = tft.visualize(effective=True, return_img=True) +# assert isinstance(img2, Image.Image) + + +def images_distance(img1: Image.Image, img2: Image.Image) -> float: + """ + Compares two PIL images for visual similarity; returns a distance. + A lower value means more similar. + + Args: + img1 (Image.Image): First image. + img2 (Image.Image): Second image. + + Returns: + float: The mean pixelwise difference between the images. Between 0 and 255. + """ + # Convert both images to RGB + img1 = img1.convert("RGB") + img2 = img2.convert("RGB") + + # Check size + if img1.size != img2.size: + return float("inf") + + # Convert to NumPy arrays + arr1 = np.array(img1).astype(np.int16) + arr2 = np.array(img2).astype(np.int16) + + # Compute absolute difference + diff = np.abs(arr1 - arr2) + + # Compute mean difference + return np.mean(diff) + + +def test_visualize_content(pdf2_tables): + """ + Tests that the output of visualize() is consistent with a reference image. + """ + ft = pdf2_tables[2] + + # Generate the image from the table + generated_img = ft.visualize(effective=True, show_labels=False, return_img=True) + + # Load reference image + reference_path = "data/test/references/img/pdf2_t2.png" + try: + reference_img = Image.open(reference_path) + except FileNotFoundError: + pytest.skip(f"Reference image not found at {reference_path}") + + # Compare images + distance = images_distance(generated_img, reference_img) + + # Allow for minor rendering differences. + # A value of 1.0 means on average each channel of each pixel is off by 1. + # print("Distance", distance) + assert distance < 1.0 # 0.0 diff --git a/test/scripts/script_generate_cropped.py b/test/scripts/script_generate_cropped.py new file mode 100644 index 0000000..b6897fa --- /dev/null +++ b/test/scripts/script_generate_cropped.py @@ -0,0 +1,21 @@ +from gmft.detectors.base import CroppedTable +from gmft.pdf_bindings.pdfium import PyPDFium2Document + + +def generate_cropped_positions_tsv(): + page = PyPDFium2Document("data/pdfs/tiny.pdf")[0] + table = CroppedTable.from_dict( + { + "filename": "data/pdfs/tiny.pdf", + "page_no": 0, + "bbox": (10, 10, 300, 150), + "confidence_score": 0.9, + "label": 0, + }, + page, + ) + + # create the tsv + with open("data/test/references/tiny_cropped_positions.tsv", "w") as f: + for pos in table.text_positions(): + f.write("\t".join(map(str, pos)) + "\n") diff --git a/test/test_auto.py b/test/test_auto.py index 2027b94..c0268b0 100644 --- a/test/test_auto.py +++ b/test/test_auto.py @@ -27,4 +27,3 @@ def test_auto_format_config_instantiation(): """Test that AutoFormatConfig properly instantiates as TATRFormatConfig.""" config = AutoFormatConfig() assert isinstance(config, TATRFormatConfig) - diff --git a/test/test_cropped.py b/test/test_cropped.py index 0a81d88..8f85618 100644 --- a/test/test_cropped.py +++ b/test/test_cropped.py @@ -34,6 +34,7 @@ def test_CroppedTable_positions(doc_tiny): }, page, ) + assert not isinstance(table, RotatedCroppedTable) # get reference positions from tiny_pdfium.txt with open("data/test/references/tiny_cropped_positions.tsv") as f: @@ -152,20 +153,28 @@ def test_RotatedCroppedTable_text(doc_tiny): ) -if __name__ == "__main__": - page = PyPDFium2Document("data/pdfs/tiny.pdf")[0] +def test_CroppedTable_angle(doc_tiny): + # Reflect the fact that 'angle' has been absorbed into CroppedTable + page = doc_tiny[0] table = CroppedTable.from_dict( { "filename": "data/pdfs/tiny.pdf", "page_no": 0, - "bbox": (10, 10, 300, 150), + "bbox": (10, 12, 300, 150), "confidence_score": 0.9, "label": 0, + "angle": 0, }, page, ) + assert not isinstance(table, RotatedCroppedTable) + + with pytest.raises(ValueError, match="Only 0, 90, 180, 270 are supported."): + _ = CroppedTable(page, (1, 2, 3, 4), angle=42) + - # create the tsv - with open("data/test/references/tiny_cropped_positions.tsv", "w") as f: - for pos in table.text_positions(): - f.write("\t".join(map(str, pos)) + "\n") +# TODO: ct.image() with margin='auto', +# ct.image() with rotated image, +# text_positions with angle==[180,270] +# ct.visualize(), +# ct.from_image_only() diff --git a/uv.lock b/uv.lock index 71953a0..e74ad0c 100644 --- a/uv.lock +++ b/uv.lock @@ -322,6 +322,85 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/68/7f46fb537958e87427d98a4074bcde4b67a70b04900cfc5ce29bc2f556c1/contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5", size = 221791 }, ] +[[package]] +name = "coverage" +version = "7.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/e0/98670a80884f64578f0c22cd70c5e81a6e07b08167721c7487b4d70a7ca0/coverage-7.9.1.tar.gz", hash = "sha256:6cf43c78c4282708a28e466316935ec7489a9c487518a77fa68f716c67909cec", size = 813650 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/78/1c1c5ec58f16817c09cbacb39783c3655d54a221b6552f47ff5ac9297603/coverage-7.9.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cc94d7c5e8423920787c33d811c0be67b7be83c705f001f7180c7b186dcf10ca", size = 212028 }, + { url = "https://files.pythonhosted.org/packages/98/db/e91b9076f3a888e3b4ad7972ea3842297a52cc52e73fd1e529856e473510/coverage-7.9.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:16aa0830d0c08a2c40c264cef801db8bc4fc0e1892782e45bcacbd5889270509", size = 212420 }, + { url = "https://files.pythonhosted.org/packages/0e/d0/2b3733412954576b0aea0a16c3b6b8fbe95eb975d8bfa10b07359ead4252/coverage-7.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf95981b126f23db63e9dbe4cf65bd71f9a6305696fa5e2262693bc4e2183f5b", size = 241529 }, + { url = "https://files.pythonhosted.org/packages/b3/00/5e2e5ae2e750a872226a68e984d4d3f3563cb01d1afb449a17aa819bc2c4/coverage-7.9.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f05031cf21699785cd47cb7485f67df619e7bcdae38e0fde40d23d3d0210d3c3", size = 239403 }, + { url = "https://files.pythonhosted.org/packages/37/3b/a2c27736035156b0a7c20683afe7df498480c0dfdf503b8c878a21b6d7fb/coverage-7.9.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb4fbcab8764dc072cb651a4bcda4d11fb5658a1d8d68842a862a6610bd8cfa3", size = 240548 }, + { url = "https://files.pythonhosted.org/packages/98/f5/13d5fc074c3c0e0dc80422d9535814abf190f1254d7c3451590dc4f8b18c/coverage-7.9.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0f16649a7330ec307942ed27d06ee7e7a38417144620bb3d6e9a18ded8a2d3e5", size = 240459 }, + { url = "https://files.pythonhosted.org/packages/36/24/24b9676ea06102df824c4a56ffd13dc9da7904478db519efa877d16527d5/coverage-7.9.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:cea0a27a89e6432705fffc178064503508e3c0184b4f061700e771a09de58187", size = 239128 }, + { url = "https://files.pythonhosted.org/packages/be/05/242b7a7d491b369ac5fee7908a6e5ba42b3030450f3ad62c645b40c23e0e/coverage-7.9.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e980b53a959fa53b6f05343afbd1e6f44a23ed6c23c4b4c56c6662bbb40c82ce", size = 239402 }, + { url = "https://files.pythonhosted.org/packages/73/e0/4de7f87192fa65c9c8fbaeb75507e124f82396b71de1797da5602898be32/coverage-7.9.1-cp310-cp310-win32.whl", hash = "sha256:70760b4c5560be6ca70d11f8988ee6542b003f982b32f83d5ac0b72476607b70", size = 214518 }, + { url = "https://files.pythonhosted.org/packages/d5/ab/5e4e2fe458907d2a65fab62c773671cfc5ac704f1e7a9ddd91996f66e3c2/coverage-7.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:a66e8f628b71f78c0e0342003d53b53101ba4e00ea8dabb799d9dba0abbbcebe", size = 215436 }, + { url = "https://files.pythonhosted.org/packages/60/34/fa69372a07d0903a78ac103422ad34db72281c9fc625eba94ac1185da66f/coverage-7.9.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:95c765060e65c692da2d2f51a9499c5e9f5cf5453aeaf1420e3fc847cc060582", size = 212146 }, + { url = "https://files.pythonhosted.org/packages/27/f0/da1894915d2767f093f081c42afeba18e760f12fdd7a2f4acbe00564d767/coverage-7.9.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ba383dc6afd5ec5b7a0d0c23d38895db0e15bcba7fb0fa8901f245267ac30d86", size = 212536 }, + { url = "https://files.pythonhosted.org/packages/10/d5/3fc33b06e41e390f88eef111226a24e4504d216ab8e5d1a7089aa5a3c87a/coverage-7.9.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37ae0383f13cbdcf1e5e7014489b0d71cc0106458878ccde52e8a12ced4298ed", size = 245092 }, + { url = "https://files.pythonhosted.org/packages/0a/39/7aa901c14977aba637b78e95800edf77f29f5a380d29768c5b66f258305b/coverage-7.9.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:69aa417a030bf11ec46149636314c24c8d60fadb12fc0ee8f10fda0d918c879d", size = 242806 }, + { url = "https://files.pythonhosted.org/packages/43/fc/30e5cfeaf560b1fc1989227adedc11019ce4bb7cce59d65db34fe0c2d963/coverage-7.9.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a4be2a28656afe279b34d4f91c3e26eccf2f85500d4a4ff0b1f8b54bf807338", size = 244610 }, + { url = "https://files.pythonhosted.org/packages/bf/15/cca62b13f39650bc87b2b92bb03bce7f0e79dd0bf2c7529e9fc7393e4d60/coverage-7.9.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:382e7ddd5289f140259b610e5f5c58f713d025cb2f66d0eb17e68d0a94278875", size = 244257 }, + { url = "https://files.pythonhosted.org/packages/cd/1a/c0f2abe92c29e1464dbd0ff9d56cb6c88ae2b9e21becdb38bea31fcb2f6c/coverage-7.9.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e5532482344186c543c37bfad0ee6069e8ae4fc38d073b8bc836fc8f03c9e250", size = 242309 }, + { url = "https://files.pythonhosted.org/packages/57/8d/c6fd70848bd9bf88fa90df2af5636589a8126d2170f3aade21ed53f2b67a/coverage-7.9.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a39d18b3f50cc121d0ce3838d32d58bd1d15dab89c910358ebefc3665712256c", size = 242898 }, + { url = "https://files.pythonhosted.org/packages/c2/9e/6ca46c7bff4675f09a66fe2797cd1ad6a24f14c9c7c3b3ebe0470a6e30b8/coverage-7.9.1-cp311-cp311-win32.whl", hash = "sha256:dd24bd8d77c98557880def750782df77ab2b6885a18483dc8588792247174b32", size = 214561 }, + { url = "https://files.pythonhosted.org/packages/a1/30/166978c6302010742dabcdc425fa0f938fa5a800908e39aff37a7a876a13/coverage-7.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:6b55ad10a35a21b8015eabddc9ba31eb590f54adc9cd39bcf09ff5349fd52125", size = 215493 }, + { url = "https://files.pythonhosted.org/packages/60/07/a6d2342cd80a5be9f0eeab115bc5ebb3917b4a64c2953534273cf9bc7ae6/coverage-7.9.1-cp311-cp311-win_arm64.whl", hash = "sha256:6ad935f0016be24c0e97fc8c40c465f9c4b85cbbe6eac48934c0dc4d2568321e", size = 213869 }, + { url = "https://files.pythonhosted.org/packages/68/d9/7f66eb0a8f2fce222de7bdc2046ec41cb31fe33fb55a330037833fb88afc/coverage-7.9.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a8de12b4b87c20de895f10567639c0797b621b22897b0af3ce4b4e204a743626", size = 212336 }, + { url = "https://files.pythonhosted.org/packages/20/20/e07cb920ef3addf20f052ee3d54906e57407b6aeee3227a9c91eea38a665/coverage-7.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5add197315a054e92cee1b5f686a2bcba60c4c3e66ee3de77ace6c867bdee7cb", size = 212571 }, + { url = "https://files.pythonhosted.org/packages/78/f8/96f155de7e9e248ca9c8ff1a40a521d944ba48bec65352da9be2463745bf/coverage-7.9.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:600a1d4106fe66f41e5d0136dfbc68fe7200a5cbe85610ddf094f8f22e1b0300", size = 246377 }, + { url = "https://files.pythonhosted.org/packages/3e/cf/1d783bd05b7bca5c10ded5f946068909372e94615a4416afadfe3f63492d/coverage-7.9.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a876e4c3e5a2a1715a6608906aa5a2e0475b9c0f68343c2ada98110512ab1d8", size = 243394 }, + { url = "https://files.pythonhosted.org/packages/02/dd/e7b20afd35b0a1abea09fb3998e1abc9f9bd953bee548f235aebd2b11401/coverage-7.9.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81f34346dd63010453922c8e628a52ea2d2ccd73cb2487f7700ac531b247c8a5", size = 245586 }, + { url = "https://files.pythonhosted.org/packages/4e/38/b30b0006fea9d617d1cb8e43b1bc9a96af11eff42b87eb8c716cf4d37469/coverage-7.9.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:888f8eee13f2377ce86d44f338968eedec3291876b0b8a7289247ba52cb984cd", size = 245396 }, + { url = "https://files.pythonhosted.org/packages/31/e4/4d8ec1dc826e16791f3daf1b50943e8e7e1eb70e8efa7abb03936ff48418/coverage-7.9.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9969ef1e69b8c8e1e70d591f91bbc37fc9a3621e447525d1602801a24ceda898", size = 243577 }, + { url = "https://files.pythonhosted.org/packages/25/f4/b0e96c5c38e6e40ef465c4bc7f138863e2909c00e54a331da335faf0d81a/coverage-7.9.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:60c458224331ee3f1a5b472773e4a085cc27a86a0b48205409d364272d67140d", size = 244809 }, + { url = "https://files.pythonhosted.org/packages/8a/65/27e0a1fa5e2e5079bdca4521be2f5dabf516f94e29a0defed35ac2382eb2/coverage-7.9.1-cp312-cp312-win32.whl", hash = "sha256:5f646a99a8c2b3ff4c6a6e081f78fad0dde275cd59f8f49dc4eab2e394332e74", size = 214724 }, + { url = "https://files.pythonhosted.org/packages/9b/a8/d5b128633fd1a5e0401a4160d02fa15986209a9e47717174f99dc2f7166d/coverage-7.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:30f445f85c353090b83e552dcbbdad3ec84c7967e108c3ae54556ca69955563e", size = 215535 }, + { url = "https://files.pythonhosted.org/packages/a3/37/84bba9d2afabc3611f3e4325ee2c6a47cd449b580d4a606b240ce5a6f9bf/coverage-7.9.1-cp312-cp312-win_arm64.whl", hash = "sha256:af41da5dca398d3474129c58cb2b106a5d93bbb196be0d307ac82311ca234342", size = 213904 }, + { url = "https://files.pythonhosted.org/packages/d0/a7/a027970c991ca90f24e968999f7d509332daf6b8c3533d68633930aaebac/coverage-7.9.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:31324f18d5969feef7344a932c32428a2d1a3e50b15a6404e97cba1cc9b2c631", size = 212358 }, + { url = "https://files.pythonhosted.org/packages/f2/48/6aaed3651ae83b231556750280682528fea8ac7f1232834573472d83e459/coverage-7.9.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0c804506d624e8a20fb3108764c52e0eef664e29d21692afa375e0dd98dc384f", size = 212620 }, + { url = "https://files.pythonhosted.org/packages/6c/2a/f4b613f3b44d8b9f144847c89151992b2b6b79cbc506dee89ad0c35f209d/coverage-7.9.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef64c27bc40189f36fcc50c3fb8f16ccda73b6a0b80d9bd6e6ce4cffcd810bbd", size = 245788 }, + { url = "https://files.pythonhosted.org/packages/04/d2/de4fdc03af5e4e035ef420ed26a703c6ad3d7a07aff2e959eb84e3b19ca8/coverage-7.9.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d4fe2348cc6ec372e25adec0219ee2334a68d2f5222e0cba9c0d613394e12d86", size = 243001 }, + { url = "https://files.pythonhosted.org/packages/f5/e8/eed18aa5583b0423ab7f04e34659e51101135c41cd1dcb33ac1d7013a6d6/coverage-7.9.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34ed2186fe52fcc24d4561041979a0dec69adae7bce2ae8d1c49eace13e55c43", size = 244985 }, + { url = "https://files.pythonhosted.org/packages/17/f8/ae9e5cce8885728c934eaa58ebfa8281d488ef2afa81c3dbc8ee9e6d80db/coverage-7.9.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:25308bd3d00d5eedd5ae7d4357161f4df743e3c0240fa773ee1b0f75e6c7c0f1", size = 245152 }, + { url = "https://files.pythonhosted.org/packages/5a/c8/272c01ae792bb3af9b30fac14d71d63371db227980682836ec388e2c57c0/coverage-7.9.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:73e9439310f65d55a5a1e0564b48e34f5369bee943d72c88378f2d576f5a5751", size = 243123 }, + { url = "https://files.pythonhosted.org/packages/8c/d0/2819a1e3086143c094ab446e3bdf07138527a7b88cb235c488e78150ba7a/coverage-7.9.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:37ab6be0859141b53aa89412a82454b482c81cf750de4f29223d52268a86de67", size = 244506 }, + { url = "https://files.pythonhosted.org/packages/8b/4e/9f6117b89152df7b6112f65c7a4ed1f2f5ec8e60c4be8f351d91e7acc848/coverage-7.9.1-cp313-cp313-win32.whl", hash = "sha256:64bdd969456e2d02a8b08aa047a92d269c7ac1f47e0c977675d550c9a0863643", size = 214766 }, + { url = "https://files.pythonhosted.org/packages/27/0f/4b59f7c93b52c2c4ce7387c5a4e135e49891bb3b7408dcc98fe44033bbe0/coverage-7.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:be9e3f68ca9edb897c2184ad0eee815c635565dbe7a0e7e814dc1f7cbab92c0a", size = 215568 }, + { url = "https://files.pythonhosted.org/packages/09/1e/9679826336f8c67b9c39a359352882b24a8a7aee48d4c9cad08d38d7510f/coverage-7.9.1-cp313-cp313-win_arm64.whl", hash = "sha256:1c503289ffef1d5105d91bbb4d62cbe4b14bec4d13ca225f9c73cde9bb46207d", size = 213939 }, + { url = "https://files.pythonhosted.org/packages/bb/5b/5c6b4e7a407359a2e3b27bf9c8a7b658127975def62077d441b93a30dbe8/coverage-7.9.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0b3496922cb5f4215bf5caaef4cf12364a26b0be82e9ed6d050f3352cf2d7ef0", size = 213079 }, + { url = "https://files.pythonhosted.org/packages/a2/22/1e2e07279fd2fd97ae26c01cc2186e2258850e9ec125ae87184225662e89/coverage-7.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9565c3ab1c93310569ec0d86b017f128f027cab0b622b7af288696d7ed43a16d", size = 213299 }, + { url = "https://files.pythonhosted.org/packages/14/c0/4c5125a4b69d66b8c85986d3321520f628756cf524af810baab0790c7647/coverage-7.9.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2241ad5dbf79ae1d9c08fe52b36d03ca122fb9ac6bca0f34439e99f8327ac89f", size = 256535 }, + { url = "https://files.pythonhosted.org/packages/81/8b/e36a04889dda9960be4263e95e777e7b46f1bb4fc32202612c130a20c4da/coverage-7.9.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bb5838701ca68b10ebc0937dbd0eb81974bac54447c55cd58dea5bca8451029", size = 252756 }, + { url = "https://files.pythonhosted.org/packages/98/82/be04eff8083a09a4622ecd0e1f31a2c563dbea3ed848069e7b0445043a70/coverage-7.9.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b30a25f814591a8c0c5372c11ac8967f669b97444c47fd794926e175c4047ece", size = 254912 }, + { url = "https://files.pythonhosted.org/packages/0f/25/c26610a2c7f018508a5ab958e5b3202d900422cf7cdca7670b6b8ca4e8df/coverage-7.9.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:2d04b16a6062516df97969f1ae7efd0de9c31eb6ebdceaa0d213b21c0ca1a683", size = 256144 }, + { url = "https://files.pythonhosted.org/packages/c5/8b/fb9425c4684066c79e863f1e6e7ecebb49e3a64d9f7f7860ef1688c56f4a/coverage-7.9.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7931b9e249edefb07cd6ae10c702788546341d5fe44db5b6108a25da4dca513f", size = 254257 }, + { url = "https://files.pythonhosted.org/packages/93/df/27b882f54157fc1131e0e215b0da3b8d608d9b8ef79a045280118a8f98fe/coverage-7.9.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:52e92b01041151bf607ee858e5a56c62d4b70f4dac85b8c8cb7fb8a351ab2c10", size = 255094 }, + { url = "https://files.pythonhosted.org/packages/41/5f/cad1c3dbed8b3ee9e16fa832afe365b4e3eeab1fb6edb65ebbf745eabc92/coverage-7.9.1-cp313-cp313t-win32.whl", hash = "sha256:684e2110ed84fd1ca5f40e89aa44adf1729dc85444004111aa01866507adf363", size = 215437 }, + { url = "https://files.pythonhosted.org/packages/99/4d/fad293bf081c0e43331ca745ff63673badc20afea2104b431cdd8c278b4c/coverage-7.9.1-cp313-cp313t-win_amd64.whl", hash = "sha256:437c576979e4db840539674e68c84b3cda82bc824dd138d56bead1435f1cb5d7", size = 216605 }, + { url = "https://files.pythonhosted.org/packages/1f/56/4ee027d5965fc7fc126d7ec1187529cc30cc7d740846e1ecb5e92d31b224/coverage-7.9.1-cp313-cp313t-win_arm64.whl", hash = "sha256:18a0912944d70aaf5f399e350445738a1a20b50fbea788f640751c2ed9208b6c", size = 214392 }, + { url = "https://files.pythonhosted.org/packages/a5/d6/c41dd9b02bf16ec001aaf1cbef665537606899a3db1094e78f5ae17540ca/coverage-7.9.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f424507f57878e424d9a95dc4ead3fbdd72fd201e404e861e465f28ea469951", size = 212029 }, + { url = "https://files.pythonhosted.org/packages/f8/c0/40420d81d731f84c3916dcdf0506b3e6c6570817bff2576b83f780914ae6/coverage-7.9.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:535fde4001b2783ac80865d90e7cc7798b6b126f4cd8a8c54acfe76804e54e58", size = 212407 }, + { url = "https://files.pythonhosted.org/packages/9b/87/f0db7d62d0e09f14d6d2f6ae8c7274a2f09edf74895a34b412a0601e375a/coverage-7.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02532fd3290bb8fa6bec876520842428e2a6ed6c27014eca81b031c2d30e3f71", size = 241160 }, + { url = "https://files.pythonhosted.org/packages/a9/b7/3337c064f058a5d7696c4867159651a5b5fb01a5202bcf37362f0c51400e/coverage-7.9.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:56f5eb308b17bca3bbff810f55ee26d51926d9f89ba92707ee41d3c061257e55", size = 239027 }, + { url = "https://files.pythonhosted.org/packages/7e/a9/5898a283f66d1bd413c32c2e0e05408196fd4f37e206e2b06c6e0c626e0e/coverage-7.9.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfa447506c1a52271f1b0de3f42ea0fa14676052549095e378d5bff1c505ff7b", size = 240145 }, + { url = "https://files.pythonhosted.org/packages/e0/33/d96e3350078a3c423c549cb5b2ba970de24c5257954d3e4066e2b2152d30/coverage-7.9.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9ca8e220006966b4a7b68e8984a6aee645a0384b0769e829ba60281fe61ec4f7", size = 239871 }, + { url = "https://files.pythonhosted.org/packages/1d/6e/6fb946072455f71a820cac144d49d11747a0f1a21038060a68d2d0200499/coverage-7.9.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:49f1d0788ba5b7ba65933f3a18864117c6506619f5ca80326b478f72acf3f385", size = 238122 }, + { url = "https://files.pythonhosted.org/packages/e4/5c/bc43f25c8586840ce25a796a8111acf6a2b5f0909ba89a10d41ccff3920d/coverage-7.9.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:68cd53aec6f45b8e4724c0950ce86eacb775c6be01ce6e3669fe4f3a21e768ed", size = 239058 }, + { url = "https://files.pythonhosted.org/packages/11/d8/ce2007418dd7fd00ff8c8b898bb150bb4bac2d6a86df05d7b88a07ff595f/coverage-7.9.1-cp39-cp39-win32.whl", hash = "sha256:95335095b6c7b1cc14c3f3f17d5452ce677e8490d101698562b2ffcacc304c8d", size = 214532 }, + { url = "https://files.pythonhosted.org/packages/20/21/334e76fa246e92e6d69cab217f7c8a70ae0cc8f01438bd0544103f29528e/coverage-7.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:e1b5191d1648acc439b24721caab2fd0c86679d8549ed2c84d5a7ec1bedcc244", size = 215439 }, + { url = "https://files.pythonhosted.org/packages/3e/e5/c723545c3fd3204ebde3b4cc4b927dce709d3b6dc577754bb57f63ca4a4a/coverage-7.9.1-pp39.pp310.pp311-none-any.whl", hash = "sha256:db0f04118d1db74db6c9e1cb1898532c7dcc220f1d2718f058601f7c3f499514", size = 204009 }, + { url = "https://files.pythonhosted.org/packages/08/b8/7ddd1e8ba9701dea08ce22029917140e6f66a859427406579fd8d0ca7274/coverage-7.9.1-py3-none-any.whl", hash = "sha256:66b974b145aa189516b6bf2d8423e888b742517d37872f6ee4c5be0073bd9a3c", size = 204000 }, +] + +[package.optional-dependencies] +toml = [ + { name = "tomli", marker = "python_full_version <= '3.11'" }, +] + [[package]] name = "cycler" version = "0.12.1" @@ -442,6 +521,7 @@ img2table = [ [package.dev-dependencies] dev = [ { name = "pytest" }, + { name = "pytest-cov" }, { name = "ruff" }, ] docs = [ @@ -467,6 +547,7 @@ provides-extras = ["img2table"] [package.metadata.requires-dev] dev = [ { name = "pytest", specifier = ">=8.3.5" }, + { name = "pytest-cov", specifier = ">=6.2.1" }, { name = "ruff", specifier = ">=0.11.11" }, ] docs = [ @@ -1766,6 +1847,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634 }, ] +[[package]] +name = "pytest-cov" +version = "6.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "coverage", extra = ["toml"] }, + { name = "pluggy" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/18/99/668cade231f434aaa59bbfbf49469068d2ddd945000621d3d165d2e7dd7b/pytest_cov-6.2.1.tar.gz", hash = "sha256:25cc6cc0a5358204b8108ecedc51a9b57b34cc6b8c967cc2c01a4e00d8a67da2", size = 69432 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/16/4ea354101abb1287856baa4af2732be351c7bee728065aed451b678153fd/pytest_cov-6.2.1-py3-none-any.whl", hash = "sha256:f5bc4c23f42f1cdd23c70b1dab1bbaef4fc505ba950d53e0081d0730dd7e86d5", size = 24644 }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0"