Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
## v0.4.2

Bugfixes before the upcoming release.
- Better imports and lazy loading
- Default device is now 'auto', which resolves to cuda/cpu depending on availability
- Rich text now available as `AutoPageFormatter`
- Fixed bug with permuted coordinates (e0c6dc52)
- CroppedTable now directly has `angle` property
- CI tests, Python 3.9 support
- More type hints
- Light restructuring (non-breaking)
- Internal data structure tweaks
- (`fctn_results` → `predictions.tatr`)
- (`effective_*` → `predictions.effective`)

## v0.4.0

Features: 3 new table structure recognition options!
Expand Down
179 changes: 86 additions & 93 deletions gmft/__init__.py
Original file line number Diff line number Diff line change
@@ -1,140 +1,133 @@
"""
Currently, contains aliases for key classes and functions.

Unfortunately, although at one point the ability to import classes from the top level module (ie. `from gmft import AutoTableFormatter`) was encouraged,
it is now discouraged and may be removed in future versions. The reason being: importing through the top level module
loads the entire library, even when you're using only a small part of it.
Importing from the top-level module previously resulted in long load times.
However, v0.5 introduces lazy loading, which greatly improves the situation.

Instead, `gmft.auto` is now encouraged. For example, `from gmft.auto import AutoTableFormatter`.
Now, classes may either be imported from their original locations,
`gmft.auto`, or from here, where they will be lazy loaded.
"""

# small classes are fine, but discouraged.
from gmft.base import Rect
from gmft.core.legacy.mirror import DeprecationMirrorMeta
from gmft.pdf_bindings.base import BasePDFDocument, BasePage
from gmft.detectors.base import CroppedTable, RotatedCroppedTable
from gmft.formatters.base import FormattedTable

from gmft.auto import (
TATRDetector as TATRTableDetectorOrig,
TableDetectorConfig as TableDetectorConfigOrig,
TableDetector as TableDetectorOrig,
TATRFormatConfig as TATRFormatConfigOrig,
TATRFormattedTable as TATRFormattedTableOrig,
TATRFormatter as TATRTableFormatterOrig,
AutoTableFormatter as AutoTableFormatterOrig,
AutoFormatConfig as AutoFormatConfigOrig,
AutoTableDetector as AutoTableDetectorOrig,
# config-only classes specific to TATR are still discouraged.

# these auto classes are lazy-loaded
from gmft.core.auto_lazy import (
AutoTableFormatter,
AutoFormatConfig,
AutoTableDetector,
)

has_warned = False
# We need to support these imports for compatibility:
# TATRTableDetector
# TableDetectorConfig
# TableDetector
# TATRFormatConfig
# TATRFormattedTable
# TATRTableFormatter
# AutoTableFormatter
# AutoFormatConfig
# AutoTableDetector


def _deprecation_warning(name):
global has_warned
if has_warned:
return
import warnings
# These bulky TATR-specific detectors are discouraged, but still available for compatibility.
class TATRTableDetector(metaclass=DeprecationMirrorMeta):
"""
This import is deprecated.

msg = f"(Deprecation) While once encouraged, \
importing {name} and other classes from the top level module is now deprecated and will break in v0.5.0. \
Please import from gmft.auto instead."
warnings.warn(msg, DeprecationWarning, stacklevel=2)
print(msg)
has_warned = True
Please use:
- gmft.AutoTableDetector
- gmft.detectors.tatr.TATRDetector
"""

@classmethod
def get_mirrored_class(cls):
from gmft.detectors.tatr import TATRDetector as OrigCls

# These small classes are fine, but still discouraged.
# Rect
# BasePDFDocument
# BasePage
# CroppedTable
# RotatedCroppedTable
return OrigCls


class TATRTableDetector(TATRTableDetectorOrig):
"""
Deprecated. Please import from gmft.auto instead.
class TableDetectorConfig(metaclass=DeprecationMirrorMeta):
"""
This import is deprecated.

def __init__(self, *args, **kwargs):
_deprecation_warning("TATRTableDetector")
super().__init__(*args, **kwargs)


class TableDetectorConfig(TableDetectorConfigOrig):
"""
Deprecated. Please import from gmft.auto instead.
Please use:
- Reformat API (v0.5)
- gmft.detectors.tatr.TATRDetectorConfig
"""

def __init__(self, *args, **kwargs):
_deprecation_warning("TableDetectorConfig")
super().__init__(*args, **kwargs)
@classmethod
def get_mirrored_class(cls):
from gmft.impl.tatr.config import TATRDetectorConfig as OrigCls

return OrigCls

class TableDetector(TableDetectorOrig):
"""
Deprecated. Please import from gmft.auto instead.
"""

def __init__(self, *args, **kwargs):
_deprecation_warning("TableDetector")
super().__init__(*args, **kwargs)


class TATRFormatConfig(TATRFormatConfigOrig):
"""
Deprecated. Please import from gmft.auto instead.
class TableDetector(metaclass=DeprecationMirrorMeta):
"""
This import is deprecated.

def __init__(self, *args, **kwargs):
_deprecation_warning("TATRFormatConfig")
super().__init__(*args, **kwargs)


class TATRFormattedTable(TATRFormattedTableOrig):
"""
Deprecated. Please import from gmft.auto instead.
Please use:
- gmft.AutoTableDetector
- gmft.detectors.tatr.TATRDetector
"""

def __init__(self, *args, **kwargs):
_deprecation_warning("TATRFormattedTable")
super().__init__(*args, **kwargs)


class TATRTableFormatter(TATRTableFormatterOrig):
"""
Deprecated. Please import from gmft.auto instead.
"""
@classmethod
def get_mirrored_class(cls):
from gmft.auto import TATRDetector as OrigCls

def __init__(self, *args, **kwargs):
_deprecation_warning("TATRTableFormatter")
super().__init__(*args, **kwargs)
return OrigCls


class AutoTableFormatter(AutoTableFormatterOrig):
class TATRFormatConfig(metaclass=DeprecationMirrorMeta):
"""
Deprecated. Please import from gmft.auto instead.
This import is deprecated.

Please use:
- Reformat API (v0.5)
- gmft.formatters.tatr.TATRFormatConfig
"""

def __init__(self, *args, **kwargs):
_deprecation_warning("AutoTableFormatter")
super().__init__(*args, **kwargs)
@classmethod
def get_mirrored_class(cls):
from gmft.impl.tatr.config import TATRFormatConfig as OrigCls

return OrigCls


class AutoFormatConfig(AutoFormatConfigOrig):
class TATRFormattedTable(metaclass=DeprecationMirrorMeta):
"""
Deprecated. Please import from gmft.auto instead.
This import is deprecated.

Please use:
- Reformat API (v0.5)
- gmft.formatters.tatr.TATRFormattedTable
"""

def __init__(self, *args, **kwargs):
_deprecation_warning("AutoFormatConfig")
super().__init__(*args, **kwargs)
@classmethod
def get_mirrored_class(cls):
from gmft.formatters.tatr import TATRFormattedTable as OrigCls

return OrigCls

class AutoTableDetector(AutoTableDetectorOrig):

class TATRTableFormatter(metaclass=DeprecationMirrorMeta):
"""
Deprecated. Please import from gmft.auto instead.
This import is deprecated.

Please use:
- gmft.auto.AutoTableFormatter
- gmft.formatters.tatr.TATRFormatter
"""

def __init__(self, *args, **kwargs):
_deprecation_warning("AutoTableDetector")
super().__init__(*args, **kwargs)
@classmethod
def get_mirrored_class(cls):
from gmft.formatters.tatr import TATRFormatter as OrigCls

return OrigCls
12 changes: 7 additions & 5 deletions gmft/algorithm/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None):

outliers = {} # store table-wide information about outliers or pecularities

results = table.predictions["tatr"]
results = table.predictions.tatr

# 1. collate identified boxes
boxes = []
Expand Down Expand Up @@ -889,8 +889,9 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None):
if not known_means:
# no text was detected
outliers["no text"] = True
table.predictions["effective"] = _empty_effective_predictions()
table.predictions["indices"] = _empty_indices_predictions()
table.predictions.effective = _empty_effective_predictions()
table.predictions.indices = _empty_indices_predictions()
table.predictions.status = "ready"
table._df = pd.DataFrame()
table.outliers = outliers
return table._df
Expand Down Expand Up @@ -930,7 +931,7 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None):
)

# nms takes care of deduplication
table.predictions["effective"] = {
table.predictions.effective = {
"rows": sorted_rows,
"columns": sorted_columns,
"headers": sorted_headers,
Expand Down Expand Up @@ -1071,7 +1072,8 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig = None):
]
indices_preds["_projecting"] = [i for i, x in enumerate(is_projecting) if x]

table.predictions["indices"] = indices_preds
table.predictions.indices = indices_preds
table.predictions.status = "ready"

# if projecting_indices:
# insert at end
Expand Down
43 changes: 5 additions & 38 deletions gmft/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,41 +22,8 @@
TATRTableFormatter = TATRFormatter
# TATRFormatConfig = TATRFormatConfig


class AutoTableFormatter:
"""
The recommended :class:`~gmft.formatters.base.BaseFormatter`. Currently points to :class:`~gmft.formatters.tatr.TATRFormatter`.
Uses a TableTransformerForObjectDetection for small/medium tables, and a custom algorithm for large tables.

Using :meth:`extract`, a :class:`~gmft.formatters.base.FormattedTable` is produced, which can be exported to csv, df, etc.
"""

def __new__(cls, *args, **kwargs):
from gmft.formatters.tatr import TATRFormatter

return TATRFormatter(*args, **kwargs)


class AutoFormatConfig:
"""
Configuration for the recommended :class:`~gmft.formatters.base.BaseFormatter`. Currently points to :class:`~gmft.formatters.tatr.TATRFormatConfig`.
"""

def __new__(cls, *args, **kwargs):
from gmft.impl.tatr.config import TATRFormatConfig

return TATRFormatConfig(*args, **kwargs)


class AutoTableDetector:
"""
The recommended :class:`~gmft.detectors.base.BaseDetector`. Currently points to :class:`~gmft.detectors.tatr.TATRDetector`.
Uses TableTransformerForObjectDetection for small/medium tables, and a custom algorithm for large tables.

Using :meth:`~gmft.detectors.base.BaseDetector.extract` produces a :class:`~gmft.formatters.base.FormattedTable`, which can be exported to csv, df, etc.
"""

def __new__(cls, *args, **kwargs):
from gmft.detectors.tatr import TATRDetector

return TATRDetector(*args, **kwargs)
from gmft.core.auto_lazy import (
AutoTableFormatter,
AutoFormatConfig,
AutoTableDetector,
)
37 changes: 37 additions & 0 deletions gmft/core/auto_lazy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
class AutoTableFormatter:
"""
The recommended :class:`~gmft.formatters.base.BaseFormatter`. Currently points to :class:`~gmft.formatters.tatr.TATRFormatter`.
Uses a TableTransformerForObjectDetection for small/medium tables, and a custom algorithm for large tables.

Using :meth:`extract`, a :class:`~gmft.formatters.base.FormattedTable` is produced, which can be exported to csv, df, etc.
"""

def __new__(cls, *args, **kwargs):
from gmft.formatters.tatr import TATRFormatter

return TATRFormatter(*args, **kwargs)


class AutoFormatConfig:
"""
Configuration for the recommended :class:`~gmft.formatters.base.BaseFormatter`. Currently points to :class:`~gmft.formatters.tatr.TATRFormatConfig`.
"""

def __new__(cls, *args, **kwargs):
from gmft.impl.tatr.config import TATRFormatConfig

return TATRFormatConfig(*args, **kwargs)


class AutoTableDetector:
"""
The recommended :class:`~gmft.detectors.base.BaseDetector`. Currently points to :class:`~gmft.detectors.tatr.TATRDetector`.
Uses TableTransformerForObjectDetection for small/medium tables, and a custom algorithm for large tables.

Using :meth:`~gmft.detectors.base.BaseDetector.extract` produces a :class:`~gmft.formatters.base.FormattedTable`, which can be exported to csv, df, etc.
"""

def __new__(cls, *args, **kwargs):
from gmft.detectors.tatr import TATRDetector

return TATRDetector(*args, **kwargs)
9 changes: 9 additions & 0 deletions gmft/core/io/serial/dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from gmft.core.ml.prediction import (
IndicesPredictions,
RawBboxPredictions,
_empty_effective_predictions,
_empty_indices_predictions,
)
from gmft.detectors.base import CroppedTable
Expand Down Expand Up @@ -57,3 +58,11 @@ def _extract_indices(d: dict) -> IndicesPredictions:
}

return _empty_indices_predictions()


def _extract_effective(d: dict) -> IndicesPredictions:
# version gmft>=0.5 format
if "predictions.effective" in d:
return d["predictions.effective"]

return _empty_effective_predictions()
Loading