diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 6be5812e..781bec97 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,12 +1,27 @@ -name: "πŸ“¦ Publish (docs, PyPI)" +name: "πŸ“¦ Publish (PyPI + docs)" -on: - push: - tags: +on: + push: + tags: - '[0-9]+.[0-9]+.[0-9]+' jobs: - package-and-upload: - name: "πŸ€™ Call SDK publish workflow" + publish-pypi: + name: "πŸ“¦ Build and upload to PyPI" uses: clamsproject/.github/.github/workflows/sdk-publish.yml@main secrets: inherit + + publish-docs: + name: "πŸ“– Build and publish docs" + needs: publish-pypi + uses: clamsproject/clamsproject.github.io/.github/workflows/sdk-docs.yml@main + with: + source_repo: clamsproject/mmif-python + source_ref: ${{ github.ref_name }} + project_name: mmif-python + version: ${{ github.ref_name }} + build_command: 'python3 build-tools/docs.py --build-ver ${{ github.ref_name }} --output-dir docs' + docs_output_dir: 'docs/${{ github.ref_name }}' + python_version: '3.11' + update_latest: true + secrets: inherit diff --git a/.gitignore b/.gitignore index 013ab917..7588b933 100644 --- a/.gitignore +++ b/.gitignore @@ -79,5 +79,10 @@ mmif/vocabulary # Documentation build artifacts documentation/cli_help.rst -documentation/whatsnew.rst +documentation/whatsnew.md +documentation/autodoc docs-test + +# environments +.venv* +venv* diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0c7f166b..e544de48 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,72 @@ # Contributing to mmif-python +## Git Workflow + +We follow a Gitflow-inspired branching model to maintain a stable `main` branch and a dynamic `develop` branch. + +1. **Branch Roles**: + - `main`: Reserved for stable, production-ready releases. + - `develop`: The primary branch for ongoing development, feature integration, and bug fixes. This serves as the "staging" area for the next release. +2. **Issue Tracking**: Every contribution (bug fix or feature) must first be reported as a [GitHub Issue](https://github.com/clamsproject/mmif-python/issues). Issues should clearly define goals and, preferably, include an implementation plan. +3. **Branch Naming**: Create a dedicated working branch for each issue. Branches must be named using the format `NUM-short-description`, where `NUM` is the issue number (e.g., `113-fix-file-loading`). +4. **Pull Requests (PRs)**: + - Once work is complete, open a PR targeting the `develop` branch. + - **Communication**: High-level discussion and planning should occur in the issue thread. The PR conversation is strictly for code review and implementation-specific feedback. +5. **Releases**: + - When `develop` is ready for a new release, open a PR from `develop` to `main` using the "release" PR template. + - After merging the release candidate into `main`, manually tag the commit with the version number. This tag triggers the automated CI/CD pipeline for publishing. +6. **Branch Protection**: Both `main` and `develop` are protected branches. Direct pushes are disabled; all changes must be introduced via Pull Requests. + +## CLI Scripts + +The `mmif` command-line interface supports subcommands (e.g., `mmif source`, `mmif describe`). These are implemented as Python modules in `mmif/utils/cli/`. + +### Adding a New CLI Script + +To add a new CLI subcommand, create a Python module in `mmif/utils/cli/` with these three required functions: + +1. **`prep_argparser(**kwargs)`** - Define and return an `argparse.ArgumentParser` instance for your subcommand. When called during discovery, the main CLI will pass `add_help=False` to this function to avoid duplicate help flags. + +2. **`describe_argparser()`** - Return a tuple of two strings: + - A one-line description (shown in `mmif --help`) + - A more verbose description (shown in `mmif --help`) + +3. **`main(args)`** - Execute the subcommand logic with the parsed arguments. + +### Standard I/O Argument Pattern + +To ensure a consistent user experience and avoid resource leaks, all CLI subcommands should adhere to the following I/O argument patterns using the `mmif.utils.cli.open_cli_io_arg` context manager (which replaces the deprecated `argparse.FileType`): + +1. **Input**: Use a positional argument (usually named `MMIF_FILE`) that supports both file paths and STDIN. + - In `prep_argparser`, use `nargs='?'`, `type=str`, and `default=None`. + - In `main`, use `with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file:`. +2. **Output**: Use the `-o`/`--output` flag for the output destination. + - In `prep_argparser`, use `type=str` and `default=None`. + - In `main`, use `with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file:`. +3. **Formatting**: Use the `-p`/`--pretty` flag as a boolean switch (`action='store_true'`) to toggle between compact and pretty-printed JSON/MMIF output. + +[!NOTE] +> CLI modules should typically act as thin wrappers. It is recommended to implement the core utility logic in other packages (e.g., `mmif.utils`) and import it into the CLI module. See existing modules like `summarize.py` (which imports from `mmif.utils.summarizer`) or `describe.py` for examples. + +### How CLI Discovery Works + +The CLI system automatically discovers subcommands at runtime. The entry point is configured in the build script (currently `setup.py`) as follows: + +```python +entry_points={ + 'console_scripts': [ + 'mmif = mmif.__init__:cli', + ], +}, +``` + +The `cli()` function in `mmif/__init__.py` handles discovery and delegation. It uses `pkgutil.walk_packages` to find all modules within the top-level of the `mmif.utils.cli` package. For the discovery logic to work, a "cli module" should implement the requirements outlined above. + +This means adding a properly structured module within the CLI package is all that's neededβ€”the module name will automatically be registered as a subcommand. No modifications to `setup.py` or other configuration files are required. + +> [!NOTE] +> Any "client" code (not shell CLI) wants to use a module in `cli` package should be able to directrly `from mmif.utils.cli import a_module`. However, for historical reasons, some CLI modules are manually imported in `mmif/__init__.py` (e.g., `source.py`) for backward compatibility for clients predateing the discovery system. + ## Documentation The documentation for `mmif-python` is built using Sphinx and published to the [CLAMS documentation hub](https://github.com/clamsproject/website-test). @@ -9,12 +76,38 @@ The documentation for `mmif-python` is built using Sphinx and published to the [ To build the documentation for the current checkout: ```bash -make doc -# OR python3 build-tools/docs.py ``` -The output will be in `documentation/_build/html`. +The output will be in `docs-test`. For more options, run `python build-tools/docs.py --help`. + +> [!NOTE] +> Since the documentation build process is relying on the working `mmif` package, one must "build" the package first before building the documentation. This can be done by running +> ```bash +> rm VERSION* # remove existing VERSION file if exists +> make devversion # creates a dummy VERSION file +> pip install -r requirements.dev # install dev dependencies +> python setup.py sdist # build the package (will download auto-generate subpackges like `mmif.res` and `mmif.ver`) + +> [!NOTE] +> running `build-tools/docs.py` in "local testing" mode will overwrite any existing VERSION file with a dummy version. + +### API Documentation (autodoc) + +As of 2026 (since the next version of 1.2.1), API documentation is **automatically generated** using `sphinx-apidoc`. When you run the documentation build: + +1. The `run_apidoc()` function in `documentation/conf.py` runs automatically +2. It scans packages listed in `apidoc_package_names` (currently `mmif` and `mmif_docloc_http`) +3. RST files are generated in `documentation/autodoc/` +4. These files are **not tracked in git** - they're regenerated on each build + +**When you add a new module or subpackage**, it will be automatically documented on the next build. No manual updates required. + +**To add a new top-level package** (like `mmif_docloc_http`), add it to `apidoc_package_names` in `documentation/conf.py`. + +**To exclude a subpackage** from documentation (like `mmif.res` or `mmif.ver`), add it to `apidoc_exclude_paths`. + +**Module docstrings** in `__init__.py` files are used as package descriptions in the documentation. Keep them concise and informative. ### Building Documentation for Old Versions diff --git a/Makefile b/Makefile index ec63ccf1..bac1919e 100644 --- a/Makefile +++ b/Makefile @@ -36,17 +36,12 @@ publish: distclean version package test $(generatedcode): dist/$(sdistname)*.tar.gz docs: - @echo "WARNING: The 'docs' target is deprecated and will be removed." - @echo "The 'docs' directory is no longer used. Documentation is now hosted in the central CLAMS documentation hub." - @echo "Use 'make doc' for local builds or 'make doc-version' for specific versions." - @echo "Nothing is done." + @echo "The 'docs' target is deprecated and will be removed." + @echo "Documentation is now managed by 'build-tools/docs.py'." + @echo "Please run 'python3 build-tools/docs.py --help' for usage." -doc: # for single version sphinx - builds current source - python3 build-tools/docs.py - -doc-version: # interactive build for specific version - @read -p "Enter version/tag to build (e.g., v1.0.0): " ver; \ - [ -n "$$ver" ] && python3 build-tools/docs.py --build-ver $$ver +doc: docs +doc-version: docs package: VERSION dist/$(sdistname)*.tar.gz @@ -85,15 +80,15 @@ version: VERSION; cat VERSION # since the GH api will return tags in chronological order, we can just grab the last one without sorting AUTH_ARG := $(if $(GITHUB_TOKEN),-H "Authorization: token $(GITHUB_TOKEN)") -VERSION.dev: devver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif-python/git/refs/tags" | grep '"ref":' | sed -E 's/.+refs\/tags\/([0-9.]+)",/\1/g' | tail -n 1) -VERSION.dev: specver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif/git/refs/tags" | grep '"ref":' | grep -v 'py-' | sed -E 's/.+refs\/tags\/(spec-)?([0-9.]+)",/\2/g' | tail -n 1) +VERSION.dev: devver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif-python/git/refs/tags" | grep '"ref":' | sed -E 's/.+refs\/tags\/([0-9.]+)",/\1/g' | sort -V | tail -n 1) +VERSION.dev: specver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif/git/refs/tags" | grep '"ref":' | grep -v 'py-' | sed -E 's/.+refs\/tags\/(spec-)?([0-9.]+)",/\2/g' | sort -V | tail -n 1) VERSION.dev: @echo DEVVER: $(devver) @echo SPECVER: $(specver) @if [ $(call macro,$(devver)) = $(call macro,$(specver)) ] && [ $(call micro,$(devver)) = $(call micro,$(specver)) ] ; \ then \ if [[ $(devver) == *.dev* ]]; then echo $(call increase_dev,$(devver)) ; else echo $(call add_dev,$(call increase_patch, $(devver))); fi \ - else echo $(call add_dev,$(specver)) ; fi \ + else if [[ $(devver) == *.dev* ]]; then echo $(call increase_dev,$(devver)) ; else echo $(call add_dev,$(call increase_patch, $(devver))); fi ; fi \ > VERSION.dev VERSION: version := $(shell git tag | sort -t. -k 1,1nr -k 2,2nr -k 3,3nr -k 4,4nr | head -n 1) diff --git a/README.md b/README.md index 1cd1070a..a2e57601 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,20 @@ ## MultiMedia Interchange Format -[MMIF](https://mmif.clams.ai) is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications in [CLAMS project](https://clams.ai). + +[MMIF](https://mmif.clams.ai) is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications of the [CLAMS project](https://clams.ai). ## mmif-python -`mmif-python` is a Python implementation of the MMIF data format. -`mmif-python` provides various helper classes and functions to handle MMIF JSON in Python, -including ; -1. de-/serialization of MMIF internal data structures to/from JSON +`mmif-python` is a Python implementation of the MMIF data format. It provides various helper classes and functions to handle MMIF JSON in Python, including: + +1. serialization and de-serialization of MMIF internal data structures to/from JSON 2. validation of MMIF JSON 3. handling of CLAMS vocabulary types -4. navigation of MMIF object via various "search" methods (e.g. `mmif.get_all_views_contain(vocab_type))`) +4. navigation of MMIF objects via various "search" methods (e.g. `mmif.get_all_views_contain(vocab_type)`) ## For more ... + * [Version history and patch notes](https://github.com/clamsproject/mmif-python/blob/main/CHANGELOG.md) -* [MMIF Python API documentation](https://clamsproject.github.io/mmif-python) +* [MMIF Python API documentation](https://clamsproject.github.io/mmif-python/latest) * [MMIF JSON specification and schema](https://clamsproject.github.io/mmif) - -## For devs ... -* Build documentation: `python build-tools/docs.py --help` +* [Contributing guide](CONTRIBUTING.md) diff --git a/build-tools/docs.py b/build-tools/docs.py index ee5d4550..eaea47e3 100644 --- a/build-tools/docs.py +++ b/build-tools/docs.py @@ -40,6 +40,19 @@ def run_sphinx_build(self, *args, cwd=None, check=True): return run_command([self.sphinx_build, *args], cwd=cwd, check=check) +def get_dummy_version(): + """Returns a dummy version based on current git branch and dirty status. + Falls back to 'unknown' if not in a git repository.""" + try: + branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], + stderr=subprocess.DEVNULL, text=True).strip() + dirty = subprocess.run(["git", "diff", "--quiet"], + stderr=subprocess.DEVNULL, check=False).returncode != 0 + return f"{branch}{'+dirty' if dirty else ''}" + except (subprocess.CalledProcessError, FileNotFoundError): + return "unknown" + + def build_docs_local(source_dir: Path, output_dir: Path): """ Builds documentation for the provided source directory. @@ -47,6 +60,18 @@ def build_docs_local(source_dir: Path, output_dir: Path): """ print("--- Running in Local Build Mode ---") + # Warning for user as VERSION file is critical + if sys.stdin.isatty(): + import select + print("\nWARNING: The 'VERSION' file will be overwritten with a dummy version for this local build.") + print("Pausing for 3 seconds (press Enter to continue immediately)...") + select.select([sys.stdin], [], [], 3) + + # Overwrite VERSION file with dummy version for local builds + version = get_dummy_version() + print(f"Generating dummy VERSION for local build: {version}") + (source_dir / "VERSION").write_text(version) + # 1. Generate source code and install in editable mode. print("\n--- Step 1: Generating source code and installing in editable mode ---") try: diff --git a/build-tools/requirements.docs.txt b/build-tools/requirements.docs.txt index 8d9ee33d..db2d03d8 100644 --- a/build-tools/requirements.docs.txt +++ b/build-tools/requirements.docs.txt @@ -1,3 +1,4 @@ -sphinx>=7.0,<8.0 +sphinx furo m2r2 +autodoc-pydantic diff --git a/documentation/autodoc/mmif.serialize.rst b/documentation/autodoc/mmif.serialize.rst deleted file mode 100644 index e58e0c24..00000000 --- a/documentation/autodoc/mmif.serialize.rst +++ /dev/null @@ -1,37 +0,0 @@ -mmif.serialize package -====================== - -Core package to provide serialization and deserialization of MMIF format. - -``model`` module ---------------------------- - -.. automodule:: mmif.serialize.model - :members: - :undoc-members: - :show-inheritance: - -``mmif`` module --------------------------- - -.. automodule:: mmif.serialize.mmif - :members: - :undoc-members: - :show-inheritance: - -``view`` module --------------------------- - -.. automodule:: mmif.serialize.view - :members: - :undoc-members: - :show-inheritance: - -``annotation`` module --------------------------------- - -.. automodule:: mmif.serialize.annotation - :members: - :undoc-members: - :show-inheritance: - diff --git a/documentation/autodoc/mmif.utils.rst b/documentation/autodoc/mmif.utils.rst deleted file mode 100644 index 8bd90cfd..00000000 --- a/documentation/autodoc/mmif.utils.rst +++ /dev/null @@ -1,49 +0,0 @@ -mmif.utils package -================== - -Package containing utility modules for handling different types of source -documents, and general implementation of common data structures and -algorithms. - -Submodules ----------- - -``video_document_helper`` module --------------------------------- - -.. automodule:: mmif.utils.video_document_helper - :members: - :undoc-members: - :show-inheritance: - -``text_document_helper`` module -------------------------------- - -.. automodule:: mmif.utils.text_document_helper - :members: - :undoc-members: - :show-inheritance: - -``timeunit_helper`` module -------------------------------- - -.. automodule:: mmif.utils.timeunit_helper - :members: - :undoc-members: - :show-inheritance: - -``sequence_helper`` module --------------------------- - -.. automodule:: mmif.utils.sequence_helper - :members: - :undoc-members: - :show-inheritance: - -``workflow_helper`` module --------------------------- - -.. automodule:: mmif.utils.workflow_helper - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/documentation/autodoc/mmif.vocabulary.rst b/documentation/autodoc/mmif.vocabulary.rst deleted file mode 100644 index 0eb985b5..00000000 --- a/documentation/autodoc/mmif.vocabulary.rst +++ /dev/null @@ -1,28 +0,0 @@ -mmif.vocabulary package -======================= - -Package contains Enum-like classes for CLAMS vocabulary. - -.. autoclass:: mmif.vocabulary.ThingTypesBase - :show-inheritance: -.. autoclass:: mmif.vocabulary.ThingType - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: mmif.vocabulary.ClamsTypesBase - :show-inheritance: -.. autoclass:: mmif.vocabulary.AnnotationTypesBase - :show-inheritance: -.. autoclass:: mmif.vocabulary.DocumentTypesBase - :show-inheritance: - -.. autoclass:: mmif.vocabulary.AnnotationTypes - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: mmif.vocabulary.DocumentTypes - :members: - :undoc-members: - :show-inheritance: diff --git a/documentation/autodoc/mmif_docloc_http.rst b/documentation/autodoc/mmif_docloc_http.rst deleted file mode 100644 index b76c8df5..00000000 --- a/documentation/autodoc/mmif_docloc_http.rst +++ /dev/null @@ -1,11 +0,0 @@ -mmif_docloc_http package -======================== - -MMIF document location helper module for `http` and `https` schemes. -If you want to write your own docloc scheme handler, please use the source code of this module as a reference. -See this :ref:`plug-in section ` for more information. - -.. automodule:: mmif_docloc_http - :members: - :undoc-members: - :show-inheritance: diff --git a/documentation/cli.rst b/documentation/cli.rst index 8a2f6836..481df49d 100644 --- a/documentation/cli.rst +++ b/documentation/cli.rst @@ -1,16 +1,16 @@ .. _cli: -``mmif`` shell command -====================== +The ``mmif`` shell command +========================== ``mmif-python`` comes with a command line interface (CLI) that allows you to handle MMIF files. Many of these commands are designed to handle MMIF files in the context of CLAMS workflows. -The CLI is installed as ``mmif`` shell command. To see the available commands, run +The CLI scripts are installed as subcommands of the ``mmif`` shell command. Run the following to see the available commands or the MMIF version: -.. code-block:: bash +.. include:: cli_help.rst - mmif --help +Please take a look at the individual command documentation for more details on each command: -The following documentation is automatically generated from the CLI help messages. +.. code-block:: text -.. include:: cli_help.rst + $ mmif --help diff --git a/documentation/conf.py b/documentation/conf.py index 2b8a027d..81b989ba 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -6,8 +6,12 @@ import inspect import textwrap import os +import re import sys from pathlib import Path +from sphinx.util import logging + +logger = logging.getLogger(__name__) # -- Path setup -------------------------------------------------------------- # Add project root to sys.path so that autodoc can find the mmif package. @@ -17,19 +21,33 @@ # At this point, `pip install -e .` should have been run, so mmif is importable import mmif +# apidoc settings +apidoc_package_names = ['mmif', 'mmif_docloc_http'] +apidoc_exclude_paths = [ + proj_root_dir / 'mmif' / 'res', + proj_root_dir / 'mmif' / 'ver', +] +# this is used by sphinx.ext.autodoc +autodoc_default_options = { + 'members': True, + 'undoc-members': True, + 'show-inheritance': True, +} +autodoc_member_order = 'bysource' + + # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = 'mmif-python' blob_base_url = f'https://github.com/clamsproject/{project}/blob' -copyright = f'{datetime.date.today().year}, Brandeis LLC' author = 'Brandeis LLC' +copyright = f'{datetime.date.today().year}, {author}' try: version = open(proj_root_dir / 'VERSION').read().strip() except FileNotFoundError: - print("WARNING: VERSION file not found, using 'dev' as version.") + logger.warning("VERSION file not found, using 'dev' as version.") version = 'dev' -release = version # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration @@ -38,8 +56,16 @@ 'sphinx.ext.autodoc', 'sphinx.ext.linkcode', 'm2r2', + 'sphinxcontrib.autodoc_pydantic', ] +autodoc_pydantic_model_show_json = True +autodoc_pydantic_model_show_field_summary = True +autodoc_pydantic_model_show_config_summary = False +autodoc_pydantic_model_show_validator_members = False +autodoc_pydantic_model_show_validator_summary = False +autodoc_pydantic_field_list_validators = False + templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # dynamically generated files @@ -64,7 +90,6 @@ "source_repository": "https://github.com/clamsproject/mmif-python", "source_branch": "main", # Default branch for "Edit on GitHub" links "source_directory": "documentation/", - # CLAMS brand colors "light_css_variables": { "color-brand-primary": "#008AFF", @@ -142,7 +167,7 @@ def update_target_versions(app): return # Insert new version - print(f"Updating target-versions.csv: {current_ver} -> {spec_ver}") + logger.info(f"Updating target-versions.csv: {current_ver} -> {spec_ver}") lines.insert(1, f'{current_ver},"{spec_ver}"\n') with open(csv_path, 'w') as f: @@ -150,53 +175,45 @@ def update_target_versions(app): def generate_cli_rst(app): - from mmif import prep_argparser_and_subcmds, find_all_modules + from mmif import prep_argparser_and_subcmds # Generate main help os.environ['COLUMNS'] = '100' - parser, subparsers = prep_argparser_and_subcmds() + parser, _, _ = prep_argparser_and_subcmds() help_text = parser.format_help() content = [] - content.append('Main Command\n') - content.append('------------\n\n') content.append('.. code-block:: text\n\n') + content.append(' $ mmif --help\n') content.append(textwrap.indent(help_text, ' ')) content.append('\n\n') - # Generate subcommand help - for cli_module in find_all_modules('mmif.utils.cli'): - cli_module_name = cli_module.__name__.rsplit('.')[-1] - subparser = cli_module.prep_argparser(prog=f'mmif {cli_module_name}') - sub_help = subparser.format_help() - - content.append(f'{cli_module_name}\n') - content.append('-' * len(cli_module_name) + '\n\n') - content.append('.. code-block:: text\n\n') - content.append(textwrap.indent(sub_help, ' ')) - content.append('\n\n') + # No longer generate subcommand help with open(proj_root_dir / 'documentation' / 'cli_help.rst', 'w') as f: f.write(''.join(content)) def generate_whatsnew_rst(app): + """ + Create the documentation/whatsnew.md file by pulling out the changes for the + current version from the changelog file. + """ + changelog_path = proj_root_dir / 'CHANGELOG.md' output_path = proj_root_dir / 'documentation' / 'whatsnew.md' if not changelog_path.exists(): - print(f"WARNING: CHANGELOG.md not found at {changelog_path}") + logger.warning(f"CHANGELOG.md not found at {changelog_path}") with open(output_path, 'w') as f: f.write("") return - import re - content = [] found_version = False version_header_re = re.compile(r'^## releasing\s+([^\s]+)\s*(\(.*\))?') - print(f"DEBUG: Looking for version '{version}' in CHANGELOG.md") + logger.debug(f"Looking for version '{version}' in CHANGELOG.md") with open(changelog_path, 'r') as f: lines = f.readlines() @@ -216,9 +233,9 @@ def generate_whatsnew_rst(app): content.append(line) if not found_version: - print(f"NOTE: No changelog entry found for version {version}") + logger.info(f"No changelog entry found for version {version}") with open(output_path, 'w') as f: - f.write("") + f.write(f"### nothing new in {version}\nDid you locally build for testing?") else: # Dump matched markdown content directly to whatsnew.md with open(output_path, 'w') as f: @@ -226,10 +243,44 @@ def generate_whatsnew_rst(app): f.writelines(content) +def run_apidoc(app): + """ + Run sphinx-apidoc to auto-generate RST files for all modules. + This ensures new modules are automatically documented without manual updates. + """ + from sphinx.ext.apidoc import main as apidoc_main + + docs_dir = Path(__file__).parent + output_dir = docs_dir / 'autodoc' + + exclude_paths = map(str, apidoc_exclude_paths) + + # Run sphinx-apidoc for each package specified in package_names + # apidoc_main() accepts argv-style arguments (without the program name) + for package_name in apidoc_package_names: + package_dir = proj_root_dir / package_name + if not package_dir.exists(): + logger.warning(f"Package directory {package_dir} does not exist. " + f"Skipping apidoc for {package_name}.") + continue + + args = [ + '-o', str(output_dir), + str(package_dir), + *exclude_paths, + '--force', # Overwrite existing files + '--module-first', # Put module docs before submodule docs + '--no-toc', # Don't create modules.rst, will be overwriting each other's + ] + logger.info(f"Running sphinx-apidoc with args: {args}") + apidoc_main(args) + + def setup(app): try: + app.connect('builder-inited', run_apidoc) app.connect('builder-inited', update_target_versions) app.connect('builder-inited', generate_cli_rst) app.connect('builder-inited', generate_whatsnew_rst) except ImportError: - print("WARNING: 'mmif' package not found. Skipping dynamic generation of parts of documentation.") + logger.warning("'mmif' package not found. Skipping dynamic generation of parts of documentation.") diff --git a/documentation/index.rst b/documentation/index.rst index ddbf0691..05d93a48 100644 --- a/documentation/index.rst +++ b/documentation/index.rst @@ -1,7 +1,7 @@ -Welcome to mmif-python's documentation! -======================================= +MMIF Python SDK +=============== -.. mdinclude:: ../README.md +This is the documentation for the mmif-python package, a Python implementation for the MultiMedia Interchange Format (MMIF). MMIF is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications of the CLAMS project. For descriptions of the CLAMS project and the MMIF format see https://clams.ai and https://mmif.clams.ai. The GitHub repository for the package is at https://github.com/clamsproject/mmif-python. ---- @@ -15,19 +15,20 @@ Welcome to mmif-python's documentation! introduction cli + summarizer plugins target-versions .. toctree:: :maxdepth: 2 - :caption: API documentation: + :caption: API Documentation - modules + autodoc/mmif + autodoc/mmif_docloc_http -Indices and tables -================== +Indices +------- * :ref:`genindex` * :ref:`modindex` -* :ref:`search` diff --git a/documentation/introduction.rst b/documentation/introduction.rst index 95508f3c..c9f63e98 100644 --- a/documentation/introduction.rst +++ b/documentation/introduction.rst @@ -11,10 +11,12 @@ MultiMedia Interchange Format (MMIF) is a JSON(-LD)-based data format designed f This documentation focuses on Python implementation of the MMIF. To learn more about the data format specification, please visit the `MMIF website `_. ``mmif-python`` is a public, open source implementation of the MMIF data format. ``mmif-python`` supports serialization/deserialization of MMIF objects from/to Python objects, as well as many navigation and manipulation helpers for MMIF objects. + Prerequisites ------------- -* `Python `_: the latest ``mmif-python`` requires Python 3.8 or newer. We have no plan to support `Python 2.7 `_. +* `Python `_: the latest ``mmif-python`` requires Python 3.10 or newer. + Installation --------------- @@ -25,20 +27,20 @@ Package ``mmif-python`` is distributed via the official PyPI. Users are supposed pip install mmif-python -This will install a package `mmif` to local python. +This will install a package `mmif` to your local python library. The MMIF format and specification is evolving over time, and ``mmif-python`` package will be updated along with the changes in MMIF format. -.. note:: MMIF format is not always backward-compatible. To find out more about relations between MMIF specification versions and ``mmif-python`` versions, please take time to read our decision on the subject `here `_. If you need to know which python SDK supports which specification version, see :ref:`target-versions` page. +.. note:: The MMIF format is not always backward-compatible. To find out more about relations between MMIF specification versions and ``mmif-python`` versions, please take time to read our decision on the subject `here `_. If you need to know which python SDK supports which specification version, see :ref:`target-versions` page. + MMIF Serialization --------------------------- -:class:`mmif.serialize.mmif.Mmif` represents the top-level MMIF object. For subcomponents of the MMIF (view objects, annotation objects, metadata for each object) are all subclass of :class:`mmif.serialize.model.MmifObject`, including the :class:`mmif.serialize.mmif.Mmif`. To start with an existing MMIF :class:`str`, simple initiate a new ``Mmif`` object with the file. +:class:`mmif.serialize.mmif.Mmif` represents the top-level MMIF object. Subcomponents of the MMIF object (views, annotation objects and metadata for each object) and the MMIF object itself are all subclasses of :class:`mmif.serialize.model.MmifObject`. To start with an existing MMIF :class:`str`, simply initiate a new ``Mmif`` object with that string. .. code-block:: python - import mmif from mmif import Mmif mmif_str = """{ @@ -64,13 +66,14 @@ MMIF Serialization } ], "views": []}""" + mmif_obj = Mmif(mmif_str) Few notes; -#. MMIF does not carry the primary source files in it. -#. MMIF encode the specification version at the top. As not all MMIF versions are backward-compatible, a version ``mmif-python`` implementation of the MMIF might not be able to load an unsupported version of MMIF string. +#. MMIF objects do not carry the primary source files in it (although there are exceptions for text documents). +#. MMIF objects specify the MMIF version at the top. As not all MMIF versions are backward-compatible, a version of the ``mmif-python`` implementation might not be able to load an unsupported MMIF versions. When serializing back to :class:`str`, call :meth:`mmif.serialize.model.MmifObject.serialize` on the object. @@ -81,11 +84,12 @@ To get subcomponents, you can use various getters implemented in subclasses. For from mmif.vocabulary.document_types import DocumentTypes for video in mmif_obj.Mmif.get_documents_by_type(DocumentTypes.VideoDocument): - with open(video.location_path(), 'b') as in_video: - # do something with the video file + with open(video.location_path(), 'b') as in_video: + # do something with the video file + +For a full list of available helper methods, please refer to the API documentation pages (See left sidebar). -For a full list of available helper methods, please refer to :ref:`the API documentation `. MMIF usage in CLAMS Workflows ----------------------------- diff --git a/documentation/modules.rst b/documentation/modules.rst deleted file mode 100644 index 4bb9307d..00000000 --- a/documentation/modules.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _apidoc: - -mmif package -============ - -.. toctree:: - :maxdepth: 4 - - autodoc/mmif.serialize - autodoc/mmif.vocabulary - autodoc/mmif.utils - -mmif_docloc_http package -======================== - -.. toctree:: - :maxdepth: 2 - - autodoc/mmif_docloc_http - diff --git a/documentation/plugins.rst b/documentation/plugins.rst index 1af39426..50af8876 100644 --- a/documentation/plugins.rst +++ b/documentation/plugins.rst @@ -1,8 +1,7 @@ .. _plugins: -Developing plugins for MMIF Python SDK -====================================== - +Developing plugins for the MMIF Python SDK +========================================== Overview -------- @@ -50,7 +49,7 @@ Here's a minimal example codebase that you refer to when you develop a ``docloc` β”œβ”€β”€ pyproject.toml └── setup.cfg - $ cat pyproject.toml + $ cat pyproject.toml [build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" @@ -80,10 +79,41 @@ And the plugin code. def help(): return "location format: `.video`" +Built-in Document Location Scheme Plugins +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +At the moment, the ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. This plugin implements caching as described above, so repeated access to the same URL will not trigger multiple downloads. +Take a look at the :mod:`mmif_docloc_http` module for details. +Caching for Remote File Access +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Bulit-in Document Location Scheme Plugins -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +When developing plugins that resolve remote document locations (e.g., ``http``, ``s3``, or custom schemes), it is highly recommended to implement caching to avoid repeated network requests or file downloads. Since ``mmif-python`` may call the ``resolve`` function multiple times for the same document location during processing, caching can significantly improve performance. + +A simple and effective approach is to use a module-level dictionary as a cache. Because Python modules are singletons (loaded once and cached in ``sys.modules``), this cache persists for the entire lifetime of the Python process, across multiple MMIF files and Document objects. + +Here's an example of how to implement caching in a plugin: + +.. code-block:: python + + # mmif_docloc_myscheme/__init__.py + + _cache = {} + + def resolve(docloc): + if docloc in _cache: + return _cache[docloc] + + # ... your resolution logic here ... + resolved_path = do_actual_resolution(docloc) + + _cache[docloc] = resolved_path + return resolved_path + +This pattern ensures that: + +* The first call to ``resolve`` performs the actual resolution (download, API call, etc.) +* Subsequent calls for the same location return the cached result immediately +* The cache is shared across all MMIF objects processed within the same Python process -At the moment, ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. -Take a look at :mod:`mmif_docloc_http` module for details. +See :mod:`mmif_docloc_http` for a concrete example of this caching strategy in action. diff --git a/documentation/summarizer.rst b/documentation/summarizer.rst new file mode 100644 index 00000000..eaef8970 --- /dev/null +++ b/documentation/summarizer.rst @@ -0,0 +1,32 @@ +.. _summarizer: + + +MMIF Summarizer +=============== + +The Summarizer is a MMIF consumer that creates a JSON summary from a MMIF file. It +makes some simplifying assumptions, including: + +- There is one video in the MMIF documents list. All start and end properties + are pointing to that video. +- The time unit is assumed to be milliseconds. + + +The summarizer is accessible via the ``mmif`` command line script. To run the +summarizer over a MMIF file and write the JSON summary to OUTFILE: + +.. code-block:: bash + + mmif summarize -i INFILE -o OUTFILE + +In all cases, the summarizer summarizes only the information that is there, it +does not fix any mistakes and in general it does not add any information that is +not explicitly or implicitly in the MMIF file. In rare cases some information is +added, for example if an ASR tool does not group tokens in sentence-like objects +then the summarizer will do that, but then only by creating token groups of the +same length. + +The summary includes the MMIF version, the list of documents, a summary of the +metadata of all views (identifier, CLAMS app, timestamp, total number of +annotations and number of annotations per type, it does not show parameters and +application configuration), time frames, transcript, captions and entities. \ No newline at end of file diff --git a/documentation/target-versions.rst b/documentation/target-versions.rst index 34216d3e..9d89717d 100644 --- a/documentation/target-versions.rst +++ b/documentation/target-versions.rst @@ -3,7 +3,7 @@ Target MMIF Versions ==================== -This article provides targeting MMIF specification versions of different versions of ``mmif-python`` SDK. +This page lists targeting MMIF specification versions for different versions of the ``mmif-python`` SDK. .. csv-table:: Target Specification Versions :file: target-versions.csv diff --git a/mmif/__init__.py b/mmif/__init__.py index 6fde82fe..513ecd86 100644 --- a/mmif/__init__.py +++ b/mmif/__init__.py @@ -34,28 +34,28 @@ def find_all_modules(pkgname): def prep_argparser_and_subcmds(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(prog='mmif') parser.add_argument( '-v', '--version', action='version', version=version_template.format(__version__, __specver__) ) subparsers = parser.add_subparsers(title='sub-command', dest='subcmd') - return parser, subparsers - - -def cli(): - parser, subparsers = prep_argparser_and_subcmds() - cli_modules = {} + subcmds = {} for cli_module in find_all_modules('mmif.utils.cli'): cli_module_name = cli_module.__name__.rsplit('.')[-1] - cli_modules[cli_module_name] = cli_module + subcmds[cli_module_name] = cli_module subcmd_parser = cli_module.prep_argparser(add_help=False) subparsers.add_parser(cli_module_name, parents=[subcmd_parser], help=cli_module.describe_argparser()[0], description=cli_module.describe_argparser()[1], formatter_class=argparse.RawDescriptionHelpFormatter, ) + return parser, subparsers, subcmds + + +def cli(): + parser, subparsers, cli_modules = prep_argparser_and_subcmds() if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) diff --git a/mmif/serialize/__init__.py b/mmif/serialize/__init__.py index 18523bac..06964253 100644 --- a/mmif/serialize/__init__.py +++ b/mmif/serialize/__init__.py @@ -1,3 +1,7 @@ +""" +Core package to provide serialization and deserialization of MMIF format. +""" + from .annotation import * from .annotation import __all__ as anno_all from .mmif import * diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index 6527f482..b7f002cd 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -374,7 +374,7 @@ def add_property(self, name: str, With the former method, the SDK will record the added property as a `Annotation` annotation object, separate from the original `Document` - object. See :meth:`.Mmif.generate_capital_annotations()` for more. + object. See :meth:`mmif.serialize.mmif.Mmif.generate_capital_annotations` for more. A few notes to keep in mind: @@ -442,7 +442,7 @@ def get(self, prop_name, default=None): See Also -------- add_property : Add a new property to the document - Mmif.generate_capital_annotations : How pending properties are serialized + mmif.serialize.mmif.Mmif.generate_capital_annotations : How pending properties are serialized """ if prop_name == 'id': # because all three dicts have `id` key as required field, we need diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 9e94496d..245c96aa 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -14,8 +14,8 @@ import math import warnings from collections import defaultdict -from datetime import datetime -from typing import List, Union, Optional, Dict, cast, Iterator +from datetime import datetime, timezone +from typing import Any, List, Union, Optional, Dict, cast, Iterator import jsonschema.validators @@ -24,7 +24,7 @@ from mmif.serialize.annotation import Annotation, Document from mmif.serialize.model import MmifObject, DataList from mmif.serialize.view import View -from mmif.vocabulary import AnnotationTypes, DocumentTypes +from mmif.vocabulary import AnnotationTypes, DocumentTypesBase __all__ = ['Mmif'] @@ -433,7 +433,7 @@ def new_view(self) -> View: """ new_view = View() new_view.id = self.new_view_id() - new_view.metadata.timestamp = datetime.now() + new_view.metadata.timestamp = datetime.now(timezone.utc) self.add_view(new_view) return new_view @@ -487,11 +487,11 @@ def get_documents_in_view(self, vid: Optional[str] = None) -> List[Document]: else: return [] - def get_documents_by_type(self, doc_type: Union[str, DocumentTypes]) -> List[Document]: + def get_documents_by_type(self, doc_type: DocumentTypesBase) -> List[Document]: """ Method to get all documents where the type matches a particular document type, which should be one of the CLAMS document types. - :param doc_type: the type of documents to search for, must be one of ``Document`` type defined in the CLAMS vocabulary. + :param doc_type: the type of documents to search for, must be one of ``Document`` types defined in the CLAMS vocabulary. :return: a list of documents matching the requested type, or an empty list if none found. """ docs = [] @@ -530,7 +530,7 @@ def get_documents_by_property(self, prop_key: str, prop_value: str) -> List[Docu docs.extend([document for document in self.documents if document[prop_key] == prop_value]) return docs - def get_documents_locations(self, m_type: Union[DocumentTypes, str], path_only=False) -> List[Union[str, None]]: + def get_documents_locations(self, m_type: Union[DocumentTypesBase, str], path_only=False) -> List[Union[str, None]]: """ This method returns the file paths of documents of given type. Only top-level documents have locations, so we only check them. @@ -545,7 +545,7 @@ def get_documents_locations(self, m_type: Union[DocumentTypes, str], path_only=F else: return [doc.location for doc in docs] - def get_document_location(self, m_type: Union[DocumentTypes, str], path_only=False) -> Optional[str]: + def get_document_location(self, m_type: Union[DocumentTypesBase, str], path_only=False) -> Optional[str]: """ Method to get the location of *first* document of given type. diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 1bec7b29..95fdc28c 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -402,7 +402,10 @@ def default(self, obj: 'MmifObject'): if hasattr(obj, '_serialize'): return obj._serialize() elif hasattr(obj, 'isoformat'): # for datetime objects - return obj.isoformat() + s = obj.isoformat() + if s.endswith('+00:00'): + s = s[:-6] + 'Z' + return s elif hasattr(obj, '__str__'): return str(obj) else: diff --git a/mmif/utils/__init__.py b/mmif/utils/__init__.py index e69de29b..fe8aea5d 100644 --- a/mmif/utils/__init__.py +++ b/mmif/utils/__init__.py @@ -0,0 +1,4 @@ +""" +Package containing utility modules for handling different types of source +documents, and general implementation of common data structures and algorithms. +""" diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 24855994..f24248f2 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -1,4 +1,226 @@ -from mmif.utils.cli import describe +""" +Package containing CLI modules. +""" + +import contextlib +import io +import os +import sys +from typing import Iterator, Optional, TextIO, Type, Union, cast, get_args, get_origin + +from pydantic import BaseModel + + +@contextlib.contextmanager +def open_cli_io_arg( + path_or_dash: Optional[str], + mode: str = "r", + encoding: Optional[str] = None, + errors: Optional[str] = None, + default_stdin: bool = False, +) -> Iterator[TextIO]: + """ + Context manager for opening files with stdin/stdout support. + + This function is intended for plain text streams (e.g. JSON/MMIF) and does + not support binary modes (e.g., 'rb', 'wb'). + + This is a native replacement for argparse.FileType which is deprecated as + of Python 3.14 due to resource leak issues. Unlike FileType, this defers + file opening until actually needed and ensures proper cleanup via context + manager. + + Handles the common CLI pattern where: + + - '-' means stdin (read mode) or stdout (write mode) + - None means "argument not provided"; when default_stdin=True, it falls back + to stdin/stdout + - Regular paths open actual files with proper resource management + + :param path_or_dash: File path, '-' for stdin/stdout, or None for no argument + :param mode: File mode ('r' for reading, 'w' for writing). Binary modes are + not supported. + :param encoding: Optional file encoding + :param errors: Optional error handling strategy for encoding + :param default_stdin: If True and path_or_dash is None, default to stdin + (mode 'r') or stdout (mode 'w') + :returns: Context manager yielding text-mode file handle + :rtype: Iterator[TextIO] + + Example usage:: + + # Read from file or stdin + with open_cli_io_arg(args.input, 'r', default_stdin=True) as f: + content = f.read() + + # Write to file or stdout + with open_cli_io_arg(args.output, 'w', default_stdin=True) as f: + f.write(content) + """ + # Valid text modes for file operations + _READ_FLAGS = frozenset({"r", "+"}) + _WRITE_FLAGS = frozenset({"w", "a", "x", "+"}) + + if "b" in mode: + raise ValueError( + f"Binary mode '{mode}' is not supported. " + "Use text modes ('r', 'w', 'a', 'x') instead." + ) + + needs_read = bool(set(mode) & _READ_FLAGS) + needs_write = bool(set(mode) & _WRITE_FLAGS) + + should_use_stdio = path_or_dash == "-" or (path_or_dash is None and default_stdin) + + file_handle: Optional[TextIO] = None + should_close = False + + try: + if should_use_stdio: + if needs_read and needs_write: + raise ValueError( + f"Mode '{mode}' not supported with stdin/stdout " + "(use read or write only)" + ) + + if needs_read: + # Check for missing input when stdin is a terminal + if path_or_dash is None and default_stdin and sys.stdin.isatty(): + raise SystemExit("error: No input provided.") + file_handle = sys.stdin + + elif needs_write: + file_handle = sys.stdout + + else: + raise ValueError( + f"Mode '{mode}' not supported with stdin/stdout (use 'r' or 'w')" + ) + + elif isinstance(path_or_dash, str): + if needs_read and not os.path.exists(path_or_dash): + raise FileNotFoundError(f"Input path does not exist: {path_or_dash}") + file_handle = cast( + TextIO, io.open(path_or_dash, mode, encoding=encoding, errors=errors) + ) + should_close = True + + elif path_or_dash is None: + # None without default_stdin means no file specified + raise ValueError( + "No file path provided. Use '-' for stdin/stdout or set default_stdin=True." + ) + else: + raise TypeError( + f"Invalid type for path_or_dash: {type(path_or_dash).__name__}. " + "Expected str or None." + ) + + if file_handle is not None: + yield file_handle + + finally: + if should_close and file_handle is not None: + file_handle.close() + + +def generate_model_summary(model: Type[BaseModel], indent: int = 0) -> str: + lines = [] + prefix = " " * indent + + # model_fields is a dictionary of FieldInfo objects + for name, field in model.model_fields.items(): + # Get the alias if available, otherwise use the field name + field_name = field.alias if field.alias else name + + # Get type annotation + type_annotation = field.annotation + + def format_type(t) -> str: + origin = get_origin(t) + args = get_args(t) + + # Handle Optional (Union[T, None]) + if origin is Union and type(None) in args: + non_none_args = [arg for arg in args if arg is not type(None)] + if len(non_none_args) == 1: + return f"{format_type(non_none_args[0])}, optional" + + # Handle List + if origin is list: + if args: + return f"[{format_type(args[0])}]" + return "[]" + + # Handle Dict + if origin is dict: + return "obj" + + # Handle Pydantic Models (Custom Classes) + if isinstance(t, type) and issubclass(t, BaseModel): + return "obj" + + # Handle basic types and cleanup + t_str = str(t) + if t_str.startswith(" 1 + and isinstance(args[1], type) + and issubclass(args[1], BaseModel) + ): + nested_model = args[1] + + if nested_model: + lines.append(generate_model_summary(nested_model, indent + 4)) + + return "\n".join(lines) + + +# keep imports of CLI modules for historical reasons +# keep them here in the bottom to avoid circular imports from mmif.utils.cli import rewind from mmif.utils.cli import source - diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index eaf35856..b8c79ced 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -3,12 +3,18 @@ import sys import textwrap from pathlib import Path -from typing import Union +from typing import Union, cast + +from mmif.utils.cli import open_cli_io_arg, generate_model_summary -from mmif.utils.workflow_helper import generate_workflow_identifier, describe_single_mmif, \ - describe_mmif_collection # gen_param_hash is imported for backward compatibility -from mmif.utils.workflow_helper import generate_param_hash +from mmif.utils.workflow_helper import ( + CollectionMmifDesc, + SingleMmifDesc, + describe_mmif_collection, + describe_single_mmif, + generate_workflow_identifier, +) def get_pipeline_specs(mmif_file: Union[str, Path]): @@ -22,41 +28,30 @@ def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: import warnings warnings.warn("generate_pipeline_identifier is deprecated, use generate_workflow_identifier instead", DeprecationWarning) - return generate_workflow_identifier(mmif_file) + return cast(str, generate_workflow_identifier(mmif_file)) def describe_argparser(): - """ - Returns two strings: one-line description of the argparser, and - additional material, which will be shown in `clams --help` and - `clams --help`, respectively. - """ oneliner = ( - 'provides CLI to describe the workflow specification from a MMIF ' - 'file or a collection of MMIF files.' + 'Describe the workflow specification from a MMIF file or a ' + 'collection of MMIF files.' ) - # get and clean docstrings - single_doc = describe_single_mmif.__doc__.split(':param')[0] - single_doc = textwrap.dedent(single_doc).strip() - collection_doc = describe_mmif_collection.__doc__.split(':param')[0] - collection_doc = textwrap.dedent(collection_doc).strip() - additional = textwrap.dedent(f""" This command extracts workflow information from a single MMIF file or - summarizes a directory of MMIF files. + a directory of MMIF files. The output is serialized as JSON. + + Output Schemas: + + 1. Single MMIF File (mmif-file): +{generate_model_summary(SingleMmifDesc, indent=4)} + + 2. MMIF Collection (mmif-dir): +{generate_model_summary(CollectionMmifDesc, indent=4)} - ========================== - For a single MMIF file - ========================== - {single_doc} - - =============================== - For a directory of MMIF files - =============================== - {collection_doc} + Use `--help-schema` to inspect the full JSON schema for a specific output type. """) - return oneliner, oneliner + '\n\n' + additional.strip() + return oneliner, additional def prep_argparser(**kwargs): @@ -65,17 +60,17 @@ def prep_argparser(**kwargs): formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs ) + parser.add_argument( "MMIF_FILE", nargs="?", type=str, - default=None if sys.stdin.isatty() else sys.stdin, + default=None, help='input MMIF file, a directory of MMIF files, or STDIN if `-` or not provided.' ) parser.add_argument( "-o", "--output", - type=argparse.FileType("w"), - default=sys.stdout, + type=str, default=None, help='output file path, or STDOUT if not provided.' ) parser.add_argument( @@ -83,33 +78,43 @@ def prep_argparser(**kwargs): action="store_true", help="Pretty-print JSON output" ) + parser.add_argument( + "--help-schema", + nargs=1, + choices=["mmif-file", "mmif-dir"], + metavar="SCHEMA_NAME", + help="Print the JSON schema for the output. Options: mmif-file, mmif-dir." + ) return parser def main(args): """ - Main entry point for the describe CLI command. - - Reads a MMIF file and outputs a JSON summary containing: - - workflow_id: unique identifier for the source and app sequence - - stats: view counts, annotation counts (total/per-view/per-type), - and lists of error/warning/empty view IDs - - views: map of view IDs to app configurations and profiling data - - :param args: Parsed command-line arguments + Main block for the describe CLI command. + This function basically works as a wrapper around + :func:`describe_single_mmif` (for single file input) or + :func:`describe_mmif_collection` (for directory input). """ + if hasattr(args, 'help_schema') and args.help_schema is not None: + schema_name = args.help_schema[0] + if schema_name == 'mmif-file': + model_cls = SingleMmifDesc + elif schema_name == 'mmif-dir': + model_cls = CollectionMmifDesc + + schema = model_cls.model_json_schema() + print(json.dumps(schema, indent=2)) + sys.exit(0) + output = {} # if input is a directory - if isinstance(args.MMIF_FILE, str) and Path(args.MMIF_FILE).is_dir(): + if Path(str(args.MMIF_FILE)).is_dir(): output = describe_mmif_collection(args.MMIF_FILE) # if input is a file or stdin else: # Read MMIF content - if hasattr(args.MMIF_FILE, 'read'): - mmif_content = args.MMIF_FILE.read() - else: - with open(args.MMIF_FILE, 'r') as f: - mmif_content = f.read() + with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file: + mmif_content = input_file.read() # For file input, we need to handle the path # If input is from stdin, create a temp file @@ -127,11 +132,10 @@ def main(args): tmp_path.unlink() if output: - if args.pretty: - json.dump(output, args.output, indent=2) - else: - json.dump(output, args.output) - args.output.write('\n') + # Convert Pydantic models to dicts + with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: + json.dump(output, output_file, indent=2 if args.pretty else None) + output_file.write('\n') if __name__ == "__main__": diff --git a/mmif/utils/cli/rewind.py b/mmif/utils/cli/rewind.py index 1e038180..8dccc63f 100644 --- a/mmif/utils/cli/rewind.py +++ b/mmif/utils/cli/rewind.py @@ -3,6 +3,7 @@ import textwrap import mmif +from mmif.utils.cli import open_cli_io_arg from mmif.utils.workflow_helper import group_views_by_app @@ -55,10 +56,6 @@ def rewind_mmif(mmif_obj: mmif.Mmif, choice: int, choice_is_viewnum: bool = True def describe_argparser(): - """ - returns two strings: one-line description of the argparser, and addition material, - which will be shown in `clams --help` and `clams --help`, respectively. - """ oneliner = 'provides CLI to rewind a MMIF from a CLAMS workflow.' additional = textwrap.dedent(""" MMIF rewinder rewinds a MMIF by deleting the last N views. @@ -70,12 +67,10 @@ def prep_argparser(**kwargs): parser = argparse.ArgumentParser(description=describe_argparser()[1], formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs) parser.add_argument("MMIF_FILE", - nargs="?", type=argparse.FileType("r"), - default=None if sys.stdin.isatty() else sys.stdin, + nargs="?", type=str, default=None, help='input MMIF file path, or STDIN if `-` or not provided.') parser.add_argument("-o", "--output", - type=argparse.FileType("w"), - default=sys.stdout, + type=str, default=None, help='output file path, or STDOUT if not provided.') parser.add_argument("-p", '--pretty', action='store_true', help="Pretty-print rewound MMIF") @@ -88,7 +83,8 @@ def prep_argparser(**kwargs): def main(args): - mmif_obj = mmif.Mmif(args.MMIF_FILE.read()) + with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file: + mmif_obj = mmif.Mmif(input_file.read()) if args.number == 0: # If user doesn't know how many views to rewind, give them choices. choice = prompt_user(mmif_obj) @@ -97,7 +93,8 @@ def main(args): if not isinstance(choice, int) or choice <= 0: raise ValueError(f"Only can rewind by a positive number of views. Got {choice}.") - args.output.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty)) + with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: + output_file.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty)) if __name__ == "__main__": diff --git a/mmif/utils/cli/source.py b/mmif/utils/cli/source.py index 3abd2e1f..6c3b4b86 100644 --- a/mmif/utils/cli/source.py +++ b/mmif/utils/cli/source.py @@ -9,6 +9,7 @@ from mmif import Mmif, Document, DocumentTypes, __specver__ from mmif.serialize.mmif import MmifMetadata +from mmif.utils.cli import open_cli_io_arg __all__ = ['WorkflowSource'] @@ -214,10 +215,6 @@ def generate_source_mmif_from_file(documents, prefix=None, scheme='file', **igno def describe_argparser(): - """ - returns two strings: one-line description of the argparser, and addition material, - which will be shown in `clams --help` and `clams --help`, respectively. - """ oneliner = 'provides CLI to create a "source" MMIF json.' additional = textwrap.dedent(""" A source MMIF is a MMIF with a list of source documents but empty views. @@ -258,8 +255,7 @@ def prep_argparser(**kwargs): ) parser.add_argument( '-o', '--output', - type=argparse.FileType('w'), - default=sys.stdout, + type=str, default=None, help='output file path, or STDOUT if not provided.' ) scheme_help = 'A scheme to associate with the document location URI. When not given, the default scheme is `file://`.' @@ -279,7 +275,8 @@ def prep_argparser(**kwargs): def main(args): mmif = generate_source_mmif_from_file(windows_path=False, **vars(args)) - args.output.write(mmif) + with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: + output_file.write(mmif) return mmif if __name__ == '__main__': diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py new file mode 100644 index 00000000..17fe3d5d --- /dev/null +++ b/mmif/utils/cli/summarize.py @@ -0,0 +1,65 @@ +import argparse +import json +import pathlib +import tempfile + +from mmif.utils.cli import open_cli_io_arg +from mmif.utils.summarizer.summary import Summary + + +def describe_argparser() -> tuple: + oneliner = 'Create a JSON Summary for a MMIF file.' + additional = 'The output is serialized as JSON and includes various statistics and summaries of the MMIF content.' + return oneliner, oneliner + '\n\n' + additional + + +def prep_argparser(**kwargs): + """ + Create the ArgumentParser instance for the summarizer. + """ + parser = argparse.ArgumentParser(description=describe_argparser()[1], + formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs) + parser.add_argument("MMIF_FILE", + nargs="?", type=str, default=None, + help='input MMIF file path, or STDIN if `-` or not provided.') + parser.add_argument("-o", "--output", + type=str, default=None, + help='output file path, or STDOUT if not provided.') + parser.add_argument("-p", "--pretty", action="store_true", + help="Pretty-print JSON output") + return parser + + +def main(args: argparse.Namespace): + """ + The main summarizer command. + """ + # If a real file path is provided (not None and not '-'), pass it directly to Summary + if args.MMIF_FILE is not None and args.MMIF_FILE != "-": + mmif_summary = Summary(pathlib.Path(args.MMIF_FILE)) + output = mmif_summary.to_dict() + else: + # Fallback: read from stdin (or default input), write to a temporary file, and summarize that + with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file: + mmif_content = input_file.read() + tmp_path = None + try: + with tempfile.NamedTemporaryFile( + mode='w', suffix='.mmif', delete=False + ) as tmp: + tmp_path = pathlib.Path(tmp.name) + tmp.write(mmif_content) + mmif_summary = Summary(tmp_path) + output = mmif_summary.to_dict() + finally: + if tmp_path and tmp_path.exists(): + tmp_path.unlink() + + with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: + json.dump(output, output_file, indent=2 if args.pretty else None) + + +if __name__ == "__main__": + parser = prep_argparser() + args = parser.parse_args() + main(args) diff --git a/mmif/utils/summarizer/__init__.py b/mmif/utils/summarizer/__init__.py new file mode 100644 index 00000000..bbbd9cb8 --- /dev/null +++ b/mmif/utils/summarizer/__init__.py @@ -0,0 +1,28 @@ +""" +Package containing the code to generate a summary from a MMIF file. +""" + + +import argparse + +from mmif.utils.summarizer.summary import Summary + + +def argparser(): + parser = argparse.ArgumentParser(description='Create a JSON Summary for a MMIF file') + parser.add_argument('-i', metavar='MMIF_FILE', help='input MMIF file', required=True) + parser.add_argument('-o', metavar='JSON_FILE', help='output JSON summary file', required=True) + return parser + + +def pp_args(args): + for a, v in args.__dict__.items(): + print(f'{a:12s} --> {v}') + + +def main(): + parser = argparser() + args = parser.parse_args() + #pp_args(args) + mmif_summary = Summary(args.i) + mmif_summary.report(outfile=args.o) diff --git a/mmif/utils/summarizer/config.py b/mmif/utils/summarizer/config.py new file mode 100644 index 00000000..f972bd97 --- /dev/null +++ b/mmif/utils/summarizer/config.py @@ -0,0 +1,69 @@ + +from mmif.vocabulary import DocumentTypes +from mmif.vocabulary import AnnotationTypes + + +# The name of CLAMS applications, used to select views and to determine whether +# the summarizer is appropriate for the app version. +# TODO: this now requires an exhaustive listing of all allowed apps and their +# versions, we need a more maintainable system. + +KALDI = [ + # The first two use MMIF 0.4 and should probably be retired + 'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.2', + 'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.3', + 'http://apps.clams.ai/aapb-pua-kaldi-wrapper/v3'] + +WHISPER = [ + 'http://apps.clams.ai/whisper-wrapper/v7', + 'http://apps.clams.ai/whisper-wrapper/v8', + 'http://apps.clams.ai/whisper-wrapper/v8-3-g737e280'] + +CAPTIONER = [ + 'http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97', + 'http://apps.clams.ai/smolvlm2-captioner'] + +NER = [ + 'http://apps.clams.ai/spacy-wrapper/v1.1', + 'http://apps.clams.ai/spacy-wrapper/v2.1'] + +SEGMENTER = 'http://apps.clams.ai/audio-segmenter' + + +# When a named entity occurs 20 times we do not want to generate 20 instances of +# it. If the start of the next entity occurs within the below number of +# milliseconds after the end of the previous, then it is just added to the +# previous one. Taking one minute as the default so two mentions in a minute end +# up being the same instance. This setting can be changed with the 'granularity' +# parameter. +# TODO: this seems broken + +GRANULARITY = 1000 + + +# Properties used for the summary for various tags + +DOC_PROPS = ('id', 'type', 'location') +VIEW_PROPS = ('id', 'timestamp', 'app') +TF_PROPS = ('id', 'start', 'end', 'frameType') +E_PROPS = ('id', 'group', 'cat', 'tag', 'video-start', 'video-end', 'coordinates') + + +# Names of types + +TEXT_DOCUMENT = DocumentTypes.TextDocument.shortname +VIDEO_DOCUMENT = DocumentTypes.VideoDocument.shortname +TIME_FRAME = AnnotationTypes.TimeFrame.shortname +BOUNDING_BOX = AnnotationTypes.BoundingBox.shortname +ALIGNMENT = AnnotationTypes.Alignment.shortname + +ANNOTATION = 'Annotation' +TOKEN = 'Token' +SENTENCE = 'Sentence' +PARAGRAPH = 'Paragraph' +NAMED_ENTITY = 'NamedEntity' +NOUN_CHUNK = 'NounChunk' +VERB_CHUNK = 'VerbChunk' + +TIME_BASED_INTERVALS = {TIME_FRAME} +SPAN_BASED_INTERVALS = {TOKEN, SENTENCE, PARAGRAPH, NAMED_ENTITY, NOUN_CHUNK, VERB_CHUNK} diff --git a/mmif/utils/summarizer/graph.py b/mmif/utils/summarizer/graph.py new file mode 100644 index 00000000..b5ea40a2 --- /dev/null +++ b/mmif/utils/summarizer/graph.py @@ -0,0 +1,256 @@ +import sys, json +from collections import defaultdict +from operator import itemgetter +from pathlib import Path +import argparse + +from typing import Any +from mmif import Mmif + +from mmif.utils.summarizer import config +from mmif.utils.summarizer.utils import compose_id, normalize_id +from mmif.utils.summarizer.nodes import Node, Nodes, EntityNode, TimeFrameNode + + +class Graph(object): + + """ + Graph implementation for a MMIF document. Each node contains an annotation + or document. Alignments are stored separately. Edges between nodes are created + from the alignments and added to the Node.targets property. The first edge added + to Node.targets is the document that the Node points to (if there is one). + + The goal for the graph is to store all useful annotation and to have simple ways + to trace nodes all the way up to the primary data. + + :var mmif: the MMIF document that we are creating a graph for + :var documents: list of the top-level documents + :var nodes: dictionary of nodes, indexed on node identifier + :var alignments: list of pairs + :var token_idx: an instance of TokenIndex + + """ + + def __init__(self, mmif: Any): + # TODO: the type hint should really be "MMif | str", but pytype did not + # like that. + self.mmif = mmif if type(mmif) is Mmif else Mmif(mmif) + self.documents = [] + self.nodes = {} + self.alignments = [] + self._init_nodes() + self._init_edges() + # Third pass to add links between text elements, in particular from + # entities to tokens, adding lists of tokens to entities. + tokens = self.get_nodes(config.TOKEN) + entities = self.get_nodes(config.NAMED_ENTITY) + self.token_idx = TokenIndex(tokens) + #self.token_idx.pp() + for e in entities: + #print('>>>', e, e.anchors) + e.tokens = self.token_idx.get_tokens_for_node(e) + + def _init_nodes(self): + # The top-level documents are added as nodes, but they are also put in + # the documents list. + for doc in self.mmif.documents: + self.add_node(None, doc) + self.documents.append(doc) + # First pass over all annotations and documents in all views and save + # them in the graph. + doc_ids = [d.id for d in self.documents] + for view in self.mmif.views: + for annotation in view.annotations: + normalize_id(doc_ids, view, annotation) + if annotation.at_type.shortname == config.ALIGNMENT: + # alignments are not added as nodes, but we do keep them around + self.alignments.append((view, annotation)) + else: + self.add_node(view, annotation) + + def _init_edges(self): + # Second pass over the alignments so we create edges. + for view, alignment in self.alignments: + self.add_edge(view, alignment) + + def __str__(self): + return "" % len(self.nodes) + + def add_node(self, view, annotation): + """Add an annotation as a node to the graph.""" + node = Nodes.new(self, view, annotation) + self.nodes[node.identifier] = node + + def add_edge(self, view, alignment): + source_id = alignment.properties['source'] + target_id = alignment.properties['target'] + #print(alignment.id, source_id, target_id) + source = self.get_node(source_id) + target = self.get_node(target_id) + if source is None or target is None: + print('WARNING: could not add edge ', + 'because the source and/or target does not extst') + else: + # make sure the direction goes from token or textdoc to annotation + if target.annotation.at_type.shortname in (config.TOKEN, config.TEXT_DOCUMENT): + source, target = target, source + source.targets.append(target) + source.add_anchors_from_alignment(target) + target.add_anchors_from_alignment(source) + + def get_node(self, node_id: str) -> Node | None: + """Return the Node instance from the node index.""" + return self.nodes.get(node_id) + + # def get_nodes(self, short_at_type: str, view_id : str = None): + # replaced the above because the code coverage is picky on type hints + def get_nodes(self, short_at_type: str, view_id=None): + """Get all nodes for an annotation type, using the short form. If a view + identifier is provided then only include nodes from that view.""" + return [node for node in self.nodes.values() + if (node.at_type.shortname == short_at_type + and (view_id is None or node.view.id == view_id))] + + def statistics(self) -> defaultdict: + """ + Collect counts for node types in each view. + """ + stats = defaultdict(int) + for node in self.nodes.values(): + stats[f'{str(node.view_id):4} {node.at_type.shortname}'] += 1 + return stats + + def trim(self, start: int, end: int): + """ + :meta private: + + Trim the graph and keep only those nodes that are included in the graph + between two timepoints (both in milliseconds). This assumes that all nodes + are anchored on the time in the audio or video stream. At the moment it + keeps all nodes that are not explicitly anchored. Private for now because + it is still useless. + """ + remove = set() + for node_id, node in self.nodes.items(): + if 'time-point' in node.anchors: + if not start <= node.anchors['time-point'] <= end: + remove.add(node_id) + if 'time-offsets' in node.anchors: + p1, p2 = node.anchors['time-offsets'] + if not (start <= p1 <= end and start <= p2 <= end): + remove.add(node_id) + new_nodes = [n for n in self.nodes.values() if not n.identifier in remove] + self.nodes = { node.identifier: node for node in new_nodes } + + def pp(self, fname=None, skip_timepoints=False): + """ + :meta private: + """ + fh = sys.stdout if fname is None else open(fname, 'w') + fh.write("%s\n" % self) + for view in self.mmif.views: + fh.write(" \n" % (view.id, str(view.metadata['app']))) + for node_id, node in self.nodes.items(): + if node.at_type.shortname == 'TimePoint': + continue + fh.write(" %-40s" % node) + targets = [str(t) for t in node.targets] + fh.write(' --> [%s]\n' % ' '.join(targets)) + + def pp_statistics(self): + """ + :meta private: + """ + stats = self.statistics() + for at_type in sorted(stats): + print(f'{at_type:20} {stats[at_type]:>5}') + + +class TokenIndex(object): + + """ + The tokens are indexed on the identifier on the TextDocument that they occur + in and for each text document we have a list of pairs + + .. code-block:: python + + {'v_4:td1': [ + ((0, 5), ), + ((5, 6), ), + ...] + } + + """ + + # TODO: + # - Benchmark get_tokens_for_node(). I may want to use something like this + # to determine enclosed nodes and enclosing nodes and that may blow up since + # that would be O(n^2). If it does matter, probably start using binary search + # or add an index from character offset to nodes. + # - It is also not sure whether we still need this since the new spaCy gives + # targets to tokens. + + def __init__(self, tokens): + self.tokens = {} + self.token_count = len(tokens) + for t in tokens: + tup = ((t.properties['start'], t.properties['end']), t) + self.tokens.setdefault(t.document.identifier, []).append(tup) + # Make sure the tokens for each document are ordered. + for document, token_list in self.tokens.items(): + self.tokens[document] = sorted(token_list, key=itemgetter(0)) + # In some cases there are two tokens with identical offset (for example + # with tokenization from both Kaldi and spaCy, not sure what to do with + # these, but should probably be more careful on what views to access + + def __len__(self): + return self.token_count + + def __str__(self): + return f'' + + def get_tokens_for_node(self, node: Node): + """Return all tokens included in the span of a node.""" + doc = node.document.identifier + try: + start = node.properties['start'] + end = node.properties['end'] + except KeyError: + start, end = node.anchors['text-offsets'] + tokens = [] + for (t_start, t_end), token in self.tokens.get(doc, []): + if t_start >= start and t_end <= end: + tokens.append(token) + return tokens + + def pp(self, fname=None): + fh = sys.stdout if fname is None else open(fname, 'w') + for document in self.tokens: + fh.write("\n[%s] -->\n" % document) + for t in self.tokens[document]: + fh.write(' %s %s\n' % (t[0], t[1])) + + + +if __name__ == '__main__': + + graph = Graph(open(sys.argv[1]).read()) + print(graph) + #graph.pp() + #graph.nodes['v_7:st12'].pp() + #graph.nodes['v_2:s1'].pp() + #graph.nodes['v_4:tf1'].pp() + exit() + for node in graph.nodes.values(): + print(node.at_type.shortname, node.identifier, node.anchors) + + +''' + +Printing some graphs: + +uv run graph.py -i examples/input-v9.mmif -e dot -f png -o examples/dot-v9-1-full -p -a -v +uv run graph.py -i examples/input-v9.mmif -e dot -f png -o examples/dot-v9-2-no-view-links -p -a +uv run graph.py -i examples/input-v9.mmif -e dot -f png -o examples/dot-v9-3-no-anchor-to-doc -p + +''' diff --git a/mmif/utils/summarizer/nodes.py b/mmif/utils/summarizer/nodes.py new file mode 100644 index 00000000..53201022 --- /dev/null +++ b/mmif/utils/summarizer/nodes.py @@ -0,0 +1,370 @@ +import json + +from typing import Any + +from mmif.utils.summarizer import config + + + +class Node(object): + + def __init__(self, graph, view, annotation): + self.graph = graph + self.view = view + self.view_id = None if self.view is None else self.view.id + self.annotation = annotation + # copy some information from the Annotation + self.at_type = annotation.at_type + self.identifier = annotation.id + self.properties = json.loads(str(annotation.properties)) + # get the document from the view or the properties + self.document = self._get_document() + # The targets property contains a list of annotations or documents that + # the node content points to. This includes the document the annotation + # points to as well as the alignment from a token or text document to a + # bounding box or time frame (which is added later). + # TODO: the above does not seem to be true since there is no evidence of + # data from alignments being added. + self.targets = [] if self.document is None else [self.document] + self.anchors = {} + self.add_local_anchors() + self.add_anchors_from_targets() + + def __str__(self): + anchor = '' + if self.at_type.shortname == config.TOKEN: + anchor = " %s:%s '%s'" % (self.properties['start'], + self.properties['end'], + self.properties.get('text','').replace('\n', '\\n')) + return "<%s %s%s>" % (self.at_type.shortname, self.identifier, anchor) + + def add_local_anchors(self): + """Get the anchors that you can get from the annotation itself, which + includes the start and end offsets, the coordinates, the timePoint of + a BoundingBox and any annotation with targets.""" + props = self.properties + attype = self.annotation.at_type.shortname + if 'start' in props and 'end' in props: + # TimeFrame is the only non-character based interval so this simple + # if-then-else should work + if attype == config.TIME_FRAME: + self.anchors['text-offsets'] = (props['start'], props['end']) + else: + self.anchors['time-offsets'] = (props['start'], props['end']) + if 'coordinates' in props: + self.anchors['coordinates'] = props['coordinates'] + if 'timePoint' in props: + self.anchors['time-point'] = props['timePoint'] + if 'targets' in props: + self.anchors['targets'] = props['targets'] + + def add_anchors_from_targets(self): + """Get start and end offsets or timePoints from the targets and add them to + the anchors, but only if there were no anchors on the node already. This has + two cases: one for TimeFrames and one for text intervals.""" + props = self.properties + attype = self.annotation.at_type.shortname + if 'targets' in props: + try: + t1 = self.graph.nodes[props['targets'][0]] + t2 = self.graph.nodes[props['targets'][-1]] + if attype == config.TIME_FRAME: + if not 'time-offsets' in props: + self.anchors['time-offsets'] = ( + t1.properties['timePoint'], t2.properties['timePoint']) + else: + if not 'text-offsets' in props: + self.anchors['text-offsets'] = ( + t1.properties['start'], t2.properties['end']) + except IndexError: + print(f'WARNING: Unexpected empty target list for {self.identifier}') + + def add_anchors_from_alignment(self, target: Any, debug=False): + if target is None: + return + source_attype = self.at_type.shortname + target_attype = target.at_type.shortname + if debug: + print('\n@ DEBUG SOURCE->TARGET ', source_attype, target_attype) + print('@ DEBUG SOURCE.PROPS ', list(self.properties.keys())) + print('@ DEBUG TARGET.PROPS ', list(target.properties.keys())) + print('@ DEBUG TARGET.ANCHORS ', target.anchors) + # If a TextDocument is aligned to a BoundingBox then we grab the coordinates + # TODO: how are we getting the time point? + if source_attype == 'TextDocument' and target_attype == 'BoundingBox': + if 'coordinates' in target.properties: + self.anchors['coordinates'] = target.properties['coordinates'] + #print(source_attype, self.anchors) + elif source_attype == 'BoundingBox' and target_attype == 'TextDocument': + pass + # If a TextDocument is aligned to a TimeFrame then we copy time anchors + # but also targets and representatives, the latter because some alignments + # are not precise + elif source_attype == 'TextDocument' and target_attype == 'TimeFrame': + if 'start' in target.properties and 'end' in target.properties: + self.anchors['time-offsets'] = (target.properties['start'], + target.properties['end']) + if 'time-offsets' in target.anchors: + # TODO: is this ever used? + self.anchors['time-offsets'] = target.anchors['time-offsets'] + if 'targets' in target.properties: + self.anchors['targets'] = target.properties['targets'] + if 'representatives' in target.properties: + self.anchors['representatives'] = target.properties['representatives'] + #print('-', source_attype, self.anchors, self, target) + elif source_attype == 'TimeFrame' and target_attype == 'TextDocument': + pass + # Simply copy the time point + elif source_attype == 'TextDocument' and target_attype == 'TimePoint': + self.anchors['time-point'] = target.anchors['time-point'] + if debug: + print('+ ADDED SOURCE.ANCHORS ', self.anchors) + # For Token-TimeFrame alignments all we need are the start and end time points + elif source_attype == 'Token' and target_attype == 'TimeFrame': + if 'start' in target.properties and 'end' in target.properties: + self.anchors['time-offsets'] = (target.properties['start'], + target.properties['end']) + #print(source_attype, self.anchors) + elif source_attype == 'TimeFrame' and target_attype == 'Token': + pass + # TODO: check whether some action is needed for the next options + elif source_attype == 'TextDocument' and target_attype == 'VideoDocument': + pass + elif source_attype == 'VideoDocument' and target_attype == 'TextDocument': + pass + elif source_attype == 'BoundingBox' and target_attype == 'TimePoint': + pass + elif source_attype =='TimePoint' and target_attype == 'BoundingBox': + pass + elif source_attype == 'BoundingBox' and target_attype in ('Token', 'Sentence', 'Paragraph'): + pass + elif source_attype in ('Token', 'Sentence', 'Paragraph') and target_attype == 'BoundingBox': + pass + elif source_attype == 'TextDocument' and target_attype == 'TimePoint': + pass + elif source_attype == 'TimePoint' and target_attype == 'TextDocument': + pass + else: + print('-', source_attype, target_attype) + #if debug: + # print('DEBUG', self.anchors) + + def _get_document(self): + """Return the document or annotation node that the annotation/document in + the node refers to via the document property. This could be a local property + or a metadata property if there is no such local property. Return None + if neither of those exist.""" + # try the local property + docid = self.properties.get('document') + if docid is not None: + # print('>>>', docid, self.graph.get_node(docid)) + return self.graph.get_node(docid) + # try the metadata property + if self.view is not None: + try: + metadata = self.view.metadata.contains[self.at_type] + docid = metadata['document'] + return self.graph.get_node(docid) + except KeyError: + return None + return None + + def summary(self): + """The default summary is just the identfier, this should typically be + overriden by sub classes.""" + return { 'id': self.identifier } + + def has_label(self): + """Only TimeFrameNodes can have labels so this returns False.""" + return False + + def pp(self, close=True): + print('-' * 80) + print(self) + print(f' document = {self.document}') + for prop in self.properties: + print(f' {prop} = {self.properties[prop]}') + print(' targets = ') + for target in self.targets: + print(' ', target) + print(' anchors = ') + for anchor in self.anchors: + print(f' {anchor} -> {self.anchors[anchor]}') + if close: + print('-' * 80) + + +class TimeFrameNode(Node): + + def __str__(self): + frame_type = ' ' + self.frame_type() if self.has_label() else '' + return ('' + % (self.identifier, self.start(), self.end(), frame_type)) + + def start(self): + return self.properties.get('start', -1) + + def end(self): + return self.properties.get('end', -1) + + def frame_type(self): + # TODO: rename this, uses old property since replaced by "label"" + # NOTE: this is still aloowing for the old property though + return self.properties.get('label') or self.properties.get('frameType') + + def has_label(self): + return self.frame_type() is not None + + def representatives(self) -> list: + """Return a list of the representative TimePoints.""" + # TODO: why could I not get this from the anchors? + rep_ids = self.properties.get('representatives', []) + reps = [self.graph.get_node(rep_id) for rep_id in rep_ids] + return reps + + def summary(self): + """The summary of a time frame just contains the identifier, start, end + and frame type.""" + return { 'id': self.identifier, + 'start': self.properties['start'], + 'end': self.properties['end'], + 'frameType': self.properties.get('frameType') } + + +class EntityNode(Node): + + def __init__(self, graph, view, annotation): + super().__init__(graph, view, annotation) + self.tokens = [] + self._paths = None + self._anchor = None + + def __str__(self): + try: + start = self.properties['start'] + end = self.properties['end'] + except KeyError: + start, end = self.anchors['text-offsets'] + return ("" + % (self.identifier, start, end, self.properties['text'])) + + def start_in_video(self): + #print('+++', self.document.properties) + try: + return self.document.anchors['time-point'] + except KeyError: + return -1 + #return self.anchor()['video-start'] + + def end_in_video(self): + return self.anchor().get('video-end') + + ''' + Commented this out because the type checking in the code coverage tests requires + the default vaue for the close parameter to be the same as on Node.pp(). + + def pp(self, close=False): + super().pp(close=close) + try: + for i, p in enumerate(self.paths_to_docs()): + print(' %s' % ' '.join([str(n) for n in p[1:]])) + except ValueError: + print(' WARNING: error in path_to_docs in NamedEntityNode.pp()') + print('-' * 80) + ''' + + def summary(self): + """The summary for entities needs to include where in the video or image + the entity occurs, it is not enough to just give the text document.""" + # TODO: in the old days this used an anchor() method which was fragile + # TODO: revamping it now + return { + 'id': self.identifier, + 'group': self.properties['group'], + 'cat': self.properties['category'], + 'document': self.document.identifier, + # Entities in a TextDocument that is a full transcript without any + # alignments do not have a TimePoint + #'time-point': self.document.anchors.get('time-point'), + #'text-offsets': self.anchors.get('text-offsets'), + 'time-point': self.document.anchors.get('time-point', -1), + 'text-offsets': self.anchors.get('text-offsets', (-1 ,-1)), + #'document': self._get_document_plus_span(), + #'video-start': anchor.get('video-start'), + #'video-end': anchor.get('video-end'), + #'coordinates': self._coordinates_as_string(anchor) + } + + def anchor(self) -> dict: + """The anchor is the position in the video that the entity is linked to. + This anchor cannot be found in the document property because that points + to a text document that was somehow derived from the video document. Some + graph traversal is needed to get the anchor, but we know that the anchor + is always a time frame or a bounding box. + """ + # TODO: deal with the case where the primary document is not a video + self.paths = self.paths_to_docs() + bbtf = self.find_boundingbox_or_timeframe() + # for path in paths: + # print('... [') + # for n in path: print(' ', n) + # print('===', bbtf) + if bbtf.at_type.shortname == config.BOUNDING_BOX: + return {'video-start': bbtf.properties['timePoint'], + 'coordinates': bbtf.properties['coordinates']} + elif bbtf.at_type.shortname == config.TIME_FRAME: + return {'video-start': bbtf.properties['start'], + 'video-end': bbtf.properties['end']} + else: + return {} + + def anchor2(self): + """The anchor is the position in the video that the entity is linked to. + This anchor cannot be found in the document property because that points + to a text document that was somehow derived from the video document. Some + graph traversal is needed to get the anchor, but we know that the anchor + is always a time frame or a bounding box. + """ + # TODO: with this version you get an error that the paths variable does + # not exist yet, must get a clearer picture on how to build a graph + # where nodes have paths to anchors + # TODO: deal with the case where the primary document is not a video + if self._anchor is None: + self._paths = self.paths_to_docs() + bbtf = self.find_boundingbox_or_timeframe() + # for path in self._paths: + # print('... [') + # for n in path: print(' ', n) + # print('===', bbtf) + if bbtf.at_type.shortname == config.BOUNDING_BOX: + self._anchor = {'video-start': bbtf.properties['timePoint'], + 'coordinates': bbtf.properties['coordinates']} + elif bbtf.at_type.shortname == config.TIME_FRAME: + self._anchor = {'video-start': bbtf.properties['start'], + 'video-end': bbtf.properties['end']} + return self._anchor + + def find_boundingbox_or_timeframe(self): + return self.paths[-1][-2] + + @staticmethod + def _coordinates_as_string(anchor): + if 'coordinates' not in anchor: + return None + return ','.join(["%s:%s" % (pair[0], pair[1]) + for pair in anchor['coordinates']]) + + +class Nodes(object): + + """Factory class for Node creation. Use Node for creation unless a special + class was registered for the kind of annotation we have.""" + + node_classes = { config.NAMED_ENTITY: EntityNode, + config.TIME_FRAME: TimeFrameNode } + + @classmethod + def new(cls, graph, view, annotation): + node_class = cls.node_classes.get(annotation.at_type.shortname, Node) + return node_class(graph, view, annotation) + diff --git a/mmif/utils/summarizer/summary.py b/mmif/utils/summarizer/summary.py new file mode 100644 index 00000000..28339fad --- /dev/null +++ b/mmif/utils/summarizer/summary.py @@ -0,0 +1,657 @@ +""" + +Main classes for the summarizer. + +""" + +# TODO: +# - For the time unit we should really update get_start(), get_end() and other methods. + + +import json +import logging +import os +import pathlib +from collections import defaultdict + +from mmif.serialize import Mmif +from mmif.utils.summarizer import config +from mmif.utils.summarizer.graph import Graph +from mmif.utils.summarizer.utils import CharacterList +from mmif.utils.summarizer.utils import get_transcript_view, get_captions_view +from mmif.utils.summarizer.utils import timestamp +from mmif.vocabulary import DocumentTypes + +logger = logging.getLogger(__name__) + + +class SummaryException(Exception): + pass + + +class Summary(object): + + """Implements the summary of a MMIF file. + + :var fname: name of the input mmif file + :var mmif: instance of mmif.serialize.Mmif + :var graph: instance of graph.Graph + :var documents: instance of Documents + :var views: instance of Views + :var transcript: instance of Transcript + :var timeframes: instance of TimeFrames + :var entities: instance of Entities + :var captions: instance of Captions + + """ + + def __init__(self, mmif_file): + self.fname = mmif_file + #self.mmif = mmif if type(mmif) is Mmif else Mmif(mmif) + self.mmif = Mmif(pathlib.Path(mmif_file).read_text()) + self.warnings: list[str] = [] + self.graph = Graph(self.mmif) + self.mmif_version = self.mmif.metadata['mmif'] + self.documents = Documents(self) + self.annotations = Annotations(self) + self.document = Document(self) + self.views = Views(self) + self.timeframes = TimeFrames(self) + self.timeframe_stats = TimeFrameStats(self) + self.transcript = Transcript(self) + self.captions = Captions(self) + self.entities = Entities(self) + self.validate() + self.print_warnings() + + def add_warning(self, warning: str): + self.warnings.append(warning) + + def validate(self): + """Minimal validation of the input. Mostly a place holder because all it + does now is to check how many video documents there are.""" + if len(self.video_documents()) > 1: + raise SummaryException("More than one video document in MMIF file") + + def video_documents(self): + return self.mmif.get_documents_by_type(DocumentTypes.VideoDocument) + + def to_dict(self): + return { + 'mmif_version': self.mmif.metadata.mmif, + 'document': self.document.data, + 'documents': self.documents.data, + 'annotations': self.annotations.data, + 'views': self.views.data, + 'transcript': self.transcript.data, + 'captions': self.captions.as_json(), + 'timeframes': self.timeframes.as_json(), + 'timeframe_stats': self.timeframe_stats.data, + 'entities': self.entities.as_json() + } + + def report(self, outfile=None): + json_obj = self.to_dict() + report = json.dumps(json_obj, indent=2) + if outfile is None: + return report + # Support both file-like objects and path-like values for outfile. + if hasattr(outfile, "write"): + outfile.write(report) + else: + with open(outfile, 'w') as fh: + fh.write(report) + + def print_warnings(self): + for warning in self.warnings: + logger.warning(warning) + + def pp(self): + self.documents.pp() + self.views.pp() + self.transcript.pp() + self.timeframes.pp() + self.entities.pp() + print() + + +class Documents(object): + + """Contains a list of document summaries, which are dictionaries with just + the id, type and location properties.""" + + def __init__(self, summary: Summary): + self.data = [self.summary(doc) for doc in summary.graph.documents] + + def __len__(self): + return len(self.data) + + @staticmethod + def summary(doc): + return { 'id': doc.id, + 'type': doc.at_type.shortname, + 'location': doc.location } + + def pp(self): + print('\nDocuments -> ') + for d in self.data: + print(' %s %s' % (d['type'], d['location'])) + + +class Annotations(object): + + """Contains a dictionary of Annotation object summaries, indexed on view + identifiers.""" + + def __init__(self, summary): + self.data = defaultdict(list) + # summary.graph.get_nodes(config.ANNOTATION, view_id=view.id) + for anno in summary.graph.get_nodes(config.ANNOTATION): + self.data[anno.view.id].append(anno.properties) + + def get(self, item): + return self.data.get(item, []) + + def get_all_annotations(self): + annotations = [] + for annos in self.data.values(): + annotations.extend(annos) + return annotations + + +class Document(object): + + """Collects some document-level information, including MMIF version, size of + the MMIF file and some information from the SWT document annotation.""" + + def __init__(self, summary): + self.data = { + 'mmif_version': summary.mmif_version, + 'size': os.path.getsize(summary.fname) } + annotations = summary.annotations.get_all_annotations() + if annotations: + # TODO: this if fragile because it assumes that the annotation we want + # (which is the one from SWT) is always the first + doc_level_annotation = annotations[0] + if 'fps' in doc_level_annotation: + self.data['fps'] = doc_level_annotation['fps'] + if 'frameCount' in doc_level_annotation: + self.data['frames'] = doc_level_annotation['frameCount'] + if 'duration' in doc_level_annotation: + duration = doc_level_annotation['duration'] + # both in milliseconds and as a timestamp + self.data['duration_ms'] = duration + self.data['duration_ts'] = timestamp(duration) + + +class Views(object): + + """Contains a list of view summaries, which are dictionaries with just + the id, app and timestamp properties.""" + + def __init__(self, summary): + self.summary = summary + self.data = [self.get_view_summary(view) for view in summary.mmif.views] + + def __getitem__(self, i): + return self.data[i] + + def __len__(self): + return len(self.data) + + #@staticmethod + def get_view_summary(self, view): + annotation_types = defaultdict(int) + for annotation in view.annotations: + annotation_types[annotation.at_type.shortname] += 1 + basic_info = { + 'id': view.id, + 'app': view.metadata.app, + 'timestamp': view.metadata.timestamp, + 'contains': [str(k) for k in view.metadata.contains.keys()], + 'annotation_count': len(view.annotations), + 'annotation_types': dict(annotation_types), + 'parameters': view.metadata.parameters, + 'appConfiguration': view.metadata.appConfiguration } + if view.metadata.warnings: + basic_info['warnings'] = view.metadata.warnings + if view.metadata.error: + basic_info['error'] = view.metadata.error + return basic_info + + def pp(self): + print('\nViews -> ') + for v in self.data: + print(' %s' % v['app']) + + +class Transcript(object): + + """The transcript contains the string value from the first text document in the + last ASR view. It issues a warning if there is more than one text document in + the view.""" + + def __init__(self, summary): + self.summary = summary + self.data = [] + view = get_transcript_view(summary.mmif.views) + if view is not None: + documents = view.get_documents() + if len(documents) > 1: + summary.add_warning(f'More than one TextDocument in ASR view {view.id}') + t_nodes = summary.graph.get_nodes(config.TOKEN, view_id=view.id) + s_nodes = summary.graph.get_nodes(config.SENTENCE, view_id=view.id) + if not t_nodes: + return + if s_nodes: + # Whisper has Sentence nodes + sentences = self.collect_targets(s_nodes) + sentence_ids = [n.identifier for n in s_nodes] + else: + # But Kaldi does not + sentences = self.create_sentences(t_nodes) + sentence_ids = [None] * len(sentences) + # initialize the transcripts with all blanks, most blanks will be + # overwrite with characters from the tokens + transcript = CharacterList(self.transcript_size(sentences)) + for s_id, s in zip(sentence_ids, sentences): + transcript_element = TranscriptElement(s_id, s, transcript) + self.data.append(transcript_element.as_json()) + + def __str__(self): + return str(self.data) + + @staticmethod + def transcript_size(sentences): + try: + return sentences[-1][-1].properties['end'] + except IndexError: + return 0 + + def collect_targets(self, s_nodes): + """For each node (in this context a sentence node), collect all target nodes + (which are tokens) and return them as a list of lists, with one list for each + node.""" + targets = [] + for node in s_nodes: + node_target_ids = node.properties['targets'] + node_targets = [self.summary.graph.get_node(stid) for stid in node_target_ids] + targets.append(node_targets) + return targets + + def create_sentences(self, t_nodes, sentence_size=12): + """If there is no sentence structure then we create it just by chopping th + input into slices of some pre-determined length.""" + # TODO: perhaps the size paramater should be set in the config file or via a + # command line option. + return [t_nodes[i:i + sentence_size] + for i in range(0, len(t_nodes), sentence_size)] + + +class TranscriptElement: + + """Utility class to handle data associated with an element from a transcript, + which is created from a sentence which is a list of Token Nodes. Initialization + has the side effect of populating the full transcript which is an instance of + CharacterList and which is also accessed here.""" + + def __init__(self, identifier: str, sentence: list, transcript: CharacterList): + for t in sentence: + # this adds the current token to the transcript + start = t.properties['start'] + end = t.properties['end'] + word = t.properties['word'] + transcript.set_chars(word, start, end) + self.id = identifier + self.start = sentence[0].anchors['time-offsets'][0] + self.end = sentence[-1].anchors['time-offsets'][1] + self.start_offset = sentence[0].properties['start'] + self.end_offset = sentence[-1].properties['end'] + self.text = transcript.getvalue(self.start_offset, self.end_offset) + + def __str__(self): + text = self.text if len(self.text) <= 50 else self.text[:50] + '...' + return f'' + + def as_json(self): + json_obj = { + "start-time": self.start, + "end-time": self.end, + "text": self.text } + if self.id is not None: + json_obj["id"] = self.id + return json_obj + + +class Nodes(object): + + """ + Abstract class to store instances of subclasses of graph.Node. The + initialization methods of subclasses of Nodes can guard what nodes will + be allowed in, for example, as of July 2022 the TimeFrames class only + allowed time frames that had a frame type (thereby blocking the many + timeframes from Kaldi). + + :var summary: an instance of Summary + :var graph: an instance of graph.Graph, taken from the summary + :var nodes: list of instances of subclasses of graph.Node + + """ + + def __init__(self, summary): + self.summary = summary + self.graph = summary.graph + self.nodes = [] + + def __getitem__(self, i): + return self.nodes[i] + + def __len__(self): + return len(self.nodes) + + def add(self, node): + self.nodes.append(node) + + def get_nodes(self, **props): + """Return all the nodes that match the given properties.""" + def prop_check(p, v, props_given): + return v == props_given.get(p) if p in props_given else False + return [n for n in self + if all([prop_check(p, v, n.annotation.properties) + for p, v in props.items()])] + + +class TimeFrames(Nodes): + + """For now, we take only the TimeFrames that have a frame type, which rules out + all the frames we got from Kaldi.""" + + def __init__(self, summary): + super().__init__(summary) + # a dictionary mapping app names to lists of timeframe summaries + self.data = defaultdict(list) + for tf_node in self.graph.get_nodes(config.TIME_FRAME): + if tf_node.has_label(): + self.add(tf_node) + self._collect_timeframe_summaries() + self._sort_timeframe_summaries() + + def _collect_timeframe_summaries(self): + for tf in self.nodes: + label = tf.frame_type() + try: + start, end = tf.anchors['time-offsets'] + except KeyError: + # TODO: + # - this defies the notion of using the anchors for this, but + # maybe in this case we should go straight to the start/end + # - this code below also raises an error if there are no start + # and end properties + start = tf.properties['start'] + end = tf.properties['end'] + representatives = tf.representatives() + rep_tps = [rep.properties['timePoint'] for rep in representatives] + score = tf.properties.get('classification', {}).get(label) + app = tf.view.metadata.app + self.data[app].append( + { 'identifier': tf.identifier, 'label': label, 'score': score, + 'start-time': start, 'end-time': end, 'representatives': rep_tps }) + + def _sort_timeframe_summaries(self): + """Sort the data on their start time, do this for all apps.""" + for app in self.data: + sort_function = lambda x: x['start-time'] + self.data[app] = list(sorted(self.data[app], key=sort_function)) + + def as_json(self): + return self.data + + def pp(self): + print('\nTimeframes -> ') + for tf in self.nodes: + summary = tf.summary() + print(' %s:%s %s' % (summary['start'], summary['end'], + summary['frameType'])) + + +class TimeFrameStats(object): + + def __init__(self, summary): + # a dictionary mapping app names to frameType->duration dictionaries, + # where the duration is cumulative over all instances + self.timeframes = summary.timeframes + self.data = {} + self._collect_durations() + self._collect_other_morsels() + + def _collect_durations(self): + timeframes = self.timeframes.data + for app in timeframes: + self.data[app] = {} + for tf in timeframes[app]: + label = tf.get('label') + if label not in self.data[app]: + self.data[app][label] = {'count': 0, 'duration': 0} + self.data[app][label]['count'] += 1 + duration = tf['end-time'] - tf['start-time'] + if label is not None: + # TODO: these gave weird values for duration + #print('---',app, label, duration) + self.data[app][label]['duration'] += duration + duration = self.data[app][label]['duration'] + count = self.data[app][label]['count'] + self.data[app][label]['average'] = duration // count + + def _collect_other_morsels(self): + # First we want everything grouped by app and label + timeframes = self.timeframes.data + grouped_timeframes = defaultdict(lambda: defaultdict(list)) + for app in timeframes: + for tf in timeframes[app]: + label = tf.get('label') + grouped_timeframes[app][label].append(tf) + # The we pick the morsels for each label + for app in grouped_timeframes: + for label in grouped_timeframes[app]: + tfs = grouped_timeframes[app][label] + sort_on_start = lambda tf: tf['start-time'] + sort_on_length = lambda tf: tf['end-time'] - tf['start-time'] + first_tf = list(sorted(tfs, key=sort_on_start))[0] + longest_tf = list(sorted(tfs, key=sort_on_length, reverse=True))[0] + self.data[app][label]['first'] = first_tf['start-time'] + self.data[app][label]['longest'] = longest_tf['start-time'] + + +class Entities(Nodes): + + """ + This class collects instances of graph.EntityNode. + + :var nodes_idx: maps entity texts to lists of instances of graph.EntityNode + :var bins: an instance of Bins + + """ + + def __init__(self, summary): + super().__init__(summary) + self.nodes_idx = {} + self.bins = None + for ent in self.graph.get_nodes(config.NAMED_ENTITY): + self.add(ent) + self._create_node_index() + self._group() + + def __str__(self): + return f'' + + def _create_node_index(self): + """Put all the entities from self.nodes in self.node_idx. This first puts + the nodes into the dictionary indexed on text string and then sorts the + list of nodes for each string on video position.""" + for ent in self: + self.nodes_idx.setdefault(ent.properties['text'], []).append(ent) + for text, entities in self.nodes_idx.items(): + self.nodes_idx[text] = sorted(entities, + key=(lambda e: e.start_in_video())) + + def _group(self): + """Groups all the nodes on the text and sorts them on position in the video, + for the latter it will also create bins of entities that occur close to each + other in the text.""" + # create the bins, governed by the summary's granularity + self.bins = Bins(self.summary) + for text, entities in self.nodes_idx.items(): + self.bins.current_bin = None + for entity in entities: + self.bins.add_entity(text, entity) + self.bins.mark_entities() + + def _add_tags(self, tags): + for tag in tags: + tag_doc = tag.properties['document'] + tag_p1 = tag.properties['start'] + tag_p2 = tag.properties['end'] + entities = self.nodes_idx.get(tag.properties['text'], []) + for entity in entities: + props = entity.properties + doc = props['document'] + p1 = props['start'] + p2 = props['end'] + if tag_doc == doc and tag_p1 == p1 and tag_p2 == p2: + entity.properties['tag'] = tag.properties['tagName'] + + def as_json(self): + json_obj = [] + for text in self.nodes_idx: + entity = {"text": text, "instances": []} + json_obj.append(entity) + for e in self.nodes_idx[text]: + entity["instances"].append(e.summary()) # e.summary(), E_PROPS) + return json_obj + + def pp(self): + print('\nEntities -> ') + for e in self.nodes_idx: + print(' %s' % e) + for d in self.nodes_idx[e]: + props = ["%s=%s" % (p, v) for p, v in d.summary().items()] + print(' %s' % ' '.join(props)) + + def print_groups(self): + for key in sorted(self.nodes_idx): + print(key) + for e in self.nodes_idx[key]: + print(' ', e, e.start_in_video()) + + +class Captions(Nodes): + + def __init__(self, summary): + super().__init__(summary) + self.captions = [] + view = get_captions_view(summary.mmif.views) + if view is not None: + for doc in self.graph.get_nodes(config.TEXT_DOCUMENT, view_id=view.id): + text = doc.properties['text']['@value'].split('[/INST]')[-1] + logger.debug('>>> DOC %s', doc) + logger.debug('>>> PROPS %s', list(doc.properties.keys())) + logger.debug('>>> TEXT %s', text.replace("\n", "")[:100]) + logger.debug('>>> ANCHORS %s', doc.anchors) + if 'time-offsets' in doc.anchors and 'representatives' in doc.anchors: + # For older LLava-style captions + # http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97 + # NOTE: probably obsolete, at least the link above is dead + tp_id = doc.anchors["representatives"][0] + tp = summary.graph.get_node(tp_id) + if tp is not None: + self.captions.append( + { 'identifier': doc.identifier, + 'time-point': tp.properties['timePoint'], + 'text': text }) + if 'time-point' in doc.anchors: + # For newer SmolVLM-style captions + # http://apps.clams.ai/smolvlm2-captioner + self.captions.append( + { 'identifier': doc.identifier, + 'time-point': doc.anchors['time-point'], + 'text': text }) + + def as_json(self): + return self.captions + #return [(ident, p1, p2, text) for ident, p1, p2, text in self.captions] + + +class Bins(object): + + def __init__(self, summary): + self.summary = summary + self.bins = {} + self.current_bin = None + self.current_text = None + + def __str__(self): + return f'' + + def __len__(self): + return len(self.bins) + + def add_entity(self, text, entity): + """Add an entity instance to the appropriate bin.""" + if self.current_bin is None: + # Add the first instance of a new entity (as defined by the text), + # since it is the first a new bin will be created. + self.current_text = text + self.current_bin = Bin(entity) + self.bins[text] = [self.current_bin] + else: + # For following entities with the same text, a new bin may be + # created depending on the positions and the granularity. + p1 = self.current_bin[-1].start_in_video() + p2 = entity.start_in_video() + # p3 = entity.end_in_video() + if p2 - p1 < config.GRANULARITY: + # TODO: should add p3 here + self.current_bin.add(entity) + else: + self.current_bin = Bin(entity) + self.bins[self.current_text].append(self.current_bin) + + def mark_entities(self): + """Marks all entities with the bin that they occur in. This is done to export + the grouping done with the bins to the entities and this way the bins never need + to be touched again.""" + # TODO: maybe use the bins when we create the output + for entity_bins in self.bins.values(): + for i, e_bin in enumerate(entity_bins): + for entity in e_bin: + entity.properties['group'] = i + + def print_bins(self): + for text in self.bins: + print(text) + text_bins = self.bins[text] + for i, text_bin in enumerate(text_bins): + text_bin.print_nodes(i) + print() + + +class Bin(object): + + def __init__(self, node): + # TODO: we are not using these yet, but a bin should have a begin and + # end in the video which should be derived from the start and end of + # entities in the video. The way we put things in bins now is a bit + # fragile since it depends on the start or end of the last element. + self.start = 0 + self.end = 0 + self.nodes = [node] + + def __getitem__(self, i): + return self.nodes[i] + + def add(self, node): + self.nodes.append(node) + + def print_nodes(self, i): + for node in self.nodes: + print(' ', i, node) diff --git a/mmif/utils/summarizer/utils.py b/mmif/utils/summarizer/utils.py new file mode 100644 index 00000000..897a3830 --- /dev/null +++ b/mmif/utils/summarizer/utils.py @@ -0,0 +1,268 @@ +""" + +Utility methods for the summarizer. + +""" + +import io +from pathlib import Path +from xml.sax.saxutils import quoteattr, escape +from collections import UserList + +from mmif import View, Annotation +from mmif.utils.summarizer.config import KALDI, WHISPER, CAPTIONER, SEGMENTER +from mmif.utils.summarizer.config import TOKEN, ALIGNMENT, TIME_FRAME + + +def compose_id(view_id, anno_id): + """Composes the view identifier with the annotation identifier.""" + return anno_id if ':' in anno_id else view_id + ':' + anno_id + + +def type_name(annotation): + """Return the short name of the type.""" + return annotation.at_type.split('/')[-1] + + +def get_transcript_view(views): + """Return the last Whisper or Kaldi view that is not a warnings view.""" + # TODO: this now has a simplified idea of how to find a view, should at least + # move towards doing some regular expression matching on the WHISPER config + # setting. The same holds for other functions to get views. + for view in reversed(views): + if view.metadata.app in KALDI + WHISPER: + if view.metadata.warnings: + continue + return view + return None + + +def get_captions_view(views): + """Return the last view created by the captioner.""" + for view in reversed(views): + if view.metadata.app in CAPTIONER: + if view.metadata.warnings: + continue + return view + return None + + +def get_last_segmenter_view(views): + for view in reversed(views): + # print(f'>>> {view.metadata.app}') + if view.metadata.app.startswith(SEGMENTER): + return view + return None + + +def get_aligned_tokens(view): + """Get a list of tokens from an ASR view where for each token we add a timeframe + properties which has the start and end points of the aligned timeframe.""" + idx = AnnotationsIndex(view) + for alignment in idx.get_annotations(ALIGNMENT).values(): + token = idx[TOKEN].get(alignment.properties['target']) + frame = idx[TIME_FRAME].get(alignment.properties['source']) + if token and frame: + # add a timeframe to the token, we can do this now that we do not + # freeze MMIF annotations anymore + token.properties['timeframe'] = (frame.properties['start'], + frame.properties['end']) + return idx.tokens + + +def timestamp(milliseconds: int, format='hh:mm:ss'): + # sometimes the milliseconds are not a usable float + if milliseconds in (None, -1): + return 'nil' + milliseconds = int(milliseconds) + seconds = milliseconds // 1000 + minutes = seconds // 60 + hours = minutes // 60 + ms = milliseconds % 1000 + s = seconds % 60 + m = minutes % 60 + if format == 'hh:mm:ss:mmm': + return f'{hours}:{m:02d}:{s:02d}.{ms:03d}' + elif format == 'hh:mm:ss': + return f'{hours}:{m:02d}:{s:02d}' + elif format == 'mm:ss': + return f'{m:02d}:{s:02d}' + elif format == 'mm:ss:mmm': + return f'{m:02d}:{s:02d}.{ms:03d}' + else: + return f'{hours}:{m:02d}:{s:02d}.{ms:03d}' + + + +class AnnotationsIndex: + + """Creates an index on the annotations list for a view, where each annotation type + is indexed on its identifier. Tokens are special and get their own list.""" + + def __init__(self, view): + self.view = view + self.idx = {} + self.tokens = [] + for annotation in view.annotations: + shortname = annotation.at_type.shortname + if shortname == TOKEN: + self.tokens.append(annotation) + self.idx.setdefault(annotation.at_type.shortname, {}) + self.idx[shortname][annotation.properties.id] = annotation + + def __str__(self): + return f'' + + def __getitem__(self, item): + return self.idx[item] + + def get_annotations(self, at_type): + return self.idx.get(at_type, {}) + + +class CharacterList(UserList): + + """Auxiliary datastructure to help print a list of tokens. It allows you to + back-engineer a sentence from the text and character offsets of the tokens.""" + + def __init__(self, n: int, char=' '): + self.size = n + self.char = char + self.data = n * [char] + + def __str__(self): + return f'' + + def __len__(self): + return self.size + + def __setitem__(self, key, value): + try: + self.data[key] = value + except IndexError: + for i in range(len(self), key + 1): + self.data.append(self.char) + self.data[key] = value + + def set_chars(self, text: str, start: int, end: int): + self.data[start:end] = text + + def getvalue(self, start: int, end: int): + return ''.join(self.data[start:end]) + + +def xml_tag(tag, subtag, objs, props, indent=' ') -> str: + """Return an XML string for a list of instances of subtag, grouped under tag.""" + s = io.StringIO() + s.write(f'{indent}<{tag}>\n') + for obj in objs: + s.write(xml_empty_tag(subtag, indent + ' ', obj, props)) + s.write(f'{indent}\n') + return s.getvalue() + + +def xml_empty_tag(tag_name: str, indent: str, obj: dict, props: tuple) -> str: + """Return an XML tag to an instance of io.StringIO(). Only properties from obj + that are in the props tuple are printed.""" + pairs = [] + for prop in props: + if prop in obj: + if obj[prop] is not None: + #pairs.append("%s=%s" % (prop, xml_attribute(obj[prop]))) + pairs.append(f'{prop}={xml_attribute(obj[prop])}') + attrs = ' '.join(pairs) + return f'{indent}<{tag_name} {attrs}/>\n' + + +def write_tag(s, tagname: str, indent: str, obj: dict, props: tuple): + """Write an XML tag to an instance of io.StringIO(). Only properties from obj + that are in the props tuple are printed.""" + pairs = [] + for prop in props: + if prop in obj: + if obj[prop] is not None: + pairs.append("%s=%s" % (prop, xml_attribute(obj[prop]))) + s.write('%s<%s %s/>\n' + % (indent, tagname, ' '.join(pairs))) + + +def xml_attribute(attr): + """Return attr as an XML attribute.""" + return quoteattr(str(attr)) + + +def xml_data(text): + """Return text as XML data.""" + return escape(str(text)) + + +def normalize_id(doc_ids: list, view: View, annotation: Annotation): + """Change identifiers to include the view identifier if it wasn't included, + do nothing otherwise. This applies to the Annotation id, target, source, + document, targets and representatives properties. Note that timePoint is + not included because the value is an integer and not an identifier.""" + # TODO: this seems somewhat fragile + # TODO: spell out what doc_ids is for (to exclude source documents I think) + debug = False + attype = annotation.at_type.shortname + props = annotation.properties + if ':' not in annotation.id and view is not None: + if annotation.id not in doc_ids: + newid = f'{view.id}:{annotation.id}' + annotation.properties['id'] = newid + if 'document' in props: + doc_id = props['document'] + if ':' not in doc_id and view is not None: + if doc_id not in doc_ids: + props['document'] = f'{view.id}:{doc_id}' + if 'targets' in props: + new_targets = [] + for target in props['targets']: + if ':' not in target and view is not None: + if target not in doc_ids: + new_targets.append(f'{view.id}:{target}') + else: + new_targets.append(target) + props['targets'] = new_targets + if 'representatives' in props: + new_representatives = [] + for rep in props['representatives']: + if ':' not in rep and view is not None: + new_representatives.append(f'{view.id}:{rep}') + else: + new_representatives.append(rep) + props['representatives'] = new_representatives + if attype == 'Alignment': + if ':' not in props['source'] and view is not None: + if props['source'] not in doc_ids: + props['source'] = f'{view.id}:{props["source"]}' + if ':' not in props['target'] and view is not None: + if props['target'] not in doc_ids: + props['target'] = f'{view.id}:{props["target"]}' + if debug: + print('===', annotation) + + +def get_annotations_from_view(view, annotation_type): + """Return all annotations from a view that match the short name of the + annotation type.""" + # Note: there is method mmif.View.get_annotations() where you can give + # at_type as a parameter, but it requires a full match. + return [a for a in view.annotations + if a.at_type.shortname == annotation_type] + + +def find_matching_tokens(tokens, ne): + matching_tokens = [] + ne_start = ne.properties["start"] + ne_end = ne.properties["end"] + start_token = None + end_token = None + for token in tokens: + if token.properties['start'] == ne_start: + start_token = token + if token.properties['end'] == ne_end: + end_token = token + return start_token, end_token + + diff --git a/mmif/utils/video_document_helper.py b/mmif/utils/video_document_helper.py index a1b9c59a..1ff6df40 100644 --- a/mmif/utils/video_document_helper.py +++ b/mmif/utils/video_document_helper.py @@ -1,5 +1,7 @@ +import contextvars import importlib import sys +from enum import Enum import math import warnings @@ -12,13 +14,22 @@ from mmif.utils.timeunit_helper import convert from mmif.vocabulary import DocumentTypes -for cv_dep in ('cv2', 'ffmpeg', 'PIL', 'wurlitzer'): +_CV_DEPS = ('cv2', 'PIL', 'wurlitzer') +_cv_import_warning = ( + 'Optional package "{}" is not found. ' + 'You might want to install Computer-Vision dependencies ' + 'by running `pip install mmif-python[cv]=={}`' +) + + +def _check_cv_dep(dep): + """Import a CV dependency, raising ImportError with a helpful message.""" try: - importlib.__import__(cv_dep) + return importlib.__import__(dep) except ImportError as e: - warnings.warn(f"Optional package \"{e.name}\" is not found. " - f"You might want to install Computer-Vision dependencies " - f"by running `pip install mmif-python[cv]=={mmif.__version__}`") + raise ImportError( + _cv_import_warning.format(e.name, mmif.__version__) + ) from e FPS_DOCPROP_KEY = 'fps' @@ -27,6 +38,36 @@ DURATIONUNIT_DOCPROP_KEY = 'durationTimeUnit' +class SamplingMode(Enum): + """Determines how timepoints are selected from a TimeFrame.""" + REPRESENTATIVES = "representatives" + SINGLE = "single" + ALL = "all" + + +SAMPLING_MODE_DESCRIPTIONS = { + SamplingMode.REPRESENTATIVES: ( + "uses all representative timepoints if present, " + "otherwise skips the TimeFrame." + ), + SamplingMode.SINGLE: ( + "uses the middle representative if present, otherwise " + "extracts a frame from the midpoint of the start/end " + "interval (midpoint is calculated by floor division " + "of the sum of start and end)." + ), + SamplingMode.ALL: ( + "uses all target timepoints if present, otherwise " + "extracts all frames from the time interval." + ), +} +SAMPLING_MODE_DEFAULT = SamplingMode.REPRESENTATIVES + + +_sampling_mode = contextvars.ContextVar( + 'sampling_mode', default=SamplingMode.REPRESENTATIVES) + + def capture(video_document: Document): """ Captures a video file using OpenCV and adds fps, frame count, and duration as properties to the document. @@ -34,7 +75,7 @@ def capture(video_document: Document): :param video_document: :py:class:`~mmif.serialize.annotation.Document` instance that holds a video document (``"@type": ".../VideoDocument/..."``) :return: `OpenCV VideoCapture `_ object """ - import cv2 # pytype: disable=import-error + cv2 = _check_cv_dep('cv2') if video_document is None or video_document.at_type != DocumentTypes.VideoDocument: raise ValueError(f'The document does not exist.') @@ -59,8 +100,8 @@ def get_framerate(video_document: Document) -> float: if video_document is None or video_document.at_type != DocumentTypes.VideoDocument: raise ValueError(f'The document does not exist.') - framerate_keys = (FPS_DOCPROP_KEY, - 'framerate', 'frameRate', 'frame_rate', 'frame-rate', + framerate_keys = (FPS_DOCPROP_KEY, + 'framerate', 'frameRate', 'frame_rate', 'frame-rate', 'framespersecond', 'framesPerSecond', 'frames_per_second', 'frames-per-second', 'framepersecond', 'framePerSecond', 'frame_per_second', 'frame-per-second') for k in framerate_keys: @@ -84,20 +125,23 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int], :param record_ffmpeg_errors: if True, records and warns about FFmpeg stderr output during extraction :return: frames as a list of :py:class:`~numpy.ndarray` or :py:class:`~PIL.Image.Image` """ - import cv2 + cv2 = _check_cv_dep('cv2') + # deduplicate and sort frame numbers for extraction, then map back to original order + original_framenums = list(framenums) + unique_framenums = sorted(set(original_framenums)) if as_PIL: - from PIL import Image - frames = [] + Image = _check_cv_dep('PIL').Image + unique_frames = {} video = capture(video_document) cur_f = 0 tot_fcount = video_document.get_property(FRAMECOUNT_DOCPROP_KEY) # when the target frame is more than this frames away, fast-forward instead of reading frame by frame - # this is sanity-checked with a small number of video samples + # this is sanity-checked with a small number of video samples # (frame-by-frame ndarrays are compared with fast-forwarded ndarrays) - skip_threadhold = 1000 - framenumi = iter(framenums) # make sure that it's actually an iterator, in case a list is passed + skip_threadhold = 1000 + framenumi = iter(unique_framenums) next_target_f = next(framenumi, None) - from wurlitzer import pipes as cpipes + cpipes = _check_cv_dep('wurlitzer').pipes ffmpeg_errs = StringIO() with cpipes(stderr=ffmpeg_errs, stdout=sys.stdout): while True: @@ -114,18 +158,23 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int], sec = convert(cur_f, 'f', 's', video_document.get_property(FPS_DOCPROP_KEY)) warnings.warn(f'Frame #{cur_f} ({sec}s) could not be read from the video {video_document.id} @ {video_document.location} .') else: - frames.append(Image.fromarray(frame[:, :, ::-1]) if as_PIL else frame) + unique_frames[cur_f] = Image.fromarray(frame[:, :, ::-1]) if as_PIL else frame next_target_f = next(framenumi, None) cur_f += 1 ffmpeg_err_str = ffmpeg_errs.getvalue() if ffmpeg_err_str and record_ffmpeg_errors: warnings.warn(f'FFmpeg output during extracting frames: {ffmpeg_err_str}') video.release() - return frames + # return frames in original input order, duplicating where needed + return [unique_frames[f] for f in original_framenums if f in unique_frames] def get_mid_framenum(mmif: Mmif, time_frame: Annotation) -> int: - warnings.warn('This function is deprecated. Use ``get_representative_framenums()`` instead.', DeprecationWarning, stacklevel=2) + """ + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + """ + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) return _get_mid_framenum(mmif, time_frame) @@ -145,6 +194,9 @@ def _get_mid_framenum(mmif: Mmif, time_frame: Annotation) -> int: def extract_mid_frame(mmif: Mmif, time_frame: Annotation, as_PIL: bool = False): """ + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + Extracts the middle frame of a time interval annotation as a numpy ndarray. :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance @@ -152,21 +204,25 @@ def extract_mid_frame(mmif: Mmif, time_frame: Annotation, as_PIL: bool = False): :param as_PIL: return :py:class:`~PIL.Image.Image` instead of :py:class:`~numpy.ndarray` :return: frame as a :py:class:`numpy.ndarray` or :py:class:`PIL.Image.Image` """ - warnings.warn('This function is deprecated. Use ``extract_representative_frames()`` instead.', DeprecationWarning, stacklevel=2) + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) vd = mmif[time_frame.get_property('document')] return extract_frames_as_images(vd, [get_mid_framenum(mmif, time_frame)], as_PIL=as_PIL)[0] def get_representative_framenums(mmif: Mmif, time_frame: Annotation) -> List[int]: """ - Calculates the representative frame numbers from an annotation. To pick the representative frames, it first looks - up the ``representatives`` property of the ``TimeFrame`` annotation. If it is not found, it will calculate the + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + + Calculates the representative frame numbers from an annotation. To pick the representative frames, it first looks + up the ``representatives`` property of the ``TimeFrame`` annotation. If it is not found, it will calculate the number of the middle frame. :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` instance that holds a time interval annotation containing a `representatives` property (``"@type": ".../TimeFrame/..."``) :return: representative frame number as an integer """ + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) if 'representatives' not in time_frame.properties: return [_get_mid_framenum(mmif, time_frame)] timeunit = time_frame.get_property('timeUnit') @@ -185,9 +241,13 @@ def get_representative_framenums(mmif: Mmif, time_frame: Annotation) -> List[int def get_representative_framenum(mmif: Mmif, time_frame: Annotation) -> int: """ + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + A thin wrapper around :py:func:`get_representative_framenums` to return a single representative frame number. Always return the first frame number found. """ + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) try: return get_representative_framenums(mmif, time_frame)[0] except IndexError: @@ -196,6 +256,9 @@ def get_representative_framenum(mmif: Mmif, time_frame: Annotation) -> int: def extract_representative_frame(mmif: Mmif, time_frame: Annotation, as_PIL: bool = False, first_only: bool = True): """ + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + Extracts the representative frame of an annotation as a numpy ndarray or PIL Image. :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance @@ -204,11 +267,197 @@ def extract_representative_frame(mmif: Mmif, time_frame: Annotation, as_PIL: boo :param first_only: return the first representative frame only :return: frame as a :py:class:`numpy.ndarray` or :py:class:`PIL.Image.Image` """ + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) video_document = mmif[time_frame.get_property('document')] rep_frame_num = [get_representative_framenum(mmif, time_frame)] if first_only else get_representative_framenums(mmif, time_frame) return extract_frames_as_images(video_document, rep_frame_num, as_PIL=as_PIL)[0] +def _tp_ids_to_framenums(mmif: Mmif, tp_ids: List[str]) -> List[int]: + """ + Converts a list of timepoint annotation IDs to frame numbers. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param tp_ids: list of timepoint annotation IDs + :return: list of frame numbers + """ + return [ + int(convert_timepoint(mmif, mmif[tp_id], 'f')) + for tp_id in tp_ids + ] + + +def _resolve_video_document(mmif: Mmif, time_frame: Annotation): + """ + Resolves the video document associated with a TimeFrame. + Checks the TimeFrame's own ``document`` property first, + then falls back to the ``document`` property of the first + target timepoint. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame + :return: :py:class:`~mmif.serialize.annotation.Document` + """ + if 'document' in time_frame.properties: + return mmif[time_frame.get_property('document')] + if 'targets' in time_frame.properties: + targets = time_frame.get_property('targets') + if targets: + tp = mmif[targets[0]] + return mmif[tp.get_property('document')] + raise ValueError( + f'Cannot resolve video document for TimeFrame ' + f'{time_frame.id}.') + + +def _timeframe_to_frame_range( + mmif: Mmif, time_frame: Annotation +) -> Tuple[int, int]: + """ + Converts a TimeFrame's start/end to frame numbers. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame with ``start``, ``end``, + ``timeUnit``, and ``document`` properties + :return: tuple of (start_frame, end_frame) + """ + start, end = convert_timeframe(mmif, time_frame, 'f') + return int(start), int(end) + + +def _sample_all(mmif: Mmif, time_frame: Annotation) -> List[int]: + """ + Samples all frame numbers from a TimeFrame. Uses all + ``targets`` if present, otherwise generates every frame + in the start/end interval. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame + :return: list of frame numbers + """ + if 'targets' in time_frame.properties: + return _tp_ids_to_framenums( + mmif, time_frame.get_property('targets')) + start, end = _timeframe_to_frame_range(mmif, time_frame) + return sample_frames(start, end) + + +def _sample_representatives( + mmif: Mmif, time_frame: Annotation +) -> List[int]: + """ + Samples frame numbers from a TimeFrame's representatives. + Returns an empty list if ``representatives`` is not present + (skips the TimeFrame). + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame + :return: list of frame numbers (empty if no representatives) + """ + if 'representatives' in time_frame.properties: + reps = time_frame.get_property('representatives') + if reps: + return _tp_ids_to_framenums(mmif, reps) + return [] + + +def _sample_single(mmif: Mmif, time_frame: Annotation) -> List[int]: + """ + Samples a single frame number from a TimeFrame. Uses the + middle representative if ``representatives`` is present, + otherwise computes the midpoint of the start/end interval + via floor division. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame + :return: list containing a single frame number + """ + if 'representatives' in time_frame.properties: + reps = time_frame.get_property('representatives') + if reps: + mid = reps[len(reps) // 2] + return _tp_ids_to_framenums(mmif, [mid]) + start, end = _timeframe_to_frame_range(mmif, time_frame) + return [(start + end) // 2] + + +def extract_target_frames(mmif: Mmif, annotation: Annotation, min_timepoints: int = 0, max_timepoints: int = sys.maxsize, fraction: float = 1.0, as_PIL: bool = False): + """ + Extracts frames corresponding to the timepoints listed in the ``targets`` property of an annotation. + Selection of timepoints is based on minimum, maximum, and fraction of targets to include. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param annotation: :py:class:`~mmif.serialize.annotation.Annotation` instance containing a ``targets`` property + :param min_timepoints: minimum number of timepoints to include + :param max_timepoints: maximum number of timepoints to include + :param fraction: fraction of targets to include (ideally) + :param as_PIL: return :py:class:`~PIL.Image.Image` instead of :py:class:`~numpy.ndarray` + :return: a tuple containing (list of frames, list of selected target IDs) + """ + if 'targets' not in annotation.properties: + raise ValueError(f'Annotation {annotation.id} does not have a "targets" property.') + + targets = annotation.get_property('targets') + num_targets = len(targets) + if num_targets == 0: + return [], [] + + ideal_count = int(num_targets * fraction) + count = max(min_timepoints, ideal_count) + count = min(max_timepoints, count) + count = min(num_targets, count) + + if count == 1: + indices = [num_targets // 2] + else: + indices = [int(i * (num_targets - 1) / (count - 1)) for i in range(count)] + + selected_target_ids = [targets[i] for i in indices] + frame_nums = _tp_ids_to_framenums(mmif, selected_target_ids) + video_doc = _resolve_video_document(mmif, annotation) + images = extract_frames_as_images(video_doc, frame_nums, as_PIL=as_PIL) + return images, selected_target_ids + + +def extract_frames_by_mode( + mmif: Mmif, + time_frame: Annotation, + mode: Union[SamplingMode, None] = None, + as_PIL: bool = False +) -> List: + """ + Extracts frames from a TimeFrame annotation based on a + sampling mode. If ``mode`` is not specified, uses the + context-level default (set via + :py:data:`_sampling_mode` context variable). + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: TimeFrame annotation to sample from + :param mode: :py:class:`SamplingMode`, or None to use + the context default + :param as_PIL: return PIL Images instead of ndarrays + :return: list of frames (may be empty for + ``REPRESENTATIVES`` mode when no representatives exist) + """ + if mode is None: + mode = _sampling_mode.get() + if mode == SamplingMode.ALL: + frame_nums = _sample_all(mmif, time_frame) + elif mode == SamplingMode.REPRESENTATIVES: + frame_nums = _sample_representatives(mmif, time_frame) + else: + frame_nums = _sample_single(mmif, time_frame) + if not frame_nums: + return [] + video_doc = _resolve_video_document(mmif, time_frame) + return extract_frames_as_images(video_doc, frame_nums, as_PIL=as_PIL) + + def sample_frames(start_frame: int, end_frame: int, sample_rate: float = 1) -> List[int]: """ Helper function to sample frames from a time interval. diff --git a/mmif/utils/workflow_helper.py b/mmif/utils/workflow_helper.py index 7980eb89..bdde664a 100644 --- a/mmif/utils/workflow_helper.py +++ b/mmif/utils/workflow_helper.py @@ -1,13 +1,16 @@ import datetime import hashlib -from collections import Counter, defaultdict -from pathlib import Path -from typing import List, Any, Tuple, Optional, Union import itertools -from mmif import Mmif +from collections import Counter +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Tuple, Union, overload + +from pydantic import BaseModel, ConfigDict, Field +from mmif.serialize.mmif import Mmif, ViewsList -def group_views_by_app(views: List[Any]) -> List[List[Any]]: + +def group_views_by_app(views: ViewsList) -> List[List[Any]]: """ Groups views into app executions based on app and timestamp. @@ -73,9 +76,43 @@ def generate_param_hash(params: dict) -> str: return hashlib.md5(param_string.encode('utf-8')).hexdigest() -def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str: +def _read_mmif_from_path(mmif_input: Union[str, Path, Mmif]) -> Mmif: """ - Generate a workflow identifier string from a MMIF file. + Helper function to get a Mmif object from various input types. + + :param mmif_input: Either a file path (str or Path) or an existing Mmif object + :return: Mmif object + :raises ValueError: If input is not a valid type + """ + if isinstance(mmif_input, Mmif): + return mmif_input + elif isinstance(mmif_input, (str, Path)): + with open(mmif_input, "r") as f: + mmif_str = f.read() + return Mmif(mmif_str) + else: + raise ValueError( + "MMIF input must be a string path, a Path object, or a Mmif object." + ) + + +@overload +def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], + return_param_dicts: Literal[True] + ) -> Tuple[str, List[dict]]: ... + + +@overload +def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], + return_param_dicts: Literal[False] = False + ) -> str: ... + + +def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], + return_param_dicts: bool = False + ) -> Union[str, Tuple[str, List[dict]]]: + """ + Generate a workflow identifier string from a MMIF file or object. The identifier follows the storage directory structure format: app_name/version/param_hash/app_name2/version2/param_hash2/... @@ -83,25 +120,18 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str: Uses view.metadata.parameters (raw user-passed values) for hashing to ensure reproducibility. Views with errors or warnings are excluded from the identifier; empty views are included. - """ - if not isinstance(mmif_file, (str, Path)): - raise ValueError( - "MMIF file path must be a string or a Path object." - ) - with open(mmif_file, "r") as f: - mmif_str = f.read() - - data = Mmif(mmif_str) + :param mmif_input: Path to MMIF file (str or Path) or a Mmif object + :param return_param_dicts: If True, also return the parameter dictionaries + :return: Workflow identifier string, or tuple of (identifier, param_dicts) if return_param_dicts=True + """ + data = _read_mmif_from_path(mmif_input) segments = [] - # First prefix is source information, sorted by document type - sources = Counter(doc.at_type.shortname for doc in data.documents) - segments.append('-'.join([f'{k}-{sources[k]}' for k in sorted(sources.keys())])) - # Group views into runs grouped_apps = group_views_by_app(data.views) + param_dicts = [] for app_execution in grouped_apps: # Use the first view in the run as representative for metadata first_view = app_execution[0] @@ -120,6 +150,7 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str: param_dict = first_view.metadata.parameters except (KeyError, AttributeError): param_dict = {} + param_dicts.append(param_dict) param_hash = generate_param_hash(param_dict) @@ -128,10 +159,58 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str: version_str = app_version if app_version else "unversioned" segments.append(f"{name_str}/{version_str}/{param_hash}") + if return_param_dicts: + return '/'.join(segments), param_dicts return '/'.join(segments) -def _get_profile_data(view) -> dict: +## single MMIF summarization + +class SingleMmifStats(BaseModel): + """ + Aggregated statistics for a single MMIF file. + """ + model_config = ConfigDict(populate_by_name=True) + + app_count: int = Field(..., alias="appCount", description="Total number of app executions identified.") + error_views: List[str] = Field(default_factory=list, alias="errorViews", description="List of view IDs that contain errors.") + warning_views: List[str] = Field(default_factory=list, alias="warningViews", description="List of view IDs that contain warnings.") + empty_views: List[str] = Field(default_factory=list, alias="emptyViews", description="List of view IDs that contain no annotations.") + annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Total annotation counts across the file.") + +class AppProfiling(BaseModel): + """ + Profiling data for a single app execution. + """ + model_config = ConfigDict(populate_by_name=True) + + running_time_ms: Optional[int] = Field(default=None, alias="runningTimeMS", description="Execution time in milliseconds.") + +class AppExecution(BaseModel): + """ + Represents a single execution of an app, which may produce multiple views. + """ + model_config = ConfigDict(populate_by_name=True) + + app: str = Field(..., description="The URI of the app.") + view_ids: List[str] = Field(..., alias="viewIds", description="List of view IDs generated by this execution.") + app_configuration: Dict = Field(default_factory=dict, alias="appConfiguration", description="Configuration parameters used for this execution.") + app_profiling: AppProfiling = Field(default_factory=lambda: AppProfiling(), alias="appProfiling", description="Profiling data for this execution.") + annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Counts of annotations produced, grouped by type.") + + +class SingleMmifDesc(BaseModel): + """ + Description of a workflow extracted from a single MMIF file. + """ + model_config = ConfigDict(populate_by_name=True) + + workflow_id: str = Field(..., alias="workflowId", description="Unique identifier for the workflow structure.") + stats: SingleMmifStats = Field(..., description="Statistics about the views and annotations.") + apps: List[AppExecution] = Field(..., description="Sequence of app executions in the workflow.") + + +def _get_profile_data(view) -> AppProfiling: """ Extract profiling data from a view's metadata. @@ -150,18 +229,18 @@ def _get_profile_data(view) -> dict: running_time_str = profiling.get("runningTime") if running_time_str is None: - return {} + return AppProfiling(runningTimeMS=None) # the format is datetime.timedelta string, e.g. '0:00:02.345678' # need to convert to milliseconds integer time_obj = datetime.datetime.strptime(running_time_str, "%H:%M:%S.%f").time() milliseconds = (time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second) * 1000 + time_obj.microsecond // 1000 - return {"runningTimeMS": milliseconds} + return AppProfiling(runningTimeMS=milliseconds) -def describe_single_mmif(mmif_file: Union[str, Path]) -> dict: +def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: """ - Reads a MMIF file and extracts the workflow specification from it. + Reads a MMIF file or object and extracts the workflow specification from it. This function provides an app-centric summarization of the workflow. The conceptual hierarchy is that a **workflow** is a sequence of **apps**, @@ -170,61 +249,24 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict: a single logical "app execution". .. note:: - For MMIF files generated by ``clams-python`` <= 1.3.3, all views - are independently timestamped. This means that even if multiple views - were generated by a single execution of an app, their + For MMIF files generated by apps based on ``clams-python`` <= 1.3.3, all + views are independently timestamped. This means that even if multiple + views were generated by a single execution of an app, their ``metadata.timestamp`` values will be unique. As a result, the grouping logic will treat each view as a separate app execution. The change that aligns timestamps for views from a single app execution is implemented in `clams-python PR #271 `_. - The output format is a dictionary with the following keys: - - * ``workflowId`` - A unique identifier for the workflow, based on the - sequence of app executions (app, version, parameter hashes). App - executions with errors are excluded from this identifier. App - executions with warnings are still considered successful for the purpose - of this identifier. - * ``stats`` - A dictionary with the following keys: - - ``appCount`` - Total number of identified app executions. - ``errorViews`` - A list of view IDs that reported errors. - ``warningViews`` - A list of view IDs that reported warnings. - ``emptyViews`` - A list of view IDs that contain no annotations. - ``annotationCountByType`` - A dictionary mapping each annotation type to its count, plus a - ``total`` key for the sum of all annotations across all app - executions. - * ``apps`` - A list of objects, where each object represents one app - execution. It includes metadata, profiling, and aggregated statistics - for all views generated by that execution. A special entry for views - that could not be assigned to an execution will be at the end of the list. - - --- - The docstring above is used to generate help messages for the CLI command. - Do not remove the triple-dashed lines. - - :param mmif_file: Path to the MMIF file + The output is a serialized :class:`~SingleMmifDesc` object. + + .. pydantic_model:: SingleMmifDesc + :noindex: + + :param mmif_input: Path to MMIF file (str or Path) or a Mmif object :return: A dictionary containing the workflow specification. """ - if not isinstance(mmif_file, (str, Path)): - raise ValueError( - "MMIF file path must be a string or a Path object." - ) - - workflow_id = generate_workflow_identifier(mmif_file) - with open(mmif_file, "r") as f: - mmif_str = f.read() - - mmif = Mmif(mmif_str) + mmif = _read_mmif_from_path(mmif_input) error_view_ids = [] warning_view_ids = [] @@ -249,17 +291,21 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict: execution_view_ids = [v.id for v in group] processed_view_ids.update(execution_view_ids) - app_data = { - "app": first_view.metadata.app, - "viewIds": execution_view_ids, - "appConfiguration": first_view.metadata.get("appConfiguration", {}), - "appProfiling": _get_profile_data(first_view), - } + # Prepare annotation counts total_annotations_in_exec = sum(execution_ann_counter.values()) if total_annotations_in_exec > 0: - app_data['annotationCountByType'] = dict(execution_ann_counter) - app_data['annotationCountByType']['total'] = total_annotations_in_exec - grouped_apps.append(app_data) + count_dict = dict(execution_ann_counter) + count_dict['total'] = total_annotations_in_exec + else: + count_dict = {} + + grouped_apps.append(AppExecution( + app=first_view.metadata.app, + viewIds=execution_view_ids, + appConfiguration=first_view.metadata.get("appConfiguration", {}), + appProfiling=_get_profile_data(first_view), + annotationCountByType=count_dict + )) # Handle unassigned and problematic views all_view_ids = set(v.id for v in mmif.views) @@ -279,19 +325,23 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict: app_count = len(grouped_apps) if unassigned_view_ids: - grouped_apps.append({ - "app": "http://apps.clams.ai/non-existing-app/v1", - "viewIds": sorted(list(unassigned_view_ids)) - }) + grouped_apps.append(AppExecution( + app="http://apps.clams.ai/non-existing-app/v1", + viewIds=sorted(list(unassigned_view_ids)), + appConfiguration={}, + appProfiling=AppProfiling(runningTimeMS=None), + annotationCountByType={} + )) # aggregate total annotation counts total_annotations_by_type = Counter() for execution in grouped_apps: # Only aggregate from actual apps, not the special unassigned entry - if execution.get('app') != "http://apps.clams.ai/non-existing-app/v1": - if 'annotationCountByType' in execution: - exec_counts = execution['annotationCountByType'].copy() - del exec_counts['total'] + if execution.app != "http://apps.clams.ai/non-existing-app/v1": + if execution.annotation_count_by_type: + exec_counts = execution.annotation_count_by_type.copy() + if 'total' in exec_counts: + del exec_counts['total'] total_annotations_by_type.update(Counter(exec_counts)) final_total_annotations = sum(total_annotations_by_type.values()) @@ -299,17 +349,79 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict: if final_total_annotations > 0: final_annotation_counts['total'] = final_total_annotations - return { - "workflowId": workflow_id, - "stats": { - "appCount": app_count, - "errorViews": error_view_ids, - "warningViews": warning_view_ids, - "emptyViews": empty_view_ids, - "annotationCountByType": final_annotation_counts - }, - "apps": grouped_apps - } + return SingleMmifDesc( + workflowId=generate_workflow_identifier(mmif, return_param_dicts=False), + stats=SingleMmifStats( + appCount=app_count, + errorViews=error_view_ids, + warningViews=warning_view_ids, + emptyViews=empty_view_ids, + annotationCountByType=final_annotation_counts + ), + apps=grouped_apps + ).model_dump(by_alias=True) + + +## MMIF collection summarization + +class AppProfilingStats(BaseModel): + """ + Aggregated profiling statistics for an app across a workflow. + """ + model_config = ConfigDict(populate_by_name=True) + + avg_running_time_ms: Optional[float] = Field(default=None, alias="avgRunningTimeMS", description="Average execution time in milliseconds.") + min_running_time_ms: Optional[float] = Field(default=None, alias="minRunningTimeMS", description="Minimum execution time in milliseconds.") + max_running_time_ms: Optional[float] = Field(default=None, alias="maxRunningTimeMS", description="Maximum execution time in milliseconds.") + stdev_running_time_ms: Optional[float] = Field(default=None, alias="stdevRunningTimeMS", description="Standard deviation of execution time.") + + + + +class WorkflowAppExecution(BaseModel): + """ + Aggregated information about an app's usage within a specific workflow across multiple files. + """ + model_config = ConfigDict(populate_by_name=True) + + app: str = Field(..., description="The URI of the app.") + app_configuration: Dict = Field(default_factory=dict, alias="appConfiguration", description="Representative configuration (usually from the first occurrence).") + app_profiling: AppProfilingStats = Field(default_factory=lambda: AppProfilingStats(), alias="appProfiling", description="Aggregated profiling statistics.") + + +class WorkflowCollectionEntry(BaseModel): + """ + Summary of a unique workflow found within a collection. + """ + model_config = ConfigDict(populate_by_name=True) + + workflow_id: str = Field(..., alias="workflowId", description="Unique identifier for the workflow.") + mmifs: List[str] = Field(..., description="List of filenames belonging to this workflow.") + mmif_count: int = Field(..., alias="mmifCount", description="Number of MMIF files matching this workflow.") + apps: List[WorkflowAppExecution] = Field(..., description="Sequence of apps in this workflow with aggregated stats.") + +class MmifCountByStatus(BaseModel): + """ + Breakdown of MMIF files in a collection by their processing status. + """ + model_config = ConfigDict(populate_by_name=True) + + total: int = Field(..., description="Total number of MMIF files found.") + successful: int = Field(..., description="Number of files processed without errors.") + with_errors: int = Field(..., alias="withErrors", description="Number of files containing error views.") + with_warnings: int = Field(..., alias="withWarnings", description="Number of files containing warning views.") + invalid: int = Field(..., description="Number of files that failed to parse as valid MMIF.") + + +class CollectionMmifDesc(BaseModel): + """ + Summary of a collection of MMIF files. + """ + model_config = ConfigDict(populate_by_name=True) + + mmif_count_by_status: MmifCountByStatus = Field(..., alias="mmifCountByStatus", description="Counts of MMIF files by status.") + workflows: List[WorkflowCollectionEntry] = Field(..., description="List of unique workflows identified in the collection.") + annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Total annotation counts across the entire collection.") def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict: @@ -319,139 +431,115 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict: This function provides an overview of a collection of MMIF files, aggregating statistics across multiple files. - The output format is a dictionary with the following keys: - - * ``mmifCountByStatus`` - A dictionary summarizing the processing status of all MMIF files in the - collection. It includes: - - ``total`` - Total number of MMIF files found. - ``successful`` - Number of MMIF files processed without errors (may contain warnings). - ``withErrors`` - Number of MMIF files containing app executions that reported errors. - ``withWarnings`` - Number of MMIF files containing app executions that reported warnings. - ``invalid`` - Number of files that failed to be parsed as valid MMIF. - * ``workflows`` - A list of "workflow" objects found in the "successful" MMIF files (files - with errors are excluded), where each object contains: - - ``workflowId`` - The unique identifier for the workflow. - ``apps`` - A list of app objects, each with ``app`` (name+ver identifier), - ``appConfiguration``, and ``appProfiling`` statistics (avg, min, max, - stdev running times) aggregated per workflow. - ``mmifs`` - A list of MMIF file basenames belonging to this workflow. - ``mmifCount`` - The number of MMIF files in this workflow. - * ``annotationCountByType`` - A dictionary aggregating annotation counts across the entire collection. - It includes a ``total`` key for the grand total, plus integer counts for - each individual annotation type. - - --- - The docstring above is used to generate help messages for the CLI command. - Do not remove the triple-dashed lines. + The output is a serialized :class:`~CollectionMmifDesc` object. + + .. pydantic_model:: CollectionMmifDesc + :noindex: :param mmif_dir: Path to the directory containing MMIF files. :return: A dictionary containing the summarized collection specification. """ import statistics - from collections import defaultdict, Counter + from collections import Counter mmif_files = list(Path(mmif_dir).glob('*.mmif')) - status_summary = defaultdict(int) - status_summary['total'] = len(mmif_files) - status_summary['successful'] = 0 - status_summary['withErrors'] = 0 - status_summary['withWarnings'] = 0 - status_summary['invalid'] = 0 + status_summary = MmifCountByStatus( + total=len(mmif_files), + successful=0, + withErrors=0, + withWarnings=0, + invalid=0 + ) aggregated_counts = Counter() - workflows_data = defaultdict(lambda: { - 'mmifs': [], - 'apps': defaultdict(lambda: { - 'appConfiguration': None, # Store the first config here - 'execution_times': [] - }) - }) + # Structure: {workflow_id: {'mmifs': [...], 'apps': {app_uri: {'appConfiguration': ..., 'execution_times': [...]}}}} + workflows_data: Dict[str, Dict] = {} for mmif_file in mmif_files: try: - single_report = describe_single_mmif(mmif_file) - except Exception as e: - status_summary['invalid'] += 1 + single_report = SingleMmifDesc.model_validate(describe_single_mmif(mmif_file)) + except Exception: + status_summary.invalid += 1 continue - if single_report['stats']['errorViews']: - status_summary['withErrors'] += 1 + if single_report.stats.error_views: + status_summary.with_errors += 1 continue # Exclude from all other stats # If we get here, the MMIF has no errors and is considered "successful" - status_summary['successful'] += 1 - if single_report['stats']['warningViews']: - status_summary['withWarnings'] += 1 - - wf_id = single_report['workflowId'] + status_summary.successful += 1 + if single_report.stats.warning_views: + status_summary.with_warnings += 1 + + wf_id = single_report.workflow_id + # Initialize workflow entry if not exists + if wf_id not in workflows_data: + workflows_data[wf_id] = {'mmifs': [], 'apps': {}} workflows_data[wf_id]['mmifs'].append(Path(mmif_file).name) # Aggregate annotation counts for successful mmifs - report_counts = single_report['stats'].get('annotationCountByType', {}) + report_counts = single_report.stats.annotation_count_by_type.copy() if 'total' in report_counts: del report_counts['total'] # don't add the sub-total to the main counter aggregated_counts.update(report_counts) - for app_exec in single_report.get('apps', []): - app_uri = app_exec.get('app') + for app_exec in single_report.apps: + app_uri = app_exec.app # skip the special "unassigned" app if app_uri and app_uri != "http://apps.clams.ai/non-existing-app/v1": - running_time = app_exec.get('appProfiling', {}).get('runningTimeMS') + # Initialize app entry if not exists + if app_uri not in workflows_data[wf_id]['apps']: + workflows_data[wf_id]['apps'][app_uri] = { + 'appConfiguration': None, + 'execution_times': [] + } + + running_time = app_exec.app_profiling.running_time_ms if running_time is not None: workflows_data[wf_id]['apps'][app_uri]['execution_times'].append(running_time) # Store the first non-empty app configuration we find for this app in this workflow if workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] is None: - config = app_exec.get('appConfiguration', {}) + config = app_exec.app_configuration if config: workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] = config # Process collected data into the final output format final_workflows_list = [] for wf_id, wf_data in sorted(workflows_data.items()): - workflow_object = { - 'workflowId': wf_id, - 'mmifs': sorted(wf_data['mmifs']), - 'mmifCount': len(wf_data['mmifs']), - 'apps': [] - } + workflow_apps = [] for app_uri, app_data in sorted(wf_data['apps'].items()): times = app_data['execution_times'] if times: - profiling_stats = { - 'avgRunningTimeMS': statistics.mean(times), - 'minRunningTimeMS': min(times), - 'maxRunningTimeMS': max(times), - 'stdevRunningTimeMS': statistics.stdev(times) if len(times) > 1 else 0 - } + profiling_stats = AppProfilingStats( + avgRunningTimeMS=statistics.mean(times), + minRunningTimeMS=min(times), + maxRunningTimeMS=max(times), + stdevRunningTimeMS=statistics.stdev(times) if len(times) > 1 else 0 + ) else: - profiling_stats = {} - - app_object = { - 'app': app_uri, - 'appConfiguration': app_data['appConfiguration'] or {}, # Default to empty dict - 'appProfiling': profiling_stats - } - workflow_object['apps'].append(app_object) - - final_workflows_list.append(workflow_object) + profiling_stats = AppProfilingStats( + avgRunningTimeMS=None, + minRunningTimeMS=None, + maxRunningTimeMS=None, + stdevRunningTimeMS=None + ) + + workflow_apps.append(WorkflowAppExecution( + app=app_uri, + appConfiguration=app_data['appConfiguration'] or {}, + appProfiling=profiling_stats + )) + + final_workflows_list.append(WorkflowCollectionEntry( + workflowId=wf_id, + mmifs=sorted(wf_data['mmifs']), + mmifCount=len(wf_data['mmifs']), + apps=workflow_apps + )) # Finalize annotation counts final_annotation_counts = dict(aggregated_counts) @@ -459,8 +547,8 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict: if grand_total > 0: final_annotation_counts['total'] = grand_total - return { - 'mmifCountByStatus': dict(status_summary), - 'workflows': final_workflows_list, - 'annotationCountByType': final_annotation_counts - } + return CollectionMmifDesc( + mmifCountByStatus=status_summary, + workflows=final_workflows_list, + annotationCountByType=final_annotation_counts + ).model_dump(by_alias=True) diff --git a/mmif_docloc_http/__init__.py b/mmif_docloc_http/__init__.py index 9bdf9f22..1d954474 100644 --- a/mmif_docloc_http/__init__.py +++ b/mmif_docloc_http/__init__.py @@ -1,16 +1,30 @@ +""" +MMIF document location helper module for ``http`` and ``https`` schemes. + +If you want to write your own docloc scheme handler, please use the source +code of this module as a reference. See the :ref:`plug-in section ` +for more information. +""" + import urllib.request import urllib.error +_cache = {} + def resolve(docloc): + if docloc in _cache: + return _cache[docloc] try: if docloc.startswith('http://') or docloc.startswith('https://'): - return urllib.request.urlretrieve(docloc)[0] + path = urllib.request.urlretrieve(docloc)[0] + _cache[docloc] = path + return path else: raise ValueError(f'cannot handle document location scheme: {docloc}') except urllib.error.URLError as e: raise e - - + + def help(): return "location must be a URL string." diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..07055628 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = mmif tests +python_files = test_*.py *_test.py diff --git a/requirements.txt b/requirements.txt index a97c214e..c3e9d722 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ orderly-set==5.3.* # 5.4 drops py38 support jsonschema +pydantic>=2.0 diff --git a/tests/mmif_examples.py b/tests/mmif_examples.py index b19f9d9a..8a8f4c6f 100644 --- a/tests/mmif_examples.py +++ b/tests/mmif_examples.py @@ -55,7 +55,7 @@ def _load_from_url_or_git(url): old_mmif_w_short_id_url = f"https://raw.githubusercontent.com/clamsproject/mmif/1.0.5/specifications/samples/everything/raw.json" EVERYTHING_JSON = _load_from_url_or_git(everything_file_url) OLD_SHORTID_JSON = _load_from_url_or_git(old_mmif_w_short_id_url) -SWT_1_0_JSON = open('tests/samples/1.0/swt.mmif').read() +SWT_1_0_JSON = (Path(__file__).resolve().parent / 'samples' / '1.0' / 'swt.mmif').read_text() # for keys and values in chain all typevers in mmif.vocabulary.*_types modules # merge into a single dict diff --git a/tests/test_serialize.py b/tests/test_serialize.py index b0836c5a..f5b0846f 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -269,6 +269,21 @@ def test_document_location_helpers_http(self): # round_trip = Document(new_doc.serialize()) self.assertEqual(Document(new_doc.serialize()).serialize(), new_doc.serialize()) + def test_document_location_http_caching(self): + import mmif_docloc_http + mmif_docloc_http._cache.clear() + test_url = "https://example.com/" + self.assertNotIn(test_url, mmif_docloc_http._cache) + new_doc = Document() + new_doc.id = "d1" + new_doc.location = test_url + new_doc.location_path() + self.assertIn(test_url, mmif_docloc_http._cache) + # second call should use cache (same path returned) + cached_path = mmif_docloc_http._cache[test_url] + second_path = new_doc.location_path() + self.assertEqual(cached_path, second_path) + def test_get_documents_locations(self): mmif_obj = Mmif(MMIF_EXAMPLES['everything']) self.assertEqual(1, len(mmif_obj.get_documents_locations(DocumentTypes.VideoDocument))) @@ -593,6 +608,25 @@ def test_get_label(self): a = v.new_annotation(AnnotationTypes.BoundingBox) _ = a._get_label() + def test_timestamp_uses_utc_with_z_suffix(self): + """Test that timestamps are in UTC with 'Z' suffix to avoid ambiguity""" + from datetime import timezone + mmif_obj = Mmif(validate=False) + + new_view = mmif_obj.new_view() + new_view.metadata.app = "http://test.app" + + # Verify the timestamp is timezone-aware and uses UTC + self.assertIsNotNone(new_view.metadata.timestamp) + self.assertIsNotNone(new_view.metadata.timestamp.tzinfo) + self.assertEqual(new_view.metadata.timestamp.tzinfo, timezone.utc) + + # Verify serialization uses 'Z' suffix instead of '+00:00' + serialized = json.loads(mmif_obj.serialize()) + ts = serialized['views'][0]['metadata']['timestamp'] + self.assertTrue(ts.endswith('Z')) + self.assertNotIn('+00:00', ts) + def test_get_anchor_point(self): mmif = Mmif(validate=False) v1 = mmif.new_view() diff --git a/tests/test_utils.py b/tests/test_utils.py index 0c261fe7..1d903b10 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,17 +1,26 @@ +import json +import os import pathlib -import unittest import tempfile -import json +import unittest +from pathlib import Path +from unittest import mock import pytest - -from mmif import Mmif, Document, AnnotationTypes +from hypothesis import given +from hypothesis import strategies as st + +from mmif import ( + AnnotationTypes, + Document, + Mmif +) from mmif.utils import sequence_helper as sqh from mmif.utils import text_document_helper as tdh from mmif.utils import timeunit_helper as tuh from mmif.utils import video_document_helper as vdh -from tests.mmif_examples import * -from hypothesis import given, strategies as st +from mmif.utils import workflow_helper as wfh +from tests import mmif_examples class TestTimeunitHelper(unittest.TestCase): @@ -135,6 +144,86 @@ def test_extract_frames_as_images(self): self.assertEqual(4, len(frame_list)) self.assertEqual(3, len(new_target_images)) + def test_sample_all(self): + tps = [] + for i in range(10): + tp = self.a_view.new_annotation( + AnnotationTypes.TimePoint, + timePoint=i * 100, timeUnit='frame', + document=self.video_doc.id) + tps.append(tp) + parent_ann = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + targets=[tp.id for tp in tps]) + + frame_nums = vdh._sample_all(self.mmif_obj, parent_ann) + self.assertEqual(10, len(frame_nums)) + self.assertEqual([i * 100 for i in range(10)], frame_nums) + + # start/end fallback (no targets) + parent_ann2 = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + start=0, end=10, timeUnit='frame', + document=self.video_doc.id) + frame_nums2 = vdh._sample_all(self.mmif_obj, parent_ann2) + self.assertEqual(list(range(10)), frame_nums2) + + def test_sample_representatives(self): + tps = [] + for i in range(10): + tp = self.a_view.new_annotation( + AnnotationTypes.TimePoint, + timePoint=i * 100, timeUnit='frame', + document=self.video_doc.id) + tps.append(tp) + reps = [tps[2].id, tps[5].id, tps[8].id] + parent_ann = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + targets=[tp.id for tp in tps], + representatives=reps) + + # should use representatives + frame_nums = vdh._sample_representatives( + self.mmif_obj, parent_ann) + self.assertEqual(3, len(frame_nums)) + self.assertEqual([200, 500, 800], frame_nums) + + # without representatives, should return empty (skip) + parent_ann2 = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + targets=[tp.id for tp in tps]) + frame_nums2 = vdh._sample_representatives( + self.mmif_obj, parent_ann2) + self.assertEqual([], frame_nums2) + + def test_sample_single(self): + tps = [] + for i in range(10): + tp = self.a_view.new_annotation( + AnnotationTypes.TimePoint, + timePoint=i * 100, timeUnit='frame', + document=self.video_doc.id) + tps.append(tp) + reps = [tps[2].id, tps[5].id, tps[8].id] + parent_ann = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + targets=[tp.id for tp in tps], + representatives=reps) + + # should pick middle representative (index 1 of 3 = tps[5]) + frame_nums = vdh._sample_single( + self.mmif_obj, parent_ann) + self.assertEqual([500], frame_nums) + + # start/end fallback (no representatives) + parent_ann2 = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + start=100, end=500, timeUnit='frame', + document=self.video_doc.id) + frame_nums2 = vdh._sample_single( + self.mmif_obj, parent_ann2) + self.assertEqual([300], frame_nums2) + class TestSequenceHelper(unittest.TestCase): @@ -205,7 +294,7 @@ def test_width_based_smoothing(self): class TestTextDocHelper(unittest.TestCase): - mmif_obj = Mmif(MMIF_EXAMPLES['everything']) + mmif_obj = Mmif(mmif_examples.MMIF_EXAMPLES['everything']) @pytest.mark.skip("The only valid test cases come from kaldi app which annotates wrong property") def test_slice_text(self): @@ -232,8 +321,6 @@ def setUp(self) -> None: def create_temp_mmif_file(self, mmif_obj): """Helper to create a temporary MMIF file.""" - import tempfile - import json tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) if isinstance(mmif_obj, Mmif): content_to_write = mmif_obj.serialize(pretty=False) @@ -244,24 +331,20 @@ def create_temp_mmif_file(self, mmif_obj): return tmp.name def test_split_appname_appversion(self): - from mmif.utils.workflow_helper import _split_appname_appversion - app_name, app_version = _split_appname_appversion("http://apps.clams.ai/test-app/v1.0.0") + app_name, app_version = wfh._split_appname_appversion("http://apps.clams.ai/test-app/v1.0.0") self.assertEqual(app_name, "test-app") self.assertEqual(app_version, "v1.0.0") def test_generate_param_hash(self): - from mmif.utils.workflow_helper import generate_param_hash params = {"param1": "value1", "param2": 42} - hash1 = generate_param_hash(params) - hash2 = generate_param_hash(params) + hash1 = wfh.generate_param_hash(params) + hash2 = wfh.generate_param_hash(params) self.assertEqual(hash1, hash2) params_reversed = {"param2": 42, "param1": "value1"} - hash3 = generate_param_hash(params_reversed) + hash3 = wfh.generate_param_hash(params_reversed) self.assertEqual(hash1, hash3) def test_generate_workflow_identifier_grouped(self): - from mmif.vocabulary import AnnotationTypes - from mmif.utils import workflow_helper view1 = self.basic_mmif.new_view() view1.metadata.app = "http://apps.clams.ai/app1/v1.0.0" view1.metadata.timestamp = "2024-01-01T12:00:00Z" @@ -274,14 +357,210 @@ def test_generate_workflow_identifier_grouped(self): tmp_file = self.create_temp_mmif_file(self.basic_mmif) import os try: - workflow_id = workflow_helper.generate_workflow_identifier(tmp_file) + workflow_id = wfh.generate_workflow_identifier(tmp_file) segments = workflow_id.split('/') - self.assertEqual(len(segments), 7) - self.assertIn('app1', segments[1]) - self.assertIn('app2', segments[4]) + self.assertEqual(len(segments), 6) + self.assertIn('app1', segments[0]) + self.assertIn('app2', segments[3]) finally: os.unlink(tmp_file) + def test_generate_workflow_identifier_with_mmif_object(self): + """Test that generate_workflow_identifier accepts Mmif objects directly.""" + import os + + # Test with Mmif object directly + workflow_id_from_obj = wfh.generate_workflow_identifier(self.basic_mmif) + + # Test with file path - should produce the same result + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + workflow_id_from_file = wfh.generate_workflow_identifier(tmp_file) + self.assertEqual(workflow_id_from_obj, workflow_id_from_file) + finally: + os.unlink(tmp_file) + + def test_read_mmif_from_path(self): + """Test the _read_mmif_from_path helper function.""" + + # Test with Mmif object - should return as-is + result = wfh._read_mmif_from_path(self.basic_mmif) + self.assertIs(result, self.basic_mmif) + + # Test with file path string + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result_from_str = wfh._read_mmif_from_path(tmp_file) + self.assertIsInstance(result_from_str, Mmif) + self.assertEqual(result_from_str.serialize(pretty=False), self.basic_mmif.serialize(pretty=False)) + + # Test with Path object + result_from_path = wfh._read_mmif_from_path(Path(tmp_file)) + self.assertIsInstance(result_from_path, Mmif) + self.assertEqual(result_from_path.serialize(pretty=False), self.basic_mmif.serialize(pretty=False)) + finally: + os.unlink(tmp_file) + + # Test with invalid input + with pytest.raises(ValueError): + wfh._read_mmif_from_path(12345) + + def test_describe_single_mmif_with_mmif_object(self): + """Test that describe_single_mmif accepts Mmif objects directly.""" + import os + + # Test with Mmif object directly + result_from_obj = wfh.describe_single_mmif(self.basic_mmif) + + # Test with file path - should produce the same result + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result_from_file = wfh.describe_single_mmif(tmp_file) + self.assertEqual(result_from_obj, result_from_file) + + # Validate that the output conforms to the SingleMmifDesc Pydantic model + # If validation succeeds, all required fields with correct aliases are present + validated = wfh.SingleMmifDesc.model_validate(result_from_obj) + # Can assert on the validated object's attributes if needed + self.assertIsNotNone(validated.workflow_id) + self.assertIsNotNone(validated.stats) + self.assertIsNotNone(validated.apps) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_empty(self): + """Test describe_single_mmif with an empty MMIF (no views).""" + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 0) + self.assertEqual(len(validated.apps), 0) + self.assertEqual(validated.stats.annotation_count_by_type, {}) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_one_app(self): + """Test describe_single_mmif with a single app execution.""" + view = self.basic_mmif.new_view() + view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view.metadata.timestamp = "2024-01-01T12:00:00Z" + view.metadata.appProfiling = {"runningTime": "0:00:01.234"} + view.new_annotation(AnnotationTypes.TimeFrame) + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 1) + self.assertEqual(len(validated.apps), 1) + app_exec = validated.apps[0] + self.assertEqual(app_exec.app, view.metadata.app) + self.assertEqual(app_exec.view_ids, [view.id]) + self.assertEqual(app_exec.app_profiling.running_time_ms, 1234) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_one_app_two_views(self): + """Test describe_single_mmif with one app execution producing two views.""" + view1 = self.basic_mmif.new_view() + view1.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view1.metadata.timestamp = "2024-01-01T12:00:00Z" + view1.new_annotation(AnnotationTypes.TimeFrame) + view2 = self.basic_mmif.new_view() + view2.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view2.metadata.timestamp = "2024-01-01T12:00:00Z" + view2.new_annotation(AnnotationTypes.TimeFrame) + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 1) + self.assertEqual(len(validated.apps), 1) + app_exec = validated.apps[0] + self.assertEqual(app_exec.view_ids, [view1.id, view2.id]) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_error_view(self): + """Test describe_single_mmif with a view containing an error.""" + view = self.basic_mmif.new_view() + view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view.metadata.timestamp = "2024-01-01T12:00:00Z" + view.metadata.error = {"message": "Something went wrong"} + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 0) + self.assertEqual(len(validated.apps), 0) + self.assertEqual(len(validated.stats.error_views), 1) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_with_unassigned_views(self): + """Test describe_single_mmif with views that cannot be grouped.""" + import unittest.mock + raw_mmif = json.loads(self.basic_mmif.serialize()) + raw_mmif['views'].append({'id': 'v1', 'metadata': {'app': 'http://apps.clams.ai/app1/v1.0.0', 'timestamp': '2024-01-01T12:00:00Z'}, 'annotations': []}) + raw_mmif['views'].append({'id': 'v2', 'metadata': {'app': 'http://apps.clams.ai/app2/v2.0.0'}, 'annotations': []}) + raw_mmif['views'].append({'id': 'v3', 'metadata': {'timestamp': '2024-01-01T12:01:00Z', 'app': ''}, 'annotations': []}) + tmp_file = self.create_temp_mmif_file(raw_mmif) + try: + with unittest.mock.patch('jsonschema.validators.validate'): + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 1) + self.assertEqual(len(validated.apps), 2) + special_entry = validated.apps[-1] + self.assertEqual(special_entry.app, 'http://apps.clams.ai/non-existing-app/v1') + self.assertEqual(len(special_entry.view_ids), 2) + self.assertIn('v2', special_entry.view_ids) + self.assertIn('v3', special_entry.view_ids) + finally: + os.unlink(tmp_file) + + def test_describe_collection_empty(self): + """Test describe_mmif_collection with an empty directory.""" + dummy_dir = 'dummy_mmif_collection' + os.makedirs(dummy_dir, exist_ok=True) + try: + output = wfh.describe_mmif_collection(dummy_dir) + # Validate using Pydantic model + validated = wfh.CollectionMmifDesc.model_validate(output) + self.assertEqual(validated.mmif_count_by_status.total, 0) + self.assertEqual(len(validated.workflows), 0) + finally: + os.rmdir(dummy_dir) + + def test_describe_collection_with_files(self): + """Test describe_mmif_collection with MMIF files.""" + dummy_dir = 'dummy_mmif_collection_with_files' + os.makedirs(dummy_dir, exist_ok=True) + try: + # Create two MMIF files in the directory + for i in range(2): + tmp_file = os.path.join(dummy_dir, f'{i}.mmif') + with open(tmp_file, 'w') as f: + f.write(self.basic_mmif.serialize()) + + output = wfh.describe_mmif_collection(dummy_dir) + + # Validate structure using Pydantic model + # If validation succeeds, all required fields with correct aliases are present + validated = wfh.CollectionMmifDesc.model_validate(output) + + # Verify counts using validated object attributes + self.assertEqual(validated.mmif_count_by_status.total, 2) + self.assertIsInstance(validated.workflows, list) + finally: + import shutil + shutil.rmtree(dummy_dir) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index fa0f8906..dd33fec2 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -1,3 +1,4 @@ +import argparse import contextlib import io import json @@ -6,17 +7,124 @@ import unittest.mock import mmif -from mmif.utils.cli import rewind -from mmif.utils.cli import source -from mmif.utils.cli import describe - from mmif.serialize import Mmif -from mmif.vocabulary import DocumentTypes, AnnotationTypes +from mmif.utils.cli import describe, rewind, source, summarize +from mmif.vocabulary import AnnotationTypes + +BASIC_MMIF_STRING = '{"metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", "properties": {"id": "d1", "mime": "video/mp4", "location": "file:///test/video.mp4"}}], "views": []}' + + +class BaseCliTestCase(unittest.TestCase): + """Base class for CLI module tests with common utilities.""" + + cli_module = None # Override in subclass + + def setUp(self): + """Set up common test fixtures.""" + if self.cli_module: + self.parser = self.cli_module.prep_argparser() + self.basic_mmif = Mmif(BASIC_MMIF_STRING) + self.maxDiff = None + + @staticmethod + def create_temp_mmif_file(mmif_obj): + """Create a temporary MMIF file for testing. + + Args: + mmif_obj: Either a Mmif object or a dict/string to serialize + + Returns: + str: Path to the temporary file (caller must unlink) + """ + tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) + if isinstance(mmif_obj, Mmif): + content = mmif_obj.serialize(pretty=False) + else: + content = json.dumps(mmif_obj) if isinstance(mmif_obj, dict) else mmif_obj + tmp.write(content) + tmp.close() + return tmp.name + + def run_cli_capture_stdout(self, args_namespace): + """Run CLI module and capture stdout as parsed JSON. + + Args: + args_namespace: Namespace object with CLI arguments + + Returns: + dict: Parsed JSON output from stdout + """ + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + self.cli_module.main(args_namespace) + return json.loads(stdout.getvalue()) + + +class IOTestMixin: + """Mixin providing common I/O tests for CLI modules. + + Requires the test class to have: + - cli_module attribute + - basic_mmif attribute + - create_temp_mmif_file method + - run_cli_capture_stdout method + - expected_output_keys attribute (list of keys to check in output) + """ + + def test_file_input_stdout_output(self): + """Test reading from file and outputting to stdout.""" + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + args = argparse.Namespace( + MMIF_FILE=tmp_file, + output=None, + pretty=False, + help_schema=None # For describe module + ) + output = self.run_cli_capture_stdout(args) + self.assertIsInstance(output, dict) + for key in self.expected_output_keys: + self.assertIn(key, output) + finally: + os.unlink(tmp_file) + + def test_file_input_file_output(self): + """Test reading from file and outputting to file.""" + tmp_input = self.create_temp_mmif_file(self.basic_mmif) + tmp_output = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) + tmp_output.close() + try: + args = self.parser.parse_args([tmp_input, '-o', tmp_output.name]) + self.cli_module.main(args) + with open(tmp_output.name, 'r') as f: + output = json.load(f) + self.assertIsInstance(output, dict) + for key in self.expected_output_keys: + self.assertIn(key, output) + finally: + os.unlink(tmp_input) + os.unlink(tmp_output.name) + + def test_stdin_input_stdout_output(self): + """Test reading from stdin and outputting to stdout.""" + mmif_str = self.basic_mmif.serialize() + with unittest.mock.patch('sys.stdin', io.StringIO(mmif_str)), \ + unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = argparse.Namespace( + MMIF_FILE=None, + output=None, + pretty=False, + help_schema=None # For describe module + ) + self.cli_module.main(args) + output = json.loads(stdout.getvalue()) + self.assertIsInstance(output, dict) + for key in self.expected_output_keys: + self.assertIn(key, output) class TestCli(unittest.TestCase): def setUp(self) -> None: - self.parser, _ = mmif.prep_argparser_and_subcmds() + self.parser, _, _ = mmif.prep_argparser_and_subcmds() def test_primary_cli(self): stdout = io.StringIO() @@ -50,9 +158,8 @@ def generate_source_mmif(self): # to suppress output (otherwise, set to stdout by default) args = self.parser.parse_args(self.get_params()) - with open(os.devnull, 'w') as devnull: - args.output = devnull - return source.main(args) + args.output = os.devnull + return source.main(args) def test_accept_file_paths(self): self.docs.append("video:/a/b/c.mp4") @@ -120,24 +227,15 @@ def test_generate_mixed_scheme(self): class TestRewind(unittest.TestCase): def setUp(self): + empty_mmif_str = ('{"metadata": {"mmif": ' + '"http://mmif.clams.ai/1.0.0"}, "documents": [], ' + '"views": []}') # mmif we add views to - self.mmif_one = Mmif( - { - "metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, - "documents": [], - "views": [], - } - ) + self.mmif_one = Mmif(empty_mmif_str) # baseline empty mmif for comparison - self.empty_mmif = Mmif( - { - "metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, - "documents": [], - "views": [], - } - ) - + self.empty_mmif = Mmif(empty_mmif_str) + @staticmethod def add_dummy_view(mmif: Mmif, appname: str, timestamp: str = None): v = mmif.new_view() @@ -185,122 +283,86 @@ def test_app_rewind(self): self.assertIn('dummy_app_two', remaining_apps) -class TestDescribe(unittest.TestCase): +class TestDescribe(BaseCliTestCase, IOTestMixin): """Test suite for the describe CLI module.""" + + cli_module = describe + expected_output_keys = ['workflowId', 'stats', 'apps'] - def setUp(self): - """Create test MMIF structures.""" - self.parser = describe.prep_argparser() - self.maxDiff = None - self.basic_mmif = Mmif( - '{"metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", "properties": {"id": "d1", "mime": "video/mp4", "location": "file:///test/video.mp4"}}], "views": []}' - ) - - def create_temp_mmif_file(self, mmif_obj): - """Helper to create a temporary MMIF file.""" - tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) - if isinstance(mmif_obj, Mmif): - content_to_write = mmif_obj.serialize(pretty=False) - else: - content_to_write = json.dumps(mmif_obj) - tmp.write(content_to_write) - tmp.close() - return tmp.name - - def test_describe_single_mmif_empty(self): + def test_help_schema(self): + """Test --help-schema with different options""" + from mmif.utils.workflow_helper import SingleMmifDesc, CollectionMmifDesc + + # Test mmif-file + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = argparse.Namespace(help_schema=['mmif-file'], MMIF_FILE=None, output=None, pretty=False) + with self.assertRaises(SystemExit) as cm: + describe.main(args) + self.assertEqual(cm.exception.code, 0) + output = stdout.getvalue() + # Verify SingleMmifDesc schema keys are present + self.assertIn("workflowId", output) + self.assertIn("stats", output) + self.assertIn("apps", output) + + # Test mmif-dir + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = argparse.Namespace(help_schema=['mmif-dir'], MMIF_FILE=None, output=None, pretty=False) + with self.assertRaises(SystemExit) as cm: + describe.main(args) + self.assertEqual(cm.exception.code, 0) + output = stdout.getvalue() + # Verify CollectionMmifDesc schema keys are present + self.assertIn("mmifCountByStatus", output) + self.assertIn("workflows", output) + + def test_describe_main_directory(self): + """Test describe.main with a directory input""" + with tempfile.TemporaryDirectory() as tmp_dir: + # Create two mmif files + with open(os.path.join(tmp_dir, '1.mmif'), 'w') as f: + f.write(self.basic_mmif.serialize()) + with open(os.path.join(tmp_dir, '2.mmif'), 'w') as f: + f.write(self.basic_mmif.serialize()) + + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + # MMIF_FILE argument expects a string path + args = argparse.Namespace(MMIF_FILE=tmp_dir, output=None, pretty=False, help_schema=None) + describe.main(args) + output_json = json.loads(stdout.getvalue()) + # Just verify valid JSON output was produced + self.assertIsInstance(output_json, dict) + self.assertTrue(len(output_json) > 0) + + def test_deprecated_functions(self): + """Test backward compatibility wrapper functions""" tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 0) - self.assertEqual(len(result["apps"]), 0) - self.assertEqual(result["stats"]["annotationCountByType"], {}) + with self.assertWarns(DeprecationWarning): + describe.get_pipeline_specs(tmp_file) + with self.assertWarns(DeprecationWarning): + describe.generate_pipeline_identifier(tmp_file) finally: os.unlink(tmp_file) - def test_describe_single_mmif_one_app(self): - view = self.basic_mmif.new_view() - view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view.metadata.timestamp = "2024-01-01T12:00:00Z" - view.metadata.appProfiling = {"runningTime": "0:00:01.234"} - view.new_annotation(AnnotationTypes.TimeFrame) - tmp_file = self.create_temp_mmif_file(self.basic_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 1) - self.assertEqual(len(result["apps"]), 1) - app_exec = result["apps"][0] - self.assertEqual(app_exec["app"], view.metadata.app) - self.assertEqual(app_exec["viewIds"], [view.id]) - self.assertEqual(app_exec["appProfiling"]["runningTimeMS"], 1234) - finally: - os.unlink(tmp_file) - def test_describe_single_mmif_one_app_two_views(self): - view1 = self.basic_mmif.new_view() - view1.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view1.metadata.timestamp = "2024-01-01T12:00:00Z" - view1.new_annotation(AnnotationTypes.TimeFrame) - view2 = self.basic_mmif.new_view() - view2.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view2.metadata.timestamp = "2024-01-01T12:00:00Z" - view2.new_annotation(AnnotationTypes.TimeFrame) - tmp_file = self.create_temp_mmif_file(self.basic_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 1) - self.assertEqual(len(result["apps"]), 1) - app_exec = result["apps"][0] - self.assertEqual(app_exec["viewIds"], [view1.id, view2.id]) - finally: - os.unlink(tmp_file) +class TestSummarize(BaseCliTestCase, IOTestMixin): + """Test suite for the summarize CLI module.""" + + cli_module = summarize + expected_output_keys = ['mmif_version', 'documents', 'views'] - def test_describe_single_mmif_error_view(self): - view = self.basic_mmif.new_view() - view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view.metadata.timestamp = "2024-01-01T12:00:00Z" - view.metadata.error = {"message": "Something went wrong"} + def test_summarize_validates_content(self): + """Test that summarize produces expected content.""" tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 0) - self.assertEqual(len(result["apps"]), 0) - self.assertEqual(len(result["stats"]["errorViews"]), 1) + output = self.run_cli_capture_stdout( + argparse.Namespace(MMIF_FILE=tmp_file, output=None, pretty=False) + ) + self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") finally: os.unlink(tmp_file) - @unittest.mock.patch('jsonschema.validators.validate') - def test_describe_single_mmif_with_unassigned_views(self, mock_validate): - raw_mmif = json.loads(self.basic_mmif.serialize()) - raw_mmif['views'].append({'id': 'v1', 'metadata': {'app': 'http://apps.clams.ai/app1/v1.0.0', 'timestamp': '2024-01-01T12:00:00Z'}, 'annotations': []}) - raw_mmif['views'].append({'id': 'v2', 'metadata': {'app': 'http://apps.clams.ai/app2/v2.0.0'}, 'annotations': []}) - raw_mmif['views'].append({'id': 'v3', 'metadata': {'timestamp': '2024-01-01T12:01:00Z', 'app': ''}, 'annotations': []}) - tmp_file = self.create_temp_mmif_file(raw_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result['stats']['appCount'], 1) - self.assertEqual(len(result['apps']), 2) - special_entry = result['apps'][-1] - self.assertEqual(special_entry['app'], 'http://apps.clams.ai/non-existing-app/v1') - self.assertEqual(len(special_entry['viewIds']), 2) - self.assertIn('v2', special_entry['viewIds']) - self.assertIn('v3', special_entry['viewIds']) - finally: - os.unlink(tmp_file) - - def test_describe_collection_empty(self): - dummy_dir = 'dummy_mmif_collection' - os.makedirs(dummy_dir, exist_ok=True) - try: - output = mmif.utils.workflow_helper.describe_mmif_collection(dummy_dir) - expected = { - 'mmifCountByStatus': {'total': 0, 'successful': 0, 'withErrors': 0, 'withWarnings': 0, 'invalid': 0}, - 'workflows': [], - 'annotationCountByType': {} - } - self.assertEqual(output, expected) - finally: - os.rmdir(dummy_dir) - if __name__ == '__main__': unittest.main()