diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 6be5812e..781bec97 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -1,12 +1,27 @@
-name: "📦 Publish (docs, PyPI)"
+name: "📦 Publish (PyPI + docs)"
 
-on: 
-  push: 
-    tags: 
+on:
+  push:
+    tags:
       - '[0-9]+.[0-9]+.[0-9]+'
 
 jobs:
-  package-and-upload:
-    name: "🤙 Call SDK publish workflow"
+  publish-pypi:
+    name: "📦 Build and upload to PyPI"
     uses: clamsproject/.github/.github/workflows/sdk-publish.yml@main
     secrets: inherit
+
+  publish-docs:
+    name: "📖 Build and publish docs"
+    needs: publish-pypi
+    uses: clamsproject/clamsproject.github.io/.github/workflows/sdk-docs.yml@main
+    with:
+      source_repo: clamsproject/mmif-python
+      source_ref: ${{ github.ref_name }}
+      project_name: mmif-python
+      version: ${{ github.ref_name }}
+      build_command: 'python3 build-tools/docs.py --build-ver ${{ github.ref_name }} --output-dir docs'
+      docs_output_dir: 'docs/${{ github.ref_name }}'
+      python_version: '3.11'
+      update_latest: true
+    secrets: inherit
diff --git a/.gitignore b/.gitignore
index 013ab917..7588b933 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,5 +79,10 @@ mmif/vocabulary
 
 # Documentation build artifacts
 documentation/cli_help.rst
-documentation/whatsnew.rst
+documentation/whatsnew.md
+documentation/autodoc
 docs-test
+
+# environments
+.venv*
+venv*
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0c7f166b..e544de48 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,5 +1,72 @@
 # Contributing to mmif-python
 
+## Git Workflow
+
+We follow a Gitflow-inspired branching model to maintain a stable `main` branch and a dynamic `develop` branch.
+
+1. **Branch Roles**:
+    - `main`: Reserved for stable, production-ready releases.
+    - `develop`: The primary branch for ongoing development, feature integration, and bug fixes. This serves as the "staging" area for the next release.
+2. **Issue Tracking**: Every contribution (bug fix or feature) must first be reported as a [GitHub Issue](https://github.com/clamsproject/mmif-python/issues). Issues should clearly define goals and, preferably, include an implementation plan.
+3. **Branch Naming**: Create a dedicated working branch for each issue. Branches must be named using the format `NUM-short-description`, where `NUM` is the issue number (e.g., `113-fix-file-loading`).
+4. **Pull Requests (PRs)**:
+    - Once work is complete, open a PR targeting the `develop` branch.
+    - **Communication**: High-level discussion and planning should occur in the issue thread. The PR conversation is strictly for code review and implementation-specific feedback.
+5. **Releases**:
+    - When `develop` is ready for a new release, open a PR from `develop` to `main` using the "release" PR template.
+    - After merging the release candidate into `main`, manually tag the commit with the version number. This tag triggers the automated CI/CD pipeline for publishing.
+6. **Branch Protection**: Both `main` and `develop` are protected branches. Direct pushes are disabled; all changes must be introduced via Pull Requests.
+
+## CLI Scripts
+
+The `mmif` command-line interface supports subcommands (e.g., `mmif source`, `mmif describe`). These are implemented as Python modules in `mmif/utils/cli/`.
+
+### Adding a New CLI Script
+
+To add a new CLI subcommand, create a Python module in `mmif/utils/cli/` with these three required functions:
+
+1. **`prep_argparser(**kwargs)`** - Define and return an `argparse.ArgumentParser` instance for your subcommand. When called during discovery, the main CLI will pass `add_help=False` to this function to avoid duplicate help flags.
+
+2. **`describe_argparser()`** - Return a tuple of two strings:
+   - A one-line description (shown in `mmif --help`)
+   - A more verbose description (shown in `mmif <subcommand> --help`)
+
+3. **`main(args)`** - Execute the subcommand logic with the parsed arguments.
+
+### Standard I/O Argument Pattern
+
+To ensure a consistent user experience and avoid resource leaks, all CLI subcommands should adhere to the following I/O argument patterns using the `mmif.utils.cli.open_cli_io_arg` context manager (which replaces the deprecated `argparse.FileType`):
+
+1. **Input**: Use a positional argument (usually named `MMIF_FILE`) that supports both file paths and STDIN. 
+   - In `prep_argparser`, use `nargs='?'`, `type=str`, and `default=None`.
+   - In `main`, use `with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file:`.
+2. **Output**: Use the `-o`/`--output` flag for the output destination.
+   - In `prep_argparser`, use `type=str` and `default=None`.
+   - In `main`, use `with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file:`.
+3. **Formatting**: Use the `-p`/`--pretty` flag as a boolean switch (`action='store_true'`) to toggle between compact and pretty-printed JSON/MMIF output.
+
+[!NOTE]
+> CLI modules should typically act as thin wrappers. It is recommended to implement the core utility logic in other packages (e.g., `mmif.utils`) and import it into the CLI module. See existing modules like `summarize.py` (which imports from `mmif.utils.summarizer`) or `describe.py` for examples.
+
+### How CLI Discovery Works
+
+The CLI system automatically discovers subcommands at runtime. The entry point is configured in the build script (currently `setup.py`) as follows:
+
+```python
+entry_points={
+    'console_scripts': [
+        'mmif = mmif.__init__:cli',
+    ],
+},
+```
+
+The `cli()` function in `mmif/__init__.py` handles discovery and delegation. It uses `pkgutil.walk_packages` to find all modules within the top-level of the `mmif.utils.cli` package. For the discovery logic to work, a "cli module" should implement the requirements outlined above. 
+
+This means adding a properly structured module within the CLI package is all that's needed—the module name will automatically be registered as a subcommand. No modifications to `setup.py` or other configuration files are required.
+
+> [!NOTE]
+> Any "client" code (not shell CLI) wants to use a module in `cli` package should be able to directrly `from mmif.utils.cli import a_module`. However, for historical reasons, some CLI modules are manually imported in `mmif/__init__.py` (e.g., `source.py`) for backward compatibility for clients predateing the discovery system. 
+
 ## Documentation
 
 The documentation for `mmif-python` is built using Sphinx and published to the [CLAMS documentation hub](https://github.com/clamsproject/website-test).
@@ -9,12 +76,38 @@ The documentation for `mmif-python` is built using Sphinx and published to the [
 To build the documentation for the current checkout:
 
 ```bash
-make doc
-# OR
 python3 build-tools/docs.py
 ```
 
-The output will be in `documentation/_build/html`.
+The output will be in `docs-test`. For more options, run `python build-tools/docs.py --help`.
+
+> [!NOTE]
+> Since the documentation build process is relying on the working `mmif` package, one must "build" the package first before building the documentation. This can be done by running
+> ```bash
+> rm VERSION*                     # remove existing VERSION file if exists
+> make devversion                 # creates a dummy VERSION file
+> pip install -r requirements.dev # install dev dependencies
+> python setup.py sdist           # build the package (will download auto-generate subpackges like `mmif.res` and `mmif.ver`)
+
+> [!NOTE]
+> running `build-tools/docs.py` in "local testing" mode will overwrite any existing VERSION file with a dummy version.
+
+### API Documentation (autodoc)
+
+As of 2026 (since the next version of 1.2.1), API documentation is **automatically generated** using `sphinx-apidoc`. When you run the documentation build:
+
+1. The `run_apidoc()` function in `documentation/conf.py` runs automatically
+2. It scans packages listed in `apidoc_package_names` (currently `mmif` and `mmif_docloc_http`)
+3. RST files are generated in `documentation/autodoc/`
+4. These files are **not tracked in git** - they're regenerated on each build
+
+**When you add a new module or subpackage**, it will be automatically documented on the next build. No manual updates required.
+
+**To add a new top-level package** (like `mmif_docloc_http`), add it to `apidoc_package_names` in `documentation/conf.py`.
+
+**To exclude a subpackage** from documentation (like `mmif.res` or `mmif.ver`), add it to `apidoc_exclude_paths`.
+
+**Module docstrings** in `__init__.py` files are used as package descriptions in the documentation. Keep them concise and informative.
 
 ### Building Documentation for Old Versions
 
diff --git a/Makefile b/Makefile
index ec63ccf1..bac1919e 100644
--- a/Makefile
+++ b/Makefile
@@ -36,17 +36,12 @@ publish: distclean version package test
 $(generatedcode): dist/$(sdistname)*.tar.gz
 
 docs:
-	@echo "WARNING: The 'docs' target is deprecated and will be removed."
-	@echo "The 'docs' directory is no longer used. Documentation is now hosted in the central CLAMS documentation hub."
-	@echo "Use 'make doc' for local builds or 'make doc-version' for specific versions."
-	@echo "Nothing is done."
+	@echo "The 'docs' target is deprecated and will be removed."
+	@echo "Documentation is now managed by 'build-tools/docs.py'."
+	@echo "Please run 'python3 build-tools/docs.py --help' for usage."
 
-doc: # for single version sphinx - builds current source
-	python3 build-tools/docs.py
-
-doc-version: # interactive build for specific version
-	@read -p "Enter version/tag to build (e.g., v1.0.0): " ver; \
-	[ -n "$$ver" ] && python3 build-tools/docs.py --build-ver $$ver
+doc: docs
+doc-version: docs
 
 package: VERSION dist/$(sdistname)*.tar.gz
 
@@ -85,15 +80,15 @@ version: VERSION; cat VERSION
 # since the GH api will return tags in chronological order, we can just grab the last one without sorting
 AUTH_ARG := $(if $(GITHUB_TOKEN),-H "Authorization: token $(GITHUB_TOKEN)")
 
-VERSION.dev: devver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif-python/git/refs/tags" | grep '"ref":' | sed -E 's/.+refs\/tags\/([0-9.]+)",/\1/g' | tail -n 1)
-VERSION.dev: specver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif/git/refs/tags" | grep '"ref":' | grep -v 'py-' | sed -E 's/.+refs\/tags\/(spec-)?([0-9.]+)",/\2/g' | tail -n 1)
+VERSION.dev: devver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif-python/git/refs/tags" | grep '"ref":' | sed -E 's/.+refs\/tags\/([0-9.]+)",/\1/g' | sort -V | tail -n 1)
+VERSION.dev: specver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif/git/refs/tags" | grep '"ref":' | grep -v 'py-' | sed -E 's/.+refs\/tags\/(spec-)?([0-9.]+)",/\2/g' | sort -V | tail -n 1)
 VERSION.dev:
 	@echo DEVVER: $(devver)
 	@echo SPECVER: $(specver)
 	@if [ $(call macro,$(devver)) = $(call macro,$(specver)) ] && [ $(call micro,$(devver)) = $(call micro,$(specver)) ] ; \
 	then \
 	if [[ $(devver) == *.dev* ]]; then echo $(call increase_dev,$(devver)) ; else echo $(call add_dev,$(call increase_patch, $(devver))); fi \
-	else echo $(call add_dev,$(specver)) ; fi \
+	else if [[ $(devver) == *.dev* ]]; then echo $(call increase_dev,$(devver)) ; else echo $(call add_dev,$(call increase_patch, $(devver))); fi ; fi \
 	> VERSION.dev
 
 VERSION: version := $(shell git tag | sort -t. -k 1,1nr -k 2,2nr -k 3,3nr -k 4,4nr | head -n 1)
diff --git a/README.md b/README.md
index 1cd1070a..a2e57601 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,20 @@
 ## MultiMedia Interchange Format
-[MMIF](https://mmif.clams.ai) is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications in [CLAMS project](https://clams.ai). 
+
+[MMIF](https://mmif.clams.ai) is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications of the [CLAMS project](https://clams.ai).
 
 
 ## mmif-python
-`mmif-python` is a Python implementation of the MMIF data format. 
-`mmif-python` provides various helper classes and functions to handle MMIF JSON in Python, 
-including ; 
 
-1. de-/serialization of MMIF internal data structures to/from JSON
+`mmif-python` is a Python implementation of the MMIF data format. It provides various helper classes and functions to handle MMIF JSON in Python, including:
+
+1. serialization and de-serialization of MMIF internal data structures to/from JSON
 2. validation of MMIF JSON
 3. handling of CLAMS vocabulary types
-4. navigation of MMIF object via various "search" methods (e.g. `mmif.get_all_views_contain(vocab_type))`)
+4. navigation of MMIF objects via various "search" methods (e.g. `mmif.get_all_views_contain(vocab_type)`)
 
 ## For more ...
+
 * [Version history and patch notes](https://github.com/clamsproject/mmif-python/blob/main/CHANGELOG.md)
-* [MMIF Python API documentation](https://clamsproject.github.io/mmif-python)
+* [MMIF Python API documentation](https://clamsproject.github.io/mmif-python/latest)
 * [MMIF JSON specification and schema](https://clamsproject.github.io/mmif)
-
-## For devs ...
-* Build documentation: `python build-tools/docs.py --help`
+* [Contributing guide](CONTRIBUTING.md)
diff --git a/build-tools/docs.py b/build-tools/docs.py
index ee5d4550..eaea47e3 100644
--- a/build-tools/docs.py
+++ b/build-tools/docs.py
@@ -40,6 +40,19 @@ def run_sphinx_build(self, *args, cwd=None, check=True):
         return run_command([self.sphinx_build, *args], cwd=cwd, check=check)
 
 
+def get_dummy_version():
+    """Returns a dummy version based on current git branch and dirty status.
+    Falls back to 'unknown' if not in a git repository."""
+    try:
+        branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], 
+                                        stderr=subprocess.DEVNULL, text=True).strip()
+        dirty = subprocess.run(["git", "diff", "--quiet"], 
+                              stderr=subprocess.DEVNULL, check=False).returncode != 0
+        return f"{branch}{'+dirty' if dirty else ''}"
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return "unknown"
+
+
 def build_docs_local(source_dir: Path, output_dir: Path):
     """
     Builds documentation for the provided source directory.
@@ -47,6 +60,18 @@ def build_docs_local(source_dir: Path, output_dir: Path):
     """
     print("--- Running in Local Build Mode ---")
 
+    # Warning for user as VERSION file is critical
+    if sys.stdin.isatty():
+        import select
+        print("\nWARNING: The 'VERSION' file will be overwritten with a dummy version for this local build.")
+        print("Pausing for 3 seconds (press Enter to continue immediately)...")
+        select.select([sys.stdin], [], [], 3)
+
+    # Overwrite VERSION file with dummy version for local builds
+    version = get_dummy_version()
+    print(f"Generating dummy VERSION for local build: {version}")
+    (source_dir / "VERSION").write_text(version)
+
     # 1. Generate source code and install in editable mode.
     print("\n--- Step 1: Generating source code and installing in editable mode ---")
     try:
diff --git a/build-tools/requirements.docs.txt b/build-tools/requirements.docs.txt
index 8d9ee33d..db2d03d8 100644
--- a/build-tools/requirements.docs.txt
+++ b/build-tools/requirements.docs.txt
@@ -1,3 +1,4 @@
-sphinx>=7.0,<8.0
+sphinx
 furo
 m2r2
+autodoc-pydantic
diff --git a/documentation/autodoc/mmif.serialize.rst b/documentation/autodoc/mmif.serialize.rst
deleted file mode 100644
index e58e0c24..00000000
--- a/documentation/autodoc/mmif.serialize.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-mmif.serialize package
-======================
-
-Core package to provide serialization and deserialization of MMIF format. 
-
-``model`` module
----------------------------
-
-.. automodule:: mmif.serialize.model
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-``mmif`` module
---------------------------
-
-.. automodule:: mmif.serialize.mmif
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-``view`` module
---------------------------
-
-.. automodule:: mmif.serialize.view
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-``annotation`` module
---------------------------------
-
-.. automodule:: mmif.serialize.annotation
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
diff --git a/documentation/autodoc/mmif.utils.rst b/documentation/autodoc/mmif.utils.rst
deleted file mode 100644
index 8bd90cfd..00000000
--- a/documentation/autodoc/mmif.utils.rst
+++ /dev/null
@@ -1,49 +0,0 @@
-mmif.utils package
-==================
-
-Package containing utility modules for handling different types of source
-documents, and general implementation of common data structures and
-algorithms.
-
-Submodules
-----------
-
-``video_document_helper`` module
---------------------------------
-
-.. automodule:: mmif.utils.video_document_helper
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-``text_document_helper`` module
--------------------------------
-
-.. automodule:: mmif.utils.text_document_helper
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-``timeunit_helper`` module
--------------------------------
-
-.. automodule:: mmif.utils.timeunit_helper
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-``sequence_helper`` module
---------------------------
-
-.. automodule:: mmif.utils.sequence_helper
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-``workflow_helper`` module
---------------------------
-
-.. automodule:: mmif.utils.workflow_helper
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
diff --git a/documentation/autodoc/mmif.vocabulary.rst b/documentation/autodoc/mmif.vocabulary.rst
deleted file mode 100644
index 0eb985b5..00000000
--- a/documentation/autodoc/mmif.vocabulary.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-mmif.vocabulary package
-=======================
-
-Package contains Enum-like classes for CLAMS vocabulary.
-
-.. autoclass:: mmif.vocabulary.ThingTypesBase
-   :show-inheritance:
-.. autoclass:: mmif.vocabulary.ThingType
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-.. autoclass:: mmif.vocabulary.ClamsTypesBase
-   :show-inheritance:
-.. autoclass:: mmif.vocabulary.AnnotationTypesBase
-   :show-inheritance:
-.. autoclass:: mmif.vocabulary.DocumentTypesBase
-   :show-inheritance:
-
-.. autoclass:: mmif.vocabulary.AnnotationTypes
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-.. autoclass:: mmif.vocabulary.DocumentTypes
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/documentation/autodoc/mmif_docloc_http.rst b/documentation/autodoc/mmif_docloc_http.rst
deleted file mode 100644
index b76c8df5..00000000
--- a/documentation/autodoc/mmif_docloc_http.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-mmif_docloc_http package
-========================
-
-MMIF document location helper module for `http` and `https` schemes. 
-If you want to write your own docloc scheme handler, please use the source code of this module as a reference. 
-See this :ref:`plug-in section <docloc_plugin>` for more information. 
-
-.. automodule:: mmif_docloc_http
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/documentation/cli.rst b/documentation/cli.rst
index 8a2f6836..481df49d 100644
--- a/documentation/cli.rst
+++ b/documentation/cli.rst
@@ -1,16 +1,16 @@
 .. _cli:
 
-``mmif`` shell command
-======================
+The ``mmif`` shell command
+==========================
 
 ``mmif-python`` comes with a command line interface (CLI) that allows you to handle MMIF files. Many of these commands are designed to handle MMIF files in the context of CLAMS workflows.
 
-The CLI is installed as ``mmif`` shell command. To see the available commands, run
+The CLI scripts are installed as subcommands of the ``mmif`` shell command. Run the following to see the available commands or the MMIF version:
 
-.. code-block:: bash
+.. include:: cli_help.rst
 
-    mmif --help
+Please take a look at the individual command documentation for more details on each command:
 
-The following documentation is automatically generated from the CLI help messages.
+.. code-block:: text
 
-.. include:: cli_help.rst
+    $ mmif <SUBCOMMAND> --help
diff --git a/documentation/conf.py b/documentation/conf.py
index 2b8a027d..81b989ba 100644
--- a/documentation/conf.py
+++ b/documentation/conf.py
@@ -6,8 +6,12 @@
 import inspect
 import textwrap
 import os
+import re
 import sys
 from pathlib import Path
+from sphinx.util import logging
+
+logger = logging.getLogger(__name__)
 
 # -- Path setup --------------------------------------------------------------
 # Add project root to sys.path so that autodoc can find the mmif package.
@@ -17,19 +21,33 @@
 # At this point, `pip install -e .` should have been run, so mmif is importable
 import mmif
 
+# apidoc settings
+apidoc_package_names = ['mmif', 'mmif_docloc_http']
+apidoc_exclude_paths = [
+    proj_root_dir / 'mmif' / 'res',
+    proj_root_dir / 'mmif' / 'ver',
+]
+# this is used by sphinx.ext.autodoc
+autodoc_default_options = {
+    'members': True,
+    'undoc-members': True,
+    'show-inheritance': True,
+}
+autodoc_member_order = 'bysource'
+
+
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
 project = 'mmif-python'
 blob_base_url = f'https://github.com/clamsproject/{project}/blob'
-copyright = f'{datetime.date.today().year}, Brandeis LLC'
 author = 'Brandeis LLC'
+copyright = f'{datetime.date.today().year}, {author}'
 try:
     version = open(proj_root_dir / 'VERSION').read().strip()
 except FileNotFoundError:
-    print("WARNING: VERSION file not found, using 'dev' as version.")
+    logger.warning("VERSION file not found, using 'dev' as version.")
     version = 'dev'
-release = version
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -38,8 +56,16 @@
     'sphinx.ext.autodoc',
     'sphinx.ext.linkcode',
     'm2r2',
+    'sphinxcontrib.autodoc_pydantic',
 ]
 
+autodoc_pydantic_model_show_json = True
+autodoc_pydantic_model_show_field_summary = True
+autodoc_pydantic_model_show_config_summary = False
+autodoc_pydantic_model_show_validator_members = False
+autodoc_pydantic_model_show_validator_summary = False
+autodoc_pydantic_field_list_validators = False
+
 templates_path = ['_templates']
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 # dynamically generated files
@@ -64,7 +90,6 @@
     "source_repository": "https://github.com/clamsproject/mmif-python",
     "source_branch": "main",  # Default branch for "Edit on GitHub" links
     "source_directory": "documentation/",
-
     # CLAMS brand colors
     "light_css_variables": {
         "color-brand-primary": "#008AFF",
@@ -142,7 +167,7 @@ def update_target_versions(app):
         return
 
     # Insert new version
-    print(f"Updating target-versions.csv: {current_ver} -> {spec_ver}")
+    logger.info(f"Updating target-versions.csv: {current_ver} -> {spec_ver}")
     lines.insert(1, f'{current_ver},"{spec_ver}"\n')
 
     with open(csv_path, 'w') as f:
@@ -150,53 +175,45 @@ def update_target_versions(app):
 
 
 def generate_cli_rst(app):
-    from mmif import prep_argparser_and_subcmds, find_all_modules
+    from mmif import prep_argparser_and_subcmds
 
     # Generate main help
     os.environ['COLUMNS'] = '100'
-    parser, subparsers = prep_argparser_and_subcmds()
+    parser, _, _ = prep_argparser_and_subcmds()
     help_text = parser.format_help()
 
     content = []
 
-    content.append('Main Command\n')
-    content.append('------------\n\n')
     content.append('.. code-block:: text\n\n')
+    content.append('    $ mmif --help\n')
     content.append(textwrap.indent(help_text, '    '))
     content.append('\n\n')
 
-    # Generate subcommand help
-    for cli_module in find_all_modules('mmif.utils.cli'):
-        cli_module_name = cli_module.__name__.rsplit('.')[-1]
-        subparser = cli_module.prep_argparser(prog=f'mmif {cli_module_name}')
-        sub_help = subparser.format_help()
-
-        content.append(f'{cli_module_name}\n')
-        content.append('-' * len(cli_module_name) + '\n\n')
-        content.append('.. code-block:: text\n\n')
-        content.append(textwrap.indent(sub_help, '    '))
-        content.append('\n\n')
+    # No longer generate subcommand help
 
     with open(proj_root_dir / 'documentation' / 'cli_help.rst', 'w') as f:
         f.write(''.join(content))
 
 
 def generate_whatsnew_rst(app):
+    """
+    Create the documentation/whatsnew.md file by pulling out the changes for the
+    current version from the changelog file.
+    """
+
     changelog_path = proj_root_dir / 'CHANGELOG.md'
     output_path = proj_root_dir / 'documentation' / 'whatsnew.md'
     if not changelog_path.exists():
-        print(f"WARNING: CHANGELOG.md not found at {changelog_path}")
+        logger.warning(f"CHANGELOG.md not found at {changelog_path}")
         with open(output_path, 'w') as f:
             f.write("")
         return
 
-    import re
-
     content = []
     found_version = False
     version_header_re = re.compile(r'^## releasing\s+([^\s]+)\s*(\(.*\))?')
 
-    print(f"DEBUG: Looking for version '{version}' in CHANGELOG.md")
+    logger.debug(f"Looking for version '{version}' in CHANGELOG.md")
 
     with open(changelog_path, 'r') as f:
         lines = f.readlines()
@@ -216,9 +233,9 @@ def generate_whatsnew_rst(app):
             content.append(line)
 
     if not found_version:
-        print(f"NOTE: No changelog entry found for version {version}")
+        logger.info(f"No changelog entry found for version {version}")
         with open(output_path, 'w') as f:
-            f.write("")
+            f.write(f"### nothing new in {version}\nDid you locally build for testing?")
     else:
         # Dump matched markdown content directly to whatsnew.md
         with open(output_path, 'w') as f:
@@ -226,10 +243,44 @@ def generate_whatsnew_rst(app):
             f.writelines(content)
 
 
+def run_apidoc(app):
+    """
+    Run sphinx-apidoc to auto-generate RST files for all modules.
+    This ensures new modules are automatically documented without manual updates.
+    """
+    from sphinx.ext.apidoc import main as apidoc_main
+
+    docs_dir = Path(__file__).parent
+    output_dir = docs_dir / 'autodoc'
+
+    exclude_paths = map(str, apidoc_exclude_paths)
+
+    # Run sphinx-apidoc for each package specified in package_names
+    # apidoc_main() accepts argv-style arguments (without the program name)
+    for package_name in apidoc_package_names:
+        package_dir = proj_root_dir / package_name
+        if not package_dir.exists():
+            logger.warning(f"Package directory {package_dir} does not exist. "
+                           f"Skipping apidoc for {package_name}.")
+            continue
+
+        args = [
+            '-o', str(output_dir),
+            str(package_dir),
+            *exclude_paths,
+            '--force',          # Overwrite existing files
+            '--module-first',   # Put module docs before submodule docs
+            '--no-toc',         # Don't create modules.rst, will be overwriting each other's
+        ]
+        logger.info(f"Running sphinx-apidoc with args: {args}")
+        apidoc_main(args)
+
+
 def setup(app):
     try:
+        app.connect('builder-inited', run_apidoc)
         app.connect('builder-inited', update_target_versions)
         app.connect('builder-inited', generate_cli_rst)
         app.connect('builder-inited', generate_whatsnew_rst)
     except ImportError:
-        print("WARNING: 'mmif' package not found. Skipping dynamic generation of parts of documentation.")
+        logger.warning("'mmif' package not found. Skipping dynamic generation of parts of documentation.")
diff --git a/documentation/index.rst b/documentation/index.rst
index ddbf0691..05d93a48 100644
--- a/documentation/index.rst
+++ b/documentation/index.rst
@@ -1,7 +1,7 @@
-Welcome to mmif-python's documentation!
-=======================================
+MMIF Python SDK
+===============
 
-.. mdinclude:: ../README.md
+This is the documentation for the mmif-python package, a Python implementation for the MultiMedia Interchange Format (MMIF). MMIF is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications of the CLAMS project. For descriptions of the CLAMS project and the MMIF format see https://clams.ai and https://mmif.clams.ai. The GitHub repository for the package is at https://github.com/clamsproject/mmif-python.
 
 ----
 
@@ -15,19 +15,20 @@ Welcome to mmif-python's documentation!
 
   introduction
   cli
+  summarizer
   plugins
   target-versions
 
 .. toctree::
   :maxdepth: 2
-  :caption: API documentation:
+  :caption: API Documentation
 
-  modules
+  autodoc/mmif
+  autodoc/mmif_docloc_http
 
 
-Indices and tables
-==================
+Indices
+-------
 
 * :ref:`genindex`
 * :ref:`modindex`
-* :ref:`search`
diff --git a/documentation/introduction.rst b/documentation/introduction.rst
index 95508f3c..c9f63e98 100644
--- a/documentation/introduction.rst
+++ b/documentation/introduction.rst
@@ -11,10 +11,12 @@ MultiMedia Interchange Format (MMIF) is a JSON(-LD)-based data format designed f
 This documentation focuses on Python implementation of the MMIF. To learn more about the data format specification, please visit the `MMIF website <https://mmif.clams.ai>`_.
 ``mmif-python`` is a public, open source implementation of the MMIF data format. ``mmif-python`` supports serialization/deserialization of MMIF objects from/to Python objects, as well as many navigation and manipulation helpers for MMIF objects. 
 
+
 Prerequisites
 -------------
 
-* `Python <https://www.python.org>`_: the latest ``mmif-python`` requires Python 3.8 or newer. We have no plan to support `Python 2.7 <https://pythonclock.org/>`_. 
+* `Python <https://www.python.org>`_: the latest ``mmif-python`` requires Python 3.10 or newer.
+
 
 Installation 
 ---------------
@@ -25,20 +27,20 @@ Package ``mmif-python`` is distributed via the official PyPI. Users are supposed
 
   pip install mmif-python
 
-This will install a package `mmif` to local python.
+This will install a package `mmif` to your local python library.
 
 The MMIF format and specification is evolving over time, and ``mmif-python`` package will be updated along with the changes in MMIF format. 
 
-.. note:: MMIF format is not always backward-compatible. To find out more about relations between MMIF specification versions and ``mmif-python`` versions, please take time to read our decision on the subject `here <https://mmif.clams.ai/versioning/>`_. If you need to know which python SDK supports which specification version, see :ref:`target-versions` page. 
+.. note:: The MMIF format is not always backward-compatible. To find out more about relations between MMIF specification versions and ``mmif-python`` versions, please take time to read our decision on the subject `here <https://mmif.clams.ai/versioning/>`_. If you need to know which python SDK supports which specification version, see :ref:`target-versions` page. 
+
 
 MMIF Serialization
 ---------------------------
 
-:class:`mmif.serialize.mmif.Mmif` represents the top-level MMIF object. For subcomponents of the MMIF (view objects, annotation objects, metadata for each object) are all subclass of :class:`mmif.serialize.model.MmifObject`, including the :class:`mmif.serialize.mmif.Mmif`. To start with an existing MMIF :class:`str`, simple initiate a new ``Mmif`` object with the file. 
+:class:`mmif.serialize.mmif.Mmif` represents the top-level MMIF object. Subcomponents of the MMIF object (views, annotation objects and metadata for each object) and the MMIF object itself are all subclasses of :class:`mmif.serialize.model.MmifObject`. To start with an existing MMIF :class:`str`, simply initiate a new ``Mmif`` object with that string.
 
 .. code-block:: python 
 
-  import mmif
   from mmif import Mmif
 
   mmif_str = """{
@@ -64,13 +66,14 @@ MMIF Serialization
     }
   ],
   "views": []}"""
+
   mmif_obj = Mmif(mmif_str)
 
 
 Few notes; 
 
-#. MMIF does not carry the primary source files in it. 
-#. MMIF encode the specification version at the top. As not all MMIF versions are backward-compatible, a version ``mmif-python`` implementation of the MMIF might not be able to load an unsupported version of MMIF string. 
+#. MMIF objects do not carry the primary source files in it (although there are exceptions for text documents). 
+#. MMIF objects specify the MMIF version at the top. As not all MMIF versions are backward-compatible, a version of the ``mmif-python`` implementation might not be able to load an unsupported MMIF versions. 
 
 When serializing back to :class:`str`, call :meth:`mmif.serialize.model.MmifObject.serialize` on the object. 
 
@@ -81,11 +84,12 @@ To get subcomponents, you can use various getters implemented in subclasses. For
   from mmif.vocabulary.document_types import DocumentTypes
 
   for video in mmif_obj.Mmif.get_documents_by_type(DocumentTypes.VideoDocument):
-    with open(video.location_path(), 'b') as in_video:
-      # do something with the video file
+      with open(video.location_path(), 'b') as in_video:
+          # do something with the video file
+
 
+For a full list of available helper methods, please refer to the API documentation pages (See left sidebar).
 
-For a full list of available helper methods, please refer to :ref:`the API documentation <apidoc>`. 
 
 MMIF usage in CLAMS Workflows
 -----------------------------
diff --git a/documentation/modules.rst b/documentation/modules.rst
deleted file mode 100644
index 4bb9307d..00000000
--- a/documentation/modules.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. _apidoc:
-
-mmif package
-============
-
-.. toctree::
-   :maxdepth: 4
-
-   autodoc/mmif.serialize
-   autodoc/mmif.vocabulary
-   autodoc/mmif.utils
-
-mmif_docloc_http package
-========================
-
-.. toctree::
-   :maxdepth: 2
-
-   autodoc/mmif_docloc_http
-
diff --git a/documentation/plugins.rst b/documentation/plugins.rst
index 1af39426..50af8876 100644
--- a/documentation/plugins.rst
+++ b/documentation/plugins.rst
@@ -1,8 +1,7 @@
 .. _plugins:
 
-Developing plugins for MMIF Python SDK
-======================================
-
+Developing plugins for the MMIF Python SDK
+==========================================
 
 Overview 
 --------
@@ -50,7 +49,7 @@ Here's a minimal example codebase that you refer to when you develop a ``docloc`
    ├── pyproject.toml
    └── setup.cfg
 
-    $ cat pyproject.toml
+   $ cat pyproject.toml
    [build-system]
    requires = ["setuptools"]
    build-backend = "setuptools.build_meta"
@@ -80,10 +79,41 @@ And the plugin code.
    def help():
        return "location format: `<DOCUMENT_ID>.video`"
 
+Built-in Document Location Scheme Plugins
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+At the moment, the ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. This plugin implements caching as described above, so repeated access to the same URL will not trigger multiple downloads.
+Take a look at the :mod:`mmif_docloc_http` module for details. 
 
+Caching for Remote File Access
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Bulit-in Document Location Scheme Plugins
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+When developing plugins that resolve remote document locations (e.g., ``http``, ``s3``, or custom schemes), it is highly recommended to implement caching to avoid repeated network requests or file downloads. Since ``mmif-python`` may call the ``resolve`` function multiple times for the same document location during processing, caching can significantly improve performance.
+
+A simple and effective approach is to use a module-level dictionary as a cache. Because Python modules are singletons (loaded once and cached in ``sys.modules``), this cache persists for the entire lifetime of the Python process, across multiple MMIF files and Document objects.
+
+Here's an example of how to implement caching in a plugin:
+
+.. code-block:: python
+
+   # mmif_docloc_myscheme/__init__.py
+
+   _cache = {}
+
+   def resolve(docloc):
+       if docloc in _cache:
+           return _cache[docloc]
+
+       # ... your resolution logic here ...
+       resolved_path = do_actual_resolution(docloc)
+
+       _cache[docloc] = resolved_path
+       return resolved_path
+
+This pattern ensures that:
+
+* The first call to ``resolve`` performs the actual resolution (download, API call, etc.)
+* Subsequent calls for the same location return the cached result immediately
+* The cache is shared across all MMIF objects processed within the same Python process
 
-At the moment, ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes.
-Take a look at :mod:`mmif_docloc_http` module for details. 
+See :mod:`mmif_docloc_http` for a concrete example of this caching strategy in action.
diff --git a/documentation/summarizer.rst b/documentation/summarizer.rst
new file mode 100644
index 00000000..eaef8970
--- /dev/null
+++ b/documentation/summarizer.rst
@@ -0,0 +1,32 @@
+.. _summarizer:
+
+
+MMIF Summarizer
+===============
+
+The Summarizer is a MMIF consumer that creates a JSON summary from a MMIF file. It
+makes some simplifying assumptions, including:
+
+- There is one video in the MMIF documents list. All start and end properties
+  are pointing to that video.
+- The time unit is assumed to be milliseconds.
+
+
+The summarizer is accessible via the ``mmif`` command line script. To run the
+summarizer over a MMIF file and write the JSON summary to OUTFILE:
+
+.. code-block:: bash
+
+    mmif summarize -i INFILE -o OUTFILE
+
+In all cases, the summarizer summarizes only the information that is there, it
+does not fix any mistakes and in general it does not add any information that is
+not explicitly or implicitly in the MMIF file. In rare cases some information is
+added, for example if an ASR tool does not group tokens in sentence-like objects
+then the summarizer will do that, but then only by creating token groups of the 
+same length.
+
+The summary includes the MMIF version, the list of documents, a summary of the
+metadata of all views (identifier, CLAMS app, timestamp, total number of
+annotations and number of annotations per type, it does not show parameters and
+application configuration), time frames, transcript, captions and entities.
\ No newline at end of file
diff --git a/documentation/target-versions.rst b/documentation/target-versions.rst
index 34216d3e..9d89717d 100644
--- a/documentation/target-versions.rst
+++ b/documentation/target-versions.rst
@@ -3,7 +3,7 @@
 Target MMIF Versions
 ====================
 
-This article provides targeting MMIF specification versions of different versions of ``mmif-python`` SDK. 
+This page lists targeting MMIF specification versions for different versions of the ``mmif-python`` SDK. 
 
 .. csv-table:: Target Specification Versions
    :file: target-versions.csv
diff --git a/mmif/__init__.py b/mmif/__init__.py
index 6fde82fe..513ecd86 100644
--- a/mmif/__init__.py
+++ b/mmif/__init__.py
@@ -34,28 +34,28 @@ def find_all_modules(pkgname):
 
 
 def prep_argparser_and_subcmds():
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(prog='mmif')
     parser.add_argument(
         '-v', '--version',
         action='version',
         version=version_template.format(__version__, __specver__)
     )
     subparsers = parser.add_subparsers(title='sub-command', dest='subcmd')
-    return parser, subparsers
-
-
-def cli():
-    parser, subparsers = prep_argparser_and_subcmds()
-    cli_modules = {}
+    subcmds = {}
     for cli_module in find_all_modules('mmif.utils.cli'):
         cli_module_name = cli_module.__name__.rsplit('.')[-1]
-        cli_modules[cli_module_name] = cli_module
+        subcmds[cli_module_name] = cli_module
         subcmd_parser = cli_module.prep_argparser(add_help=False)
         subparsers.add_parser(cli_module_name, parents=[subcmd_parser],
                               help=cli_module.describe_argparser()[0],
                               description=cli_module.describe_argparser()[1],
                               formatter_class=argparse.RawDescriptionHelpFormatter,
                               )
+    return parser, subparsers, subcmds
+
+
+def cli():
+    parser, subparsers, cli_modules = prep_argparser_and_subcmds()
     if len(sys.argv) == 1:
         parser.print_help(sys.stderr)
         sys.exit(1)
diff --git a/mmif/serialize/__init__.py b/mmif/serialize/__init__.py
index 18523bac..06964253 100644
--- a/mmif/serialize/__init__.py
+++ b/mmif/serialize/__init__.py
@@ -1,3 +1,7 @@
+"""
+Core package to provide serialization and deserialization of MMIF format.
+"""
+
 from .annotation import *
 from .annotation import __all__ as anno_all
 from .mmif import *
diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py
index 6527f482..b7f002cd 100644
--- a/mmif/serialize/annotation.py
+++ b/mmif/serialize/annotation.py
@@ -374,7 +374,7 @@ def add_property(self, name: str,
 
         With the former method, the SDK will record the added property as a
         `Annotation` annotation object, separate from the original `Document`
-        object. See :meth:`.Mmif.generate_capital_annotations()` for more.
+        object. See :meth:`mmif.serialize.mmif.Mmif.generate_capital_annotations` for more.
 
         A few notes to keep in mind:
 
@@ -442,7 +442,7 @@ def get(self, prop_name, default=None):
         See Also
         --------
         add_property : Add a new property to the document
-        Mmif.generate_capital_annotations : How pending properties are serialized
+        mmif.serialize.mmif.Mmif.generate_capital_annotations : How pending properties are serialized
         """
         if prop_name == 'id':
             # because all three dicts have `id` key as required field, we need
diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py
index 9e94496d..245c96aa 100644
--- a/mmif/serialize/mmif.py
+++ b/mmif/serialize/mmif.py
@@ -14,8 +14,8 @@
 import math
 import warnings
 from collections import defaultdict
-from datetime import datetime
-from typing import List, Union, Optional, Dict, cast, Iterator
+from datetime import datetime, timezone
+from typing import Any, List, Union, Optional, Dict, cast, Iterator
 
 import jsonschema.validators
 
@@ -24,7 +24,7 @@
 from mmif.serialize.annotation import Annotation, Document
 from mmif.serialize.model import MmifObject, DataList
 from mmif.serialize.view import View
-from mmif.vocabulary import AnnotationTypes, DocumentTypes
+from mmif.vocabulary import AnnotationTypes, DocumentTypesBase
 
 __all__ = ['Mmif']
 
@@ -433,7 +433,7 @@ def new_view(self) -> View:
         """
         new_view = View()
         new_view.id = self.new_view_id()
-        new_view.metadata.timestamp = datetime.now()
+        new_view.metadata.timestamp = datetime.now(timezone.utc)
         self.add_view(new_view)
         return new_view
 
@@ -487,11 +487,11 @@ def get_documents_in_view(self, vid: Optional[str] = None) -> List[Document]:
         else:
             return []
 
-    def get_documents_by_type(self, doc_type: Union[str, DocumentTypes]) -> List[Document]:
+    def get_documents_by_type(self, doc_type: DocumentTypesBase) -> List[Document]:
         """
         Method to get all documents where the type matches a particular document type, which should be one of the CLAMS document types.
 
-        :param doc_type: the type of documents to search for, must be one of ``Document`` type defined in the CLAMS vocabulary.
+        :param doc_type: the type of documents to search for, must be one of ``Document`` types defined in the CLAMS vocabulary.
         :return: a list of documents matching the requested type, or an empty list if none found.
         """
         docs = []
@@ -530,7 +530,7 @@ def get_documents_by_property(self, prop_key: str, prop_value: str) -> List[Docu
         docs.extend([document for document in self.documents if document[prop_key] == prop_value])
         return docs
 
-    def get_documents_locations(self, m_type: Union[DocumentTypes, str], path_only=False) -> List[Union[str, None]]:
+    def get_documents_locations(self, m_type: Union[DocumentTypesBase, str], path_only=False) -> List[Union[str, None]]:
         """
         This method returns the file paths of documents of given type.
         Only top-level documents have locations, so we only check them.
@@ -545,7 +545,7 @@ def get_documents_locations(self, m_type: Union[DocumentTypes, str], path_only=F
         else:
             return [doc.location for doc in docs]
 
-    def get_document_location(self, m_type: Union[DocumentTypes, str], path_only=False) -> Optional[str]:
+    def get_document_location(self, m_type: Union[DocumentTypesBase, str], path_only=False) -> Optional[str]:
         """
         Method to get the location of *first* document of given type.
 
diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py
index 1bec7b29..95fdc28c 100644
--- a/mmif/serialize/model.py
+++ b/mmif/serialize/model.py
@@ -402,7 +402,10 @@ def default(self, obj: 'MmifObject'):
         if hasattr(obj, '_serialize'):
             return obj._serialize()
         elif hasattr(obj, 'isoformat'):         # for datetime objects
-            return obj.isoformat()
+            s = obj.isoformat()
+            if s.endswith('+00:00'):
+                s = s[:-6] + 'Z'
+            return s
         elif hasattr(obj, '__str__'):
             return str(obj)
         else:
diff --git a/mmif/utils/__init__.py b/mmif/utils/__init__.py
index e69de29b..fe8aea5d 100644
--- a/mmif/utils/__init__.py
+++ b/mmif/utils/__init__.py
@@ -0,0 +1,4 @@
+"""
+Package containing utility modules for handling different types of source
+documents, and general implementation of common data structures and algorithms.
+"""
diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py
index 24855994..f24248f2 100644
--- a/mmif/utils/cli/__init__.py
+++ b/mmif/utils/cli/__init__.py
@@ -1,4 +1,226 @@
-from mmif.utils.cli import describe
+"""
+Package containing CLI modules.
+"""
+
+import contextlib
+import io
+import os
+import sys
+from typing import Iterator, Optional, TextIO, Type, Union, cast, get_args, get_origin
+
+from pydantic import BaseModel
+
+
+@contextlib.contextmanager
+def open_cli_io_arg(
+    path_or_dash: Optional[str],
+    mode: str = "r",
+    encoding: Optional[str] = None,
+    errors: Optional[str] = None,
+    default_stdin: bool = False,
+) -> Iterator[TextIO]:
+    """
+    Context manager for opening files with stdin/stdout support.
+
+    This function is intended for plain text streams (e.g. JSON/MMIF) and does
+    not support binary modes (e.g., 'rb', 'wb').
+
+    This is a native replacement for argparse.FileType which is deprecated as
+    of Python 3.14 due to resource leak issues. Unlike FileType, this defers
+    file opening until actually needed and ensures proper cleanup via context
+    manager.
+
+    Handles the common CLI pattern where:
+
+    - '-' means stdin (read mode) or stdout (write mode)
+    - None means "argument not provided"; when default_stdin=True, it falls back
+      to stdin/stdout
+    - Regular paths open actual files with proper resource management
+
+    :param path_or_dash: File path, '-' for stdin/stdout, or None for no argument
+    :param mode: File mode ('r' for reading, 'w' for writing). Binary modes are
+        not supported.
+    :param encoding: Optional file encoding
+    :param errors: Optional error handling strategy for encoding
+    :param default_stdin: If True and path_or_dash is None, default to stdin
+        (mode 'r') or stdout (mode 'w')
+    :returns: Context manager yielding text-mode file handle
+    :rtype: Iterator[TextIO]
+
+    Example usage::
+
+        # Read from file or stdin
+        with open_cli_io_arg(args.input, 'r', default_stdin=True) as f:
+            content = f.read()
+
+        # Write to file or stdout
+        with open_cli_io_arg(args.output, 'w', default_stdin=True) as f:
+            f.write(content)
+    """
+    # Valid text modes for file operations
+    _READ_FLAGS = frozenset({"r", "+"})
+    _WRITE_FLAGS = frozenset({"w", "a", "x", "+"})
+
+    if "b" in mode:
+        raise ValueError(
+            f"Binary mode '{mode}' is not supported. "
+            "Use text modes ('r', 'w', 'a', 'x') instead."
+        )
+
+    needs_read = bool(set(mode) & _READ_FLAGS)
+    needs_write = bool(set(mode) & _WRITE_FLAGS)
+
+    should_use_stdio = path_or_dash == "-" or (path_or_dash is None and default_stdin)
+
+    file_handle: Optional[TextIO] = None
+    should_close = False
+
+    try:
+        if should_use_stdio:
+            if needs_read and needs_write:
+                raise ValueError(
+                    f"Mode '{mode}' not supported with stdin/stdout "
+                    "(use read or write only)"
+                )
+
+            if needs_read:
+                # Check for missing input when stdin is a terminal
+                if path_or_dash is None and default_stdin and sys.stdin.isatty():
+                    raise SystemExit("error: No input provided.")
+                file_handle = sys.stdin
+
+            elif needs_write:
+                file_handle = sys.stdout
+
+            else:
+                raise ValueError(
+                    f"Mode '{mode}' not supported with stdin/stdout (use 'r' or 'w')"
+                )
+
+        elif isinstance(path_or_dash, str):
+            if needs_read and not os.path.exists(path_or_dash):
+                raise FileNotFoundError(f"Input path does not exist: {path_or_dash}")
+            file_handle = cast(
+                TextIO, io.open(path_or_dash, mode, encoding=encoding, errors=errors)
+            )
+            should_close = True
+
+        elif path_or_dash is None:
+            # None without default_stdin means no file specified
+            raise ValueError(
+                "No file path provided. Use '-' for stdin/stdout or set default_stdin=True."
+            )
+        else:
+            raise TypeError(
+                f"Invalid type for path_or_dash: {type(path_or_dash).__name__}. "
+                "Expected str or None."
+            )
+
+        if file_handle is not None:
+            yield file_handle
+
+    finally:
+        if should_close and file_handle is not None:
+            file_handle.close()
+
+
+def generate_model_summary(model: Type[BaseModel], indent: int = 0) -> str:
+    lines = []
+    prefix = " " * indent
+
+    # model_fields is a dictionary of FieldInfo objects
+    for name, field in model.model_fields.items():
+        # Get the alias if available, otherwise use the field name
+        field_name = field.alias if field.alias else name
+
+        # Get type annotation
+        type_annotation = field.annotation
+
+        def format_type(t) -> str:
+            origin = get_origin(t)
+            args = get_args(t)
+
+            # Handle Optional (Union[T, None])
+            if origin is Union and type(None) in args:
+                non_none_args = [arg for arg in args if arg is not type(None)]
+                if len(non_none_args) == 1:
+                    return f"{format_type(non_none_args[0])}, optional"
+
+            # Handle List
+            if origin is list:
+                if args:
+                    return f"[{format_type(args[0])}]"
+                return "[]"
+
+            # Handle Dict
+            if origin is dict:
+                return "obj"
+
+            # Handle Pydantic Models (Custom Classes)
+            if isinstance(t, type) and issubclass(t, BaseModel):
+                return "obj"
+
+            # Handle basic types and cleanup
+            t_str = str(t)
+            if t_str.startswith("<class '"):
+                t_str = t_str[8:-2]
+            if t_str.startswith("typing."):
+                t_str = t_str[7:]
+
+            # Remove module prefix if present
+            if "." in t_str:
+                t_str = t_str.split(".")[-1]
+
+            return t_str
+
+        display_type = format_type(type_annotation)
+
+        description = field.description if field.description else ""
+
+        line_content = f"{prefix}- {field_name} ({display_type})"
+        if description:
+            line_content += f": {description}"
+        lines.append(line_content)
+
+        # Check if it's a Pydantic model or a list/dict of Pydantic models
+        origin = get_origin(type_annotation)
+        args = get_args(type_annotation)
+
+        nested_model = None
+        # Handle Optional wrappers for nesting check
+        check_type = type_annotation
+        if origin is Union and type(None) in args:
+            non_none_args = [arg for arg in args if arg is not type(None)]
+            if len(non_none_args) == 1:
+                check_type = non_none_args[0]
+                origin = get_origin(check_type)
+                args = get_args(check_type)
+
+        if isinstance(check_type, type) and issubclass(check_type, BaseModel):
+            nested_model = check_type
+        elif (
+            origin is list
+            and args
+            and isinstance(args[0], type)
+            and issubclass(args[0], BaseModel)
+        ):
+            nested_model = args[0]
+        elif (
+            origin is dict
+            and args
+            and len(args) > 1
+            and isinstance(args[1], type)
+            and issubclass(args[1], BaseModel)
+        ):
+            nested_model = args[1]
+
+        if nested_model:
+            lines.append(generate_model_summary(nested_model, indent + 4))
+
+    return "\n".join(lines)
+
+
+# keep imports of CLI modules for historical reasons
+# keep them here in the bottom to avoid circular imports
 from mmif.utils.cli import rewind
 from mmif.utils.cli import source
-
diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py
index eaf35856..b8c79ced 100644
--- a/mmif/utils/cli/describe.py
+++ b/mmif/utils/cli/describe.py
@@ -3,12 +3,18 @@
 import sys
 import textwrap
 from pathlib import Path
-from typing import Union
+from typing import Union, cast
+
+from mmif.utils.cli import open_cli_io_arg, generate_model_summary
 
-from mmif.utils.workflow_helper import generate_workflow_identifier, describe_single_mmif, \
-    describe_mmif_collection
 # gen_param_hash is imported for backward compatibility
-from mmif.utils.workflow_helper import generate_param_hash
+from mmif.utils.workflow_helper import (
+    CollectionMmifDesc,
+    SingleMmifDesc,
+    describe_mmif_collection,
+    describe_single_mmif,
+    generate_workflow_identifier,
+)
 
 
 def get_pipeline_specs(mmif_file: Union[str, Path]):
@@ -22,41 +28,30 @@ def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str:
     import warnings
     warnings.warn("generate_pipeline_identifier is deprecated, use generate_workflow_identifier instead",
                   DeprecationWarning)
-    return generate_workflow_identifier(mmif_file)
+    return cast(str, generate_workflow_identifier(mmif_file))
 
 
 def describe_argparser():
-    """
-    Returns two strings: one-line description of the argparser, and
-    additional material, which will be shown in `clams --help` and
-    `clams <subcmd> --help`, respectively.
-    """
     oneliner = (
-        'provides CLI to describe the workflow specification from a MMIF '
-        'file or a collection of MMIF files.'
+        'Describe the workflow specification from a MMIF file or a '
+        'collection of MMIF files.'
     )
 
-    # get and clean docstrings
-    single_doc = describe_single_mmif.__doc__.split(':param')[0]
-    single_doc = textwrap.dedent(single_doc).strip()
-    collection_doc = describe_mmif_collection.__doc__.split(':param')[0]
-    collection_doc = textwrap.dedent(collection_doc).strip()
-
     additional = textwrap.dedent(f"""
     This command extracts workflow information from a single MMIF file or 
-    summarizes a directory of MMIF files.
+    a directory of MMIF files. The output is serialized as JSON.
+    
+    Output Schemas:
+    
+    1. Single MMIF File (mmif-file):
+{generate_model_summary(SingleMmifDesc, indent=4)}
+    
+    2. MMIF Collection (mmif-dir):
+{generate_model_summary(CollectionMmifDesc, indent=4)}
     
-    ==========================
-    For a single MMIF file
-    ==========================
-    {single_doc}
-
-    ===============================
-    For a directory of MMIF files
-    ===============================
-    {collection_doc}
+    Use `--help-schema` to inspect the full JSON schema for a specific output type.
     """)
-    return oneliner, oneliner + '\n\n' + additional.strip()
+    return oneliner, additional
 
 
 def prep_argparser(**kwargs):
@@ -65,17 +60,17 @@ def prep_argparser(**kwargs):
         formatter_class=argparse.RawDescriptionHelpFormatter,
         **kwargs
     )
+    
     parser.add_argument(
         "MMIF_FILE",
         nargs="?",
         type=str,
-        default=None if sys.stdin.isatty() else sys.stdin,
+        default=None,
         help='input MMIF file, a directory of MMIF files, or STDIN if `-` or not provided.'
     )
     parser.add_argument(
         "-o", "--output",
-        type=argparse.FileType("w"),
-        default=sys.stdout,
+        type=str, default=None,
         help='output file path, or STDOUT if not provided.'
     )
     parser.add_argument(
@@ -83,33 +78,43 @@ def prep_argparser(**kwargs):
         action="store_true",
         help="Pretty-print JSON output"
     )
+    parser.add_argument(
+        "--help-schema",
+        nargs=1,
+        choices=["mmif-file", "mmif-dir"],
+        metavar="SCHEMA_NAME",
+        help="Print the JSON schema for the output. Options: mmif-file, mmif-dir."
+    )
     return parser
 
 
 def main(args):
     """
-    Main entry point for the describe CLI command.
-
-    Reads a MMIF file and outputs a JSON summary containing:
-    - workflow_id: unique identifier for the source and app sequence
-    - stats: view counts, annotation counts (total/per-view/per-type),
-      and lists of error/warning/empty view IDs
-    - views: map of view IDs to app configurations and profiling data
-
-    :param args: Parsed command-line arguments
+    Main block for the describe CLI command.
+    This function basically works as a wrapper around
+    :func:`describe_single_mmif` (for single file input) or 
+    :func:`describe_mmif_collection` (for directory input).
     """
+    if hasattr(args, 'help_schema') and args.help_schema is not None:
+        schema_name = args.help_schema[0]
+        if schema_name == 'mmif-file':
+            model_cls = SingleMmifDesc
+        elif schema_name == 'mmif-dir':
+            model_cls = CollectionMmifDesc
+        
+        schema = model_cls.model_json_schema()
+        print(json.dumps(schema, indent=2))
+        sys.exit(0)
+
     output = {}
     # if input is a directory
-    if isinstance(args.MMIF_FILE, str) and Path(args.MMIF_FILE).is_dir():
+    if Path(str(args.MMIF_FILE)).is_dir():
         output = describe_mmif_collection(args.MMIF_FILE)
     # if input is a file or stdin
     else:
         # Read MMIF content
-        if hasattr(args.MMIF_FILE, 'read'):
-            mmif_content = args.MMIF_FILE.read()
-        else:
-            with open(args.MMIF_FILE, 'r') as f:
-                mmif_content = f.read()
+        with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file:
+            mmif_content = input_file.read()
 
         # For file input, we need to handle the path
         # If input is from stdin, create a temp file
@@ -127,11 +132,10 @@ def main(args):
                 tmp_path.unlink()
 
     if output:
-        if args.pretty:
-            json.dump(output, args.output, indent=2)
-        else:
-            json.dump(output, args.output)
-        args.output.write('\n')
+        # Convert Pydantic models to dicts
+        with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file:
+            json.dump(output, output_file, indent=2 if args.pretty else None)
+            output_file.write('\n')
 
 
 if __name__ == "__main__":
diff --git a/mmif/utils/cli/rewind.py b/mmif/utils/cli/rewind.py
index 1e038180..8dccc63f 100644
--- a/mmif/utils/cli/rewind.py
+++ b/mmif/utils/cli/rewind.py
@@ -3,6 +3,7 @@
 import textwrap
 
 import mmif
+from mmif.utils.cli import open_cli_io_arg
 from mmif.utils.workflow_helper import group_views_by_app
 
 
@@ -55,10 +56,6 @@ def rewind_mmif(mmif_obj: mmif.Mmif, choice: int, choice_is_viewnum: bool = True
 
 
 def describe_argparser():
-    """
-    returns two strings: one-line description of the argparser, and addition material, 
-    which will be shown in `clams --help` and `clams <subcmd> --help`, respectively.
-    """
     oneliner = 'provides CLI to rewind a MMIF from a CLAMS workflow.'
     additional = textwrap.dedent("""
     MMIF rewinder rewinds a MMIF by deleting the last N views.
@@ -70,12 +67,10 @@ def prep_argparser(**kwargs):
     parser = argparse.ArgumentParser(description=describe_argparser()[1], 
                                      formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs)
     parser.add_argument("MMIF_FILE",
-                        nargs="?", type=argparse.FileType("r"),
-                        default=None if sys.stdin.isatty() else sys.stdin,
+                        nargs="?", type=str, default=None,
                         help='input MMIF file path, or STDIN if `-` or not provided.')
     parser.add_argument("-o", "--output",
-                        type=argparse.FileType("w"),
-                        default=sys.stdout,
+                        type=str, default=None,
                         help='output file path, or STDOUT if not provided.')
     parser.add_argument("-p", '--pretty', action='store_true', 
                         help="Pretty-print rewound MMIF")
@@ -88,7 +83,8 @@ def prep_argparser(**kwargs):
 
 
 def main(args):
-    mmif_obj = mmif.Mmif(args.MMIF_FILE.read())
+    with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file:
+        mmif_obj = mmif.Mmif(input_file.read())
 
     if args.number == 0:  # If user doesn't know how many views to rewind, give them choices.
         choice = prompt_user(mmif_obj)
@@ -97,7 +93,8 @@ def main(args):
     if not isinstance(choice, int) or choice <= 0:
         raise ValueError(f"Only can rewind by a positive number of views. Got {choice}.")
 
-    args.output.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty))
+    with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file:
+        output_file.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty))
 
 
 if __name__ == "__main__":
diff --git a/mmif/utils/cli/source.py b/mmif/utils/cli/source.py
index 3abd2e1f..6c3b4b86 100644
--- a/mmif/utils/cli/source.py
+++ b/mmif/utils/cli/source.py
@@ -9,6 +9,7 @@
 
 from mmif import Mmif, Document, DocumentTypes, __specver__
 from mmif.serialize.mmif import MmifMetadata
+from mmif.utils.cli import open_cli_io_arg
 
 __all__ = ['WorkflowSource']
 
@@ -214,10 +215,6 @@ def generate_source_mmif_from_file(documents, prefix=None, scheme='file', **igno
 
 
 def describe_argparser():
-    """
-    returns two strings: one-line description of the argparser, and addition material, 
-    which will be shown in `clams --help` and `clams <subcmd> --help`, respectively.
-    """
     oneliner = 'provides CLI to create a "source" MMIF json.'
     additional = textwrap.dedent("""
     A source MMIF is a MMIF with a list of source documents but empty views. 
@@ -258,8 +255,7 @@ def prep_argparser(**kwargs):
     )
     parser.add_argument(
         '-o', '--output',
-        type=argparse.FileType('w'),
-        default=sys.stdout,
+        type=str, default=None,
         help='output file path, or STDOUT if not provided.'
     )
     scheme_help = 'A scheme to associate with the document location URI. When not given, the default scheme is `file://`.'
@@ -279,7 +275,8 @@ def prep_argparser(**kwargs):
 
 def main(args):
     mmif = generate_source_mmif_from_file(windows_path=False, **vars(args))
-    args.output.write(mmif)
+    with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file:
+        output_file.write(mmif)
     return mmif
 
 if __name__ == '__main__':
diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py
new file mode 100644
index 00000000..17fe3d5d
--- /dev/null
+++ b/mmif/utils/cli/summarize.py
@@ -0,0 +1,65 @@
+import argparse
+import json
+import pathlib
+import tempfile
+
+from mmif.utils.cli import open_cli_io_arg
+from mmif.utils.summarizer.summary import Summary
+
+
+def describe_argparser() -> tuple:
+    oneliner = 'Create a JSON Summary for a MMIF file.'
+    additional = 'The output is serialized as JSON and includes various statistics and summaries of the MMIF content.'
+    return oneliner, oneliner + '\n\n' + additional
+
+
+def prep_argparser(**kwargs):
+    """
+    Create the ArgumentParser instance for the summarizer.
+    """
+    parser = argparse.ArgumentParser(description=describe_argparser()[1],
+                                     formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs)
+    parser.add_argument("MMIF_FILE",
+                        nargs="?", type=str, default=None,
+                        help='input MMIF file path, or STDIN if `-` or not provided.')
+    parser.add_argument("-o", "--output",
+                        type=str, default=None,
+                        help='output file path, or STDOUT if not provided.')
+    parser.add_argument("-p", "--pretty", action="store_true",
+                        help="Pretty-print JSON output")
+    return parser
+
+
+def main(args: argparse.Namespace):
+    """
+    The main summarizer command.
+    """
+    # If a real file path is provided (not None and not '-'), pass it directly to Summary
+    if args.MMIF_FILE is not None and args.MMIF_FILE != "-":
+        mmif_summary = Summary(pathlib.Path(args.MMIF_FILE))
+        output = mmif_summary.to_dict()
+    else:
+        # Fallback: read from stdin (or default input), write to a temporary file, and summarize that
+        with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file:
+            mmif_content = input_file.read()
+        tmp_path = None
+        try:
+            with tempfile.NamedTemporaryFile(
+                    mode='w', suffix='.mmif', delete=False
+            ) as tmp:
+                tmp_path = pathlib.Path(tmp.name)
+                tmp.write(mmif_content)
+            mmif_summary = Summary(tmp_path)
+            output = mmif_summary.to_dict()
+        finally:
+            if tmp_path and tmp_path.exists():
+                tmp_path.unlink()
+
+    with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file:
+        json.dump(output, output_file, indent=2 if args.pretty else None)
+
+
+if __name__ == "__main__":
+    parser = prep_argparser()
+    args = parser.parse_args()
+    main(args)
diff --git a/mmif/utils/summarizer/__init__.py b/mmif/utils/summarizer/__init__.py
new file mode 100644
index 00000000..bbbd9cb8
--- /dev/null
+++ b/mmif/utils/summarizer/__init__.py
@@ -0,0 +1,28 @@
+"""
+Package containing the code to generate a summary from a MMIF file.
+"""
+
+
+import argparse
+
+from mmif.utils.summarizer.summary import Summary
+
+
+def argparser():
+    parser = argparse.ArgumentParser(description='Create a JSON Summary for a MMIF file')
+    parser.add_argument('-i', metavar='MMIF_FILE', help='input MMIF file', required=True)
+    parser.add_argument('-o', metavar='JSON_FILE', help='output JSON summary file', required=True)
+    return parser
+
+
+def pp_args(args):
+    for a, v in args.__dict__.items():
+        print(f'{a:12s}  -->  {v}')
+
+
+def main():
+    parser = argparser()
+    args = parser.parse_args()
+    #pp_args(args)
+    mmif_summary = Summary(args.i)
+    mmif_summary.report(outfile=args.o)
diff --git a/mmif/utils/summarizer/config.py b/mmif/utils/summarizer/config.py
new file mode 100644
index 00000000..f972bd97
--- /dev/null
+++ b/mmif/utils/summarizer/config.py
@@ -0,0 +1,69 @@
+
+from mmif.vocabulary import DocumentTypes
+from mmif.vocabulary import AnnotationTypes
+
+
+# The name of CLAMS applications, used to select views and to determine whether
+# the summarizer is appropriate for the app version.
+# TODO: this now requires an exhaustive listing of all allowed apps and their
+# versions, we need a more maintainable system.
+
+KALDI = [
+    # The first two use MMIF 0.4 and should probably be retired
+    'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.2',
+    'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.3',
+    'http://apps.clams.ai/aapb-pua-kaldi-wrapper/v3']
+
+WHISPER = [
+    'http://apps.clams.ai/whisper-wrapper/v7',
+    'http://apps.clams.ai/whisper-wrapper/v8',
+    'http://apps.clams.ai/whisper-wrapper/v8-3-g737e280']
+
+CAPTIONER = [
+    'http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97',
+    'http://apps.clams.ai/smolvlm2-captioner']
+
+NER = [
+    'http://apps.clams.ai/spacy-wrapper/v1.1',
+    'http://apps.clams.ai/spacy-wrapper/v2.1']
+
+SEGMENTER = 'http://apps.clams.ai/audio-segmenter'
+
+
+# When a named entity occurs 20 times we do not want to generate 20 instances of
+# it. If the start of the next entity occurs within the below number of
+# milliseconds after the end of the previous, then it is just added to the
+# previous one. Taking one minute as the default so two mentions in a minute end
+# up being the same instance. This setting can be changed with the 'granularity'
+# parameter.
+# TODO: this seems broken
+
+GRANULARITY = 1000
+
+
+# Properties used for the summary for various tags
+
+DOC_PROPS = ('id', 'type', 'location')
+VIEW_PROPS = ('id', 'timestamp', 'app')
+TF_PROPS = ('id', 'start', 'end', 'frameType')
+E_PROPS = ('id', 'group', 'cat', 'tag', 'video-start', 'video-end', 'coordinates')
+
+
+# Names of types
+
+TEXT_DOCUMENT = DocumentTypes.TextDocument.shortname
+VIDEO_DOCUMENT = DocumentTypes.VideoDocument.shortname
+TIME_FRAME = AnnotationTypes.TimeFrame.shortname
+BOUNDING_BOX = AnnotationTypes.BoundingBox.shortname
+ALIGNMENT = AnnotationTypes.Alignment.shortname
+
+ANNOTATION = 'Annotation'
+TOKEN = 'Token'
+SENTENCE = 'Sentence'
+PARAGRAPH = 'Paragraph'
+NAMED_ENTITY = 'NamedEntity'
+NOUN_CHUNK = 'NounChunk'
+VERB_CHUNK = 'VerbChunk'
+
+TIME_BASED_INTERVALS = {TIME_FRAME}
+SPAN_BASED_INTERVALS = {TOKEN, SENTENCE, PARAGRAPH, NAMED_ENTITY, NOUN_CHUNK, VERB_CHUNK}
diff --git a/mmif/utils/summarizer/graph.py b/mmif/utils/summarizer/graph.py
new file mode 100644
index 00000000..b5ea40a2
--- /dev/null
+++ b/mmif/utils/summarizer/graph.py
@@ -0,0 +1,256 @@
+import sys, json
+from collections import defaultdict
+from operator import itemgetter
+from pathlib import Path
+import argparse
+
+from typing import Any
+from mmif import Mmif
+
+from mmif.utils.summarizer import config
+from mmif.utils.summarizer.utils import compose_id, normalize_id
+from mmif.utils.summarizer.nodes import Node, Nodes, EntityNode, TimeFrameNode
+
+
+class Graph(object):
+
+    """
+    Graph implementation for a MMIF document. Each node contains an annotation
+    or document. Alignments are stored separately. Edges between nodes are created
+    from the alignments and added to the Node.targets property. The first edge added
+    to Node.targets is the document that the Node points to (if there is one).
+
+    The goal for the graph is to store all useful annotation and to have simple ways
+    to trace nodes all the way up to the primary data.
+
+    :var mmif:        the MMIF document that we are creating a graph for
+    :var documents:   list of the top-level documents
+    :var nodes:       dictionary of nodes, indexed on node identifier
+    :var alignments:  list of <View, Annotation> pairs
+    :var token_idx:   an instance of TokenIndex
+
+    """
+
+    def __init__(self, mmif: Any):
+        # TODO: the type hint should really be "MMif | str", but pytype did not
+        # like that.
+        self.mmif = mmif if type(mmif) is Mmif else Mmif(mmif)
+        self.documents = []
+        self.nodes = {}
+        self.alignments = []
+        self._init_nodes()
+        self._init_edges()
+        # Third pass to add links between text elements, in particular from
+        # entities to tokens, adding lists of tokens to entities.
+        tokens = self.get_nodes(config.TOKEN)
+        entities = self.get_nodes(config.NAMED_ENTITY)
+        self.token_idx = TokenIndex(tokens)
+        #self.token_idx.pp()
+        for e in entities:
+            #print('>>>', e, e.anchors)
+            e.tokens = self.token_idx.get_tokens_for_node(e)
+
+    def _init_nodes(self):
+        # The top-level documents are added as nodes, but they are also put in
+        # the documents list.
+        for doc in self.mmif.documents:
+            self.add_node(None, doc)
+            self.documents.append(doc)
+        # First pass over all annotations and documents in all views and save
+        # them in the graph.
+        doc_ids = [d.id for d in self.documents]
+        for view in self.mmif.views:
+            for annotation in view.annotations:
+                normalize_id(doc_ids, view, annotation)
+                if annotation.at_type.shortname == config.ALIGNMENT:
+                    # alignments are not added as nodes, but we do keep them around
+                    self.alignments.append((view, annotation))
+                else:
+                    self.add_node(view, annotation)
+
+    def _init_edges(self):
+        # Second pass over the alignments so we create edges.
+        for view, alignment in self.alignments:
+            self.add_edge(view, alignment)
+
+    def __str__(self):
+        return "<Graph nodes=%d>" % len(self.nodes)
+
+    def add_node(self, view, annotation):
+        """Add an annotation as a node to the graph."""
+        node = Nodes.new(self, view, annotation)
+        self.nodes[node.identifier] = node
+
+    def add_edge(self, view, alignment):
+        source_id = alignment.properties['source']
+        target_id = alignment.properties['target']
+        #print(alignment.id, source_id, target_id)
+        source = self.get_node(source_id)
+        target = self.get_node(target_id)
+        if source is None or target is None:
+            print('WARNING: could not add edge ',
+                  'because the source and/or target does not extst')
+        else:
+            # make sure the direction goes from token or textdoc to annotation
+            if target.annotation.at_type.shortname in (config.TOKEN, config.TEXT_DOCUMENT):
+                source, target = target, source
+            source.targets.append(target)
+            source.add_anchors_from_alignment(target)
+            target.add_anchors_from_alignment(source)
+
+    def get_node(self, node_id: str) -> Node | None:
+        """Return the Node instance from the node index."""
+        return self.nodes.get(node_id)
+
+    # def get_nodes(self, short_at_type: str, view_id : str = None):
+    # replaced the above because the code coverage is picky on type hints
+    def get_nodes(self, short_at_type: str, view_id=None):
+        """Get all nodes for an annotation type, using the short form. If a view
+        identifier is provided then only include nodes from that view."""
+        return [node for node in self.nodes.values()
+                if (node.at_type.shortname == short_at_type
+                    and (view_id is None or node.view.id == view_id))]
+
+    def statistics(self) -> defaultdict:
+        """
+        Collect counts for node types in each view.
+        """
+        stats = defaultdict(int)
+        for node in self.nodes.values():
+            stats[f'{str(node.view_id):4} {node.at_type.shortname}'] += 1
+        return stats
+
+    def trim(self, start: int, end: int):
+        """
+        :meta private:
+
+        Trim the graph and keep only those nodes that are included in the graph
+        between two timepoints (both in milliseconds). This assumes that all nodes
+        are anchored on the time in the audio or video stream. At the moment it 
+        keeps all nodes that are not explicitly anchored. Private for now because
+        it is still useless.
+        """
+        remove = set()
+        for node_id, node in self.nodes.items():
+            if 'time-point' in node.anchors:
+                if not start <= node.anchors['time-point'] <= end:
+                    remove.add(node_id)
+            if 'time-offsets' in node.anchors:
+                p1, p2 = node.anchors['time-offsets']
+                if not (start <= p1 <= end and start <= p2 <= end):
+                    remove.add(node_id)
+        new_nodes = [n for n in self.nodes.values() if not n.identifier in remove]
+        self.nodes = { node.identifier: node for node in new_nodes }
+
+    def pp(self, fname=None, skip_timepoints=False):
+        """
+        :meta private:
+        """
+        fh = sys.stdout if fname is None else open(fname, 'w')
+        fh.write("%s\n" % self)
+        for view in self.mmif.views:
+            fh.write("  <View %s %s>\n" % (view.id, str(view.metadata['app'])))
+        for node_id, node in self.nodes.items():
+            if node.at_type.shortname == 'TimePoint':
+                continue
+            fh.write("  %-40s" % node)
+            targets = [str(t) for t in node.targets]
+            fh.write(' -->  [%s]\n' % ' '.join(targets))
+
+    def pp_statistics(self):
+        """
+        :meta private:
+        """
+        stats = self.statistics()
+        for at_type in sorted(stats):
+            print(f'{at_type:20} {stats[at_type]:>5}')
+
+
+class TokenIndex(object):
+
+    """
+    The tokens are indexed on the identifier on the TextDocument that they occur
+    in and for each text document we have a list of <offsets, Node> pairs
+
+    .. code-block:: python
+
+        {'v_4:td1': [
+            ((0, 5), <summarizer.graph.Node object at 0x1039996d0>),
+            ((5, 6), <summarizer.graph.Node object at 0x103999850>),
+            ...]
+        }
+
+    """
+
+    # TODO: 
+    # - Benchmark get_tokens_for_node(). I may want to use something like this
+    #   to  determine enclosed nodes and enclosing nodes and that may blow up since
+    #   that would be O(n^2). If it does matter, probably start using binary search
+    #   or add an index from character offset to nodes.
+    # - It is also not sure whether we still need this since the new spaCy gives
+    #   targets to tokens.
+
+    def __init__(self, tokens):
+        self.tokens = {}
+        self.token_count = len(tokens)
+        for t in tokens:
+            tup = ((t.properties['start'], t.properties['end']), t)
+            self.tokens.setdefault(t.document.identifier, []).append(tup)
+        # Make sure the tokens for each document are ordered.
+        for document, token_list in self.tokens.items():
+            self.tokens[document] = sorted(token_list, key=itemgetter(0))
+        # In some cases there are two tokens with identical offset (for example
+        # with tokenization from both Kaldi and spaCy, not sure what to do with
+        # these, but should probably be more careful on what views to access
+
+    def __len__(self):
+        return self.token_count
+
+    def __str__(self):
+        return f'<TokenIndex with {len(self)} tokens>'
+
+    def get_tokens_for_node(self, node: Node):
+        """Return all tokens included in the span of a node."""
+        doc = node.document.identifier
+        try:
+            start = node.properties['start']
+            end = node.properties['end']
+        except KeyError:
+            start, end = node.anchors['text-offsets']
+        tokens = []
+        for (t_start, t_end), token in self.tokens.get(doc, []):
+            if t_start >= start and t_end <= end:
+                tokens.append(token)
+        return tokens
+
+    def pp(self, fname=None):
+        fh = sys.stdout if fname is None else open(fname, 'w')
+        for document in self.tokens:
+            fh.write("\n[%s] -->\n" % document)
+            for t in self.tokens[document]:
+                fh.write('    %s %s\n' % (t[0], t[1]))
+
+
+
+if __name__ == '__main__':
+
+    graph = Graph(open(sys.argv[1]).read())
+    print(graph)
+    #graph.pp()
+    #graph.nodes['v_7:st12'].pp()
+    #graph.nodes['v_2:s1'].pp()
+    #graph.nodes['v_4:tf1'].pp()
+    exit()
+    for node in graph.nodes.values():
+        print(node.at_type.shortname, node.identifier, node.anchors)
+
+
+'''
+
+Printing some graphs:
+
+uv run graph.py -i examples/input-v9.mmif -e dot -f png -o examples/dot-v9-1-full -p -a -v
+uv run graph.py -i examples/input-v9.mmif -e dot -f png -o examples/dot-v9-2-no-view-links -p -a
+uv run graph.py -i examples/input-v9.mmif -e dot -f png -o examples/dot-v9-3-no-anchor-to-doc -p
+
+'''
diff --git a/mmif/utils/summarizer/nodes.py b/mmif/utils/summarizer/nodes.py
new file mode 100644
index 00000000..53201022
--- /dev/null
+++ b/mmif/utils/summarizer/nodes.py
@@ -0,0 +1,370 @@
+import json
+
+from typing import Any
+
+from mmif.utils.summarizer import config
+
+
+
+class Node(object):
+
+    def __init__(self, graph, view, annotation):
+        self.graph = graph
+        self.view = view
+        self.view_id = None if self.view is None else self.view.id
+        self.annotation = annotation
+        # copy some information from the Annotation
+        self.at_type = annotation.at_type
+        self.identifier = annotation.id
+        self.properties = json.loads(str(annotation.properties))
+        # get the document from the view or the properties
+        self.document = self._get_document()
+        # The targets property contains a list of annotations or documents that
+        # the node content points to. This includes the document the annotation
+        # points to as well as the alignment from a token or text document to a
+        # bounding box or time frame (which is added later).
+        # TODO: the above does not seem to be true since there is no evidence of
+        # data from alignments being added.
+        self.targets = [] if self.document is None else [self.document]
+        self.anchors = {}
+        self.add_local_anchors()
+        self.add_anchors_from_targets()
+
+    def __str__(self):
+        anchor = ''
+        if self.at_type.shortname == config.TOKEN:
+            anchor = " %s:%s '%s'" % (self.properties['start'],
+                                      self.properties['end'],
+                                      self.properties.get('text','').replace('\n', '\\n'))
+        return "<%s %s%s>" % (self.at_type.shortname, self.identifier, anchor)
+
+    def add_local_anchors(self):
+        """Get the anchors that you can get from the annotation itself, which 
+        includes the start and end offsets, the coordinates, the timePoint of
+        a BoundingBox and any annotation with targets."""
+        props = self.properties
+        attype = self.annotation.at_type.shortname
+        if 'start' in props and 'end' in props:
+            # TimeFrame is the only non-character based interval so this simple
+            # if-then-else should work
+            if attype == config.TIME_FRAME:
+                self.anchors['text-offsets'] = (props['start'], props['end'])
+            else:
+                self.anchors['time-offsets'] = (props['start'], props['end'])
+        if 'coordinates' in props:
+            self.anchors['coordinates'] = props['coordinates']
+        if 'timePoint' in props:
+            self.anchors['time-point'] = props['timePoint']
+        if 'targets' in props:
+            self.anchors['targets'] = props['targets']
+
+    def add_anchors_from_targets(self):
+        """Get start and end offsets or timePoints from the targets and add them to
+        the anchors, but only if there were no anchors on the node already. This has
+        two cases: one for TimeFrames and one for text intervals."""
+        props = self.properties
+        attype = self.annotation.at_type.shortname
+        if 'targets' in props:
+            try:
+                t1 = self.graph.nodes[props['targets'][0]]
+                t2 = self.graph.nodes[props['targets'][-1]]
+                if attype == config.TIME_FRAME:
+                    if not 'time-offsets' in props:
+                        self.anchors['time-offsets'] = (
+                            t1.properties['timePoint'], t2.properties['timePoint'])
+                else:
+                    if not 'text-offsets' in props:
+                        self.anchors['text-offsets'] = (
+                            t1.properties['start'], t2.properties['end'])
+            except IndexError:
+                print(f'WARNING: Unexpected empty target list for {self.identifier}')
+
+    def add_anchors_from_alignment(self, target: Any, debug=False):
+        if target is None:
+            return
+        source_attype = self.at_type.shortname
+        target_attype = target.at_type.shortname
+        if debug:
+            print('\n@ DEBUG SOURCE->TARGET ', source_attype, target_attype)
+            print('@ DEBUG SOURCE.PROPS   ', list(self.properties.keys()))
+            print('@ DEBUG TARGET.PROPS   ', list(target.properties.keys()))
+            print('@ DEBUG TARGET.ANCHORS ', target.anchors)
+        # If a TextDocument is aligned to a BoundingBox then we grab the coordinates
+        # TODO: how are we getting the time point?
+        if source_attype == 'TextDocument' and target_attype == 'BoundingBox':
+            if 'coordinates' in target.properties:
+                self.anchors['coordinates'] = target.properties['coordinates']
+            #print(source_attype, self.anchors)
+        elif source_attype == 'BoundingBox' and target_attype == 'TextDocument':
+            pass
+        # If a TextDocument is aligned to a TimeFrame then we copy time anchors
+        # but also targets and representatives, the latter because some alignments
+        # are not precise
+        elif source_attype == 'TextDocument' and target_attype == 'TimeFrame':
+            if 'start' in target.properties and 'end' in target.properties:
+                self.anchors['time-offsets'] = (target.properties['start'],
+                                                target.properties['end'])
+            if 'time-offsets' in target.anchors:
+                # TODO: is this ever used?
+                self.anchors['time-offsets'] = target.anchors['time-offsets']
+            if 'targets' in target.properties:
+                self.anchors['targets'] = target.properties['targets']
+            if 'representatives' in target.properties:
+                self.anchors['representatives'] = target.properties['representatives']
+            #print('-', source_attype, self.anchors, self, target)
+        elif source_attype == 'TimeFrame' and target_attype == 'TextDocument':
+            pass
+        # Simply copy the time point
+        elif source_attype == 'TextDocument' and target_attype == 'TimePoint':
+            self.anchors['time-point'] = target.anchors['time-point']
+            if debug:
+                print('+ ADDED SOURCE.ANCHORS ', self.anchors)
+        # For Token-TimeFrame alignments all we need are the start and end time points
+        elif source_attype == 'Token' and target_attype == 'TimeFrame':
+            if 'start' in target.properties and 'end' in target.properties:
+                self.anchors['time-offsets'] = (target.properties['start'],
+                                                target.properties['end'])
+            #print(source_attype, self.anchors)
+        elif source_attype == 'TimeFrame' and target_attype == 'Token':
+            pass
+        # TODO: check whether some action is needed for the next options
+        elif source_attype == 'TextDocument' and target_attype == 'VideoDocument':
+            pass
+        elif source_attype == 'VideoDocument' and target_attype == 'TextDocument':
+            pass
+        elif source_attype == 'BoundingBox' and target_attype == 'TimePoint':
+            pass
+        elif source_attype =='TimePoint' and target_attype == 'BoundingBox':
+            pass
+        elif source_attype == 'BoundingBox' and target_attype in ('Token', 'Sentence', 'Paragraph'):
+            pass
+        elif source_attype in ('Token', 'Sentence', 'Paragraph') and target_attype == 'BoundingBox':
+            pass
+        elif source_attype == 'TextDocument' and target_attype == 'TimePoint':
+            pass
+        elif source_attype == 'TimePoint' and target_attype == 'TextDocument':
+            pass
+        else:
+            print('-', source_attype, target_attype)
+        #if debug:
+        #    print('DEBUG', self.anchors)
+
+    def _get_document(self):
+        """Return the document or annotation node that the annotation/document in
+        the node refers to via the document property. This could be a local property
+        or a metadata property if there is no such local property. Return None
+        if neither of those exist."""
+        # try the local property
+        docid = self.properties.get('document')
+        if docid is not None:
+            # print('>>>', docid, self.graph.get_node(docid))
+            return self.graph.get_node(docid)
+        # try the metadata property
+        if self.view is not None:
+            try:
+                metadata = self.view.metadata.contains[self.at_type]
+                docid = metadata['document']
+                return self.graph.get_node(docid)
+            except KeyError:
+                return None
+        return None
+
+    def summary(self):
+        """The default summary is just the identfier, this should typically be
+        overriden by sub classes."""
+        return { 'id': self.identifier }
+
+    def has_label(self):
+        """Only TimeFrameNodes can have labels so this returns False."""
+        return False
+
+    def pp(self, close=True):
+        print('-' * 80)
+        print(self)
+        print(f'    document = {self.document}')
+        for prop in self.properties:
+            print(f'    {prop} = {self.properties[prop]}')
+        print('    targets = ')
+        for target in self.targets:
+            print('       ', target)
+        print('    anchors = ')
+        for anchor in self.anchors:
+            print(f'        {anchor} -> {self.anchors[anchor]}')
+        if close:
+            print('-' * 80)
+
+
+class TimeFrameNode(Node):
+
+    def __str__(self):
+        frame_type = ' ' + self.frame_type() if self.has_label() else ''
+        return ('<TimeFrameNode %s %s:%s%s>'
+                % (self.identifier, self.start(), self.end(), frame_type))
+
+    def start(self):
+        return self.properties.get('start', -1)
+
+    def end(self):
+        return self.properties.get('end', -1)
+
+    def frame_type(self):
+        # TODO: rename this, uses old property since replaced by "label""
+        # NOTE: this is still aloowing for the old property though
+        return self.properties.get('label') or self.properties.get('frameType')
+
+    def has_label(self):
+        return self.frame_type() is not None
+
+    def representatives(self) -> list:
+        """Return a list of the representative TimePoints."""
+        # TODO: why could I not get this from the anchors?
+        rep_ids = self.properties.get('representatives', [])
+        reps = [self.graph.get_node(rep_id) for rep_id in rep_ids]
+        return reps
+
+    def summary(self):
+        """The summary of a time frame just contains the identifier, start, end
+        and frame type."""
+        return { 'id': self.identifier,
+                 'start': self.properties['start'],
+                 'end': self.properties['end'],
+                 'frameType': self.properties.get('frameType') }
+
+
+class EntityNode(Node):
+
+    def __init__(self, graph, view, annotation):
+        super().__init__(graph, view, annotation)
+        self.tokens = []
+        self._paths = None
+        self._anchor = None
+
+    def __str__(self):
+        try:
+            start = self.properties['start']
+            end = self.properties['end']
+        except KeyError:
+            start, end = self.anchors['text-offsets']
+        return ("<NamedEntityNode %s %s:%s '%s'>"
+                % (self.identifier, start, end, self.properties['text']))
+
+    def start_in_video(self):
+        #print('+++', self.document.properties)
+        try:
+            return self.document.anchors['time-point']
+        except KeyError:
+            return -1
+        #return self.anchor()['video-start']
+
+    def end_in_video(self):
+        return self.anchor().get('video-end')
+
+    '''
+    Commented this out because the type checking in the code coverage tests requires 
+    the default vaue for the close parameter to be the same as on Node.pp().
+
+    def pp(self, close=False):
+        super().pp(close=close)
+        try:
+            for i, p in enumerate(self.paths_to_docs()):
+                print('    %s' % ' '.join([str(n) for n in p[1:]]))
+        except ValueError:
+            print('    WARNING: error in path_to_docs in NamedEntityNode.pp()')
+        print('-' * 80)
+    '''
+
+    def summary(self):
+        """The summary for entities needs to include where in the video or image
+        the entity occurs, it is not enough to just give the text document."""
+        # TODO: in the old days this used an anchor() method which was fragile
+        # TODO: revamping it now  
+        return {
+            'id': self.identifier,
+            'group': self.properties['group'],
+            'cat': self.properties['category'],
+            'document': self.document.identifier,
+            # Entities in a TextDocument that is a full transcript without any
+            # alignments do not have a TimePoint
+            #'time-point': self.document.anchors.get('time-point'),
+            #'text-offsets': self.anchors.get('text-offsets'),
+            'time-point': self.document.anchors.get('time-point', -1),
+            'text-offsets': self.anchors.get('text-offsets', (-1 ,-1)),
+            #'document': self._get_document_plus_span(),
+            #'video-start': anchor.get('video-start'),
+            #'video-end': anchor.get('video-end'),
+            #'coordinates': self._coordinates_as_string(anchor)
+            }
+
+    def anchor(self) -> dict:
+        """The anchor is the position in the video that the entity is linked to.
+        This anchor cannot be found in the document property because that points
+        to a text document that was somehow derived from the video document. Some
+        graph traversal is needed to get the anchor, but we know that the anchor
+        is always a time frame or a bounding box.
+        """
+        # TODO: deal with the case where the primary document is not a video
+        self.paths = self.paths_to_docs()
+        bbtf = self.find_boundingbox_or_timeframe()
+        # for path in paths:
+        #     print('... [')
+        #     for n in path: print('     ', n)
+        # print('===', bbtf)
+        if bbtf.at_type.shortname == config.BOUNDING_BOX:
+            return {'video-start': bbtf.properties['timePoint'],
+                    'coordinates': bbtf.properties['coordinates']}
+        elif bbtf.at_type.shortname == config.TIME_FRAME:
+            return {'video-start': bbtf.properties['start'],
+                    'video-end': bbtf.properties['end']}
+        else:
+            return {}
+
+    def anchor2(self):
+        """The anchor is the position in the video that the entity is linked to.
+        This anchor cannot be found in the document property because that points
+        to a text document that was somehow derived from the video document. Some
+        graph traversal is needed to get the anchor, but we know that the anchor
+        is always a time frame or a bounding box.
+        """
+        # TODO: with this version you get an error that the paths variable does
+        #       not exist yet, must get a clearer picture on how to build a graph
+        #       where nodes have paths to anchors
+        # TODO: deal with the case where the primary document is not a video
+        if self._anchor is None:
+            self._paths = self.paths_to_docs()
+            bbtf = self.find_boundingbox_or_timeframe()
+            # for path in self._paths:
+            #    print('... [')
+            #    for n in path: print('     ', n)
+            # print('===', bbtf)
+            if bbtf.at_type.shortname == config.BOUNDING_BOX:
+                self._anchor = {'video-start': bbtf.properties['timePoint'],
+                                'coordinates': bbtf.properties['coordinates']}
+            elif bbtf.at_type.shortname == config.TIME_FRAME:
+                self._anchor = {'video-start': bbtf.properties['start'],
+                                'video-end': bbtf.properties['end']}
+        return self._anchor
+
+    def find_boundingbox_or_timeframe(self):
+        return self.paths[-1][-2]
+
+    @staticmethod
+    def _coordinates_as_string(anchor):
+        if 'coordinates' not in anchor:
+            return None
+        return ','.join(["%s:%s" % (pair[0], pair[1])
+                         for pair in anchor['coordinates']])
+
+
+class Nodes(object):
+
+    """Factory class for Node creation. Use Node for creation unless a special
+    class was registered for the kind of annotation we have."""
+
+    node_classes = { config.NAMED_ENTITY: EntityNode,
+                     config.TIME_FRAME: TimeFrameNode }
+
+    @classmethod
+    def new(cls, graph, view, annotation):
+        node_class = cls.node_classes.get(annotation.at_type.shortname, Node)
+        return node_class(graph, view, annotation)
+
diff --git a/mmif/utils/summarizer/summary.py b/mmif/utils/summarizer/summary.py
new file mode 100644
index 00000000..28339fad
--- /dev/null
+++ b/mmif/utils/summarizer/summary.py
@@ -0,0 +1,657 @@
+"""
+
+Main classes for the summarizer.
+
+"""
+
+# TODO:
+# - For the time unit we should really update get_start(), get_end() and other methods.
+
+
+import json
+import logging
+import os
+import pathlib
+from collections import defaultdict
+
+from mmif.serialize import Mmif
+from mmif.utils.summarizer import config
+from mmif.utils.summarizer.graph import Graph
+from mmif.utils.summarizer.utils import CharacterList
+from mmif.utils.summarizer.utils import get_transcript_view, get_captions_view
+from mmif.utils.summarizer.utils import timestamp
+from mmif.vocabulary import DocumentTypes
+
+logger = logging.getLogger(__name__)
+
+
+class SummaryException(Exception):
+    pass
+
+
+class Summary(object):
+
+    """Implements the summary of a MMIF file.
+
+    :var fname:      name of the input mmif file
+    :var mmif:       instance of mmif.serialize.Mmif
+    :var graph:      instance of graph.Graph
+    :var documents:  instance of Documents
+    :var views:      instance of Views
+    :var transcript: instance of Transcript
+    :var timeframes: instance of TimeFrames
+    :var entities:   instance of Entities
+    :var captions:   instance of Captions
+
+    """
+
+    def __init__(self, mmif_file):
+        self.fname = mmif_file
+        #self.mmif = mmif if type(mmif) is Mmif else Mmif(mmif)
+        self.mmif = Mmif(pathlib.Path(mmif_file).read_text())
+        self.warnings: list[str] = []
+        self.graph = Graph(self.mmif)
+        self.mmif_version = self.mmif.metadata['mmif']
+        self.documents = Documents(self)
+        self.annotations = Annotations(self)
+        self.document = Document(self)
+        self.views = Views(self)
+        self.timeframes = TimeFrames(self)
+        self.timeframe_stats = TimeFrameStats(self)
+        self.transcript = Transcript(self)
+        self.captions = Captions(self)
+        self.entities = Entities(self)
+        self.validate()
+        self.print_warnings()
+
+    def add_warning(self, warning: str):
+        self.warnings.append(warning)
+
+    def validate(self):
+        """Minimal validation of the input. Mostly a place holder because all it
+        does now is to check how many video documents there are."""
+        if len(self.video_documents()) > 1:
+            raise SummaryException("More than one video document in MMIF file")
+
+    def video_documents(self):
+        return self.mmif.get_documents_by_type(DocumentTypes.VideoDocument)
+
+    def to_dict(self):
+        return {
+            'mmif_version': self.mmif.metadata.mmif,
+            'document': self.document.data,
+            'documents': self.documents.data,
+            'annotations': self.annotations.data,
+            'views': self.views.data,
+            'transcript': self.transcript.data,
+            'captions': self.captions.as_json(),
+            'timeframes': self.timeframes.as_json(),
+            'timeframe_stats': self.timeframe_stats.data,
+            'entities': self.entities.as_json()
+        }
+
+    def report(self, outfile=None):
+        json_obj = self.to_dict()
+        report = json.dumps(json_obj, indent=2)
+        if outfile is None:
+            return report
+        # Support both file-like objects and path-like values for outfile.
+        if hasattr(outfile, "write"):
+            outfile.write(report)
+        else:
+            with open(outfile, 'w') as fh:
+                fh.write(report)
+
+    def print_warnings(self):
+        for warning in self.warnings:
+            logger.warning(warning)
+
+    def pp(self):
+        self.documents.pp()
+        self.views.pp()
+        self.transcript.pp()
+        self.timeframes.pp()
+        self.entities.pp()
+        print()
+
+
+class Documents(object):
+
+    """Contains a list of document summaries, which are dictionaries with just
+    the id, type and location properties."""
+
+    def __init__(self, summary: Summary):
+        self.data = [self.summary(doc) for doc in summary.graph.documents]
+
+    def __len__(self):
+        return len(self.data)
+
+    @staticmethod
+    def summary(doc):
+        return { 'id': doc.id,
+                 'type': doc.at_type.shortname,
+                 'location': doc.location }
+
+    def pp(self):
+        print('\nDocuments -> ')
+        for d in self.data:
+            print('    %s %s' % (d['type'], d['location']))
+
+
+class Annotations(object):
+
+    """Contains a dictionary of Annotation object summaries, indexed on view
+    identifiers."""
+
+    def __init__(self, summary):
+        self.data = defaultdict(list)
+        # summary.graph.get_nodes(config.ANNOTATION, view_id=view.id)
+        for anno in summary.graph.get_nodes(config.ANNOTATION):
+            self.data[anno.view.id].append(anno.properties)
+
+    def get(self, item):
+        return self.data.get(item, [])
+
+    def get_all_annotations(self):
+        annotations = []
+        for annos in self.data.values():
+            annotations.extend(annos)
+        return annotations
+
+
+class Document(object):
+
+    """Collects some document-level information, including MMIF version, size of
+    the MMIF file and some information from the SWT document annotation."""
+
+    def __init__(self, summary):
+        self.data = {
+            'mmif_version': summary.mmif_version,
+            'size': os.path.getsize(summary.fname) }
+        annotations = summary.annotations.get_all_annotations()
+        if annotations:
+            # TODO: this if fragile because it assumes that the annotation we want
+            # (which is the one from SWT) is always the first
+            doc_level_annotation = annotations[0]
+            if 'fps' in doc_level_annotation:
+                self.data['fps'] = doc_level_annotation['fps']
+            if 'frameCount' in doc_level_annotation:
+                self.data['frames'] = doc_level_annotation['frameCount']
+            if 'duration' in doc_level_annotation:
+                duration = doc_level_annotation['duration']
+                # both in milliseconds and as a timestamp
+                self.data['duration_ms'] = duration
+                self.data['duration_ts'] = timestamp(duration)
+
+
+class Views(object):
+
+    """Contains a list of view summaries, which are dictionaries with just
+    the id, app and timestamp properties."""
+
+    def __init__(self, summary):
+        self.summary = summary
+        self.data = [self.get_view_summary(view) for view in summary.mmif.views]
+
+    def __getitem__(self, i):
+        return self.data[i]
+
+    def __len__(self):
+        return len(self.data)
+
+    #@staticmethod
+    def get_view_summary(self, view):
+        annotation_types = defaultdict(int)
+        for annotation in view.annotations:
+            annotation_types[annotation.at_type.shortname] += 1
+        basic_info = {
+            'id': view.id,
+            'app': view.metadata.app,
+            'timestamp': view.metadata.timestamp,
+            'contains': [str(k) for k in view.metadata.contains.keys()],
+            'annotation_count': len(view.annotations),
+            'annotation_types': dict(annotation_types),
+            'parameters': view.metadata.parameters,
+            'appConfiguration': view.metadata.appConfiguration }
+        if view.metadata.warnings:
+            basic_info['warnings'] = view.metadata.warnings
+        if view.metadata.error:
+            basic_info['error'] = view.metadata.error
+        return basic_info
+
+    def pp(self):
+        print('\nViews -> ')
+        for v in self.data:
+            print('    %s' % v['app'])
+
+
+class Transcript(object):
+
+    """The transcript contains the string value from the first text document in the
+    last ASR view. It issues a warning if there is more than one text document in
+    the view."""
+
+    def __init__(self, summary):
+        self.summary = summary
+        self.data = []
+        view = get_transcript_view(summary.mmif.views)
+        if view is not None:
+            documents = view.get_documents()
+            if len(documents) > 1:
+                summary.add_warning(f'More than one TextDocument in ASR view {view.id}')
+            t_nodes = summary.graph.get_nodes(config.TOKEN, view_id=view.id)
+            s_nodes = summary.graph.get_nodes(config.SENTENCE, view_id=view.id)
+            if not t_nodes:
+                return
+            if s_nodes:
+                # Whisper has Sentence nodes
+                sentences = self.collect_targets(s_nodes)
+                sentence_ids = [n.identifier for n in s_nodes]
+            else:
+                # But Kaldi does not
+                sentences = self.create_sentences(t_nodes)
+                sentence_ids = [None] * len(sentences)
+            # initialize the transcripts with all blanks, most blanks will be
+            # overwrite with characters from the tokens
+            transcript = CharacterList(self.transcript_size(sentences))
+            for s_id, s in zip(sentence_ids, sentences):
+                transcript_element = TranscriptElement(s_id, s, transcript)
+                self.data.append(transcript_element.as_json())
+
+    def __str__(self):
+        return str(self.data)
+
+    @staticmethod
+    def transcript_size(sentences):
+        try:
+            return sentences[-1][-1].properties['end']
+        except IndexError:
+            return 0
+
+    def collect_targets(self, s_nodes):
+        """For each node (in this context a sentence node), collect all target nodes
+        (which are tokens) and return them as a list of lists, with one list for each
+        node."""
+        targets = []
+        for node in s_nodes:
+            node_target_ids = node.properties['targets']
+            node_targets = [self.summary.graph.get_node(stid) for stid in node_target_ids]
+            targets.append(node_targets)
+        return targets
+
+    def create_sentences(self, t_nodes, sentence_size=12):
+        """If there is no sentence structure then we create it just by chopping th
+        input into slices of some pre-determined length."""
+        # TODO: perhaps the size paramater should be set in the config file or via a
+        # command line option.
+        return [t_nodes[i:i + sentence_size]
+                for i in range(0, len(t_nodes), sentence_size)]
+
+
+class TranscriptElement:
+
+    """Utility class to handle data associated with an element from a transcript,
+    which is created from a sentence which is a list of Token Nodes. Initialization
+    has the side effect of populating the full transcript which is an instance of
+    CharacterList and which is also accessed here."""
+
+    def __init__(self, identifier: str, sentence: list, transcript: CharacterList):
+        for t in sentence:
+            # this adds the current token to the transcript
+            start = t.properties['start']
+            end = t.properties['end']
+            word = t.properties['word']
+            transcript.set_chars(word, start, end)
+        self.id = identifier
+        self.start = sentence[0].anchors['time-offsets'][0]
+        self.end = sentence[-1].anchors['time-offsets'][1]
+        self.start_offset = sentence[0].properties['start']
+        self.end_offset = sentence[-1].properties['end']
+        self.text = transcript.getvalue(self.start_offset, self.end_offset)
+
+    def __str__(self):
+        text = self.text if len(self.text) <= 50 else self.text[:50] + '...'
+        return f'<TranscriptElement {self.id} {self.start} {self.end}  "{text}">'
+
+    def as_json(self):
+        json_obj = {
+            "start-time": self.start,
+            "end-time": self.end,
+            "text": self.text }
+        if self.id is not None:
+            json_obj["id"] = self.id
+        return json_obj
+
+
+class Nodes(object):
+
+    """
+    Abstract class to store instances of subclasses of graph.Node. The
+    initialization methods of subclasses of Nodes can guard what nodes will
+    be allowed in, for example, as of July 2022 the TimeFrames class only
+    allowed time frames that had a frame type (thereby blocking the many
+    timeframes from Kaldi).
+
+    :var summary:  an instance of Summary
+    :var graph:  an instance of graph.Graph, taken from the summary
+    :var nodes:  list of instances of subclasses of graph.Node
+
+    """
+
+    def __init__(self, summary):
+        self.summary = summary
+        self.graph = summary.graph
+        self.nodes = []
+
+    def __getitem__(self, i):
+        return self.nodes[i]
+
+    def __len__(self):
+        return len(self.nodes)
+
+    def add(self, node):
+        self.nodes.append(node)
+
+    def get_nodes(self, **props):
+        """Return all the nodes that match the given properties."""
+        def prop_check(p, v, props_given):
+            return v == props_given.get(p) if p in props_given else False
+        return [n for n in self
+                if all([prop_check(p, v, n.annotation.properties)
+                        for p, v in props.items()])]
+
+
+class TimeFrames(Nodes):
+
+    """For now, we take only the TimeFrames that have a frame type, which rules out
+    all the frames we got from Kaldi."""
+
+    def __init__(self, summary):
+        super().__init__(summary)
+        # a dictionary mapping app names to lists of timeframe summaries
+        self.data = defaultdict(list)
+        for tf_node in self.graph.get_nodes(config.TIME_FRAME):
+            if tf_node.has_label():
+                self.add(tf_node)
+        self._collect_timeframe_summaries()
+        self._sort_timeframe_summaries()
+
+    def _collect_timeframe_summaries(self):
+        for tf in self.nodes:
+            label = tf.frame_type()
+            try:
+                start, end = tf.anchors['time-offsets']
+            except KeyError:
+                # TODO: 
+                # - this defies the notion of using the anchors for this, but 
+                #   maybe in this case we should go straight to the start/end
+                # - this code below also raises an error if there are no start
+                #   and end properties
+                start = tf.properties['start']
+                end = tf.properties['end']
+            representatives = tf.representatives()
+            rep_tps = [rep.properties['timePoint'] for rep in representatives]
+            score = tf.properties.get('classification', {}).get(label)
+            app = tf.view.metadata.app
+            self.data[app].append(
+                { 'identifier': tf.identifier, 'label': label, 'score': score,
+                  'start-time': start, 'end-time': end, 'representatives': rep_tps })
+
+    def _sort_timeframe_summaries(self):
+        """Sort the data on their start time, do this for all apps."""
+        for app in self.data:
+            sort_function = lambda x: x['start-time']
+            self.data[app] = list(sorted(self.data[app], key=sort_function))
+
+    def as_json(self):
+        return self.data
+
+    def pp(self):
+        print('\nTimeframes -> ')
+        for tf in self.nodes:
+            summary = tf.summary()
+            print('    %s:%s %s' % (summary['start'], summary['end'],
+                                    summary['frameType']))
+
+
+class TimeFrameStats(object):
+
+    def __init__(self, summary):
+        # a dictionary mapping app names to frameType->duration dictionaries,
+        # where the duration is cumulative over all instances
+        self.timeframes = summary.timeframes
+        self.data = {}
+        self._collect_durations()
+        self._collect_other_morsels()
+
+    def _collect_durations(self):
+        timeframes = self.timeframes.data
+        for app in timeframes:
+            self.data[app] = {}
+            for tf in timeframes[app]:
+                label = tf.get('label')
+                if label not in self.data[app]:
+                    self.data[app][label] = {'count': 0, 'duration': 0}
+                self.data[app][label]['count'] += 1
+                duration = tf['end-time'] - tf['start-time']
+                if label is not None:
+                    # TODO: these gave weird values for duration
+                    #print('---',app, label, duration)
+                    self.data[app][label]['duration'] += duration
+                duration = self.data[app][label]['duration']
+                count = self.data[app][label]['count']
+                self.data[app][label]['average'] = duration // count 
+
+    def _collect_other_morsels(self):
+        # First we want everything grouped by app and label
+        timeframes = self.timeframes.data
+        grouped_timeframes = defaultdict(lambda: defaultdict(list))
+        for app in timeframes:
+            for tf in timeframes[app]:
+                label = tf.get('label')
+                grouped_timeframes[app][label].append(tf)
+        # The we pick the morsels for each label
+        for app in grouped_timeframes:
+            for label in grouped_timeframes[app]:
+                tfs = grouped_timeframes[app][label]
+                sort_on_start = lambda tf: tf['start-time']
+                sort_on_length = lambda tf: tf['end-time'] - tf['start-time']
+                first_tf = list(sorted(tfs, key=sort_on_start))[0]
+                longest_tf = list(sorted(tfs, key=sort_on_length, reverse=True))[0]                
+                self.data[app][label]['first'] = first_tf['start-time']
+                self.data[app][label]['longest'] = longest_tf['start-time']
+
+
+class Entities(Nodes):
+
+    """
+    This class collects instances of graph.EntityNode.
+
+    :var nodes_idx: maps entity texts to lists of instances of graph.EntityNode
+    :var bins: an instance of Bins
+
+    """
+
+    def __init__(self, summary):
+        super().__init__(summary)
+        self.nodes_idx = {}
+        self.bins = None
+        for ent in self.graph.get_nodes(config.NAMED_ENTITY):
+            self.add(ent)
+        self._create_node_index()
+        self._group()
+
+    def __str__(self):
+        return f'<Entities with {len(self.nodes_idx)} nodes and {len(self.bins)} bins>'
+
+    def _create_node_index(self):
+        """Put all the entities from self.nodes in self.node_idx. This first puts
+        the nodes into the dictionary indexed on text string and then sorts the
+        list of nodes for each string on video position."""
+        for ent in self:
+            self.nodes_idx.setdefault(ent.properties['text'], []).append(ent)
+        for text, entities in self.nodes_idx.items():
+            self.nodes_idx[text] = sorted(entities,
+                                          key=(lambda e: e.start_in_video()))
+
+    def _group(self):
+        """Groups all the nodes on the text and sorts them on position in the video,
+        for the latter it will also create bins of entities that occur close to each
+        other in the text."""
+        # create the bins, governed by the summary's granularity
+        self.bins = Bins(self.summary)
+        for text, entities in self.nodes_idx.items():
+            self.bins.current_bin = None
+            for entity in entities:
+                self.bins.add_entity(text, entity)
+        self.bins.mark_entities()
+
+    def _add_tags(self, tags):
+        for tag in tags:
+            tag_doc = tag.properties['document']
+            tag_p1 = tag.properties['start']
+            tag_p2 = tag.properties['end']
+            entities = self.nodes_idx.get(tag.properties['text'], [])
+            for entity in entities:
+                props = entity.properties
+                doc = props['document']
+                p1 = props['start']
+                p2 = props['end']
+                if tag_doc == doc and tag_p1 == p1 and tag_p2 == p2:
+                    entity.properties['tag'] = tag.properties['tagName']
+
+    def as_json(self):
+        json_obj = []
+        for text in self.nodes_idx:
+            entity = {"text": text, "instances": []}
+            json_obj.append(entity)
+            for e in self.nodes_idx[text]:
+                entity["instances"].append(e.summary()) # e.summary(), E_PROPS)
+        return json_obj
+
+    def pp(self):
+        print('\nEntities -> ')
+        for e in self.nodes_idx:
+            print('    %s' % e)
+            for d in self.nodes_idx[e]:
+                props = ["%s=%s" % (p, v) for p, v in d.summary().items()]
+                print('        %s' % ' '.join(props))
+
+    def print_groups(self):
+        for key in sorted(self.nodes_idx):
+            print(key)
+            for e in self.nodes_idx[key]:
+                print('   ', e, e.start_in_video())
+
+
+class Captions(Nodes):
+
+    def __init__(self, summary):
+        super().__init__(summary)
+        self.captions = []
+        view = get_captions_view(summary.mmif.views)
+        if view is not None:
+            for doc in self.graph.get_nodes(config.TEXT_DOCUMENT, view_id=view.id):
+                text = doc.properties['text']['@value'].split('[/INST]')[-1]
+                logger.debug('>>> DOC      %s', doc)
+                logger.debug('>>> PROPS    %s', list(doc.properties.keys()))
+                logger.debug('>>> TEXT     %s', text.replace("\n", "")[:100])
+                logger.debug('>>> ANCHORS  %s', doc.anchors)
+                if 'time-offsets' in doc.anchors and 'representatives' in doc.anchors:
+                    # For older LLava-style captions
+                    # http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97
+                    # NOTE: probably obsolete, at least the link above is dead
+                    tp_id = doc.anchors["representatives"][0]
+                    tp = summary.graph.get_node(tp_id)
+                    if tp is not None:
+                        self.captions.append(
+                            { 'identifier': doc.identifier,
+                              'time-point': tp.properties['timePoint'],
+                              'text': text })
+                if 'time-point' in doc.anchors:
+                    # For newer SmolVLM-style captions
+                    # http://apps.clams.ai/smolvlm2-captioner
+                    self.captions.append(
+                        { 'identifier': doc.identifier,
+                          'time-point': doc.anchors['time-point'],
+                          'text': text })
+
+    def as_json(self):
+        return self.captions
+        #return [(ident, p1, p2, text) for ident, p1, p2, text in self.captions]
+
+
+class Bins(object):
+
+    def __init__(self, summary):
+        self.summary = summary
+        self.bins = {}
+        self.current_bin = None
+        self.current_text = None
+
+    def __str__(self):
+        return f'<Bins {len(self.bins)}>'
+
+    def __len__(self):
+        return len(self.bins)
+
+    def add_entity(self, text, entity):
+        """Add an entity instance to the appropriate bin."""
+        if self.current_bin is None:
+            # Add the first instance of a new entity (as defined by the text),
+            # since it is the first a new bin will be created.
+            self.current_text = text
+            self.current_bin = Bin(entity)
+            self.bins[text] = [self.current_bin]
+        else:
+            # For following entities with the same text, a new bin may be
+            # created depending on the positions and the granularity.
+            p1 = self.current_bin[-1].start_in_video()
+            p2 = entity.start_in_video()
+            # p3 = entity.end_in_video()
+            if p2 - p1 < config.GRANULARITY:
+                # TODO: should add p3 here
+                self.current_bin.add(entity)
+            else:
+                self.current_bin = Bin(entity)
+                self.bins[self.current_text].append(self.current_bin)
+
+    def mark_entities(self):
+        """Marks all entities with the bin that they occur in. This is done to export
+        the grouping done with the bins to the entities and this way the bins never need
+        to be touched again."""
+        # TODO: maybe use the bins when we create the output
+        for entity_bins in self.bins.values():
+            for i, e_bin in enumerate(entity_bins):
+                for entity in e_bin:
+                    entity.properties['group'] = i
+
+    def print_bins(self):
+        for text in self.bins:
+            print(text)
+            text_bins = self.bins[text]
+            for i, text_bin in enumerate(text_bins):
+                text_bin.print_nodes(i)
+            print()
+
+
+class Bin(object):
+
+    def __init__(self, node):
+        # TODO: we are not using these yet, but a bin should have a begin and
+        # end in the video which should be derived from the start and end of
+        # entities in the video. The way we put things in bins now is a bit
+        # fragile since it depends on the start or end of the last element.
+        self.start = 0
+        self.end = 0
+        self.nodes = [node]
+
+    def __getitem__(self, i):
+        return self.nodes[i]
+
+    def add(self, node):
+        self.nodes.append(node)
+
+    def print_nodes(self, i):
+        for node in self.nodes:
+            print(' ', i, node)
diff --git a/mmif/utils/summarizer/utils.py b/mmif/utils/summarizer/utils.py
new file mode 100644
index 00000000..897a3830
--- /dev/null
+++ b/mmif/utils/summarizer/utils.py
@@ -0,0 +1,268 @@
+"""
+
+Utility methods for the summarizer.
+
+"""
+
+import io
+from pathlib import Path
+from xml.sax.saxutils import quoteattr, escape
+from collections import UserList
+
+from mmif import View, Annotation
+from mmif.utils.summarizer.config import KALDI, WHISPER, CAPTIONER, SEGMENTER
+from mmif.utils.summarizer.config import TOKEN, ALIGNMENT, TIME_FRAME
+
+
+def compose_id(view_id, anno_id):
+    """Composes the view identifier with the annotation identifier."""
+    return anno_id if ':' in anno_id else view_id + ':' + anno_id
+
+
+def type_name(annotation):
+    """Return the short name of the type."""
+    return annotation.at_type.split('/')[-1]
+
+
+def get_transcript_view(views):
+    """Return the last Whisper or Kaldi view that is not a warnings view."""
+    # TODO: this now has a simplified idea of how to find a view, should at least
+    # move towards doing some regular expression matching on the WHISPER config
+    # setting. The same holds for other functions to get views.
+    for view in reversed(views):
+        if view.metadata.app in KALDI + WHISPER:
+            if view.metadata.warnings:
+                continue
+            return view
+    return None
+
+
+def get_captions_view(views):
+    """Return the last view created by the captioner."""
+    for view in reversed(views):
+        if view.metadata.app in CAPTIONER:
+            if view.metadata.warnings:
+                continue
+            return view
+    return None
+
+
+def get_last_segmenter_view(views):
+    for view in reversed(views):
+        # print(f'>>> {view.metadata.app}')
+        if view.metadata.app.startswith(SEGMENTER):
+            return view
+    return None
+
+
+def get_aligned_tokens(view):
+    """Get a list of tokens from an ASR view where for each token we add a timeframe
+    properties which has the start and end points of the aligned timeframe."""
+    idx = AnnotationsIndex(view)
+    for alignment in idx.get_annotations(ALIGNMENT).values():
+        token = idx[TOKEN].get(alignment.properties['target'])
+        frame = idx[TIME_FRAME].get(alignment.properties['source'])
+        if token and frame:
+            # add a timeframe to the token, we can do this now that we do not
+            # freeze MMIF annotations anymore
+            token.properties['timeframe'] = (frame.properties['start'],
+                                             frame.properties['end'])
+    return idx.tokens
+
+
+def timestamp(milliseconds: int, format='hh:mm:ss'):
+    # sometimes the milliseconds are not a usable float
+    if milliseconds in (None, -1):
+        return 'nil'
+    milliseconds = int(milliseconds)
+    seconds = milliseconds // 1000
+    minutes = seconds // 60
+    hours = minutes // 60
+    ms = milliseconds % 1000
+    s = seconds % 60
+    m = minutes % 60
+    if format == 'hh:mm:ss:mmm':
+        return f'{hours}:{m:02d}:{s:02d}.{ms:03d}'
+    elif format == 'hh:mm:ss':
+        return f'{hours}:{m:02d}:{s:02d}'
+    elif format == 'mm:ss':
+        return f'{m:02d}:{s:02d}'
+    elif format == 'mm:ss:mmm':
+        return f'{m:02d}:{s:02d}.{ms:03d}'
+    else:
+        return f'{hours}:{m:02d}:{s:02d}.{ms:03d}'
+
+
+
+class AnnotationsIndex:
+
+    """Creates an index on the annotations list for a view, where each annotation type
+    is indexed on its identifier. Tokens are special and get their own list."""
+
+    def __init__(self, view):
+        self.view = view
+        self.idx = {}
+        self.tokens = []
+        for annotation in view.annotations:
+            shortname = annotation.at_type.shortname
+            if shortname == TOKEN:
+                self.tokens.append(annotation)
+            self.idx.setdefault(annotation.at_type.shortname, {})
+            self.idx[shortname][annotation.properties.id] = annotation
+
+    def __str__(self):
+        return f'<AnnotationsIndex on view {self.view.id} {self.view.metadata.app}>'
+
+    def __getitem__(self, item):
+        return self.idx[item]
+
+    def get_annotations(self, at_type):
+        return self.idx.get(at_type, {})
+
+
+class CharacterList(UserList):
+
+    """Auxiliary datastructure to help print a list of tokens. It allows you to
+    back-engineer a sentence from the text and character offsets of the tokens."""
+
+    def __init__(self, n: int, char=' '):
+        self.size = n
+        self.char = char
+        self.data = n * [char]
+
+    def __str__(self):
+        return f'<CharacterList [{self.getvalue(0, len(self))}]>'
+
+    def __len__(self):
+        return self.size
+
+    def __setitem__(self, key, value):
+        try:
+            self.data[key] = value
+        except IndexError:
+            for i in range(len(self), key + 1):
+                self.data.append(self.char)
+            self.data[key] = value
+
+    def set_chars(self, text: str, start: int, end: int):
+        self.data[start:end] = text
+
+    def getvalue(self, start: int, end: int):
+        return ''.join(self.data[start:end])
+
+
+def xml_tag(tag, subtag, objs, props, indent='  ') -> str:
+    """Return an XML string for a list of instances of subtag, grouped under tag."""
+    s = io.StringIO()
+    s.write(f'{indent}<{tag}>\n')
+    for obj in objs:
+        s.write(xml_empty_tag(subtag, indent + '  ', obj, props))
+    s.write(f'{indent}</{tag}>\n')
+    return s.getvalue()
+
+
+def xml_empty_tag(tag_name: str, indent: str, obj: dict, props: tuple) -> str:
+    """Return an XML tag to an instance of io.StringIO(). Only properties from obj
+    that are in the props tuple are printed."""
+    pairs = []
+    for prop in props:
+        if prop in obj:
+            if obj[prop] is not None:
+                #pairs.append("%s=%s" % (prop, xml_attribute(obj[prop])))
+                pairs.append(f'{prop}={xml_attribute(obj[prop])}')
+    attrs = ' '.join(pairs)
+    return f'{indent}<{tag_name} {attrs}/>\n'
+
+
+def write_tag(s, tagname: str, indent: str, obj: dict, props: tuple):
+    """Write an XML tag to an instance of io.StringIO(). Only properties from obj
+    that are in the props tuple are printed."""
+    pairs = []
+    for prop in props:
+        if prop in obj:
+            if obj[prop] is not None:
+                pairs.append("%s=%s" % (prop, xml_attribute(obj[prop])))
+    s.write('%s<%s %s/>\n'
+            % (indent, tagname, ' '.join(pairs)))
+
+
+def xml_attribute(attr):
+    """Return attr as an XML attribute."""
+    return quoteattr(str(attr))
+
+
+def xml_data(text):
+    """Return text as XML data."""
+    return escape(str(text))
+
+
+def normalize_id(doc_ids: list, view: View, annotation: Annotation):
+    """Change identifiers to include the view identifier if it wasn't included,
+    do nothing otherwise. This applies to the Annotation id, target, source,
+    document, targets and representatives properties. Note that timePoint is
+    not included because the value is an integer and not an identifier."""
+    # TODO: this seems somewhat fragile
+    # TODO: spell out what doc_ids is for (to exclude source documents I think)
+    debug = False
+    attype = annotation.at_type.shortname
+    props = annotation.properties
+    if ':' not in annotation.id and view is not None:
+        if annotation.id not in doc_ids:
+            newid = f'{view.id}:{annotation.id}'
+            annotation.properties['id'] = newid
+    if 'document' in props:
+        doc_id = props['document']
+        if ':' not in doc_id and view is not None:
+            if doc_id not in doc_ids:
+                props['document'] = f'{view.id}:{doc_id}'
+    if 'targets' in props:
+        new_targets = []
+        for target in props['targets']:
+            if ':' not in target and view is not None:
+                if target not in doc_ids:
+                    new_targets.append(f'{view.id}:{target}')
+            else:
+                new_targets.append(target)
+        props['targets'] = new_targets
+    if 'representatives' in props:
+        new_representatives = []
+        for rep in props['representatives']:
+            if ':' not in rep and view is not None:
+                new_representatives.append(f'{view.id}:{rep}')
+            else:
+                new_representatives.append(rep)
+        props['representatives'] = new_representatives
+    if attype == 'Alignment':
+        if ':' not in props['source'] and view is not None:
+            if props['source'] not in doc_ids:
+                props['source'] = f'{view.id}:{props["source"]}'
+        if ':' not in props['target'] and view is not None:
+            if props['target'] not in doc_ids:
+                props['target'] = f'{view.id}:{props["target"]}'
+    if debug:
+        print('===', annotation)
+
+
+def get_annotations_from_view(view, annotation_type):
+    """Return all annotations from a view that match the short name of the
+    annotation type."""
+    # Note: there is method mmif.View.get_annotations() where you can give
+    # at_type as a parameter, but it requires a full match.
+    return [a for a in view.annotations
+            if a.at_type.shortname == annotation_type]
+
+
+def find_matching_tokens(tokens, ne):
+    matching_tokens = []
+    ne_start = ne.properties["start"]
+    ne_end = ne.properties["end"]
+    start_token = None
+    end_token = None
+    for token in tokens:
+        if token.properties['start'] == ne_start:
+            start_token = token
+        if token.properties['end'] == ne_end:
+            end_token = token
+    return start_token, end_token
+
+
diff --git a/mmif/utils/video_document_helper.py b/mmif/utils/video_document_helper.py
index a1b9c59a..1ff6df40 100644
--- a/mmif/utils/video_document_helper.py
+++ b/mmif/utils/video_document_helper.py
@@ -1,5 +1,7 @@
+import contextvars
 import importlib
 import sys
+from enum import Enum
 
 import math
 import warnings
@@ -12,13 +14,22 @@
 from mmif.utils.timeunit_helper import convert
 from mmif.vocabulary import DocumentTypes
 
-for cv_dep in ('cv2', 'ffmpeg', 'PIL', 'wurlitzer'):
+_CV_DEPS = ('cv2', 'PIL', 'wurlitzer')
+_cv_import_warning = (
+    'Optional package "{}" is not found. '
+    'You might want to install Computer-Vision dependencies '
+    'by running `pip install mmif-python[cv]=={}`'
+)
+
+
+def _check_cv_dep(dep):
+    """Import a CV dependency, raising ImportError with a helpful message."""
     try:
-        importlib.__import__(cv_dep)
+        return importlib.__import__(dep)
     except ImportError as e:
-        warnings.warn(f"Optional package \"{e.name}\" is not found. "
-                      f"You might want to install Computer-Vision dependencies "
-                      f"by running `pip install mmif-python[cv]=={mmif.__version__}`")
+        raise ImportError(
+            _cv_import_warning.format(e.name, mmif.__version__)
+        ) from e
 
 
 FPS_DOCPROP_KEY = 'fps'
@@ -27,6 +38,36 @@
 DURATIONUNIT_DOCPROP_KEY = 'durationTimeUnit'
 
 
+class SamplingMode(Enum):
+    """Determines how timepoints are selected from a TimeFrame."""
+    REPRESENTATIVES = "representatives"
+    SINGLE = "single"
+    ALL = "all"
+
+
+SAMPLING_MODE_DESCRIPTIONS = {
+    SamplingMode.REPRESENTATIVES: (
+        "uses all representative timepoints if present, "
+        "otherwise skips the TimeFrame."
+    ),
+    SamplingMode.SINGLE: (
+        "uses the middle representative if present, otherwise "
+        "extracts a frame from the midpoint of the start/end "
+        "interval (midpoint is calculated by floor division "
+        "of the sum of start and end)."
+    ),
+    SamplingMode.ALL: (
+        "uses all target timepoints if present, otherwise "
+        "extracts all frames from the time interval."
+    ),
+}
+SAMPLING_MODE_DEFAULT = SamplingMode.REPRESENTATIVES
+
+
+_sampling_mode = contextvars.ContextVar(
+    'sampling_mode', default=SamplingMode.REPRESENTATIVES)
+
+
 def capture(video_document: Document):
     """
     Captures a video file using OpenCV and adds fps, frame count, and duration as properties to the document.
@@ -34,7 +75,7 @@ def capture(video_document: Document):
     :param video_document: :py:class:`~mmif.serialize.annotation.Document` instance that holds a video document (``"@type": ".../VideoDocument/..."``)
     :return: `OpenCV VideoCapture <https://docs.opencv.org/3.4/d8/dfe/classcv_1_1VideoCapture.html>`_ object
     """
-    import cv2  # pytype: disable=import-error
+    cv2 = _check_cv_dep('cv2')
     if video_document is None or video_document.at_type != DocumentTypes.VideoDocument:
         raise ValueError(f'The document does not exist.')
 
@@ -59,8 +100,8 @@ def get_framerate(video_document: Document) -> float:
     if video_document is None or video_document.at_type != DocumentTypes.VideoDocument:
         raise ValueError(f'The document does not exist.')
 
-    framerate_keys = (FPS_DOCPROP_KEY, 
-                      'framerate', 'frameRate', 'frame_rate', 'frame-rate', 
+    framerate_keys = (FPS_DOCPROP_KEY,
+                      'framerate', 'frameRate', 'frame_rate', 'frame-rate',
                       'framespersecond', 'framesPerSecond', 'frames_per_second', 'frames-per-second',
                       'framepersecond', 'framePerSecond', 'frame_per_second', 'frame-per-second')
     for k in framerate_keys:
@@ -84,20 +125,23 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int],
     :param record_ffmpeg_errors: if True, records and warns about FFmpeg stderr output during extraction
     :return: frames as a list of :py:class:`~numpy.ndarray` or :py:class:`~PIL.Image.Image`
     """
-    import cv2
+    cv2 = _check_cv_dep('cv2')
+    # deduplicate and sort frame numbers for extraction, then map back to original order
+    original_framenums = list(framenums)
+    unique_framenums = sorted(set(original_framenums))
     if as_PIL:
-        from PIL import Image
-    frames = []
+        Image = _check_cv_dep('PIL').Image
+    unique_frames = {}
     video = capture(video_document)
     cur_f = 0
     tot_fcount = video_document.get_property(FRAMECOUNT_DOCPROP_KEY)
     # when the target frame is more than this frames away, fast-forward instead of reading frame by frame
-    # this is sanity-checked with a small number of video samples 
+    # this is sanity-checked with a small number of video samples
     # (frame-by-frame ndarrays are compared with fast-forwarded ndarrays)
-    skip_threadhold = 1000  
-    framenumi = iter(framenums)  # make sure that it's actually an iterator, in case a list is passed
+    skip_threadhold = 1000
+    framenumi = iter(unique_framenums)
     next_target_f = next(framenumi, None)
-    from wurlitzer import pipes as cpipes
+    cpipes = _check_cv_dep('wurlitzer').pipes
     ffmpeg_errs = StringIO()
     with cpipes(stderr=ffmpeg_errs, stdout=sys.stdout):
         while True:
@@ -114,18 +158,23 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int],
                     sec = convert(cur_f, 'f', 's', video_document.get_property(FPS_DOCPROP_KEY))
                     warnings.warn(f'Frame #{cur_f} ({sec}s) could not be read from the video {video_document.id} @ {video_document.location} .')
                 else:
-                    frames.append(Image.fromarray(frame[:, :, ::-1]) if as_PIL else frame)
+                    unique_frames[cur_f] = Image.fromarray(frame[:, :, ::-1]) if as_PIL else frame
                 next_target_f = next(framenumi, None)
             cur_f += 1
     ffmpeg_err_str = ffmpeg_errs.getvalue()
     if ffmpeg_err_str and record_ffmpeg_errors:
         warnings.warn(f'FFmpeg output during extracting frames: {ffmpeg_err_str}')
     video.release()
-    return frames
+    # return frames in original input order, duplicating where needed
+    return [unique_frames[f] for f in original_framenums if f in unique_frames]
 
 
 def get_mid_framenum(mmif: Mmif, time_frame: Annotation) -> int:
-    warnings.warn('This function is deprecated. Use ``get_representative_framenums()`` instead.', DeprecationWarning, stacklevel=2)
+    """
+    .. deprecated::
+       Use :py:func:`extract_frames_by_mode` instead.
+    """
+    warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2)
     return _get_mid_framenum(mmif, time_frame)
 
 
@@ -145,6 +194,9 @@ def _get_mid_framenum(mmif: Mmif, time_frame: Annotation) -> int:
 
 def extract_mid_frame(mmif: Mmif, time_frame: Annotation, as_PIL: bool = False):
     """
+    .. deprecated::
+       Use :py:func:`extract_frames_by_mode` instead.
+
     Extracts the middle frame of a time interval annotation as a numpy ndarray.
 
     :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
@@ -152,21 +204,25 @@ def extract_mid_frame(mmif: Mmif, time_frame: Annotation, as_PIL: bool = False):
     :param as_PIL: return :py:class:`~PIL.Image.Image` instead of :py:class:`~numpy.ndarray`
     :return: frame as a :py:class:`numpy.ndarray` or :py:class:`PIL.Image.Image`
     """
-    warnings.warn('This function is deprecated. Use ``extract_representative_frames()`` instead.', DeprecationWarning, stacklevel=2)
+    warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2)
     vd = mmif[time_frame.get_property('document')]
     return extract_frames_as_images(vd, [get_mid_framenum(mmif, time_frame)], as_PIL=as_PIL)[0]
 
 
 def get_representative_framenums(mmif: Mmif, time_frame: Annotation) -> List[int]:
     """
-    Calculates the representative frame numbers from an annotation. To pick the representative frames, it first looks 
-    up the ``representatives`` property of the ``TimeFrame`` annotation. If it is not found, it will calculate the 
+    .. deprecated::
+       Use :py:func:`extract_frames_by_mode` instead.
+
+    Calculates the representative frame numbers from an annotation. To pick the representative frames, it first looks
+    up the ``representatives`` property of the ``TimeFrame`` annotation. If it is not found, it will calculate the
     number of the middle frame.
 
     :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
     :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` instance that holds a time interval annotation containing a `representatives` property (``"@type": ".../TimeFrame/..."``)
     :return: representative frame number as an integer
     """
+    warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2)
     if 'representatives' not in time_frame.properties:
         return [_get_mid_framenum(mmif, time_frame)]
     timeunit = time_frame.get_property('timeUnit')
@@ -185,9 +241,13 @@ def get_representative_framenums(mmif: Mmif, time_frame: Annotation) -> List[int
 
 def get_representative_framenum(mmif: Mmif, time_frame: Annotation) -> int:
     """
+    .. deprecated::
+       Use :py:func:`extract_frames_by_mode` instead.
+
     A thin wrapper around :py:func:`get_representative_framenums` to return a single representative frame number. Always
     return the first frame number found.
     """
+    warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2)
     try:
         return get_representative_framenums(mmif, time_frame)[0]
     except IndexError:
@@ -196,6 +256,9 @@ def get_representative_framenum(mmif: Mmif, time_frame: Annotation) -> int:
 
 def extract_representative_frame(mmif: Mmif, time_frame: Annotation, as_PIL: bool = False, first_only: bool = True):
     """
+    .. deprecated::
+       Use :py:func:`extract_frames_by_mode` instead.
+
     Extracts the representative frame of an annotation as a numpy ndarray or PIL Image.
 
     :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
@@ -204,11 +267,197 @@ def extract_representative_frame(mmif: Mmif, time_frame: Annotation, as_PIL: boo
     :param first_only: return the first representative frame only
     :return: frame as a :py:class:`numpy.ndarray` or :py:class:`PIL.Image.Image`
     """
+    warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2)
     video_document = mmif[time_frame.get_property('document')]
     rep_frame_num = [get_representative_framenum(mmif, time_frame)] if first_only else get_representative_framenums(mmif, time_frame)
     return extract_frames_as_images(video_document, rep_frame_num, as_PIL=as_PIL)[0]
 
 
+def _tp_ids_to_framenums(mmif: Mmif, tp_ids: List[str]) -> List[int]:
+    """
+    Converts a list of timepoint annotation IDs to frame numbers.
+
+    :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
+    :param tp_ids: list of timepoint annotation IDs
+    :return: list of frame numbers
+    """
+    return [
+        int(convert_timepoint(mmif, mmif[tp_id], 'f'))
+        for tp_id in tp_ids
+    ]
+
+
+def _resolve_video_document(mmif: Mmif, time_frame: Annotation):
+    """
+    Resolves the video document associated with a TimeFrame.
+    Checks the TimeFrame's own ``document`` property first,
+    then falls back to the ``document`` property of the first
+    target timepoint.
+
+    :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
+    :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation`
+        instance of a TimeFrame
+    :return: :py:class:`~mmif.serialize.annotation.Document`
+    """
+    if 'document' in time_frame.properties:
+        return mmif[time_frame.get_property('document')]
+    if 'targets' in time_frame.properties:
+        targets = time_frame.get_property('targets')
+        if targets:
+            tp = mmif[targets[0]]
+            return mmif[tp.get_property('document')]
+    raise ValueError(
+        f'Cannot resolve video document for TimeFrame '
+        f'{time_frame.id}.')
+
+
+def _timeframe_to_frame_range(
+    mmif: Mmif, time_frame: Annotation
+) -> Tuple[int, int]:
+    """
+    Converts a TimeFrame's start/end to frame numbers.
+
+    :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
+    :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation`
+        instance of a TimeFrame with ``start``, ``end``,
+        ``timeUnit``, and ``document`` properties
+    :return: tuple of (start_frame, end_frame)
+    """
+    start, end = convert_timeframe(mmif, time_frame, 'f')
+    return int(start), int(end)
+
+
+def _sample_all(mmif: Mmif, time_frame: Annotation) -> List[int]:
+    """
+    Samples all frame numbers from a TimeFrame. Uses all
+    ``targets`` if present, otherwise generates every frame
+    in the start/end interval.
+
+    :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
+    :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation`
+        instance of a TimeFrame
+    :return: list of frame numbers
+    """
+    if 'targets' in time_frame.properties:
+        return _tp_ids_to_framenums(
+            mmif, time_frame.get_property('targets'))
+    start, end = _timeframe_to_frame_range(mmif, time_frame)
+    return sample_frames(start, end)
+
+
+def _sample_representatives(
+    mmif: Mmif, time_frame: Annotation
+) -> List[int]:
+    """
+    Samples frame numbers from a TimeFrame's representatives.
+    Returns an empty list if ``representatives`` is not present
+    (skips the TimeFrame).
+
+    :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
+    :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation`
+        instance of a TimeFrame
+    :return: list of frame numbers (empty if no representatives)
+    """
+    if 'representatives' in time_frame.properties:
+        reps = time_frame.get_property('representatives')
+        if reps:
+            return _tp_ids_to_framenums(mmif, reps)
+    return []
+
+
+def _sample_single(mmif: Mmif, time_frame: Annotation) -> List[int]:
+    """
+    Samples a single frame number from a TimeFrame. Uses the
+    middle representative if ``representatives`` is present,
+    otherwise computes the midpoint of the start/end interval
+    via floor division.
+
+    :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
+    :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation`
+        instance of a TimeFrame
+    :return: list containing a single frame number
+    """
+    if 'representatives' in time_frame.properties:
+        reps = time_frame.get_property('representatives')
+        if reps:
+            mid = reps[len(reps) // 2]
+            return _tp_ids_to_framenums(mmif, [mid])
+    start, end = _timeframe_to_frame_range(mmif, time_frame)
+    return [(start + end) // 2]
+
+
+def extract_target_frames(mmif: Mmif, annotation: Annotation, min_timepoints: int = 0, max_timepoints: int = sys.maxsize, fraction: float = 1.0, as_PIL: bool = False):
+    """
+    Extracts frames corresponding to the timepoints listed in the ``targets`` property of an annotation.
+    Selection of timepoints is based on minimum, maximum, and fraction of targets to include.
+
+    :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
+    :param annotation: :py:class:`~mmif.serialize.annotation.Annotation` instance containing a ``targets`` property
+    :param min_timepoints: minimum number of timepoints to include
+    :param max_timepoints: maximum number of timepoints to include
+    :param fraction: fraction of targets to include (ideally)
+    :param as_PIL: return :py:class:`~PIL.Image.Image` instead of :py:class:`~numpy.ndarray`
+    :return: a tuple containing (list of frames, list of selected target IDs)
+    """
+    if 'targets' not in annotation.properties:
+        raise ValueError(f'Annotation {annotation.id} does not have a "targets" property.')
+
+    targets = annotation.get_property('targets')
+    num_targets = len(targets)
+    if num_targets == 0:
+        return [], []
+
+    ideal_count = int(num_targets * fraction)
+    count = max(min_timepoints, ideal_count)
+    count = min(max_timepoints, count)
+    count = min(num_targets, count)
+
+    if count == 1:
+        indices = [num_targets // 2]
+    else:
+        indices = [int(i * (num_targets - 1) / (count - 1)) for i in range(count)]
+
+    selected_target_ids = [targets[i] for i in indices]
+    frame_nums = _tp_ids_to_framenums(mmif, selected_target_ids)
+    video_doc = _resolve_video_document(mmif, annotation)
+    images = extract_frames_as_images(video_doc, frame_nums, as_PIL=as_PIL)
+    return images, selected_target_ids
+
+
+def extract_frames_by_mode(
+    mmif: Mmif,
+    time_frame: Annotation,
+    mode: Union[SamplingMode, None] = None,
+    as_PIL: bool = False
+) -> List:
+    """
+    Extracts frames from a TimeFrame annotation based on a
+    sampling mode. If ``mode`` is not specified, uses the
+    context-level default (set via
+    :py:data:`_sampling_mode` context variable).
+
+    :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
+    :param time_frame: TimeFrame annotation to sample from
+    :param mode: :py:class:`SamplingMode`, or None to use
+        the context default
+    :param as_PIL: return PIL Images instead of ndarrays
+    :return: list of frames (may be empty for
+        ``REPRESENTATIVES`` mode when no representatives exist)
+    """
+    if mode is None:
+        mode = _sampling_mode.get()
+    if mode == SamplingMode.ALL:
+        frame_nums = _sample_all(mmif, time_frame)
+    elif mode == SamplingMode.REPRESENTATIVES:
+        frame_nums = _sample_representatives(mmif, time_frame)
+    else:
+        frame_nums = _sample_single(mmif, time_frame)
+    if not frame_nums:
+        return []
+    video_doc = _resolve_video_document(mmif, time_frame)
+    return extract_frames_as_images(video_doc, frame_nums, as_PIL=as_PIL)
+
+
 def sample_frames(start_frame: int, end_frame: int, sample_rate: float = 1) -> List[int]:
     """
     Helper function to sample frames from a time interval.
diff --git a/mmif/utils/workflow_helper.py b/mmif/utils/workflow_helper.py
index 7980eb89..bdde664a 100644
--- a/mmif/utils/workflow_helper.py
+++ b/mmif/utils/workflow_helper.py
@@ -1,13 +1,16 @@
 import datetime
 import hashlib
-from collections import Counter, defaultdict
-from pathlib import Path
-from typing import List, Any, Tuple, Optional, Union
 import itertools
-from mmif import Mmif
+from collections import Counter
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union, overload
+
+from pydantic import BaseModel, ConfigDict, Field
 
+from mmif.serialize.mmif import Mmif, ViewsList
 
-def group_views_by_app(views: List[Any]) -> List[List[Any]]:
+
+def group_views_by_app(views: ViewsList) -> List[List[Any]]:
     """
     Groups views into app executions based on app and timestamp.
 
@@ -73,9 +76,43 @@ def generate_param_hash(params: dict) -> str:
     return hashlib.md5(param_string.encode('utf-8')).hexdigest()
 
 
-def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str:
+def _read_mmif_from_path(mmif_input: Union[str, Path, Mmif]) -> Mmif:
     """
-    Generate a workflow identifier string from a MMIF file.
+    Helper function to get a Mmif object from various input types.
+
+    :param mmif_input: Either a file path (str or Path) or an existing Mmif object
+    :return: Mmif object
+    :raises ValueError: If input is not a valid type
+    """
+    if isinstance(mmif_input, Mmif):
+        return mmif_input
+    elif isinstance(mmif_input, (str, Path)):
+        with open(mmif_input, "r") as f:
+            mmif_str = f.read()
+        return Mmif(mmif_str)
+    else:
+        raise ValueError(
+            "MMIF input must be a string path, a Path object, or a Mmif object."
+        )
+
+
+@overload
+def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], 
+                                 return_param_dicts: Literal[True]
+                                 ) -> Tuple[str, List[dict]]: ...
+
+
+@overload
+def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif],
+                                 return_param_dicts: Literal[False] = False
+                                 ) -> str: ...
+
+
+def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif],
+                                  return_param_dicts: bool = False
+                                  ) -> Union[str, Tuple[str, List[dict]]]:
+    """
+    Generate a workflow identifier string from a MMIF file or object.
 
     The identifier follows the storage directory structure format:
     app_name/version/param_hash/app_name2/version2/param_hash2/...
@@ -83,25 +120,18 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str:
     Uses view.metadata.parameters (raw user-passed values) for hashing
     to ensure reproducibility. Views with errors or warnings are excluded
     from the identifier; empty views are included.
-    """
-    if not isinstance(mmif_file, (str, Path)):
-        raise ValueError(
-            "MMIF file path must be a string or a Path object."
-        )
 
-    with open(mmif_file, "r") as f:
-        mmif_str = f.read()
-
-    data = Mmif(mmif_str)
+    :param mmif_input: Path to MMIF file (str or Path) or a Mmif object
+    :param return_param_dicts: If True, also return the parameter dictionaries
+    :return: Workflow identifier string, or tuple of (identifier, param_dicts) if return_param_dicts=True
+    """
+    data = _read_mmif_from_path(mmif_input)
     segments = []
 
-    # First prefix is source information, sorted by document type
-    sources = Counter(doc.at_type.shortname for doc in data.documents)
-    segments.append('-'.join([f'{k}-{sources[k]}' for k in sorted(sources.keys())]))
-
     # Group views into runs
     grouped_apps = group_views_by_app(data.views)
 
+    param_dicts = []
     for app_execution in grouped_apps:
         # Use the first view in the run as representative for metadata
         first_view = app_execution[0]
@@ -120,6 +150,7 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str:
             param_dict = first_view.metadata.parameters
         except (KeyError, AttributeError):
             param_dict = {}
+        param_dicts.append(param_dict)
 
         param_hash = generate_param_hash(param_dict)
 
@@ -128,10 +159,58 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str:
         version_str = app_version if app_version else "unversioned"
         segments.append(f"{name_str}/{version_str}/{param_hash}")
 
+    if return_param_dicts:
+        return '/'.join(segments), param_dicts
     return '/'.join(segments)
 
 
-def _get_profile_data(view) -> dict:
+## single MMIF summarization 
+
+class SingleMmifStats(BaseModel):
+    """
+    Aggregated statistics for a single MMIF file.
+    """
+    model_config = ConfigDict(populate_by_name=True)
+    
+    app_count: int = Field(..., alias="appCount", description="Total number of app executions identified.")
+    error_views: List[str] = Field(default_factory=list, alias="errorViews", description="List of view IDs that contain errors.")
+    warning_views: List[str] = Field(default_factory=list, alias="warningViews", description="List of view IDs that contain warnings.")
+    empty_views: List[str] = Field(default_factory=list, alias="emptyViews", description="List of view IDs that contain no annotations.")
+    annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Total annotation counts across the file.")
+
+class AppProfiling(BaseModel):
+    """
+    Profiling data for a single app execution.
+    """
+    model_config = ConfigDict(populate_by_name=True)
+    
+    running_time_ms: Optional[int] = Field(default=None, alias="runningTimeMS", description="Execution time in milliseconds.")
+
+class AppExecution(BaseModel):
+    """
+    Represents a single execution of an app, which may produce multiple views.
+    """
+    model_config = ConfigDict(populate_by_name=True)
+    
+    app: str = Field(..., description="The URI of the app.")
+    view_ids: List[str] = Field(..., alias="viewIds", description="List of view IDs generated by this execution.")
+    app_configuration: Dict = Field(default_factory=dict, alias="appConfiguration", description="Configuration parameters used for this execution.")
+    app_profiling: AppProfiling = Field(default_factory=lambda: AppProfiling(), alias="appProfiling", description="Profiling data for this execution.")
+    annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Counts of annotations produced, grouped by type.")
+
+
+class SingleMmifDesc(BaseModel):
+    """
+    Description of a workflow extracted from a single MMIF file.
+    """
+    model_config = ConfigDict(populate_by_name=True)
+    
+    workflow_id: str = Field(..., alias="workflowId", description="Unique identifier for the workflow structure.")
+    stats: SingleMmifStats = Field(..., description="Statistics about the views and annotations.")
+    apps: List[AppExecution] = Field(..., description="Sequence of app executions in the workflow.")
+
+
+def _get_profile_data(view) -> AppProfiling:
     """
     Extract profiling data from a view's metadata.
 
@@ -150,18 +229,18 @@ def _get_profile_data(view) -> dict:
         running_time_str = profiling.get("runningTime")
 
     if running_time_str is None:
-        return {}
+        return AppProfiling(runningTimeMS=None)
 
     # the format is datetime.timedelta string, e.g. '0:00:02.345678'
     # need to convert to milliseconds integer
     time_obj = datetime.datetime.strptime(running_time_str, "%H:%M:%S.%f").time()
     milliseconds = (time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second) * 1000 + time_obj.microsecond // 1000
-    return {"runningTimeMS": milliseconds}
+    return AppProfiling(runningTimeMS=milliseconds)
 
 
-def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
+def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict:
     """
-    Reads a MMIF file and extracts the workflow specification from it.
+    Reads a MMIF file or object and extracts the workflow specification from it.
 
     This function provides an app-centric summarization of the workflow. The
     conceptual hierarchy is that a **workflow** is a sequence of **apps**,
@@ -170,61 +249,24 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
     a single logical "app execution".
 
     .. note::
-        For MMIF files generated by ``clams-python`` <= 1.3.3, all views
-        are independently timestamped. This means that even if multiple views
-        were generated by a single execution of an app, their
+        For MMIF files generated by apps based on ``clams-python`` <= 1.3.3, all 
+        views are independently timestamped. This means that even if multiple 
+        views were generated by a single execution of an app, their
         ``metadata.timestamp`` values will be unique. As a result, the grouping
         logic will treat each view as a separate app execution. The change
         that aligns timestamps for views from a single app execution is
         implemented in `clams-python PR #271
         <https://github.com/clamsproject/clams-python/pull/271>`_.
 
-    The output format is a dictionary with the following keys:
-
-    * ``workflowId``
-        A unique identifier for the workflow, based on the
-        sequence of app executions (app, version, parameter hashes). App
-        executions with errors are excluded from this identifier. App
-        executions with warnings are still considered successful for the purpose
-        of this identifier.
-    * ``stats``
-        A dictionary with the following keys:
-
-        ``appCount``
-            Total number of identified app executions.
-        ``errorViews``
-            A list of view IDs that reported errors.
-        ``warningViews``
-            A list of view IDs that reported warnings.
-        ``emptyViews``
-            A list of view IDs that contain no annotations.
-        ``annotationCountByType``
-            A dictionary mapping each annotation type to its count, plus a
-            ``total`` key for the sum of all annotations across all app
-            executions.
-    * ``apps``
-        A list of objects, where each object represents one app
-        execution. It includes metadata, profiling, and aggregated statistics
-        for all views generated by that execution. A special entry for views
-        that could not be assigned to an execution will be at the end of the list.
-
-    ---
-    The docstring above is used to generate help messages for the CLI command.
-    Do not remove the triple-dashed lines.
-
-    :param mmif_file: Path to the MMIF file
+    The output is a serialized :class:`~SingleMmifDesc` object.
+
+    .. pydantic_model:: SingleMmifDesc
+       :noindex:
+    
+    :param mmif_input: Path to MMIF file (str or Path) or a Mmif object
     :return: A dictionary containing the workflow specification.
     """
-    if not isinstance(mmif_file, (str, Path)):
-        raise ValueError(
-            "MMIF file path must be a string or a Path object."
-        )
-
-    workflow_id = generate_workflow_identifier(mmif_file)
-    with open(mmif_file, "r") as f:
-        mmif_str = f.read()
-
-    mmif = Mmif(mmif_str)
+    mmif = _read_mmif_from_path(mmif_input)
 
     error_view_ids = []
     warning_view_ids = []
@@ -249,17 +291,21 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
         execution_view_ids = [v.id for v in group]
         processed_view_ids.update(execution_view_ids)
 
-        app_data = {
-            "app": first_view.metadata.app,
-            "viewIds": execution_view_ids,
-            "appConfiguration": first_view.metadata.get("appConfiguration", {}),
-            "appProfiling": _get_profile_data(first_view),
-        }
+        # Prepare annotation counts
         total_annotations_in_exec = sum(execution_ann_counter.values())
         if total_annotations_in_exec > 0:
-            app_data['annotationCountByType'] = dict(execution_ann_counter)
-            app_data['annotationCountByType']['total'] = total_annotations_in_exec
-        grouped_apps.append(app_data)
+            count_dict = dict(execution_ann_counter)
+            count_dict['total'] = total_annotations_in_exec
+        else:
+            count_dict = {}
+        
+        grouped_apps.append(AppExecution(
+            app=first_view.metadata.app,
+            viewIds=execution_view_ids,
+            appConfiguration=first_view.metadata.get("appConfiguration", {}),
+            appProfiling=_get_profile_data(first_view),
+            annotationCountByType=count_dict
+        ))
 
     # Handle unassigned and problematic views
     all_view_ids = set(v.id for v in mmif.views)
@@ -279,19 +325,23 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
     app_count = len(grouped_apps)
 
     if unassigned_view_ids:
-        grouped_apps.append({
-            "app": "http://apps.clams.ai/non-existing-app/v1",
-            "viewIds": sorted(list(unassigned_view_ids))
-        })
+        grouped_apps.append(AppExecution(
+            app="http://apps.clams.ai/non-existing-app/v1",
+            viewIds=sorted(list(unassigned_view_ids)),
+            appConfiguration={},
+            appProfiling=AppProfiling(runningTimeMS=None),
+            annotationCountByType={}
+        ))
 
     # aggregate total annotation counts
     total_annotations_by_type = Counter()
     for execution in grouped_apps:
         # Only aggregate from actual apps, not the special unassigned entry
-        if execution.get('app') != "http://apps.clams.ai/non-existing-app/v1":
-            if 'annotationCountByType' in execution:
-                exec_counts = execution['annotationCountByType'].copy()
-                del exec_counts['total']
+        if execution.app != "http://apps.clams.ai/non-existing-app/v1":
+            if execution.annotation_count_by_type:
+                exec_counts = execution.annotation_count_by_type.copy()
+                if 'total' in exec_counts:
+                    del exec_counts['total']
                 total_annotations_by_type.update(Counter(exec_counts))
 
     final_total_annotations = sum(total_annotations_by_type.values())
@@ -299,17 +349,79 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict:
     if final_total_annotations > 0:
         final_annotation_counts['total'] = final_total_annotations
 
-    return {
-        "workflowId": workflow_id,
-        "stats": {
-            "appCount": app_count,
-            "errorViews": error_view_ids,
-            "warningViews": warning_view_ids,
-            "emptyViews": empty_view_ids,
-            "annotationCountByType": final_annotation_counts
-        },
-        "apps": grouped_apps
-    }
+    return SingleMmifDesc(
+        workflowId=generate_workflow_identifier(mmif, return_param_dicts=False),
+        stats=SingleMmifStats(
+            appCount=app_count,
+            errorViews=error_view_ids,
+            warningViews=warning_view_ids,
+            emptyViews=empty_view_ids,
+            annotationCountByType=final_annotation_counts
+        ),
+        apps=grouped_apps
+    ).model_dump(by_alias=True)
+
+
+## MMIF collection summarization 
+
+class AppProfilingStats(BaseModel):
+    """
+    Aggregated profiling statistics for an app across a workflow.
+    """
+    model_config = ConfigDict(populate_by_name=True)
+    
+    avg_running_time_ms: Optional[float] = Field(default=None, alias="avgRunningTimeMS", description="Average execution time in milliseconds.")
+    min_running_time_ms: Optional[float] = Field(default=None, alias="minRunningTimeMS", description="Minimum execution time in milliseconds.")
+    max_running_time_ms: Optional[float] = Field(default=None, alias="maxRunningTimeMS", description="Maximum execution time in milliseconds.")
+    stdev_running_time_ms: Optional[float] = Field(default=None, alias="stdevRunningTimeMS", description="Standard deviation of execution time.")
+
+
+
+
+class WorkflowAppExecution(BaseModel):
+    """
+    Aggregated information about an app's usage within a specific workflow across multiple files.
+    """
+    model_config = ConfigDict(populate_by_name=True)
+    
+    app: str = Field(..., description="The URI of the app.")
+    app_configuration: Dict = Field(default_factory=dict, alias="appConfiguration", description="Representative configuration (usually from the first occurrence).")
+    app_profiling: AppProfilingStats = Field(default_factory=lambda: AppProfilingStats(), alias="appProfiling", description="Aggregated profiling statistics.")
+
+
+class WorkflowCollectionEntry(BaseModel):
+    """
+    Summary of a unique workflow found within a collection.
+    """
+    model_config = ConfigDict(populate_by_name=True)
+    
+    workflow_id: str = Field(..., alias="workflowId", description="Unique identifier for the workflow.")
+    mmifs: List[str] = Field(..., description="List of filenames belonging to this workflow.")
+    mmif_count: int = Field(..., alias="mmifCount", description="Number of MMIF files matching this workflow.")
+    apps: List[WorkflowAppExecution] = Field(..., description="Sequence of apps in this workflow with aggregated stats.")
+
+class MmifCountByStatus(BaseModel):
+    """
+    Breakdown of MMIF files in a collection by their processing status.
+    """
+    model_config = ConfigDict(populate_by_name=True)
+    
+    total: int = Field(..., description="Total number of MMIF files found.")
+    successful: int = Field(..., description="Number of files processed without errors.")
+    with_errors: int = Field(..., alias="withErrors", description="Number of files containing error views.")
+    with_warnings: int = Field(..., alias="withWarnings", description="Number of files containing warning views.")
+    invalid: int = Field(..., description="Number of files that failed to parse as valid MMIF.")
+
+
+class CollectionMmifDesc(BaseModel):
+    """
+    Summary of a collection of MMIF files.
+    """
+    model_config = ConfigDict(populate_by_name=True)
+    
+    mmif_count_by_status: MmifCountByStatus = Field(..., alias="mmifCountByStatus", description="Counts of MMIF files by status.")
+    workflows: List[WorkflowCollectionEntry] = Field(..., description="List of unique workflows identified in the collection.")
+    annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Total annotation counts across the entire collection.")
 
 
 def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict:
@@ -319,139 +431,115 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict:
     This function provides an overview of a collection of MMIF files, aggregating
     statistics across multiple files.
 
-    The output format is a dictionary with the following keys:
-
-    * ``mmifCountByStatus``
-        A dictionary summarizing the processing status of all MMIF files in the
-        collection. It includes:
-
-        ``total``
-            Total number of MMIF files found.
-        ``successful``
-            Number of MMIF files processed without errors (may contain warnings).
-        ``withErrors``
-            Number of MMIF files containing app executions that reported errors.
-        ``withWarnings``
-            Number of MMIF files containing app executions that reported warnings.
-        ``invalid``
-            Number of files that failed to be parsed as valid MMIF.
-    * ``workflows``
-        A list of "workflow" objects found in the "successful" MMIF files (files
-        with errors are excluded), where each object contains:
-
-        ``workflowId``
-            The unique identifier for the workflow.
-        ``apps``
-            A list of app objects, each with ``app`` (name+ver identifier),
-            ``appConfiguration``, and ``appProfiling`` statistics (avg, min, max,
-            stdev running times) aggregated per workflow.
-        ``mmifs``
-            A list of MMIF file basenames belonging to this workflow.
-        ``mmifCount``
-            The number of MMIF files in this workflow.
-    * ``annotationCountByType``
-        A dictionary aggregating annotation counts across the entire collection.
-        It includes a ``total`` key for the grand total, plus integer counts for
-        each individual annotation type.
-
-    ---
-    The docstring above is used to generate help messages for the CLI command.
-    Do not remove the triple-dashed lines.
+    The output is a serialized :class:`~CollectionMmifDesc` object.
+
+    .. pydantic_model:: CollectionMmifDesc
+       :noindex:
 
     :param mmif_dir: Path to the directory containing MMIF files.
     :return: A dictionary containing the summarized collection specification.
     """
     import statistics
-    from collections import defaultdict, Counter
+    from collections import Counter
 
     mmif_files = list(Path(mmif_dir).glob('*.mmif'))
 
-    status_summary = defaultdict(int)
-    status_summary['total'] = len(mmif_files)
-    status_summary['successful'] = 0
-    status_summary['withErrors'] = 0
-    status_summary['withWarnings'] = 0
-    status_summary['invalid'] = 0
+    status_summary = MmifCountByStatus(
+        total=len(mmif_files),
+        successful=0,
+        withErrors=0,
+        withWarnings=0,
+        invalid=0
+    )
 
     aggregated_counts = Counter()
 
-    workflows_data = defaultdict(lambda: {
-        'mmifs': [],
-        'apps': defaultdict(lambda: {
-            'appConfiguration': None,  # Store the first config here
-            'execution_times': []
-        })
-    })
+    # Structure: {workflow_id: {'mmifs': [...], 'apps': {app_uri: {'appConfiguration': ..., 'execution_times': [...]}}}}
+    workflows_data: Dict[str, Dict] = {}
 
     for mmif_file in mmif_files:
         try:
-            single_report = describe_single_mmif(mmif_file)
-        except Exception as e:
-            status_summary['invalid'] += 1
+            single_report = SingleMmifDesc.model_validate(describe_single_mmif(mmif_file))
+        except Exception:
+            status_summary.invalid += 1
             continue
 
-        if single_report['stats']['errorViews']:
-            status_summary['withErrors'] += 1
+        if single_report.stats.error_views:
+            status_summary.with_errors += 1
             continue  # Exclude from all other stats
 
         # If we get here, the MMIF has no errors and is considered "successful"
-        status_summary['successful'] += 1
-        if single_report['stats']['warningViews']:
-            status_summary['withWarnings'] += 1
-
-        wf_id = single_report['workflowId']
+        status_summary.successful += 1
+        if single_report.stats.warning_views:
+            status_summary.with_warnings += 1
+
+        wf_id = single_report.workflow_id
+        # Initialize workflow entry if not exists
+        if wf_id not in workflows_data:
+            workflows_data[wf_id] = {'mmifs': [], 'apps': {}}
         workflows_data[wf_id]['mmifs'].append(Path(mmif_file).name)
 
         # Aggregate annotation counts for successful mmifs
-        report_counts = single_report['stats'].get('annotationCountByType', {})
+        report_counts = single_report.stats.annotation_count_by_type.copy()
         if 'total' in report_counts:
             del report_counts['total']  # don't add the sub-total to the main counter
         aggregated_counts.update(report_counts)
 
-        for app_exec in single_report.get('apps', []):
-            app_uri = app_exec.get('app')
+        for app_exec in single_report.apps:
+            app_uri = app_exec.app
             # skip the special "unassigned" app
             if app_uri and app_uri != "http://apps.clams.ai/non-existing-app/v1":
-                running_time = app_exec.get('appProfiling', {}).get('runningTimeMS')
+                # Initialize app entry if not exists
+                if app_uri not in workflows_data[wf_id]['apps']:
+                    workflows_data[wf_id]['apps'][app_uri] = {
+                        'appConfiguration': None,
+                        'execution_times': []
+                    }
+                
+                running_time = app_exec.app_profiling.running_time_ms
                 if running_time is not None:
                     workflows_data[wf_id]['apps'][app_uri]['execution_times'].append(running_time)
 
                 # Store the first non-empty app configuration we find for this app in this workflow
                 if workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] is None:
-                    config = app_exec.get('appConfiguration', {})
+                    config = app_exec.app_configuration
                     if config:
                         workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] = config
 
     # Process collected data into the final output format
     final_workflows_list = []
     for wf_id, wf_data in sorted(workflows_data.items()):
-        workflow_object = {
-            'workflowId': wf_id,
-            'mmifs': sorted(wf_data['mmifs']),
-            'mmifCount': len(wf_data['mmifs']),
-            'apps': []
-        }
+        workflow_apps = []
 
         for app_uri, app_data in sorted(wf_data['apps'].items()):
             times = app_data['execution_times']
             if times:
-                profiling_stats = {
-                    'avgRunningTimeMS': statistics.mean(times),
-                    'minRunningTimeMS': min(times),
-                    'maxRunningTimeMS': max(times),
-                    'stdevRunningTimeMS': statistics.stdev(times) if len(times) > 1 else 0
-                }
+                profiling_stats = AppProfilingStats(
+                    avgRunningTimeMS=statistics.mean(times),
+                    minRunningTimeMS=min(times),
+                    maxRunningTimeMS=max(times),
+                    stdevRunningTimeMS=statistics.stdev(times) if len(times) > 1 else 0
+                )
             else:
-                profiling_stats = {}
-
-            app_object = {
-                'app': app_uri,
-                'appConfiguration': app_data['appConfiguration'] or {},  # Default to empty dict
-                'appProfiling': profiling_stats
-            }
-            workflow_object['apps'].append(app_object)
-
-        final_workflows_list.append(workflow_object)
+                profiling_stats = AppProfilingStats(
+                    avgRunningTimeMS=None,
+                    minRunningTimeMS=None,
+                    maxRunningTimeMS=None,
+                    stdevRunningTimeMS=None
+                )
+
+            workflow_apps.append(WorkflowAppExecution(
+                app=app_uri,
+                appConfiguration=app_data['appConfiguration'] or {},
+                appProfiling=profiling_stats
+            ))
+
+        final_workflows_list.append(WorkflowCollectionEntry(
+            workflowId=wf_id,
+            mmifs=sorted(wf_data['mmifs']),
+            mmifCount=len(wf_data['mmifs']),
+            apps=workflow_apps
+        ))
 
     # Finalize annotation counts
     final_annotation_counts = dict(aggregated_counts)
@@ -459,8 +547,8 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict:
     if grand_total > 0:
         final_annotation_counts['total'] = grand_total
 
-    return {
-        'mmifCountByStatus': dict(status_summary),
-        'workflows': final_workflows_list,
-        'annotationCountByType': final_annotation_counts
-    }
+    return CollectionMmifDesc(
+        mmifCountByStatus=status_summary,
+        workflows=final_workflows_list,
+        annotationCountByType=final_annotation_counts
+    ).model_dump(by_alias=True)
diff --git a/mmif_docloc_http/__init__.py b/mmif_docloc_http/__init__.py
index 9bdf9f22..1d954474 100644
--- a/mmif_docloc_http/__init__.py
+++ b/mmif_docloc_http/__init__.py
@@ -1,16 +1,30 @@
+"""
+MMIF document location helper module for ``http`` and ``https`` schemes.
+
+If you want to write your own docloc scheme handler, please use the source
+code of this module as a reference. See the :ref:`plug-in section <docloc_plugin>`
+for more information.
+"""
+
 import urllib.request
 import urllib.error
 
+_cache = {}
+
 
 def resolve(docloc):
+    if docloc in _cache:
+        return _cache[docloc]
     try:
         if docloc.startswith('http://') or docloc.startswith('https://'):
-            return urllib.request.urlretrieve(docloc)[0]
+            path = urllib.request.urlretrieve(docloc)[0]
+            _cache[docloc] = path
+            return path
         else:
             raise ValueError(f'cannot handle document location scheme: {docloc}')
     except urllib.error.URLError as e:
         raise e
-    
-    
+
+
 def help():
     return "location must be a URL string."
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..07055628
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths = mmif tests
+python_files = test_*.py *_test.py
diff --git a/requirements.txt b/requirements.txt
index a97c214e..c3e9d722 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 
 orderly-set==5.3.*  # 5.4 drops py38 support
 jsonschema
+pydantic>=2.0
diff --git a/tests/mmif_examples.py b/tests/mmif_examples.py
index b19f9d9a..8a8f4c6f 100644
--- a/tests/mmif_examples.py
+++ b/tests/mmif_examples.py
@@ -55,7 +55,7 @@ def _load_from_url_or_git(url):
 old_mmif_w_short_id_url = f"https://raw.githubusercontent.com/clamsproject/mmif/1.0.5/specifications/samples/everything/raw.json"
 EVERYTHING_JSON = _load_from_url_or_git(everything_file_url)
 OLD_SHORTID_JSON = _load_from_url_or_git(old_mmif_w_short_id_url)
-SWT_1_0_JSON = open('tests/samples/1.0/swt.mmif').read()
+SWT_1_0_JSON = (Path(__file__).resolve().parent / 'samples' / '1.0' / 'swt.mmif').read_text()
 
 # for keys and values in chain all typevers in mmif.vocabulary.*_types modules
 # merge into a single dict 
diff --git a/tests/test_serialize.py b/tests/test_serialize.py
index b0836c5a..f5b0846f 100644
--- a/tests/test_serialize.py
+++ b/tests/test_serialize.py
@@ -269,6 +269,21 @@ def test_document_location_helpers_http(self):
         # round_trip = Document(new_doc.serialize())
         self.assertEqual(Document(new_doc.serialize()).serialize(), new_doc.serialize())
 
+    def test_document_location_http_caching(self):
+        import mmif_docloc_http
+        mmif_docloc_http._cache.clear()
+        test_url = "https://example.com/"
+        self.assertNotIn(test_url, mmif_docloc_http._cache)
+        new_doc = Document()
+        new_doc.id = "d1"
+        new_doc.location = test_url
+        new_doc.location_path()
+        self.assertIn(test_url, mmif_docloc_http._cache)
+        # second call should use cache (same path returned)
+        cached_path = mmif_docloc_http._cache[test_url]
+        second_path = new_doc.location_path()
+        self.assertEqual(cached_path, second_path)
+
     def test_get_documents_locations(self):
         mmif_obj = Mmif(MMIF_EXAMPLES['everything'])
         self.assertEqual(1, len(mmif_obj.get_documents_locations(DocumentTypes.VideoDocument)))
@@ -593,6 +608,25 @@ def test_get_label(self):
             a = v.new_annotation(AnnotationTypes.BoundingBox)
             _ = a._get_label()
 
+    def test_timestamp_uses_utc_with_z_suffix(self):
+        """Test that timestamps are in UTC with 'Z' suffix to avoid ambiguity"""
+        from datetime import timezone
+        mmif_obj = Mmif(validate=False)
+
+        new_view = mmif_obj.new_view()
+        new_view.metadata.app = "http://test.app"
+
+        # Verify the timestamp is timezone-aware and uses UTC
+        self.assertIsNotNone(new_view.metadata.timestamp)
+        self.assertIsNotNone(new_view.metadata.timestamp.tzinfo)
+        self.assertEqual(new_view.metadata.timestamp.tzinfo, timezone.utc)
+
+        # Verify serialization uses 'Z' suffix instead of '+00:00'
+        serialized = json.loads(mmif_obj.serialize())
+        ts = serialized['views'][0]['metadata']['timestamp']
+        self.assertTrue(ts.endswith('Z'))
+        self.assertNotIn('+00:00', ts)
+
     def test_get_anchor_point(self):
         mmif = Mmif(validate=False)
         v1 = mmif.new_view()
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0c261fe7..1d903b10 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,17 +1,26 @@
+import json
+import os
 import pathlib
-import unittest
 import tempfile
-import json
+import unittest
+from pathlib import Path
+from unittest import mock
 
 import pytest
-
-from mmif import Mmif, Document, AnnotationTypes
+from hypothesis import given
+from hypothesis import strategies as st
+
+from mmif import (
+    AnnotationTypes, 
+    Document, 
+    Mmif
+)
 from mmif.utils import sequence_helper as sqh
 from mmif.utils import text_document_helper as tdh
 from mmif.utils import timeunit_helper as tuh
 from mmif.utils import video_document_helper as vdh
-from tests.mmif_examples import *
-from hypothesis import given, strategies as st
+from mmif.utils import workflow_helper as wfh
+from tests import mmif_examples 
 
 
 class TestTimeunitHelper(unittest.TestCase):
@@ -135,6 +144,86 @@ def test_extract_frames_as_images(self):
         self.assertEqual(4, len(frame_list))
         self.assertEqual(3, len(new_target_images))
 
+    def test_sample_all(self):
+        tps = []
+        for i in range(10):
+            tp = self.a_view.new_annotation(
+                AnnotationTypes.TimePoint,
+                timePoint=i * 100, timeUnit='frame',
+                document=self.video_doc.id)
+            tps.append(tp)
+        parent_ann = self.a_view.new_annotation(
+            AnnotationTypes.TimeFrame,
+            targets=[tp.id for tp in tps])
+
+        frame_nums = vdh._sample_all(self.mmif_obj, parent_ann)
+        self.assertEqual(10, len(frame_nums))
+        self.assertEqual([i * 100 for i in range(10)], frame_nums)
+
+        # start/end fallback (no targets)
+        parent_ann2 = self.a_view.new_annotation(
+            AnnotationTypes.TimeFrame,
+            start=0, end=10, timeUnit='frame',
+            document=self.video_doc.id)
+        frame_nums2 = vdh._sample_all(self.mmif_obj, parent_ann2)
+        self.assertEqual(list(range(10)), frame_nums2)
+
+    def test_sample_representatives(self):
+        tps = []
+        for i in range(10):
+            tp = self.a_view.new_annotation(
+                AnnotationTypes.TimePoint,
+                timePoint=i * 100, timeUnit='frame',
+                document=self.video_doc.id)
+            tps.append(tp)
+        reps = [tps[2].id, tps[5].id, tps[8].id]
+        parent_ann = self.a_view.new_annotation(
+            AnnotationTypes.TimeFrame,
+            targets=[tp.id for tp in tps],
+            representatives=reps)
+
+        # should use representatives
+        frame_nums = vdh._sample_representatives(
+            self.mmif_obj, parent_ann)
+        self.assertEqual(3, len(frame_nums))
+        self.assertEqual([200, 500, 800], frame_nums)
+
+        # without representatives, should return empty (skip)
+        parent_ann2 = self.a_view.new_annotation(
+            AnnotationTypes.TimeFrame,
+            targets=[tp.id for tp in tps])
+        frame_nums2 = vdh._sample_representatives(
+            self.mmif_obj, parent_ann2)
+        self.assertEqual([], frame_nums2)
+
+    def test_sample_single(self):
+        tps = []
+        for i in range(10):
+            tp = self.a_view.new_annotation(
+                AnnotationTypes.TimePoint,
+                timePoint=i * 100, timeUnit='frame',
+                document=self.video_doc.id)
+            tps.append(tp)
+        reps = [tps[2].id, tps[5].id, tps[8].id]
+        parent_ann = self.a_view.new_annotation(
+            AnnotationTypes.TimeFrame,
+            targets=[tp.id for tp in tps],
+            representatives=reps)
+
+        # should pick middle representative (index 1 of 3 = tps[5])
+        frame_nums = vdh._sample_single(
+            self.mmif_obj, parent_ann)
+        self.assertEqual([500], frame_nums)
+
+        # start/end fallback (no representatives)
+        parent_ann2 = self.a_view.new_annotation(
+            AnnotationTypes.TimeFrame,
+            start=100, end=500, timeUnit='frame',
+            document=self.video_doc.id)
+        frame_nums2 = vdh._sample_single(
+            self.mmif_obj, parent_ann2)
+        self.assertEqual([300], frame_nums2)
+
 
 class TestSequenceHelper(unittest.TestCase):
 
@@ -205,7 +294,7 @@ def test_width_based_smoothing(self):
 
 
 class TestTextDocHelper(unittest.TestCase):
-    mmif_obj = Mmif(MMIF_EXAMPLES['everything'])
+    mmif_obj = Mmif(mmif_examples.MMIF_EXAMPLES['everything'])
 
     @pytest.mark.skip("The only valid test cases come from kaldi app which annotates wrong property")
     def test_slice_text(self):
@@ -232,8 +321,6 @@ def setUp(self) -> None:
 
     def create_temp_mmif_file(self, mmif_obj):
         """Helper to create a temporary MMIF file."""
-        import tempfile
-        import json
         tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False)
         if isinstance(mmif_obj, Mmif):
             content_to_write = mmif_obj.serialize(pretty=False)
@@ -244,24 +331,20 @@ def create_temp_mmif_file(self, mmif_obj):
         return tmp.name
 
     def test_split_appname_appversion(self):
-        from mmif.utils.workflow_helper import _split_appname_appversion
-        app_name, app_version = _split_appname_appversion("http://apps.clams.ai/test-app/v1.0.0")
+        app_name, app_version = wfh._split_appname_appversion("http://apps.clams.ai/test-app/v1.0.0")
         self.assertEqual(app_name, "test-app")
         self.assertEqual(app_version, "v1.0.0")
 
     def test_generate_param_hash(self):
-        from mmif.utils.workflow_helper import generate_param_hash
         params = {"param1": "value1", "param2": 42}
-        hash1 = generate_param_hash(params)
-        hash2 = generate_param_hash(params)
+        hash1 = wfh.generate_param_hash(params)
+        hash2 = wfh.generate_param_hash(params)
         self.assertEqual(hash1, hash2)
         params_reversed = {"param2": 42, "param1": "value1"}
-        hash3 = generate_param_hash(params_reversed)
+        hash3 = wfh.generate_param_hash(params_reversed)
         self.assertEqual(hash1, hash3)
 
     def test_generate_workflow_identifier_grouped(self):
-        from mmif.vocabulary import AnnotationTypes
-        from mmif.utils import workflow_helper
         view1 = self.basic_mmif.new_view()
         view1.metadata.app = "http://apps.clams.ai/app1/v1.0.0"
         view1.metadata.timestamp = "2024-01-01T12:00:00Z"
@@ -274,14 +357,210 @@ def test_generate_workflow_identifier_grouped(self):
         tmp_file = self.create_temp_mmif_file(self.basic_mmif)
         import os
         try:
-            workflow_id = workflow_helper.generate_workflow_identifier(tmp_file)
+            workflow_id = wfh.generate_workflow_identifier(tmp_file)
             segments = workflow_id.split('/')
-            self.assertEqual(len(segments), 7)
-            self.assertIn('app1', segments[1])
-            self.assertIn('app2', segments[4])
+            self.assertEqual(len(segments), 6)
+            self.assertIn('app1', segments[0])
+            self.assertIn('app2', segments[3])
         finally:
             os.unlink(tmp_file)
 
+    def test_generate_workflow_identifier_with_mmif_object(self):
+        """Test that generate_workflow_identifier accepts Mmif objects directly."""
+        import os
+
+        # Test with Mmif object directly
+        workflow_id_from_obj = wfh.generate_workflow_identifier(self.basic_mmif)
+
+        # Test with file path - should produce the same result
+        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
+        try:
+            workflow_id_from_file = wfh.generate_workflow_identifier(tmp_file)
+            self.assertEqual(workflow_id_from_obj, workflow_id_from_file)
+        finally:
+            os.unlink(tmp_file)
+
+    def test_read_mmif_from_path(self):
+        """Test the _read_mmif_from_path helper function."""
+
+        # Test with Mmif object - should return as-is
+        result = wfh._read_mmif_from_path(self.basic_mmif)
+        self.assertIs(result, self.basic_mmif)
+
+        # Test with file path string
+        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
+        try:
+            result_from_str = wfh._read_mmif_from_path(tmp_file)
+            self.assertIsInstance(result_from_str, Mmif)
+            self.assertEqual(result_from_str.serialize(pretty=False), self.basic_mmif.serialize(pretty=False))
+
+            # Test with Path object
+            result_from_path = wfh._read_mmif_from_path(Path(tmp_file))
+            self.assertIsInstance(result_from_path, Mmif)
+            self.assertEqual(result_from_path.serialize(pretty=False), self.basic_mmif.serialize(pretty=False))
+        finally:
+            os.unlink(tmp_file)
+
+        # Test with invalid input
+        with pytest.raises(ValueError):
+            wfh._read_mmif_from_path(12345)
+
+    def test_describe_single_mmif_with_mmif_object(self):
+        """Test that describe_single_mmif accepts Mmif objects directly."""
+        import os
+
+        # Test with Mmif object directly
+        result_from_obj = wfh.describe_single_mmif(self.basic_mmif)
+
+        # Test with file path - should produce the same result
+        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
+        try:
+            result_from_file = wfh.describe_single_mmif(tmp_file)
+            self.assertEqual(result_from_obj, result_from_file)
+            
+            # Validate that the output conforms to the SingleMmifDesc Pydantic model
+            # If validation succeeds, all required fields with correct aliases are present
+            validated = wfh.SingleMmifDesc.model_validate(result_from_obj)
+            # Can assert on the validated object's attributes if needed
+            self.assertIsNotNone(validated.workflow_id)
+            self.assertIsNotNone(validated.stats)
+            self.assertIsNotNone(validated.apps)
+        finally:
+            os.unlink(tmp_file)
+
+    def test_describe_single_mmif_empty(self):
+        """Test describe_single_mmif with an empty MMIF (no views)."""
+        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
+        try:
+            result = wfh.describe_single_mmif(tmp_file)
+            # Validate against Pydantic model
+            validated = wfh.SingleMmifDesc.model_validate(result)
+            self.assertEqual(validated.stats.app_count, 0)
+            self.assertEqual(len(validated.apps), 0)
+            self.assertEqual(validated.stats.annotation_count_by_type, {})
+        finally:
+            os.unlink(tmp_file)
+
+    def test_describe_single_mmif_one_app(self):
+        """Test describe_single_mmif with a single app execution."""
+        view = self.basic_mmif.new_view()
+        view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0"
+        view.metadata.timestamp = "2024-01-01T12:00:00Z"
+        view.metadata.appProfiling = {"runningTime": "0:00:01.234"}
+        view.new_annotation(AnnotationTypes.TimeFrame)
+        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
+        try:
+            result = wfh.describe_single_mmif(tmp_file)
+            # Validate against Pydantic model
+            validated = wfh.SingleMmifDesc.model_validate(result)
+            self.assertEqual(validated.stats.app_count, 1)
+            self.assertEqual(len(validated.apps), 1)
+            app_exec = validated.apps[0]
+            self.assertEqual(app_exec.app, view.metadata.app)
+            self.assertEqual(app_exec.view_ids, [view.id])
+            self.assertEqual(app_exec.app_profiling.running_time_ms, 1234)
+        finally:
+            os.unlink(tmp_file)
+
+    def test_describe_single_mmif_one_app_two_views(self):
+        """Test describe_single_mmif with one app execution producing two views."""
+        view1 = self.basic_mmif.new_view()
+        view1.metadata.app = "http://apps.clams.ai/test-app/v1.0.0"
+        view1.metadata.timestamp = "2024-01-01T12:00:00Z"
+        view1.new_annotation(AnnotationTypes.TimeFrame)
+        view2 = self.basic_mmif.new_view()
+        view2.metadata.app = "http://apps.clams.ai/test-app/v1.0.0"
+        view2.metadata.timestamp = "2024-01-01T12:00:00Z"
+        view2.new_annotation(AnnotationTypes.TimeFrame)
+        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
+        try:
+            result = wfh.describe_single_mmif(tmp_file)
+            # Validate against Pydantic model
+            validated = wfh.SingleMmifDesc.model_validate(result)
+            self.assertEqual(validated.stats.app_count, 1)
+            self.assertEqual(len(validated.apps), 1)
+            app_exec = validated.apps[0]
+            self.assertEqual(app_exec.view_ids, [view1.id, view2.id])
+        finally:
+            os.unlink(tmp_file)
+
+    def test_describe_single_mmif_error_view(self):
+        """Test describe_single_mmif with a view containing an error."""
+        view = self.basic_mmif.new_view()
+        view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0"
+        view.metadata.timestamp = "2024-01-01T12:00:00Z"
+        view.metadata.error = {"message": "Something went wrong"}
+        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
+        try:
+            result = wfh.describe_single_mmif(tmp_file)
+            # Validate against Pydantic model
+            validated = wfh.SingleMmifDesc.model_validate(result)
+            self.assertEqual(validated.stats.app_count, 0)
+            self.assertEqual(len(validated.apps), 0)
+            self.assertEqual(len(validated.stats.error_views), 1)
+        finally:
+            os.unlink(tmp_file)
+
+    def test_describe_single_mmif_with_unassigned_views(self):
+        """Test describe_single_mmif with views that cannot be grouped."""
+        import unittest.mock
+        raw_mmif = json.loads(self.basic_mmif.serialize())
+        raw_mmif['views'].append({'id': 'v1', 'metadata': {'app': 'http://apps.clams.ai/app1/v1.0.0', 'timestamp': '2024-01-01T12:00:00Z'}, 'annotations': []})
+        raw_mmif['views'].append({'id': 'v2', 'metadata': {'app': 'http://apps.clams.ai/app2/v2.0.0'}, 'annotations': []})
+        raw_mmif['views'].append({'id': 'v3', 'metadata': {'timestamp': '2024-01-01T12:01:00Z', 'app': ''}, 'annotations': []})
+        tmp_file = self.create_temp_mmif_file(raw_mmif)
+        try:
+            with unittest.mock.patch('jsonschema.validators.validate'):
+                result = wfh.describe_single_mmif(tmp_file)
+            # Validate against Pydantic model
+            validated = wfh.SingleMmifDesc.model_validate(result)
+            self.assertEqual(validated.stats.app_count, 1)
+            self.assertEqual(len(validated.apps), 2)
+            special_entry = validated.apps[-1]
+            self.assertEqual(special_entry.app, 'http://apps.clams.ai/non-existing-app/v1')
+            self.assertEqual(len(special_entry.view_ids), 2)
+            self.assertIn('v2', special_entry.view_ids)
+            self.assertIn('v3', special_entry.view_ids)
+        finally:
+            os.unlink(tmp_file)
+
+    def test_describe_collection_empty(self):
+        """Test describe_mmif_collection with an empty directory."""
+        dummy_dir = 'dummy_mmif_collection'
+        os.makedirs(dummy_dir, exist_ok=True)
+        try:
+            output = wfh.describe_mmif_collection(dummy_dir)
+            # Validate using Pydantic model
+            validated = wfh.CollectionMmifDesc.model_validate(output)
+            self.assertEqual(validated.mmif_count_by_status.total, 0)
+            self.assertEqual(len(validated.workflows), 0)
+        finally:
+            os.rmdir(dummy_dir)
+
+    def test_describe_collection_with_files(self):
+        """Test describe_mmif_collection with MMIF files."""
+        dummy_dir = 'dummy_mmif_collection_with_files'
+        os.makedirs(dummy_dir, exist_ok=True)
+        try:
+            # Create two MMIF files in the directory
+            for i in range(2):
+                tmp_file = os.path.join(dummy_dir, f'{i}.mmif')
+                with open(tmp_file, 'w') as f:
+                    f.write(self.basic_mmif.serialize())
+            
+            output = wfh.describe_mmif_collection(dummy_dir)
+            
+            # Validate structure using Pydantic model
+            # If validation succeeds, all required fields with correct aliases are present
+            validated = wfh.CollectionMmifDesc.model_validate(output)
+            
+            # Verify counts using validated object attributes
+            self.assertEqual(validated.mmif_count_by_status.total, 2)
+            self.assertIsInstance(validated.workflows, list)
+        finally:
+            import shutil
+            shutil.rmtree(dummy_dir)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py
index fa0f8906..dd33fec2 100644
--- a/tests/test_utils_cli.py
+++ b/tests/test_utils_cli.py
@@ -1,3 +1,4 @@
+import argparse
 import contextlib
 import io
 import json
@@ -6,17 +7,124 @@
 import unittest.mock
 
 import mmif
-from mmif.utils.cli import rewind
-from mmif.utils.cli import source
-from mmif.utils.cli import describe
-
 from mmif.serialize import Mmif
-from mmif.vocabulary import DocumentTypes, AnnotationTypes
+from mmif.utils.cli import describe, rewind, source, summarize
+from mmif.vocabulary import AnnotationTypes
+
+BASIC_MMIF_STRING = '{"metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", "properties": {"id": "d1", "mime": "video/mp4", "location": "file:///test/video.mp4"}}], "views": []}'
+
+
+class BaseCliTestCase(unittest.TestCase):
+    """Base class for CLI module tests with common utilities."""
+    
+    cli_module = None  # Override in subclass
+    
+    def setUp(self):
+        """Set up common test fixtures."""
+        if self.cli_module:
+            self.parser = self.cli_module.prep_argparser()
+        self.basic_mmif = Mmif(BASIC_MMIF_STRING)
+        self.maxDiff = None
+    
+    @staticmethod
+    def create_temp_mmif_file(mmif_obj):
+        """Create a temporary MMIF file for testing.
+        
+        Args:
+            mmif_obj: Either a Mmif object or a dict/string to serialize
+            
+        Returns:
+            str: Path to the temporary file (caller must unlink)
+        """
+        tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False)
+        if isinstance(mmif_obj, Mmif):
+            content = mmif_obj.serialize(pretty=False)
+        else:
+            content = json.dumps(mmif_obj) if isinstance(mmif_obj, dict) else mmif_obj
+        tmp.write(content)
+        tmp.close()
+        return tmp.name
+    
+    def run_cli_capture_stdout(self, args_namespace):
+        """Run CLI module and capture stdout as parsed JSON.
+        
+        Args:
+            args_namespace: Namespace object with CLI arguments
+            
+        Returns:
+            dict: Parsed JSON output from stdout
+        """
+        with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout:
+            self.cli_module.main(args_namespace)
+            return json.loads(stdout.getvalue())
+
+
+class IOTestMixin:
+    """Mixin providing common I/O tests for CLI modules.
+    
+    Requires the test class to have:
+    - cli_module attribute
+    - basic_mmif attribute
+    - create_temp_mmif_file method
+    - run_cli_capture_stdout method
+    - expected_output_keys attribute (list of keys to check in output)
+    """
+    
+    def test_file_input_stdout_output(self):
+        """Test reading from file and outputting to stdout."""
+        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
+        try:
+            args = argparse.Namespace(
+                MMIF_FILE=tmp_file,
+                output=None,
+                pretty=False,
+                help_schema=None  # For describe module
+            )
+            output = self.run_cli_capture_stdout(args)
+            self.assertIsInstance(output, dict)
+            for key in self.expected_output_keys:
+                self.assertIn(key, output)
+        finally:
+            os.unlink(tmp_file)
+    
+    def test_file_input_file_output(self):
+        """Test reading from file and outputting to file."""
+        tmp_input = self.create_temp_mmif_file(self.basic_mmif)
+        tmp_output = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
+        tmp_output.close()
+        try:
+            args = self.parser.parse_args([tmp_input, '-o', tmp_output.name])
+            self.cli_module.main(args)
+            with open(tmp_output.name, 'r') as f:
+                output = json.load(f)
+            self.assertIsInstance(output, dict)
+            for key in self.expected_output_keys:
+                self.assertIn(key, output)
+        finally:
+            os.unlink(tmp_input)
+            os.unlink(tmp_output.name)
+    
+    def test_stdin_input_stdout_output(self):
+        """Test reading from stdin and outputting to stdout."""
+        mmif_str = self.basic_mmif.serialize()
+        with unittest.mock.patch('sys.stdin', io.StringIO(mmif_str)), \
+             unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout:
+            args = argparse.Namespace(
+                MMIF_FILE=None,
+                output=None,
+                pretty=False,
+                help_schema=None  # For describe module
+            )
+            self.cli_module.main(args)
+            output = json.loads(stdout.getvalue())
+            self.assertIsInstance(output, dict)
+            for key in self.expected_output_keys:
+                self.assertIn(key, output)
 
 
 class TestCli(unittest.TestCase):
     def setUp(self) -> None:
-        self.parser, _ = mmif.prep_argparser_and_subcmds()
+        self.parser, _, _ = mmif.prep_argparser_and_subcmds()
 
     def test_primary_cli(self):
         stdout = io.StringIO()
@@ -50,9 +158,8 @@ def generate_source_mmif(self):
 
         # to suppress output (otherwise, set to stdout by default)
         args = self.parser.parse_args(self.get_params())
-        with open(os.devnull, 'w') as devnull:
-            args.output = devnull
-            return source.main(args)
+        args.output = os.devnull
+        return source.main(args)
 
     def test_accept_file_paths(self):
         self.docs.append("video:/a/b/c.mp4")
@@ -120,24 +227,15 @@ def test_generate_mixed_scheme(self):
 
 class TestRewind(unittest.TestCase):
     def setUp(self):
+        empty_mmif_str = ('{"metadata": {"mmif": '
+                          '"http://mmif.clams.ai/1.0.0"}, "documents": [], '
+                          '"views": []}')
         # mmif we add views to
-        self.mmif_one = Mmif(
-            {
-                "metadata": {"mmif": "http://mmif.clams.ai/1.0.0"},
-                "documents": [],
-                "views": [],
-            }
-        )
+        self.mmif_one = Mmif(empty_mmif_str)
 
         # baseline empty mmif for comparison
-        self.empty_mmif = Mmif(
-            {
-                "metadata": {"mmif": "http://mmif.clams.ai/1.0.0"},
-                "documents": [],
-                "views": [],
-            }
-        )
-    
+        self.empty_mmif = Mmif(empty_mmif_str)
+
     @staticmethod
     def add_dummy_view(mmif: Mmif, appname: str, timestamp: str = None):
         v = mmif.new_view()
@@ -185,122 +283,86 @@ def test_app_rewind(self):
         self.assertIn('dummy_app_two', remaining_apps)
 
 
-class TestDescribe(unittest.TestCase):
+class TestDescribe(BaseCliTestCase, IOTestMixin):
     """Test suite for the describe CLI module."""
+    
+    cli_module = describe
+    expected_output_keys = ['workflowId', 'stats', 'apps']
 
-    def setUp(self):
-        """Create test MMIF structures."""
-        self.parser = describe.prep_argparser()
-        self.maxDiff = None
-        self.basic_mmif = Mmif(
-            '{"metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", "properties": {"id": "d1", "mime": "video/mp4", "location": "file:///test/video.mp4"}}], "views": []}'
-        )
-
-    def create_temp_mmif_file(self, mmif_obj):
-        """Helper to create a temporary MMIF file."""
-        tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False)
-        if isinstance(mmif_obj, Mmif):
-            content_to_write = mmif_obj.serialize(pretty=False)
-        else:
-            content_to_write = json.dumps(mmif_obj)
-        tmp.write(content_to_write)
-        tmp.close()
-        return tmp.name
-
-    def test_describe_single_mmif_empty(self):
+    def test_help_schema(self):
+        """Test --help-schema with different options"""
+        from mmif.utils.workflow_helper import SingleMmifDesc, CollectionMmifDesc
+        
+        # Test mmif-file
+        with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout:
+            args = argparse.Namespace(help_schema=['mmif-file'], MMIF_FILE=None, output=None, pretty=False)
+            with self.assertRaises(SystemExit) as cm:
+                describe.main(args)
+            self.assertEqual(cm.exception.code, 0)
+            output = stdout.getvalue()
+            # Verify SingleMmifDesc schema keys are present
+            self.assertIn("workflowId", output)
+            self.assertIn("stats", output)
+            self.assertIn("apps", output)
+
+        # Test mmif-dir
+        with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout:
+            args = argparse.Namespace(help_schema=['mmif-dir'], MMIF_FILE=None, output=None, pretty=False)
+            with self.assertRaises(SystemExit) as cm:
+                describe.main(args)
+            self.assertEqual(cm.exception.code, 0)
+            output = stdout.getvalue()
+            # Verify CollectionMmifDesc schema keys are present
+            self.assertIn("mmifCountByStatus", output)
+            self.assertIn("workflows", output)
+
+    def test_describe_main_directory(self):
+        """Test describe.main with a directory input"""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # Create two mmif files
+            with open(os.path.join(tmp_dir, '1.mmif'), 'w') as f:
+                f.write(self.basic_mmif.serialize())
+            with open(os.path.join(tmp_dir, '2.mmif'), 'w') as f:
+                f.write(self.basic_mmif.serialize())
+            
+            with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout:
+                # MMIF_FILE argument expects a string path
+                args = argparse.Namespace(MMIF_FILE=tmp_dir, output=None, pretty=False, help_schema=None)
+                describe.main(args)
+                output_json = json.loads(stdout.getvalue())
+                # Just verify valid JSON output was produced
+                self.assertIsInstance(output_json, dict)
+                self.assertTrue(len(output_json) > 0)
+
+    def test_deprecated_functions(self):
+        """Test backward compatibility wrapper functions"""
         tmp_file = self.create_temp_mmif_file(self.basic_mmif)
         try:
-            result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file)
-            self.assertEqual(result["stats"]["appCount"], 0)
-            self.assertEqual(len(result["apps"]), 0)
-            self.assertEqual(result["stats"]["annotationCountByType"], {})
+            with self.assertWarns(DeprecationWarning):
+                describe.get_pipeline_specs(tmp_file)
+            with self.assertWarns(DeprecationWarning):
+                describe.generate_pipeline_identifier(tmp_file)
         finally:
             os.unlink(tmp_file)
 
-    def test_describe_single_mmif_one_app(self):
-        view = self.basic_mmif.new_view()
-        view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0"
-        view.metadata.timestamp = "2024-01-01T12:00:00Z"
-        view.metadata.appProfiling = {"runningTime": "0:00:01.234"}
-        view.new_annotation(AnnotationTypes.TimeFrame)
-        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
-        try:
-            result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file)
-            self.assertEqual(result["stats"]["appCount"], 1)
-            self.assertEqual(len(result["apps"]), 1)
-            app_exec = result["apps"][0]
-            self.assertEqual(app_exec["app"], view.metadata.app)
-            self.assertEqual(app_exec["viewIds"], [view.id])
-            self.assertEqual(app_exec["appProfiling"]["runningTimeMS"], 1234)
-        finally:
-            os.unlink(tmp_file)
 
-    def test_describe_single_mmif_one_app_two_views(self):
-        view1 = self.basic_mmif.new_view()
-        view1.metadata.app = "http://apps.clams.ai/test-app/v1.0.0"
-        view1.metadata.timestamp = "2024-01-01T12:00:00Z"
-        view1.new_annotation(AnnotationTypes.TimeFrame)
-        view2 = self.basic_mmif.new_view()
-        view2.metadata.app = "http://apps.clams.ai/test-app/v1.0.0"
-        view2.metadata.timestamp = "2024-01-01T12:00:00Z"
-        view2.new_annotation(AnnotationTypes.TimeFrame)
-        tmp_file = self.create_temp_mmif_file(self.basic_mmif)
-        try:
-            result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file)
-            self.assertEqual(result["stats"]["appCount"], 1)
-            self.assertEqual(len(result["apps"]), 1)
-            app_exec = result["apps"][0]
-            self.assertEqual(app_exec["viewIds"], [view1.id, view2.id])
-        finally:
-            os.unlink(tmp_file)
+class TestSummarize(BaseCliTestCase, IOTestMixin):
+    """Test suite for the summarize CLI module."""
+    
+    cli_module = summarize
+    expected_output_keys = ['mmif_version', 'documents', 'views']
 
-    def test_describe_single_mmif_error_view(self):
-        view = self.basic_mmif.new_view()
-        view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0"
-        view.metadata.timestamp = "2024-01-01T12:00:00Z"
-        view.metadata.error = {"message": "Something went wrong"}
+    def test_summarize_validates_content(self):
+        """Test that summarize produces expected content."""
         tmp_file = self.create_temp_mmif_file(self.basic_mmif)
         try:
-            result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file)
-            self.assertEqual(result["stats"]["appCount"], 0)
-            self.assertEqual(len(result["apps"]), 0)
-            self.assertEqual(len(result["stats"]["errorViews"]), 1)
+            output = self.run_cli_capture_stdout(
+                argparse.Namespace(MMIF_FILE=tmp_file, output=None, pretty=False)
+            )
+            self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0")
         finally:
             os.unlink(tmp_file)
 
-    @unittest.mock.patch('jsonschema.validators.validate')
-    def test_describe_single_mmif_with_unassigned_views(self, mock_validate):
-        raw_mmif = json.loads(self.basic_mmif.serialize())
-        raw_mmif['views'].append({'id': 'v1', 'metadata': {'app': 'http://apps.clams.ai/app1/v1.0.0', 'timestamp': '2024-01-01T12:00:00Z'}, 'annotations': []})
-        raw_mmif['views'].append({'id': 'v2', 'metadata': {'app': 'http://apps.clams.ai/app2/v2.0.0'}, 'annotations': []})
-        raw_mmif['views'].append({'id': 'v3', 'metadata': {'timestamp': '2024-01-01T12:01:00Z', 'app': ''}, 'annotations': []})
-        tmp_file = self.create_temp_mmif_file(raw_mmif)
-        try:
-            result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file)
-            self.assertEqual(result['stats']['appCount'], 1)
-            self.assertEqual(len(result['apps']), 2)
-            special_entry = result['apps'][-1]
-            self.assertEqual(special_entry['app'], 'http://apps.clams.ai/non-existing-app/v1')
-            self.assertEqual(len(special_entry['viewIds']), 2)
-            self.assertIn('v2', special_entry['viewIds'])
-            self.assertIn('v3', special_entry['viewIds'])
-        finally:
-            os.unlink(tmp_file)
-
-    def test_describe_collection_empty(self):
-        dummy_dir = 'dummy_mmif_collection'
-        os.makedirs(dummy_dir, exist_ok=True)
-        try:
-            output = mmif.utils.workflow_helper.describe_mmif_collection(dummy_dir)
-            expected = {
-                'mmifCountByStatus': {'total': 0, 'successful': 0, 'withErrors': 0, 'withWarnings': 0, 'invalid': 0},
-                'workflows': [],
-                'annotationCountByType': {}
-            }
-            self.assertEqual(output, expected)
-        finally:
-            os.rmdir(dummy_dir)
-
 
 if __name__ == '__main__':
     unittest.main()