From 3f9459f5a18603b9f5b3f4c441f3e86231829fdd Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sun, 14 Dec 2025 22:29:28 +0900 Subject: [PATCH 01/48] minor fixes to better support baapb server --- mmif/utils/cli/describe.py | 4 +-- mmif/utils/workflow_helper.py | 60 ++++++++++++++++++++------------- tests/test_utils.py | 63 +++++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 26 deletions(-) diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index eaf35856..52166658 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -3,7 +3,7 @@ import sys import textwrap from pathlib import Path -from typing import Union +from typing import Union, cast from mmif.utils.workflow_helper import generate_workflow_identifier, describe_single_mmif, \ describe_mmif_collection @@ -22,7 +22,7 @@ def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: import warnings warnings.warn("generate_pipeline_identifier is deprecated, use generate_workflow_identifier instead", DeprecationWarning) - return generate_workflow_identifier(mmif_file) + return cast(str, generate_workflow_identifier(mmif_file)) def describe_argparser(): diff --git a/mmif/utils/workflow_helper.py b/mmif/utils/workflow_helper.py index 7980eb89..08920292 100644 --- a/mmif/utils/workflow_helper.py +++ b/mmif/utils/workflow_helper.py @@ -73,9 +73,29 @@ def generate_param_hash(params: dict) -> str: return hashlib.md5(param_string.encode('utf-8')).hexdigest() -def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str: +def _read_mmif_from_path(mmif_input: Union[str, Path, Mmif]) -> Mmif: """ - Generate a workflow identifier string from a MMIF file. + Helper function to get a Mmif object from various input types. + + :param mmif_input: Either a file path (str or Path) or an existing Mmif object + :return: Mmif object + :raises ValueError: If input is not a valid type + """ + if isinstance(mmif_input, Mmif): + return mmif_input + elif isinstance(mmif_input, (str, Path)): + with open(mmif_input, "r") as f: + mmif_str = f.read() + return Mmif(mmif_str) + else: + raise ValueError( + "MMIF input must be a string path, a Path object, or a Mmif object." + ) + + +def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], return_param_dicts=False) -> Union[str, Tuple[str, List[dict]]]: + """ + Generate a workflow identifier string from a MMIF file or object. The identifier follows the storage directory structure format: app_name/version/param_hash/app_name2/version2/param_hash2/... @@ -83,16 +103,12 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str: Uses view.metadata.parameters (raw user-passed values) for hashing to ensure reproducibility. Views with errors or warnings are excluded from the identifier; empty views are included. - """ - if not isinstance(mmif_file, (str, Path)): - raise ValueError( - "MMIF file path must be a string or a Path object." - ) - with open(mmif_file, "r") as f: - mmif_str = f.read() - - data = Mmif(mmif_str) + :param mmif_input: Path to MMIF file (str or Path) or a Mmif object + :param return_param_dicts: If True, also return the parameter dictionaries + :return: Workflow identifier string, or tuple of (identifier, param_dicts) if return_param_dicts=True + """ + data = _read_mmif_from_path(mmif_input) segments = [] # First prefix is source information, sorted by document type @@ -102,6 +118,7 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str: # Group views into runs grouped_apps = group_views_by_app(data.views) + param_dicts = [] for app_execution in grouped_apps: # Use the first view in the run as representative for metadata first_view = app_execution[0] @@ -120,6 +137,7 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str: param_dict = first_view.metadata.parameters except (KeyError, AttributeError): param_dict = {} + param_dicts.append(param_dict) param_hash = generate_param_hash(param_dict) @@ -128,6 +146,8 @@ def generate_workflow_identifier(mmif_file: Union[str, Path]) -> str: version_str = app_version if app_version else "unversioned" segments.append(f"{name_str}/{version_str}/{param_hash}") + if return_param_dicts: + return '/'.join(segments), param_dicts return '/'.join(segments) @@ -159,9 +179,9 @@ def _get_profile_data(view) -> dict: return {"runningTimeMS": milliseconds} -def describe_single_mmif(mmif_file: Union[str, Path]) -> dict: +def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: """ - Reads a MMIF file and extracts the workflow specification from it. + Reads a MMIF file or object and extracts the workflow specification from it. This function provides an app-centric summarization of the workflow. The conceptual hierarchy is that a **workflow** is a sequence of **apps**, @@ -212,19 +232,11 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict: The docstring above is used to generate help messages for the CLI command. Do not remove the triple-dashed lines. - :param mmif_file: Path to the MMIF file + :param mmif_input: Path to MMIF file (str or Path) or a Mmif object :return: A dictionary containing the workflow specification. """ - if not isinstance(mmif_file, (str, Path)): - raise ValueError( - "MMIF file path must be a string or a Path object." - ) - - workflow_id = generate_workflow_identifier(mmif_file) - with open(mmif_file, "r") as f: - mmif_str = f.read() - - mmif = Mmif(mmif_str) + mmif = _read_mmif_from_path(mmif_input) + workflow_id = generate_workflow_identifier(mmif) error_view_ids = [] warning_view_ids = [] diff --git a/tests/test_utils.py b/tests/test_utils.py index 0c261fe7..1fb97696 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -282,6 +282,69 @@ def test_generate_workflow_identifier_grouped(self): finally: os.unlink(tmp_file) + def test_generate_workflow_identifier_with_mmif_object(self): + """Test that generate_workflow_identifier accepts Mmif objects directly.""" + from mmif.utils import workflow_helper + import os + + # Test with Mmif object directly + workflow_id_from_obj = workflow_helper.generate_workflow_identifier(self.basic_mmif) + + # Test with file path - should produce the same result + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + workflow_id_from_file = workflow_helper.generate_workflow_identifier(tmp_file) + self.assertEqual(workflow_id_from_obj, workflow_id_from_file) + finally: + os.unlink(tmp_file) + + def test_read_mmif_from_path(self): + """Test the _read_mmif_from_path helper function.""" + from mmif.utils.workflow_helper import _read_mmif_from_path + from pathlib import Path + import os + + # Test with Mmif object - should return as-is + result = _read_mmif_from_path(self.basic_mmif) + self.assertIs(result, self.basic_mmif) + + # Test with file path string + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result_from_str = _read_mmif_from_path(tmp_file) + self.assertIsInstance(result_from_str, Mmif) + self.assertEqual(result_from_str.serialize(pretty=False), self.basic_mmif.serialize(pretty=False)) + + # Test with Path object + result_from_path = _read_mmif_from_path(Path(tmp_file)) + self.assertIsInstance(result_from_path, Mmif) + self.assertEqual(result_from_path.serialize(pretty=False), self.basic_mmif.serialize(pretty=False)) + finally: + os.unlink(tmp_file) + + # Test with invalid input + with pytest.raises(ValueError): + _read_mmif_from_path(12345) + + def test_describe_single_mmif_with_mmif_object(self): + """Test that describe_single_mmif accepts Mmif objects directly.""" + from mmif.utils.workflow_helper import describe_single_mmif + import os + + # Test with Mmif object directly + result_from_obj = describe_single_mmif(self.basic_mmif) + + # Test with file path - should produce the same result + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result_from_file = describe_single_mmif(tmp_file) + self.assertEqual(result_from_obj, result_from_file) + self.assertIn('workflowId', result_from_obj) + self.assertIn('stats', result_from_obj) + self.assertIn('apps', result_from_obj) + finally: + os.unlink(tmp_file) + if __name__ == '__main__': unittest.main() From 873122ef8d89dff9d1b7a7e9508ee32280acb063 Mon Sep 17 00:00:00 2001 From: kelleyl Date: Fri, 19 Dec 2025 21:01:12 +0900 Subject: [PATCH 02/48] adding select timepoints from targets --- mmif/utils/video_document_helper.py | 45 +++++++++++++++++ tests/test_utils.py | 78 +++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/mmif/utils/video_document_helper.py b/mmif/utils/video_document_helper.py index a1b9c59a..b0c3685b 100644 --- a/mmif/utils/video_document_helper.py +++ b/mmif/utils/video_document_helper.py @@ -209,6 +209,51 @@ def extract_representative_frame(mmif: Mmif, time_frame: Annotation, as_PIL: boo return extract_frames_as_images(video_document, rep_frame_num, as_PIL=as_PIL)[0] +def extract_target_frames(mmif: Mmif, annotation: Annotation, min_timepoints: int = 0, max_timepoints: int = sys.maxsize, fraction: float = 1.0, as_PIL: bool = False): + """ + Extracts frames corresponding to the timepoints listed in the ``targets`` property of an annotation. + Selection of timepoints is based on minimum, maximum, and fraction of targets to include. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param annotation: :py:class:`~mmif.serialize.annotation.Annotation` instance containing a ``targets`` property + :param min_timepoints: minimum number of timepoints to include + :param max_timepoints: maximum number of timepoints to include + :param fraction: fraction of targets to include (ideally) + :param as_PIL: return :py:class:`~PIL.Image.Image` instead of :py:class:`~numpy.ndarray` + :return: a tuple containing (list of frames, list of selected target IDs) + """ + if 'targets' not in annotation.properties: + raise ValueError(f'Annotation {annotation.id} does not have a "targets" property.') + + targets = annotation.get_property('targets') + num_targets = len(targets) + if num_targets == 0: + return [], [] + + ideal_count = int(num_targets * fraction) + count = max(min_timepoints, ideal_count) + count = min(max_timepoints, count) + count = min(num_targets, count) + + if count == 1: + indices = [num_targets // 2] + else: + indices = [int(i * (num_targets - 1) / (count - 1)) for i in range(count)] + + selected_target_ids = [targets[i] for i in indices] + selected_timepoints = [mmif[target_id] for target_id in selected_target_ids] + + # Assuming all targets use the same document as the parent annotation if it exists, + # otherwise we'll have to check each timepoint. convert_timepoint handles document lookup. + frame_nums = [int(convert_timepoint(mmif, tp, 'f')) for tp in selected_timepoints] + + # Get the document from the first selected timepoint to use with extract_frames_as_images + video_doc = mmif[selected_timepoints[0].get_property('document')] + images = extract_frames_as_images(video_doc, frame_nums, as_PIL=as_PIL) + + return images, selected_target_ids + + def sample_frames(start_frame: int, end_frame: int, sample_rate: float = 1) -> List[int]: """ Helper function to sample frames from a time interval. diff --git a/tests/test_utils.py b/tests/test_utils.py index 0c261fe7..17f32393 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,6 +2,7 @@ import unittest import tempfile import json +from unittest import mock import pytest @@ -135,6 +136,83 @@ def test_extract_frames_as_images(self): self.assertEqual(4, len(frame_list)) self.assertEqual(3, len(new_target_images)) + def test_extract_target_frames(self): + # Create 10 timepoints + tps = [] + for i in range(10): + tp = self.a_view.new_annotation(AnnotationTypes.TimePoint, timePoint=i*100, timeUnit='frame', document=self.video_doc.id) + tps.append(tp) + + # Create an annotation with targets + parent_ann = self.a_view.new_annotation(AnnotationTypes.TimeFrame, targets=[tp.id for tp in tps]) + + # Test fraction=0.5 (should get 5 timepoints: indices 0, 2, 4, 6, 9) + # indices = [int(i * 9 / 4) for i in range(5)] = [0, 2, 4, 6, 9] + images, ids = vdh.extract_target_frames(self.mmif_obj, parent_ann, fraction=0.5) + self.assertEqual(5, len(images)) + self.assertEqual(5, len(ids)) + self.assertEqual(tps[0].id, ids[0]) + self.assertEqual(tps[9].id, ids[-1]) + + # Test min_timepoints=8, fraction=0.1 (should get 8) + images, ids = vdh.extract_target_frames(self.mmif_obj, parent_ann, min_timepoints=8, fraction=0.1) + self.assertEqual(8, len(images)) + + # Test max_timepoints=3, fraction=1.0 (should get 3) + images, ids = vdh.extract_target_frames(self.mmif_obj, parent_ann, max_timepoints=3, fraction=1.0) + self.assertEqual(3, len(images)) + self.assertEqual(tps[0].id, ids[0]) + self.assertEqual(tps[4].id, ids[1]) # int(1 * 9 / 2) = 4 + self.assertEqual(tps[9].id, ids[2]) + + # Test all targets if min_timepoints > num_targets + images, ids = vdh.extract_target_frames(self.mmif_obj, parent_ann, min_timepoints=20) + self.assertEqual(10, len(images)) + self.assertEqual([tp.id for tp in tps], ids) + + def test_extract_target_frames_with_sample(self): + # Load from the sample file + swt_path = pathlib.Path(__file__).parent / 'samples' / '1.0' / 'swt.mmif' + with open(swt_path) as f: + mmif_obj = Mmif(f.read()) + + # Find a timeframe with targets + tf = None + for view in mmif_obj.views: + for ann in view.annotations: + if ann.at_type == AnnotationTypes.TimeFrame and 'targets' in ann.properties: + tf = ann + break + if tf: break + + self.assertIsNotNone(tf, "Could not find a TimeFrame with targets in swt.mmif") + + # Update document location to avoid error, although we'll mock extraction + # because we don't have the original video referenced in swt.mmif + video_doc = mmif_obj[tf.get_property('document')] + video_doc.location = f"file://{pathlib.Path(__file__).parent}/black-2997fps.mp4" + video_doc.add_property('fps', 29.97) + video_doc.add_property('frameCount', 1000) + + # Test with max_timepoints=5 + # We mock extract_frames_as_images because we don't really need to decode + # frames to test the selection logic here, and we don't want to rely on CV2/FFMPEG + # being fully functional for a dummy video in this test environment if possible + with mock.patch('mmif.utils.video_document_helper.extract_frames_as_images') as mock_extract: + mock_extract.return_value = [f"img_{i}" for i in range(5)] + images, ids = vdh.extract_target_frames(mmif_obj, tf, max_timepoints=5) + + self.assertEqual(len(images), 5) + self.assertEqual(len(ids), 5) + # Verify that IDs are from the targets list + targets = tf.get_property('targets') + for id in ids: + self.assertIn(id, targets) + + # Verify the first and last targets are selected (if count > 1) + self.assertEqual(ids[0], targets[0]) + self.assertEqual(ids[-1], targets[-1]) + class TestSequenceHelper(unittest.TestCase): From 0c40bacf9130fa3314665105f14d337d85830599 Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Tue, 23 Dec 2025 12:20:35 -0500 Subject: [PATCH 03/48] Importing the summarizer, fixing its imports and hooking it up with a MMIF CLI script --- .gitignore | 5 + mmif/utils/cli/summarize.py | 43 ++ mmif/utils/summarizer/__init__.py | 52 +++ mmif/utils/summarizer/config.py | 69 +++ mmif/utils/summarizer/graph.py | 596 ++++++++++++++++++++++++ mmif/utils/summarizer/summary.py | 731 ++++++++++++++++++++++++++++++ mmif/utils/summarizer/utils.py | 301 ++++++++++++ 7 files changed, 1797 insertions(+) create mode 100644 mmif/utils/cli/summarize.py create mode 100644 mmif/utils/summarizer/__init__.py create mode 100644 mmif/utils/summarizer/config.py create mode 100644 mmif/utils/summarizer/graph.py create mode 100644 mmif/utils/summarizer/summary.py create mode 100644 mmif/utils/summarizer/utils.py diff --git a/.gitignore b/.gitignore index 013ab917..f937b437 100644 --- a/.gitignore +++ b/.gitignore @@ -75,9 +75,14 @@ mmif/ver mmif/res mmif/vocabulary ./VERSION* +VERSION .hypothesis # Documentation build artifacts documentation/cli_help.rst documentation/whatsnew.rst docs-test + +# environments +.venv* +venv* diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py new file mode 100644 index 00000000..8b88c53d --- /dev/null +++ b/mmif/utils/cli/summarize.py @@ -0,0 +1,43 @@ +import sys +import argparse + +from mmif.utils.summarizer.summary import Summary + + + +def describe_argparser() -> tuple: + """ + Returns two strings: a one-line description of the argparser and additional + material, which will be shown for `mmif --help` and `mmif summarize --help`, + respectively. For now they return the same string. The retun value should + still be a tuple because mmif.cli() depends on it. + """ + oneliner = 'provides a CLI to create a JSON Summary for a MMIF file' + return oneliner, oneliner + + +def prep_argparser(**kwargs): + parser = argparse.ArgumentParser( + description=describe_argparser()[1], + formatter_class=argparse.RawDescriptionHelpFormatter, + **kwargs) + parser.add_argument("-i", metavar='MMIF_FILE', help='input MMIF file', required=True) + parser.add_argument("-o", metavar='JSON_FILE', help='output JSON summary file', required=True) + parser.add_argument("--full", action="store_true", help="print full report") + parser.add_argument('--transcript', action='store_true', help='include transcript') + parser.add_argument('--captions', action='store_true', help='include Llava captions') + parser.add_argument('--timeframes', action='store_true', help='include all time frames') + parser.add_argument('--entities', action='store_true', help='include entities from transcript') + return parser + + +def main(args): + #print('>>>', args) + mmif_summary = Summary(args.i) + #print('>>>', mmif_summary) + mmif_summary.report( + outfile=args.o, full=args.full, + #timeframes=args.timeframes, transcript=args.transcript, + #captions=args.captions, entities=args.entities + ) + diff --git a/mmif/utils/summarizer/__init__.py b/mmif/utils/summarizer/__init__.py new file mode 100644 index 00000000..59a980fe --- /dev/null +++ b/mmif/utils/summarizer/__init__.py @@ -0,0 +1,52 @@ + +import argparse + +from mmif.utils.summarizer.summary import Summary + + +def argparser(): + parser = argparse.ArgumentParser(description='Create a JSON Summary for a MMIF file') + parser.add_argument('-i', metavar='MMIF_FILE', help='input MMIF file', required=True) + parser.add_argument('-o', metavar='JSON_FILE', help='output JSON summary file', required=True) + parser.add_argument('--full', action='store_true', help='create full report') + parser.add_argument('--transcript', action='store_true', help='include transcript') + parser.add_argument('--captions', action='store_true', help='include Llava captions') + parser.add_argument('--timeframes', action='store_true', help='include all time frames') + parser.add_argument('--entities', action='store_true', help='include entities from transcript') + return parser + + +def pp_args(args): + for a, v in args.__dict__.items(): + print(f'{a:12s} --> {v}') + + +def main(): + parser = argparser() + args = parser.parse_args() + #pp_args(args) + mmif_summary = Summary(args.i) + mmif_summary.report( + outfile=args.o, full=args.full, + timeframes=args.timeframes, transcript=args.transcript, + captions=args.captions, entities=args.entities) + + +""" + +There used to be an option to process a whole directory, but I never used it and decided +that if needed it would better be done by an extra script or a separate function. + +The code for when there was a -d option is here just in case. + +if args.d: + for mmif_file in pathlib.Path(args.d).iterdir(): + if mmif_file.is_file() and mmif_file.name.endswith('.mmif'): + print(mmif_file) + json_file = str(mmif_file)[:-4] + 'json' + mmif_summary = Summary(mmif_file.read_text()) + mmif_summary.report( + outfile=json_file, full=args.full, + timeframes=args.timeframes, transcript=args.transcript, + captions=args.captions, entities=args.entities) +""" \ No newline at end of file diff --git a/mmif/utils/summarizer/config.py b/mmif/utils/summarizer/config.py new file mode 100644 index 00000000..f972bd97 --- /dev/null +++ b/mmif/utils/summarizer/config.py @@ -0,0 +1,69 @@ + +from mmif.vocabulary import DocumentTypes +from mmif.vocabulary import AnnotationTypes + + +# The name of CLAMS applications, used to select views and to determine whether +# the summarizer is appropriate for the app version. +# TODO: this now requires an exhaustive listing of all allowed apps and their +# versions, we need a more maintainable system. + +KALDI = [ + # The first two use MMIF 0.4 and should probably be retired + 'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.2', + 'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.3', + 'http://apps.clams.ai/aapb-pua-kaldi-wrapper/v3'] + +WHISPER = [ + 'http://apps.clams.ai/whisper-wrapper/v7', + 'http://apps.clams.ai/whisper-wrapper/v8', + 'http://apps.clams.ai/whisper-wrapper/v8-3-g737e280'] + +CAPTIONER = [ + 'http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97', + 'http://apps.clams.ai/smolvlm2-captioner'] + +NER = [ + 'http://apps.clams.ai/spacy-wrapper/v1.1', + 'http://apps.clams.ai/spacy-wrapper/v2.1'] + +SEGMENTER = 'http://apps.clams.ai/audio-segmenter' + + +# When a named entity occurs 20 times we do not want to generate 20 instances of +# it. If the start of the next entity occurs within the below number of +# milliseconds after the end of the previous, then it is just added to the +# previous one. Taking one minute as the default so two mentions in a minute end +# up being the same instance. This setting can be changed with the 'granularity' +# parameter. +# TODO: this seems broken + +GRANULARITY = 1000 + + +# Properties used for the summary for various tags + +DOC_PROPS = ('id', 'type', 'location') +VIEW_PROPS = ('id', 'timestamp', 'app') +TF_PROPS = ('id', 'start', 'end', 'frameType') +E_PROPS = ('id', 'group', 'cat', 'tag', 'video-start', 'video-end', 'coordinates') + + +# Names of types + +TEXT_DOCUMENT = DocumentTypes.TextDocument.shortname +VIDEO_DOCUMENT = DocumentTypes.VideoDocument.shortname +TIME_FRAME = AnnotationTypes.TimeFrame.shortname +BOUNDING_BOX = AnnotationTypes.BoundingBox.shortname +ALIGNMENT = AnnotationTypes.Alignment.shortname + +ANNOTATION = 'Annotation' +TOKEN = 'Token' +SENTENCE = 'Sentence' +PARAGRAPH = 'Paragraph' +NAMED_ENTITY = 'NamedEntity' +NOUN_CHUNK = 'NounChunk' +VERB_CHUNK = 'VerbChunk' + +TIME_BASED_INTERVALS = {TIME_FRAME} +SPAN_BASED_INTERVALS = {TOKEN, SENTENCE, PARAGRAPH, NAMED_ENTITY, NOUN_CHUNK, VERB_CHUNK} diff --git a/mmif/utils/summarizer/graph.py b/mmif/utils/summarizer/graph.py new file mode 100644 index 00000000..ae11b9be --- /dev/null +++ b/mmif/utils/summarizer/graph.py @@ -0,0 +1,596 @@ +import sys, json +from collections import defaultdict +from operator import itemgetter +from pathlib import Path +import argparse + +from mmif import Mmif + +from mmif.utils.summarizer import config +from mmif.utils.summarizer.utils import compose_id, normalize_id +#from summarizer.utils import compose_id, flatten_paths, normalize_id + + + +class Graph(object): + + """Graph implementation for a MMIF document. Each node contains an annotation + or document. Alignments are stored separately. Edges between nodes are created + from the alignments and added to the Node.targets property. The first edge added + to Node.targets is the document that the Node points to (if there is one). + + The goal for the graph is to store all useful annotation and to have simple ways + to trace nodes all the way up to the primary data.""" + + def __init__(self, mmif): + self.mmif = mmif if type(mmif) is Mmif else Mmif(mmif) + self.documents = [] + self.nodes = {} + self.alignments = [] + self._init_nodes() + self._init_edges() + # Third pass to add links between text elements, in particular from + # entities to tokens, adding lists of tokens to entities. + tokens = self.get_nodes(config.TOKEN) + entities = self.get_nodes(config.NAMED_ENTITY) + self.token_idx = TokenIndex(tokens) + #self.token_idx.pp() + for e in entities: + #print('>>>', e, e.anchors) + e.tokens = self.token_idx.get_tokens_for_node(e) + + def _init_nodes(self): + # The top-level documents are added as nodes, but they are also put in + # the documents list. + for doc in self.mmif.documents: + self.add_node(None, doc) + self.documents.append(doc) + # First pass over all annotations and documents in all views and save + # them in the graph. + doc_ids = [d.id for d in self.documents] + for view in self.mmif.views: + for annotation in view.annotations: + normalize_id(doc_ids, view, annotation) + if annotation.at_type.shortname == config.ALIGNMENT: + # alignments are not added as nodes, but we do keep them around + self.alignments.append((view, annotation)) + else: + self.add_node(view, annotation) + + def _init_edges(self): + # Second pass over the alignments so we create edges. + for view, alignment in self.alignments: + self.add_edge(view, alignment) + + def __str__(self): + return "" % len(self.nodes) + + def add_node(self, view, annotation): + """Add an annotation as a node to the graph.""" + node = Nodes.new(self, view, annotation) + self.nodes[node.identifier] = node + + def add_edge(self, view, alignment): + source_id = alignment.properties['source'] + target_id = alignment.properties['target'] + #print(alignment.id, source_id, target_id) + source = self.get_node(source_id) + target = self.get_node(target_id) + # make sure the direction goes from token or textdoc to annotation + if target.annotation.at_type.shortname in (config.TOKEN, config.TEXT_DOCUMENT): + source, target = target, source + source.targets.append(target) + source.add_anchors_from_alignment(target) + target.add_anchors_from_alignment(source) + + def get_node(self, node_id): + return self.nodes.get(node_id) + + def get_nodes(self, short_at_type: str, view_id : str = None): + """Get all nodes for an annotation type, using the short form. If a view + identifier is provided then only include nodes from that view.""" + return [node for node in self.nodes.values() + if (node.at_type.shortname == short_at_type + and (view_id is None or node.view.id == view_id))] + + def statistics(self): + stats = defaultdict(int) + for node in self.nodes.values(): + stats[f'{str(node.view_id):4} {node.at_type.shortname}'] += 1 + return stats + + def trim(self, start: int, end: int): + """Trim the graph and keep only those nodes that are included in the graph + between two timepoints (both in milliseconds). This assumes that all nodes + are anchored on the time in the audio or video stream. At the moment it + keeps all nodes that are not explicitly anchored.""" + remove = set() + for node_id, node in self.nodes.items(): + if 'time-point' in node.anchors: + if not start <= node.anchors['time-point'] <= end: + remove.add(node_id) + if 'time-offsets' in node.anchors: + p1, p2 = node.anchors['time-offsets'] + if not (start <= p1 <= end and start <= p2 <= end): + remove.add(node_id) + new_nodes = [n for n in self.nodes.values() if not n.identifier in remove] + self.nodes = { node.identifier: node for node in new_nodes } + + def pp(self, fname=None,skip_timepoints=False): + fh = sys.stdout if fname is None else open(fname, 'w') + fh.write("%s\n" % self) + for view in self.mmif.views: + fh.write(" \n" % (view.id, str(view.metadata['app']))) + for node_id, node in self.nodes.items(): + if node.at_type.shortname == 'TimePoint': + continue + fh.write(" %-40s" % node) + targets = [str(t) for t in node.targets] + fh.write(' --> [%s]\n' % ' '.join(targets)) + + def pp_statistics(self): + stats = self.statistics() + for at_type in sorted(stats): + print(f'{at_type:20} {stats[at_type]:>5}') + + +class TokenIndex(object): + + """ + The tokens are indexed on the identifier on the TextDocument that they occur + in and for each text document we have a list of pairs + + {'v_4:td1': [ + ((0, 5), ), + ((5, 6), ), + ... + } + """ + + # TODO: + # - Benchmark get_tokens_for_node(). I may want to use something like this + # to determine enclosed nodes and enclosing nodes and that may blow up since + # that would be O(n^2). If it does matter, probably start using binary search + # or add an index from character offset to nodes. + # - It is also not sure whether we still need this since the new spaCy gives + # targets to tokens. + + def __init__(self, tokens): + self.tokens = {} + self.token_count = len(tokens) + for t in tokens: + tup = ((t.properties['start'], t.properties['end']), t) + self.tokens.setdefault(t.document.identifier, []).append(tup) + # Make sure the tokens for each document are ordered. + for document, token_list in self.tokens.items(): + self.tokens[document] = sorted(token_list, key=itemgetter(0)) + # In some cases there are two tokens with identical offset (for example + # with tokenization from both Kaldi and spaCy, not sure what to do with + # these, but should probably be more careful on what views to access + + def __len__(self): + return self.token_count + + def __str__(self): + return f'' + + def get_tokens_for_node(self, node): + """Return all tokens included in the span of a node.""" + doc = node.document.identifier + try: + start = node.properties['start'] + end = node.properties['end'] + except KeyError: + start, end = node.anchors['text-offsets'] + tokens = [] + for (t_start, t_end), token in self.tokens.get(doc, []): + if t_start >= start and t_end <= end: + tokens.append(token) + return tokens + + def pp(self, fname=None): + fh = sys.stdout if fname is None else open(fname, 'w') + for document in self.tokens: + fh.write("\n[%s] -->\n" % document) + for t in self.tokens[document]: + fh.write(' %s %s\n' % (t[0], t[1])) + + +class Node(object): + + def __init__(self, graph, view, annotation): + self.graph = graph + self.view = view + self.view_id = None if self.view is None else self.view.id + self.annotation = annotation + # copy some information from the Annotation + self.at_type = annotation.at_type + self.identifier = annotation.id + self.properties = json.loads(str(annotation.properties)) + # get the document from the view or the properties + self.document = self._get_document() + # The targets property contains a list of annotations or documents that + # the node content points to. This includes the document the annotation + # points to as well as the alignment from a token or text document to a + # bounding box or time frame (which is added later). + # TODO: the above does not seem to be true since there is no evidence of + # data from alignments being added. + self.targets = [] if self.document is None else [self.document] + self.anchors = {} + self.add_local_anchors() + self.add_anchors_from_targets() + + def __str__(self): + anchor = '' + if self.at_type.shortname == config.TOKEN: + anchor = " %s:%s '%s'" % (self.properties['start'], + self.properties['end'], + self.properties.get('text','').replace('\n', '\\n')) + return "<%s %s%s>" % (self.at_type.shortname, self.identifier, anchor) + + def add_local_anchors(self): + """Get the anchors that you can get from the annotation itself, which + includes the start and end offsets, the coordinates, the timePoint of + a BoundingBox and any annotation with targets.""" + props = self.properties + attype = self.annotation.at_type.shortname + if 'start' in props and 'end' in props: + # TimeFrame is the only non-character based interval so this simple + # if-then-else should work + if attype == config.TIME_FRAME: + self.anchors['text-offsets'] = (props['start'], props['end']) + else: + self.anchors['time-offsets'] = (props['start'], props['end']) + if 'coordinates' in props: + self.anchors['coordinates'] = props['coordinates'] + if 'timePoint' in props: + self.anchors['time-point'] = props['timePoint'] + if 'targets' in props: + self.anchors['targets'] = props['targets'] + + def add_anchors_from_targets(self): + """Get start and end offsets or timePoints from the targets and add them to + the anchors, but only if there were no anchors on the node already. This has + two cases: one for TimeFrames and one for text intervals.""" + props = self.properties + attype = self.annotation.at_type.shortname + if 'targets' in props: + try: + t1 = self.graph.nodes[props['targets'][0]] + t2 = self.graph.nodes[props['targets'][-1]] + if attype == config.TIME_FRAME: + if not 'time-offsets' in props: + self.anchors['time-offsets'] = ( + t1.properties['timePoint'], t2.properties['timePoint']) + else: + if not 'text-offsets' in props: + self.anchors['text-offsets'] = ( + t1.properties['start'], t2.properties['end']) + except IndexError: + print(f'WARNING: Unexpected empty target list for {self.identifier}') + + def add_anchors_from_alignment(self, target: None, debug=False): + source_attype = self.at_type.shortname + target_attype = target.at_type.shortname + if debug: + print('\n@ DEBUG SOURCE->TARGET ', source_attype, target_attype) + print('@ DEBUG SOURCE.PROPS ', list(self.properties.keys())) + print('@ DEBUG TARGET.PROPS ', list(target.properties.keys())) + print('@ DEBUG TARGET.ANCHORS ', target.anchors) + # If a TextDocument is aligned to a BoundingBox then we grab the coordinates + # TODO: how are we getting the time point? + if source_attype == 'TextDocument' and target_attype == 'BoundingBox': + if 'coordinates' in target.properties: + self.anchors['coordinates'] = target.properties['coordinates'] + #print(source_attype, self.anchors) + elif source_attype == 'BoundingBox' and target_attype == 'TextDocument': + pass + # If a TextDocument is aligned to a TimeFrame then we copy time anchors + # but also targets and representatives, the latter because some alignments + # are not precise + elif source_attype == 'TextDocument' and target_attype == 'TimeFrame': + if 'start' in target.properties and 'end' in target.properties: + self.anchors['time-offsets'] = (target.properties['start'], + target.properties['end']) + if 'time-offsets' in target.anchors: + # TODO: is this ever used? + self.anchors['time-offsets'] = target.anchors['time-offsets'] + if 'targets' in target.properties: + self.anchors['targets'] = target.properties['targets'] + if 'representatives' in target.properties: + self.anchors['representatives'] = target.properties['representatives'] + #print('-', source_attype, self.anchors, self, target) + elif source_attype == 'TimeFrame' and target_attype == 'TextDocument': + pass + # Simply copy the time point + elif source_attype == 'TextDocument' and target_attype == 'TimePoint': + self.anchors['time-point'] = target.anchors['time-point'] + if debug: + print('+ ADDED SOURCE.ANCHORS ', self.anchors) + # For Token-TimeFrame alignments all we need are the start and end time points + elif source_attype == 'Token' and target_attype == 'TimeFrame': + if 'start' in target.properties and 'end' in target.properties: + self.anchors['time-offsets'] = (target.properties['start'], + target.properties['end']) + #print(source_attype, self.anchors) + elif source_attype == 'TimeFrame' and target_attype == 'Token': + pass + # TODO: check whether some action is needed for the next options + elif source_attype == 'TextDocument' and target_attype == 'VideoDocument': + pass + elif source_attype == 'VideoDocument' and target_attype == 'TextDocument': + pass + elif source_attype == 'BoundingBox' and target_attype == 'TimePoint': + pass + elif source_attype =='TimePoint' and target_attype == 'BoundingBox': + pass + elif source_attype == 'BoundingBox' and target_attype in ('Token', 'Sentence', 'Paragraph'): + pass + elif source_attype in ('Token', 'Sentence', 'Paragraph') and target_attype == 'BoundingBox': + pass + elif source_attype == 'TextDocument' and target_attype == 'TimePoint': + pass + elif source_attype == 'TimePoint' and target_attype == 'TextDocument': + pass + else: + print('-', source_attype, target_attype) + #if debug: + # print('DEBUG', self.anchors) + + def _get_document(self): + """Return the document or annotation node that the annotation/document in + the node refers to via the document property. This could be a local property + or a metadata property if there is no such local property. Return None + if neither of those exist.""" + # try the local property + docid = self.properties.get('document') + if docid is not None: + # print('>>>', docid, self.graph.get_node(docid)) + return self.graph.get_node(docid) + # try the metadata property + if self.view is not None: + try: + metadata = self.view.metadata.contains[self.at_type] + docid = metadata['document'] + return self.graph.get_node(docid) + except KeyError: + return None + return None + + def XXX_get_document_plus_span(self): + self.pp() + props = self.properties + return "%s:%s:%s" % (self.document.identifier, + props['start'], props['end']) + + def XXXpaths_to_docs(self): + """Return all the paths from the node to documents.""" + paths = self._paths_to_docs() + return flatten_paths(paths) + + def XXX_paths_to_docs(self): + paths = [] + if not self.targets: + return [[self]] + for t in self.targets: + paths.append([self]) + for i, target in enumerate(self.targets): + paths[i].extend(target._paths_to_docs()) + return paths + + def summary(self): + """The default summary is just the identfier, this should typically be + overriden by sub classes.""" + return { 'id': self.identifier } + + def pp(self, close=True): + print('-' * 80) + print(self) + print(f' document = {self.document}') + for prop in self.properties: + print(f' {prop} = {self.properties[prop]}') + print(' targets = ') + for target in self.targets: + print(' ', target) + print(' anchors = ') + for anchor in self.anchors: + print(f' {anchor} -> {self.anchors[anchor]}') + if close: + print('-' * 80) + + +class TimeFrameNode(Node): + + def __str__(self): + frame_type = ' ' + self.frame_type() if self.has_label() else '' + return ('' + % (self.identifier, self.start(), self.end(), frame_type)) + + def start(self): + return self.properties.get('start', -1) + + def end(self): + return self.properties.get('end', -1) + + def frame_type(self): + # TODO: rename this, uses old property since replaced by "label"" + # NOTE: this is still aloowing for the old property though + return self.properties.get('label') or self.properties.get('frameType') + + def has_label(self): + return self.frame_type() is not None + + def representatives(self) -> list: + """Return a list of the representative TimePoints.""" + # TODO: why could I not get this from the anchors? + rep_ids = self.properties.get('representatives', []) + reps = [self.graph.get_node(rep_id) for rep_id in rep_ids] + return reps + + def summary(self): + """The summary of a time frame just contains the identifier, start, end + and frame type.""" + return { 'id': self.identifier, + 'start': self.properties['start'], + 'end': self.properties['end'], + 'frameType': self.properties.get('frameType') } + + +class EntityNode(Node): + + def __init__(self, graph, view, annotation): + super().__init__(graph, view, annotation) + self.tokens = [] + self._paths = None + self._anchor = None + + def __str__(self): + try: + start = self.properties['start'] + end = self.properties['end'] + except KeyError: + start, end = self.anchors['text-offsets'] + return ("" + % (self.identifier, start, end, self.properties['text'])) + + def start_in_video(self): + #print('+++', self.document.properties) + try: + return self.document.anchors['time-point'] + except KeyError: + return -1 + #return self.anchor()['video-start'] + + def end_in_video(self): + return self.anchor().get('video-end') + + def pp(self): + super().pp(close=False) + try: + for i, p in enumerate(self.paths_to_docs()): + print(' %s' % ' '.join([str(n) for n in p[1:]])) + except ValueError: + print(' WARNING: error in path_to_docs in NamedEntityNode.pp()') + print('-' * 80) + + def summary(self): + """The summary for entities needs to include where in the video or image + the entity occurs, it is not enough to just give the text document.""" + # TODO: in the old days this used an anchor() method which was fragile + # TODO: revamping it now + + #anchor = self.anchor() + #self.document.pp() +# print('...', self.document.anchors + return { + 'id': self.identifier, + 'group': self.properties['group'], + 'cat': self.properties['category'], + 'document': self.document.identifier, + # Entities in a TextDocument that is a full transcript without any + # alignments do not have a TimePoint + #'time-point': self.document.anchors.get('time-point'), + #'text-offsets': self.anchors.get('text-offsets'), + 'time-point': self.document.anchors.get('time-point', -1), + 'text-offsets': self.anchors.get('text-offsets', (-1 ,-1)), + #'document': self._get_document_plus_span(), + #'video-start': anchor.get('video-start'), + #'video-end': anchor.get('video-end'), + #'coordinates': self._coordinates_as_string(anchor) + } + + def anchor(self): + """The anchor is the position in the video that the entity is linked to. + This anchor cannot be found in the document property because that points + to a text document that was somehow derived from the video document. Some + graph traversal is needed to get the anchor, but we know that the anchor + is always a time frame or a bounding box. + """ + # TODO: deal with the case where the primary document is not a video + self.paths = self.paths_to_docs() + bbtf = self.find_boundingbox_or_timeframe() + # for path in paths: + # print('... [') + # for n in path: print(' ', n) + # print('===', bbtf) + if bbtf.at_type.shortname == config.BOUNDING_BOX: + return {'video-start': bbtf.properties['timePoint'], + 'coordinates': bbtf.properties['coordinates']} + elif bbtf.at_type.shortname == config.TIME_FRAME: + return {'video-start': bbtf.properties['start'], + 'video-end': bbtf.properties['end']} + + def anchor2(self): + """The anchor is the position in the video that the entity is linked to. + This anchor cannot be found in the document property because that points + to a text document that was somehow derived from the video document. Some + graph traversal is needed to get the anchor, but we know that the anchor + is always a time frame or a bounding box. + """ + # TODO: with this version you get an error that the paths variable does + # not exist yet, must get a clearer picture on how to build a graph + # where nodes have paths to anchors + # TODO: deal with the case where the primary document is not a video + if self._anchor is None: + self._paths = self.paths_to_docs() + bbtf = self.find_boundingbox_or_timeframe() + # for path in self._paths: + # print('... [') + # for n in path: print(' ', n) + # print('===', bbtf) + if bbtf.at_type.shortname == config.BOUNDING_BOX: + self._anchor = {'video-start': bbtf.properties['timePoint'], + 'coordinates': bbtf.properties['coordinates']} + elif bbtf.at_type.shortname == config.TIME_FRAME: + self._anchor = {'video-start': bbtf.properties['start'], + 'video-end': bbtf.properties['end']} + return self._anchor + + def find_boundingbox_or_timeframe(self): + return self.paths[-1][-2] + + @staticmethod + def _coordinates_as_string(anchor): + if 'coordinates' not in anchor: + return None + return ','.join(["%s:%s" % (pair[0], pair[1]) + for pair in anchor['coordinates']]) + + +class Nodes(object): + + """Factory class for Node creation. Use Node for creation unless a special + class was registered for the kind of annotation we have.""" + + node_classes = { config.NAMED_ENTITY: EntityNode, + config.TIME_FRAME: TimeFrameNode } + + @classmethod + def new(cls, graph, view, annotation): + node_class = cls.node_classes.get(annotation.at_type.shortname, Node) + return node_class(graph, view, annotation) + + + +if __name__ == '__main__': + + graph = Graph(open(sys.argv[1]).read()) + print(graph) + #graph.pp() + #graph.nodes['v_7:st12'].pp() + #graph.nodes['v_2:s1'].pp() + #graph.nodes['v_4:tf1'].pp() + exit() + for node in graph.nodes.values(): + print(node.at_type.shortname, node.identifier, node.anchors) + + +''' + +Printing some graphs: + +uv run graph.py -i examples/input-v9.mmif -e dot -f png -o examples/dot-v9-1-full -p -a -v +uv run graph.py -i examples/input-v9.mmif -e dot -f png -o examples/dot-v9-2-no-view-links -p -a +uv run graph.py -i examples/input-v9.mmif -e dot -f png -o examples/dot-v9-3-no-anchor-to-doc -p + +''' diff --git a/mmif/utils/summarizer/summary.py b/mmif/utils/summarizer/summary.py new file mode 100644 index 00000000..b340f35b --- /dev/null +++ b/mmif/utils/summarizer/summary.py @@ -0,0 +1,731 @@ +"""MMIF Summarizer + +MMIF consumer that creates a JSON summary from a MMIF file. + +Makes some simplifying assumptions, including: + +- There is one video in the MMIF documents list. All start and end properties + are pointing to that video. +- The time unit is assumed to be milliseconds. + +Other assumptions are listed with the options below. + + +USAGE: + + $ python summary.py [OPTIONS] + + Reads the MMIF file and creates a JSON summary file with the document list + and any requested extra information. + +Example: + + $ python summary -i input.mmif -o output.json --transcript + + Reads input.mmif and creates output.json with just transcript + information added to the documents list and the views. + +In all cases, the summarizer will summarize what is there and use the information +that is there, if the output of CLAMS is bad, then the results of the summarizer +will be bad (although it may hide a lot of the badness). In some rare cases some +information is added. For example if the ASR tool does not group tokens then the +summarizer will do that, but then only by simply grouping in equal chunks and not +trying to infer sentence-like groupings. + +The summary always includes the MMIF version, the list of documents and a summary +of the metadata of all views (identifier, CLAMS app, timestamp, total number of +annotations and number of annotations per type, it does not show parameters and +application configuration). + + +OPTIONS: + +-i INFILE -o OUTFILE + +Run the summarizer over a single MMIF file and write the JSON summary to OUTFILE. + +-- timeframes + +Shows basic information of all timeframes. This groups the timeframes according to +the apps it was found in. + +--transcript + +Shows the text from the transcript in pseudo sentences. + +The transcript is taken from the last non-warning ASR view, so only the last added +transcript will be summarized. It is assumed that Tokens in the view are ordered on +text occurrence. + +--captions + +Shows captions from the Llava captioner app. + +--entities + +Include entities from spaCy or other NER. + +--full + +Include all the above. + +""" + +# TODO: +# - For the time unit we should really update get_start(), get_end() and other methods. + + +import os, sys, io, json, argparse, pathlib +from collections import defaultdict + +from mmif.serialize import Mmif +from mmif.vocabulary import DocumentTypes + +from mmif.utils.summarizer.utils import CharacterList +from mmif.utils.summarizer.utils import get_aligned_tokens, timestamp +from mmif.utils.summarizer.utils import get_transcript_view, get_last_segmenter_view, get_captions_view +from mmif.utils.summarizer.graph import Graph +from mmif.utils.summarizer import config + + +VERSION = '0.2.0' + + +DEBUG = False + +def debug(*texts): + if DEBUG: + for text in texts: + sys.stderr.write(f'{text}\n') + + +class SummaryException(Exception): + pass + + +class Summary(object): + + """Implements the summary of a MMIF file. + + fname - name of the input mmif file + mmif - instance of mmif.serialize.Mmif + graph - instance of graph.Graph + documents - instance of Documents + views - instance of Views + transcript - instance of Transcript + timeframes - instance of TimeFrames + entities - instance of Entities + captions - instance of get_captions_view + + """ + + def __init__(self, mmif_file): + self.fname = mmif_file + #self.mmif = mmif if type(mmif) is Mmif else Mmif(mmif) + self.mmif = Mmif(pathlib.Path(mmif_file).read_text()) + self.warnings = [] + self.graph = Graph(self.mmif) + self.mmif_version = self.mmif.metadata['mmif'] + self.documents = Documents(self) + self.annotations = Annotations(self) + self.document = Document(self) + self.views = Views(self) + self.timeframes = TimeFrames(self) + self.timeframe_stats = TimeFrameStats(self) + self.transcript = Transcript(self) + self.captions = Captions(self) + self.entities = Entities(self) + self.validate() + self.print_warnings() + + def add_warning(self, warning: str): + self.warnings.append(warning) + + def validate(self): + """Minimal validation of the input. Mostly a place holder because all it + does now is to check how many video documents there are.""" + if len(self.video_documents()) > 1: + raise SummaryException("More than one video document in MMIF file") + + def video_documents(self): + return self.mmif.get_documents_by_type(DocumentTypes.VideoDocument) + + def report(self, outfile=None, html=None, full=False, timeframes=False, + transcript=False, captions=False, entities=False): + json_obj = { + 'mmif_version': self.mmif.metadata.mmif, + 'document': self.document.data, + 'documents': self.documents.data, + 'annotations': self.annotations.data, + 'views': self.views.data} + if transcript or full: + json_obj['transcript'] = self.transcript.data + if captions or full: + json_obj['captions'] = self.captions.as_json() + if timeframes or full: + json_obj['timeframes'] = self.timeframes.as_json() + json_obj['timeframe_stats'] = self.timeframe_stats.data + if entities or full: + json_obj['entities'] = self.entities.as_json() + report = json.dumps(json_obj, indent=2) + if outfile is None: + return report + else: + with open(outfile, 'w') as fh: + fh.write(report) + + def print_warnings(self): + for warning in self.warnings: + print(f'WARNING: {warning}') + + def pp(self): + self.documents.pp() + self.views.pp() + self.transcript.pp() + self.timeframes.pp() + self.entities.pp() + print() + + +class Documents(object): + + """Contains a list of document summaries, which are dictionaries with just + the id, type and location properties.""" + + def __init__(self, summary: Summary): + self.data = [self.summary(doc) for doc in summary.graph.documents] + + def __len__(self): + return len(self.data) + + @staticmethod + def summary(doc): + return { 'id': doc.id, + 'type': doc.at_type.shortname, + 'location': doc.location } + + def pp(self): + print('\nDocuments -> ') + for d in self.data: + print(' %s %s' % (d['type'], d['location'])) + + +class Annotations(object): + + """Contains a dictionary of Annotation object summaries, indexed on view + identifiers.""" + + def __init__(self, summary): + self.data = defaultdict(list) + # summary.graph.get_nodes(config.ANNOTATION, view_id=view.id) + for anno in summary.graph.get_nodes(config.ANNOTATION): + self.data[anno.view.id].append(anno.properties) + + def get(self, item): + return self.data.get(item, []) + + def get_all_annotations(self): + annotations = [] + for annos in self.data.values(): + annotations.extend(annos) + return annotations + + +class Document(object): + + """Collects some document-level information, including MMIF version, size of + the MMIF file and some information from the SWT document annotation.""" + + def __init__(self, summary): + self.data = { + 'mmif_version': summary.mmif_version, + 'size': os.path.getsize(summary.fname) } + annotations = summary.annotations.get_all_annotations() + if annotations: + # TODO: this if fragile because it assumes that the annotation we want + # (which is the one from SWT) is always the first + doc_level_annotation = annotations[0] + if 'fps' in doc_level_annotation: + self.data['fps'] = doc_level_annotation['fps'] + if 'frameCount' in doc_level_annotation: + self.data['frames'] = doc_level_annotation['frameCount'] + if 'duration' in doc_level_annotation: + duration = doc_level_annotation['duration'] + # both in milliseconds and as a timestamp + self.data['duration_ms'] = duration + self.data['duration_ts'] = timestamp(duration) + + +class Views(object): + + """Contains a list of view summaries, which are dictionaries with just + the id, app and timestamp properties.""" + + def __init__(self, summary): + self.summary = summary + self.data = [self.get_view_summary(view) for view in summary.mmif.views] + + def __getitem__(self, i): + return self.data[i] + + def __len__(self): + return len(self.data) + + #@staticmethod + def get_view_summary(self, view): + annotation_types = defaultdict(int) + for annotation in view.annotations: + annotation_types[annotation.at_type.shortname] += 1 + basic_info = { + 'id': view.id, + 'app': view.metadata.app, + 'timestamp': view.metadata.timestamp, + 'contains': [str(k) for k in view.metadata.contains.keys()], + 'annotation_count': len(view.annotations), + 'annotation_types': dict(annotation_types), + 'parameters': view.metadata.parameters, + 'appConfiguration': view.metadata.appConfiguration } + if view.metadata.warnings: + basic_info['warnings'] = view.metadata.warnings + if view.metadata.error: + basic_info['error'] = view.metadata.error + return basic_info + + def pp(self): + print('\nViews -> ') + for v in self.data: + print(' %s' % v['app']) + + +class Transcript(object): + + """The transcript contains the string value from the first text document in the + last ASR view. It issues a warning if there is more than one text document in + the view.""" + + def __init__(self, summary): + self.summary = summary + self.data = [] + view = get_transcript_view(summary.mmif.views) + if view is not None: + documents = view.get_documents() + if len(documents) > 1: + summary.add_warning(f'More than one TextDocument in ASR view {view.id}') + t_nodes = summary.graph.get_nodes(config.TOKEN, view_id=view.id) + s_nodes = summary.graph.get_nodes(config.SENTENCE, view_id=view.id) + if not t_nodes: + return + if s_nodes: + # Whisper has Sentence nodes + sentences = self.collect_targets(s_nodes) + sentence_ids = [n.identifier for n in s_nodes] + else: + # But Kaldi does not + sentences = self.create_sentences(t_nodes) + sentence_ids = [None] * len(sentences) + # initialize the transcripts with all blanks, most blanks will be + # overwrite with characters from the tokens + transcript = CharacterList(self.transcript_size(sentences)) + for s_id, s in zip(sentence_ids, sentences): + transcript_element = TranscriptElement(s_id, s, transcript) + self.data.append(transcript_element.as_json()) + + def __str__(self): + return str(self.data) + + @staticmethod + def transcript_size(sentences): + try: + return sentences[-1][-1].properties['end'] + except IndexError: + return 0 + + def collect_targets(self, s_nodes): + """For each node (in this context a sentence node), collect all target nodes + (which are tokens) and return them as a list of lists, with one list for each + node.""" + targets = [] + for node in s_nodes: + node_target_ids = node.properties['targets'] + node_targets = [self.summary.graph.get_node(stid) for stid in node_target_ids] + targets.append(node_targets) + return targets + + def create_sentences(self, t_nodes, sentence_size=12): + """If there is no sentence structure then we create it just by chopping th + input into slices of some pre-determined length.""" + # TODO: perhaps the size paramater should be set in the config file or via a + # command line option. + return [t_nodes[i:i + sentence_size] + for i in range(0, len(t_nodes), sentence_size)] + + +class TranscriptElement: + + """Utility class to handle data associated with an element from a transcript, + which is created from a sentence which is a list of Token Nodes. Initialization + has the side effect of populating the full transcript which is an instance of + CharacterList and which is also accessed here.""" + + def __init__(self, identifier: str, sentence: list, transcript: CharacterList): + for t in sentence: + # this adds the current token to the transcript + start = t.properties['start'] + end = t.properties['end'] + word = t.properties['word'] + transcript.set_chars(word, start, end) + self.id = identifier + self.start = sentence[0].anchors['time-offsets'][0] + self.end = sentence[-1].anchors['time-offsets'][1] + self.start_offset = sentence[0].properties['start'] + self.end_offset = sentence[-1].properties['end'] + self.text = transcript.getvalue(self.start_offset, self.end_offset) + + def __str__(self): + text = self.text if len(self.text) <= 50 else self.text[:50] + '...' + return f'' + + def as_json(self): + json_obj = { + "start-time": self.start, + "end-time": self.end, + "text": self.text } + if self.id is not None: + json_obj["id"] = self.id + return json_obj + + +class Nodes(object): + + """Abstract class to store instances of subclasses of graph.Node. The + initialization methods of subclasses of Nodes can guard what nodes will + be allowed in, for example, as of July 2022 the TimeFrames class only + allowed time frames that had a frame type (thereby blocking the many + timeframes from Kaldi). + + Instance variables: + + summary - an instance of Summary + graph - an instance of graph.Graph, taken from the summary + nodes - list of instances of subclasses of graph.Node + + """ + + def __init__(self, summary): + self.summary = summary + self.graph = summary.graph + self.nodes = [] + + def __getitem__(self, i): + return self.nodes[i] + + def __len__(self): + return len(self.nodes) + + def add(self, node): + self.nodes.append(node) + + def get_nodes(self, **props): + """Return all the nodes that match the given properties.""" + def prop_check(p, v, props_given): + return v == props_given.get(p) if p in props_given else False + return [n for n in self + if all([prop_check(p, v, n.annotation.properties) + for p, v in props.items()])] + + +class TimeFrames(Nodes): + + """For now, we take only the TimeFrames that have a frame type, which rules out + all the frames we got from Kaldi.""" + + def __init__(self, summary): + super().__init__(summary) + # a dictionary mapping app names to lists of timeframe summaries + self.data = defaultdict(list) + for tf_node in self.graph.get_nodes(config.TIME_FRAME): + if tf_node.has_label(): + self.add(tf_node) + self._collect_timeframe_summaries() + self._sort_timeframe_summaries() + + def _collect_timeframe_summaries(self): + for tf in self.nodes: + label = tf.frame_type() + try: + start, end = tf.anchors['time-offsets'] + except KeyError: + # TODO: + # - this defies the notion of using the anchors for this, but + # maybe in this case we should go straight to the start/end + # - this code below also raises an error if there are no start + # and end properties + start = tf.properties['start'] + end = tf.properties['end'] + representatives = tf.representatives() + rep_tps = [rep.properties['timePoint'] for rep in representatives] + score = tf.properties.get('classification', {}).get(label) + app = tf.view.metadata.app + self.data[app].append( + { 'identifier': tf.identifier, 'label': label, 'score': score, + 'start-time': start, 'end-time': end, 'representatives': rep_tps }) + + def _sort_timeframe_summaries(self): + """Sort the data on their start time, do this for all apps.""" + for app in self.data: + sort_function = lambda x: x['start-time'] + self.data[app] = list(sorted(self.data[app], key=sort_function)) + + def as_json(self): + return self.data + + def pp(self): + print('\nTimeframes -> ') + for tf in self.nodes: + summary = tf.summary() + print(' %s:%s %s' % (summary['start'], summary['end'], + summary['frameType'])) + + +class TimeFrameStats(object): + + def __init__(self, summary): + # a dictionary mapping app names to frameType->duration dictionaries, + # where the duration is cumulative over all instances + self.timeframes = summary.timeframes + self.data = {} + self._collect_durations() + self._collect_other_morsels() + + def _collect_durations(self): + timeframes = self.timeframes.data + for app in timeframes: + self.data[app] = {} + for tf in timeframes[app]: + label = tf.get('label') + if label not in self.data[app]: + self.data[app][label] = {'count': 0, 'duration': 0} + self.data[app][label]['count'] += 1 + duration = tf['end-time'] - tf['start-time'] + if label is not None: + # TODO: these gave weird values for duration + #print('---',app, label, duration) + self.data[app][label]['duration'] += duration + duration = self.data[app][label]['duration'] + count = self.data[app][label]['count'] + self.data[app][label]['average'] = duration // count + + def _collect_other_morsels(self): + # First we want everything grouped by app and label + timeframes = self.timeframes.data + grouped_timeframes = defaultdict(lambda: defaultdict(list)) + for app in timeframes: + for tf in timeframes[app]: + label = tf.get('label') + grouped_timeframes[app][label].append(tf) + # The we pick the morsels for each label + for app in grouped_timeframes: + for label in grouped_timeframes[app]: + tfs = grouped_timeframes[app][label] + sort_on_start = lambda tf: tf['start-time'] + sort_on_length = lambda tf: tf['end-time'] - tf['start-time'] + first_tf = list(sorted(tfs, key=sort_on_start))[0] + longest_tf = list(sorted(tfs, key=sort_on_length, reverse=True))[0] + self.data[app][label]['first'] = first_tf['start-time'] + self.data[app][label]['longest'] = longest_tf['start-time'] + + +class Entities(Nodes): + + """Collecting instances of graph.EntityNode. + + nodes_idx - lists of instances of graph.EntityNode, indexed on entity text + { entity-string ==> list of graph.EntityNode } + bins - an instance of Bins + + """ + + def __init__(self, summary): + super().__init__(summary) + self.nodes_idx = {} + self.bins = None + for ent in self.graph.get_nodes(config.NAMED_ENTITY): + self.add(ent) + self._create_node_index() + self._group() + + def __str__(self): + return f'' + + def _create_node_index(self): + """Put all the entities from self.nodes in self.node_idx. This first puts + the nodes into the dictionary indexed on text string and then sorts the + list of nodes for each string on video position.""" + for ent in self: + self.nodes_idx.setdefault(ent.properties['text'], []).append(ent) + for text, entities in self.nodes_idx.items(): + self.nodes_idx[text] = sorted(entities, + key=(lambda e: e.start_in_video())) + + def _group(self): + """Groups all the nodes on the text and sorts them on position in the video, + for the latter it will also create bins of entities that occur close to each + other in the text.""" + # create the bins, governed by the summary's granularity + self.bins = Bins(self.summary) + for text, entities in self.nodes_idx.items(): + self.bins.current_bin = None + for entity in entities: + self.bins.add_entity(text, entity) + self.bins.mark_entities() + + def _add_tags(self, tags): + for tag in tags: + tag_doc = tag.properties['document'] + tag_p1 = tag.properties['start'] + tag_p2 = tag.properties['end'] + entities = self.nodes_idx.get(tag.properties['text'], []) + for entity in entities: + props = entity.properties + doc = props['document'] + p1 = props['start'] + p2 = props['end'] + if tag_doc == doc and tag_p1 == p1 and tag_p2 == p2: + entity.properties['tag'] = tag.properties['tagName'] + + def as_json(self): + json_obj = [] + for text in self.nodes_idx: + entity = {"text": text, "instances": []} + json_obj.append(entity) + for e in self.nodes_idx[text]: + entity["instances"].append(e.summary()) # e.summary(), E_PROPS) + return json_obj + + def pp(self): + print('\nEntities -> ') + for e in self.nodes_idx: + print(' %s' % e) + for d in self.nodes_idx[e]: + props = ["%s=%s" % (p, v) for p, v in d.summary().items()] + print(' %s' % ' '.join(props)) + + def print_groups(self): + for key in sorted(self.nodes_idx): + print(key) + for e in self.nodes_idx[key]: + print(' ', e, e.start_in_video()) + + +class Captions(Nodes): + + def __init__(self, summary): + super().__init__(summary) + self.captions = [] + view = get_captions_view(summary.mmif.views) + if view is not None: + for doc in self.graph.get_nodes(config.TEXT_DOCUMENT, view_id=view.id): + text = doc.properties['text']['@value'].split('[/INST]')[-1] + debug( + f'>>> DOC {doc}', + f'>>> PROPS {list(doc.properties.keys())}', + f'>>> TEXT ' + text.replace("\n", "")[:100], + f'>>> ANCHORS {doc.anchors}') + if 'time-offsets' in doc.anchors: + # For older LLava-style captions + # http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97 + p1, p2 = doc.anchors['time-offsets'] + if 'representatives' in doc.anchors: + tp_id = doc.anchors["representatives"][0] + tp = summary.graph.get_node(tp_id) + self.captions.append( + { 'identifier': doc.identifier, + 'time-point': tp.properties['timePoint'], + 'text': text }) + if 'time-point' in doc.anchors: + # For newer SmolVLM-style captions + # http://apps.clams.ai/smolvlm2-captioner + self.captions.append( + { 'identifier': doc.identifier, + 'time-point': doc.anchors['time-point'], + 'text': text }) + + def as_json(self): + return self.captions + #return [(ident, p1, p2, text) for ident, p1, p2, text in self.captions] + + +class Bins(object): + + def __init__(self, summary): + self.summary = summary + self.bins = {} + self.current_bin = None + self.current_text = None + + def __str__(self): + return f'' + + def __len__(self): + return len(self.bins) + + def add_entity(self, text, entity): + """Add an entity instance to the appropriate bin.""" + if self.current_bin is None: + # Add the first instance of a new entity (as defined by the text), + # since it is the first a new bin will be created. + self.current_text = text + self.current_bin = Bin(entity) + self.bins[text] = [self.current_bin] + else: + # For following entities with the same text, a new bin may be + # created depending on the positions and the granularity. + p1 = self.current_bin[-1].start_in_video() + p2 = entity.start_in_video() + # p3 = entity.end_in_video() + if p2 - p1 < config.GRANULARITY: + # TODO: should add p3 here + self.current_bin.add(entity) + else: + self.current_bin = Bin(entity) + self.bins[self.current_text].append(self.current_bin) + + def mark_entities(self): + """Marks all entities with the bin that they occur in. This is done to export + the grouping done with the bins to the entities and this way the bins never need + to be touched again.""" + # TODO: maybe use the bins when we create the output + for entity_bins in self.bins.values(): + for i, e_bin in enumerate(entity_bins): + for entity in e_bin: + entity.properties['group'] = i + + def print_bins(self): + for text in self.bins: + print(text) + text_bins = self.bins[text] + for i, text_bin in enumerate(text_bins): + text_bin.print_nodes(i) + print() + + +class Bin(object): + + def __init__(self, node): + # TODO: we are not using these yet, but a bin should have a begin and + # end in the video which should be derived from the start and end of + # entities in the video. The way we put things in bins now is a bit + # fragile since it depends on the start or end of the last element. + self.start = 0 + self.end = 0 + self.nodes = [node] + + def __getitem__(self, i): + return self.nodes[i] + + def add(self, node): + self.nodes.append(node) + + def print_nodes(self, i): + for node in self.nodes: + print(' ', i, node) diff --git a/mmif/utils/summarizer/utils.py b/mmif/utils/summarizer/utils.py new file mode 100644 index 00000000..5920f8ce --- /dev/null +++ b/mmif/utils/summarizer/utils.py @@ -0,0 +1,301 @@ +"""Utility methods + +""" + +import io +from pathlib import Path +from xml.sax.saxutils import quoteattr, escape +from collections import UserList + +from mmif.utils.summarizer.config import KALDI, WHISPER, CAPTIONER, SEGMENTER +from mmif.utils.summarizer.config import TOKEN, ALIGNMENT, TIME_FRAME + + +def compose_id(view_id, anno_id): + """Composes the view identifier with the annotation identifier.""" + return anno_id if ':' in anno_id else view_id + ':' + anno_id + + +def type_name(annotation): + """Return the short name of the type.""" + return annotation.at_type.split('/')[-1] + + +def get_transcript_view(views): + """Return the last Whisper or Kaldi view that is not a warnings view.""" + # TODO: this now has a simplified idea of how to find a view, should at least + # move towards doing some regular expression matching on the WHISPER config + # setting. The same holds for other functions to get views. + for view in reversed(views): + if view.metadata.app in KALDI + WHISPER: + if view.metadata.warnings: + continue + return view + return None + + +def get_captions_view(views): + """Return the last view created by the captioner.""" + for view in reversed(views): + if view.metadata.app in CAPTIONER: + if view.metadata.warnings: + continue + return view + return None + + +def get_last_segmenter_view(views): + for view in reversed(views): + # print(f'>>> {view.metadata.app}') + if view.metadata.app.startswith(SEGMENTER): + return view + return None + + +def get_aligned_tokens(view): + """Get a list of tokens from an ASR view where for each token we add a timeframe + properties which has the start and end points of the aligned timeframe.""" + idx = AnnotationsIndex(view) + for alignment in idx.get_annotations(ALIGNMENT).values(): + token = idx[TOKEN].get(alignment.properties['target']) + frame = idx[TIME_FRAME].get(alignment.properties['source']) + if token and frame: + # add a timeframe to the token, we can do this now that we do not + # freeze MMIF annotations anymore + token.properties['timeframe'] = (frame.properties['start'], + frame.properties['end']) + return idx.tokens + + +def timestamp(milliseconds: int, format='hh:mm:ss'): + # sometimes the milliseconds are not a usable float + if milliseconds in (None, -1): + return 'nil' + milliseconds = int(milliseconds) + seconds = milliseconds // 1000 + minutes = seconds // 60 + hours = minutes // 60 + ms = milliseconds % 1000 + s = seconds % 60 + m = minutes % 60 + if format == 'hh:mm:ss:mmm': + return f'{hours}:{m:02d}:{s:02d}.{ms:03d}' + elif format == 'hh:mm:ss': + return f'{hours}:{m:02d}:{s:02d}' + elif format == 'mm:ss': + return f'{m:02d}:{s:02d}' + elif format == 'mm:ss:mmm': + return f'{m:02d}:{s:02d}.{ms:03d}' + else: + return f'{hours}:{m:02d}:{s:02d}.{ms:03d}' + + + +class AnnotationsIndex: + + """Creates an index on the annotations list for a view, where each annotation type + is indexed on its identifier. Tokens are special and get their own list.""" + + def __init__(self, view): + self.view = view + self.idx = {} + self.tokens = [] + for annotation in view.annotations: + shortname = annotation.at_type.shortname + if shortname == TOKEN: + self.tokens.append(annotation) + self.idx.setdefault(annotation.at_type.shortname, {}) + self.idx[shortname][annotation.properties.id] = annotation + + def __str__(self): + return f'' + + def __getitem__(self, item): + return self.idx[item] + + def get_annotations(self, at_type): + return self.idx.get(at_type, {}) + + +class CharacterList(UserList): + + """Auxiliary datastructure to help print a list of tokens. It allows you to + back-engineer a sentence from the text and character offsets of the tokens.""" + + def __init__(self, n: int, char=' '): + self.size = n + self.char = char + self.data = n * [char] + + def __str__(self): + return f'' + + def __len__(self): + return self.size + + def __setitem__(self, key, value): + try: + self.data[key] = value + except IndexError: + for i in range(len(self), key + 1): + self.data.append(self.char) + self.data[key] = value + + def set_chars(self, text: str, start: int, end: int): + self.data[start:end] = text + + def getvalue(self, start: int, end: int): + return ''.join(self.data[start:end]) + + +def xml_tag(tag, subtag, objs, props, indent=' ') -> str: + """Return an XML string for a list of instances of subtag, grouped under tag.""" + s = io.StringIO() + s.write(f'{indent}<{tag}>\n') + for obj in objs: + s.write(xml_empty_tag(subtag, indent + ' ', obj, props)) + s.write(f'{indent}\n') + return s.getvalue() + + +def xml_empty_tag(tag_name: str, indent: str, obj: dict, props: tuple) -> str: + """Return an XML tag to an instance of io.StringIO(). Only properties from obj + that are in the props tuple are printed.""" + pairs = [] + for prop in props: + if prop in obj: + if obj[prop] is not None: + #pairs.append("%s=%s" % (prop, xml_attribute(obj[prop]))) + pairs.append(f'{prop}={xml_attribute(obj[prop])}') + attrs = ' '.join(pairs) + return f'{indent}<{tag_name} {attrs}/>\n' + + +def write_tag(s, tagname: str, indent: str, obj: dict, props: tuple): + """Write an XML tag to an instance of io.StringIO(). Only properties from obj + that are in the props tuple are printed.""" + pairs = [] + for prop in props: + if prop in obj: + if obj[prop] is not None: + pairs.append("%s=%s" % (prop, xml_attribute(obj[prop]))) + s.write('%s<%s %s/>\n' + % (indent, tagname, ' '.join(pairs))) + + +def xml_attribute(attr): + """Return attr as an XML attribute.""" + return quoteattr(str(attr)) + + +def xml_data(text): + """Return text as XML data.""" + return escape(str(text)) + + +def XXXflatten_paths(paths): + """Take paths implemented as singly linked lists and return regular lists.""" + return [flatten_path(path) for path in paths] + + +def XXXflatten_path(path): + """Take a path implemented as singly linked lists and return a regular list.""" + while path: + if len(path) == 1: + return path + else: + #print('>>>', len(path)) + #for x in path: + # print(' ', x) + first, rest = path + return [first] + flatten_path(rest) + + +def XXXprint_paths(paths, indent=''): + """Print paths, which may be flattened.""" + for path in paths: + print(indent, end='') + print_path(path) + print() + + +def XXXprint_path(p): + if isinstance(p, list): + print('[', end=' ') + for e in p: + print_path(e) + print(']', end=' ') + else: + print(p, end=' ') + + +def normalize_id(doc_ids: list, view: 'View', annotation: 'Annotation'): + """Change identifiers to include the view identifier if it wasn't included, + do nothing otherwise. This applies to the Annotation id, target, source, + document, targets and representatives properties. Note that timePoint is + not included because the value is an integer and not an identifier.""" + # TODO: this seems somewhat fragile + # TODO: spell out what doc_ids is for (to exclude source documents I think) + debug = False + attype = annotation.at_type.shortname + props = annotation.properties + if ':' not in annotation.id and view is not None: + if annotation.id not in doc_ids: + newid = f'{view.id}:{annotation.id}' + annotation.properties['id'] = newid + if 'document' in props: + doc_id = props['document'] + if ':' not in doc_id and view is not None: + if doc_id not in doc_ids: + props['document'] = f'{view.id}:{doc_id}' + if 'targets' in props: + new_targets = [] + for target in props['targets']: + if ':' not in target and view is not None: + if target not in doc_ids: + new_targets.append(f'{view.id}:{target}') + else: + new_targets.append(target) + props['targets'] = new_targets + if 'representatives' in props: + new_representatives = [] + for rep in props['representatives']: + if ':' not in rep and view is not None: + new_representatives.append(f'{view.id}:{rep}') + else: + new_representatives.append(rep) + props['representatives'] = new_representatives + if attype == 'Alignment': + if ':' not in props['source'] and view is not None: + if props['source'] not in doc_ids: + props['source'] = f'{view.id}:{props["source"]}' + if ':' not in props['target'] and view is not None: + if props['target'] not in doc_ids: + props['target'] = f'{view.id}:{props["target"]}' + if debug: + print('===', annotation) + + +def get_annotations_from_view(view, annotation_type): + """Return all annotations from a view that match the short name of the + annotation type.""" + # Note: there is method mmif.View.get_annotations() where you can give + # at_type as a parameter, but it requires a full match. + return [a for a in view.annotations + if a.at_type.shortname == annotation_type] + + +def find_matching_tokens(tokens, ne): + matching_tokens = [] + ne_start = ne.properties["start"] + ne_end = ne.properties["end"] + start_token = None + end_token = None + for token in tokens: + if token.properties['start'] == ne_start: + start_token = token + if token.properties['end'] == ne_end: + end_token = token + return start_token, end_token + + From 4c7c6a6ecca02a501ffcafe846ea8f55f28e5ecf Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Thu, 8 Jan 2026 12:09:03 -0500 Subject: [PATCH 04/48] Added notes on how to add a CLI script and added the summarizer to the doc modules --- documentation/modules.rst | 1 + mmif/utils/cli/README.md | 71 +++++++++++++++++++++++++++++++++++++ mmif/utils/cli/summarize.py | 2 +- 3 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 mmif/utils/cli/README.md diff --git a/documentation/modules.rst b/documentation/modules.rst index 4bb9307d..e32ade5d 100644 --- a/documentation/modules.rst +++ b/documentation/modules.rst @@ -9,6 +9,7 @@ mmif package autodoc/mmif.serialize autodoc/mmif.vocabulary autodoc/mmif.utils + autodoc/mmif.utils.summarizer mmif_docloc_http package ======================== diff --git a/mmif/utils/cli/README.md b/mmif/utils/cli/README.md new file mode 100644 index 00000000..6d04438d --- /dev/null +++ b/mmif/utils/cli/README.md @@ -0,0 +1,71 @@ +# MMIF CLI Scripts + +This directory contains CLI scripts like `source` and `rewind` that can be called from the command line. These scripts are called as subcommands of the `mmif` CLI script, for example `mmif source --help`. + + +## Adding another CLI script + +To add a CLI script all you need to do is add a python module to `mmif/utils/cli` and make sure it has the following three methods: + +1. `prep_argparser(**kwargs)` to define and return an instance of `argparse.ArgumentParser`. + +2. `describe_argparser()` to return a pair of strings that describe the script. The first string is a one-line description of the argument parser and the second a more verbose description. These will be shown for `mmif --help` and `mmif subcommand --help` respectively. + +3. `main(args)` to do the actual work of running the code + +See the current CLI scripts for examples. + + +## Some background + +The mmif-python package has a particular way to deal with CLI utility scripts. All scripts live in the mmif.utils.cli package. The `mmif/__init__.py` module has the `cli()` function which illustrates the requirements on utility scripts: + +```python +def cli(): + parser, subparsers = prep_argparser_and_subcmds() + cli_modules = {} + for cli_module in find_all_modules('mmif.utils.cli'): + cli_module_name = cli_module.__name__.rsplit('.')[-1] + cli_modules[cli_module_name] = cli_module + subcmd_parser = cli_module.prep_argparser(add_help=False) + subparsers.add_parser(cli_module_name, parents=[subcmd_parser], + help=cli_module.describe_argparser()[0], + description=cli_module.describe_argparser()[1], + formatter_class=argparse.RawDescriptionHelpFormatter) + if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + args = parser.parse_args() + if args.subcmd not in cli_modules: + parser.print_help(sys.stderr) + else: + cli_modules[args.subcmd].main(args) +``` + + + +You can see the invocations of the three functions mentioned above. + +The `prep_argparser()` function uses `find_all_modules()`, which finds modules in the top-level of the cli package. That module could have all the code needed for the CLI to work, but it could refer to other modules as well. For example, the `summary.py` script is in `cli`, but it imports the summary utility from `mmif.utls`. + +In the setup.py script there is this passage towards the end of the file: + +```python + entry_points={ + 'console_scripts': [ + 'mmif = mmif.__init__:cli', + ], + }, +``` + +This leaves it up to the `cli()` method to find the scripts and this is why just adding a submodule as mentioned above works. Note that the initialization file of the cli package imports two of the commandline related scripts: + +```python +from mmif.utils.cli import rewind +from mmif.utils.cli import source +``` + +These may be used somewhere, but they are not necessary to run MMIF CLI scripts. + diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py index 8b88c53d..c8e384c0 100644 --- a/mmif/utils/cli/summarize.py +++ b/mmif/utils/cli/summarize.py @@ -12,7 +12,7 @@ def describe_argparser() -> tuple: respectively. For now they return the same string. The retun value should still be a tuple because mmif.cli() depends on it. """ - oneliner = 'provides a CLI to create a JSON Summary for a MMIF file' + oneliner = 'Create a JSON Summary for a MMIF file' return oneliner, oneliner From 7cc973d2576637c1e11292289dd4de9519f1fa8c Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Thu, 8 Jan 2026 13:25:48 -0500 Subject: [PATCH 05/48] Removed some deprecated methods because they broke the coverage tests of the pull request --- mmif/utils/summarizer/utils.py | 36 ---------------------------------- 1 file changed, 36 deletions(-) diff --git a/mmif/utils/summarizer/utils.py b/mmif/utils/summarizer/utils.py index 5920f8ce..95b15b86 100644 --- a/mmif/utils/summarizer/utils.py +++ b/mmif/utils/summarizer/utils.py @@ -193,42 +193,6 @@ def xml_data(text): return escape(str(text)) -def XXXflatten_paths(paths): - """Take paths implemented as singly linked lists and return regular lists.""" - return [flatten_path(path) for path in paths] - - -def XXXflatten_path(path): - """Take a path implemented as singly linked lists and return a regular list.""" - while path: - if len(path) == 1: - return path - else: - #print('>>>', len(path)) - #for x in path: - # print(' ', x) - first, rest = path - return [first] + flatten_path(rest) - - -def XXXprint_paths(paths, indent=''): - """Print paths, which may be flattened.""" - for path in paths: - print(indent, end='') - print_path(path) - print() - - -def XXXprint_path(p): - if isinstance(p, list): - print('[', end=' ') - for e in p: - print_path(e) - print(']', end=' ') - else: - print(p, end=' ') - - def normalize_id(doc_ids: list, view: 'View', annotation: 'Annotation'): """Change identifiers to include the view identifier if it wasn't included, do nothing otherwise. This applies to the Annotation id, target, source, From 8b310ebeb78ecf51122880978c1d521296c25f82 Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Thu, 8 Jan 2026 13:39:48 -0500 Subject: [PATCH 06/48] Type checker from the coverage test does not like string-valued type hints --- mmif/utils/summarizer/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mmif/utils/summarizer/utils.py b/mmif/utils/summarizer/utils.py index 95b15b86..61c3bc8b 100644 --- a/mmif/utils/summarizer/utils.py +++ b/mmif/utils/summarizer/utils.py @@ -7,6 +7,7 @@ from xml.sax.saxutils import quoteattr, escape from collections import UserList +from mmif import View, Annotation from mmif.utils.summarizer.config import KALDI, WHISPER, CAPTIONER, SEGMENTER from mmif.utils.summarizer.config import TOKEN, ALIGNMENT, TIME_FRAME @@ -193,7 +194,7 @@ def xml_data(text): return escape(str(text)) -def normalize_id(doc_ids: list, view: 'View', annotation: 'Annotation'): +def normalize_id(doc_ids: list, view: View, annotation: Annotation): """Change identifiers to include the view identifier if it wasn't included, do nothing otherwise. This applies to the Annotation id, target, source, document, targets and representatives properties. Note that timePoint is From cca671496eb0e45db93d5e4281808bdcc662ec27 Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Thu, 8 Jan 2026 14:55:30 -0500 Subject: [PATCH 07/48] More cleanup and fixes for code coverage tests --- mmif/utils/summarizer/graph.py | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/mmif/utils/summarizer/graph.py b/mmif/utils/summarizer/graph.py index ae11b9be..642db16a 100644 --- a/mmif/utils/summarizer/graph.py +++ b/mmif/utils/summarizer/graph.py @@ -357,27 +357,6 @@ def _get_document(self): return None return None - def XXX_get_document_plus_span(self): - self.pp() - props = self.properties - return "%s:%s:%s" % (self.document.identifier, - props['start'], props['end']) - - def XXXpaths_to_docs(self): - """Return all the paths from the node to documents.""" - paths = self._paths_to_docs() - return flatten_paths(paths) - - def XXX_paths_to_docs(self): - paths = [] - if not self.targets: - return [[self]] - for t in self.targets: - paths.append([self]) - for i, target in enumerate(self.targets): - paths[i].extend(target._paths_to_docs()) - return paths - def summary(self): """The default summary is just the identfier, this should typically be overriden by sub classes.""" @@ -464,8 +443,8 @@ def start_in_video(self): def end_in_video(self): return self.anchor().get('video-end') - def pp(self): - super().pp(close=False) + def pp(self, close=False): + super().pp(close=close) try: for i, p in enumerate(self.paths_to_docs()): print(' %s' % ' '.join([str(n) for n in p[1:]])) @@ -478,10 +457,6 @@ def summary(self): the entity occurs, it is not enough to just give the text document.""" # TODO: in the old days this used an anchor() method which was fragile # TODO: revamping it now - - #anchor = self.anchor() - #self.document.pp() -# print('...', self.document.anchors return { 'id': self.identifier, 'group': self.properties['group'], From d2498f8a65975b64de7fdd2619a146e6110f96f2 Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Thu, 8 Jan 2026 15:08:02 -0500 Subject: [PATCH 08/48] More changes to satisfy typing requirements from the code coverage tests --- mmif/utils/summarizer/graph.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mmif/utils/summarizer/graph.py b/mmif/utils/summarizer/graph.py index 642db16a..3f0b18c9 100644 --- a/mmif/utils/summarizer/graph.py +++ b/mmif/utils/summarizer/graph.py @@ -8,8 +8,6 @@ from mmif.utils.summarizer import config from mmif.utils.summarizer.utils import compose_id, normalize_id -#from summarizer.utils import compose_id, flatten_paths, normalize_id - class Graph(object): @@ -86,7 +84,9 @@ def add_edge(self, view, alignment): def get_node(self, node_id): return self.nodes.get(node_id) - def get_nodes(self, short_at_type: str, view_id : str = None): + # def get_nodes(self, short_at_type: str, view_id : str = None): + # replaced the above because the code coverage is picky on type hints + def get_nodes(self, short_at_type: str, view_id=None): """Get all nodes for an annotation type, using the short form. If a view identifier is provided then only include nodes from that view.""" return [node for node in self.nodes.values() @@ -443,6 +443,10 @@ def start_in_video(self): def end_in_video(self): return self.anchor().get('video-end') + ''' + Commented this out because the type checking in the code coverage tests requires + the default vaue for the close parameter to be the same as on Node.pp(). + def pp(self, close=False): super().pp(close=close) try: @@ -451,6 +455,7 @@ def pp(self, close=False): except ValueError: print(' WARNING: error in path_to_docs in NamedEntityNode.pp()') print('-' * 80) + ''' def summary(self): """The summary for entities needs to include where in the video or image From 51983c5f3d443d15749dd7986bae162429693c8a Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Mon, 12 Jan 2026 14:10:14 -0500 Subject: [PATCH 09/48] Making code pass pytype in Python 3.10, 3.11 and 3.12 ; some cleanup. --- mmif/serialize/mmif.py | 6 +- mmif/utils/cli/summarize.py | 16 +- mmif/utils/summarizer/__init__.py | 16 +- mmif/utils/summarizer/graph.py | 385 ++---------------------------- mmif/utils/summarizer/nodes.py | 370 ++++++++++++++++++++++++++++ mmif/utils/summarizer/summary.py | 40 ++-- 6 files changed, 414 insertions(+), 419 deletions(-) create mode 100644 mmif/utils/summarizer/nodes.py diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 9e94496d..c6fa4c62 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -15,7 +15,7 @@ import warnings from collections import defaultdict from datetime import datetime -from typing import List, Union, Optional, Dict, cast, Iterator +from typing import Any, List, Union, Optional, Dict, cast, Iterator import jsonschema.validators @@ -487,11 +487,11 @@ def get_documents_in_view(self, vid: Optional[str] = None) -> List[Document]: else: return [] - def get_documents_by_type(self, doc_type: Union[str, DocumentTypes]) -> List[Document]: + def get_documents_by_type(self, doc_type: Any) -> List[Document]: """ Method to get all documents where the type matches a particular document type, which should be one of the CLAMS document types. - :param doc_type: the type of documents to search for, must be one of ``Document`` type defined in the CLAMS vocabulary. + :param doc_type: the type of documents to search for, must be one of ``Document`` types defined in the CLAMS vocabulary. :return: a list of documents matching the requested type, or an empty list if none found. """ docs = [] diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py index c8e384c0..06c1afae 100644 --- a/mmif/utils/cli/summarize.py +++ b/mmif/utils/cli/summarize.py @@ -22,22 +22,10 @@ def prep_argparser(**kwargs): formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs) parser.add_argument("-i", metavar='MMIF_FILE', help='input MMIF file', required=True) - parser.add_argument("-o", metavar='JSON_FILE', help='output JSON summary file', required=True) - parser.add_argument("--full", action="store_true", help="print full report") - parser.add_argument('--transcript', action='store_true', help='include transcript') - parser.add_argument('--captions', action='store_true', help='include Llava captions') - parser.add_argument('--timeframes', action='store_true', help='include all time frames') - parser.add_argument('--entities', action='store_true', help='include entities from transcript') + parser.add_argument("-o", metavar='OUTPUT_FILE', help='output JSON summary file', required=True) return parser def main(args): - #print('>>>', args) mmif_summary = Summary(args.i) - #print('>>>', mmif_summary) - mmif_summary.report( - outfile=args.o, full=args.full, - #timeframes=args.timeframes, transcript=args.transcript, - #captions=args.captions, entities=args.entities - ) - + mmif_summary.report(outfile=args.o) diff --git a/mmif/utils/summarizer/__init__.py b/mmif/utils/summarizer/__init__.py index 59a980fe..1122d449 100644 --- a/mmif/utils/summarizer/__init__.py +++ b/mmif/utils/summarizer/__init__.py @@ -8,11 +8,6 @@ def argparser(): parser = argparse.ArgumentParser(description='Create a JSON Summary for a MMIF file') parser.add_argument('-i', metavar='MMIF_FILE', help='input MMIF file', required=True) parser.add_argument('-o', metavar='JSON_FILE', help='output JSON summary file', required=True) - parser.add_argument('--full', action='store_true', help='create full report') - parser.add_argument('--transcript', action='store_true', help='include transcript') - parser.add_argument('--captions', action='store_true', help='include Llava captions') - parser.add_argument('--timeframes', action='store_true', help='include all time frames') - parser.add_argument('--entities', action='store_true', help='include entities from transcript') return parser @@ -26,14 +21,10 @@ def main(): args = parser.parse_args() #pp_args(args) mmif_summary = Summary(args.i) - mmif_summary.report( - outfile=args.o, full=args.full, - timeframes=args.timeframes, transcript=args.transcript, - captions=args.captions, entities=args.entities) + mmif_summary.report(outfile=args.o) """ - There used to be an option to process a whole directory, but I never used it and decided that if needed it would better be done by an extra script or a separate function. @@ -45,8 +36,5 @@ def main(): print(mmif_file) json_file = str(mmif_file)[:-4] + 'json' mmif_summary = Summary(mmif_file.read_text()) - mmif_summary.report( - outfile=json_file, full=args.full, - timeframes=args.timeframes, transcript=args.transcript, - captions=args.captions, entities=args.entities) + mmif_summary.report(outfile=json_file) """ \ No newline at end of file diff --git a/mmif/utils/summarizer/graph.py b/mmif/utils/summarizer/graph.py index 3f0b18c9..55c38ffd 100644 --- a/mmif/utils/summarizer/graph.py +++ b/mmif/utils/summarizer/graph.py @@ -4,10 +4,12 @@ from pathlib import Path import argparse +from typing import Any from mmif import Mmif from mmif.utils.summarizer import config from mmif.utils.summarizer.utils import compose_id, normalize_id +from mmif.utils.summarizer.nodes import Node, Nodes, EntityNode, TimeFrameNode class Graph(object): @@ -20,7 +22,9 @@ class Graph(object): The goal for the graph is to store all useful annotation and to have simple ways to trace nodes all the way up to the primary data.""" - def __init__(self, mmif): + def __init__(self, mmif: Any): + # TODO: the type hint should really be "MMif | str", but pytype did not + # like that. self.mmif = mmif if type(mmif) is Mmif else Mmif(mmif) self.documents = [] self.nodes = {} @@ -74,14 +78,18 @@ def add_edge(self, view, alignment): #print(alignment.id, source_id, target_id) source = self.get_node(source_id) target = self.get_node(target_id) - # make sure the direction goes from token or textdoc to annotation - if target.annotation.at_type.shortname in (config.TOKEN, config.TEXT_DOCUMENT): - source, target = target, source - source.targets.append(target) - source.add_anchors_from_alignment(target) - target.add_anchors_from_alignment(source) - - def get_node(self, node_id): + if source is None or target is None: + print('WARNING: could not add edge ', + 'because the source and/or target does not extst') + else: + # make sure the direction goes from token or textdoc to annotation + if target.annotation.at_type.shortname in (config.TOKEN, config.TEXT_DOCUMENT): + source, target = target, source + source.targets.append(target) + source.add_anchors_from_alignment(target) + target.add_anchors_from_alignment(source) + + def get_node(self, node_id) -> Node | None: return self.nodes.get(node_id) # def get_nodes(self, short_at_type: str, view_id : str = None): @@ -116,7 +124,7 @@ def trim(self, start: int, end: int): new_nodes = [n for n in self.nodes.values() if not n.identifier in remove] self.nodes = { node.identifier: node for node in new_nodes } - def pp(self, fname=None,skip_timepoints=False): + def pp(self, fname=None, skip_timepoints=False): fh = sys.stdout if fname is None else open(fname, 'w') fh.write("%s\n" % self) for view in self.mmif.views: @@ -174,7 +182,7 @@ def __len__(self): def __str__(self): return f'' - def get_tokens_for_node(self, node): + def get_tokens_for_node(self, node: Node): """Return all tokens included in the span of a node.""" doc = node.document.identifier try: @@ -196,361 +204,6 @@ def pp(self, fname=None): fh.write(' %s %s\n' % (t[0], t[1])) -class Node(object): - - def __init__(self, graph, view, annotation): - self.graph = graph - self.view = view - self.view_id = None if self.view is None else self.view.id - self.annotation = annotation - # copy some information from the Annotation - self.at_type = annotation.at_type - self.identifier = annotation.id - self.properties = json.loads(str(annotation.properties)) - # get the document from the view or the properties - self.document = self._get_document() - # The targets property contains a list of annotations or documents that - # the node content points to. This includes the document the annotation - # points to as well as the alignment from a token or text document to a - # bounding box or time frame (which is added later). - # TODO: the above does not seem to be true since there is no evidence of - # data from alignments being added. - self.targets = [] if self.document is None else [self.document] - self.anchors = {} - self.add_local_anchors() - self.add_anchors_from_targets() - - def __str__(self): - anchor = '' - if self.at_type.shortname == config.TOKEN: - anchor = " %s:%s '%s'" % (self.properties['start'], - self.properties['end'], - self.properties.get('text','').replace('\n', '\\n')) - return "<%s %s%s>" % (self.at_type.shortname, self.identifier, anchor) - - def add_local_anchors(self): - """Get the anchors that you can get from the annotation itself, which - includes the start and end offsets, the coordinates, the timePoint of - a BoundingBox and any annotation with targets.""" - props = self.properties - attype = self.annotation.at_type.shortname - if 'start' in props and 'end' in props: - # TimeFrame is the only non-character based interval so this simple - # if-then-else should work - if attype == config.TIME_FRAME: - self.anchors['text-offsets'] = (props['start'], props['end']) - else: - self.anchors['time-offsets'] = (props['start'], props['end']) - if 'coordinates' in props: - self.anchors['coordinates'] = props['coordinates'] - if 'timePoint' in props: - self.anchors['time-point'] = props['timePoint'] - if 'targets' in props: - self.anchors['targets'] = props['targets'] - - def add_anchors_from_targets(self): - """Get start and end offsets or timePoints from the targets and add them to - the anchors, but only if there were no anchors on the node already. This has - two cases: one for TimeFrames and one for text intervals.""" - props = self.properties - attype = self.annotation.at_type.shortname - if 'targets' in props: - try: - t1 = self.graph.nodes[props['targets'][0]] - t2 = self.graph.nodes[props['targets'][-1]] - if attype == config.TIME_FRAME: - if not 'time-offsets' in props: - self.anchors['time-offsets'] = ( - t1.properties['timePoint'], t2.properties['timePoint']) - else: - if not 'text-offsets' in props: - self.anchors['text-offsets'] = ( - t1.properties['start'], t2.properties['end']) - except IndexError: - print(f'WARNING: Unexpected empty target list for {self.identifier}') - - def add_anchors_from_alignment(self, target: None, debug=False): - source_attype = self.at_type.shortname - target_attype = target.at_type.shortname - if debug: - print('\n@ DEBUG SOURCE->TARGET ', source_attype, target_attype) - print('@ DEBUG SOURCE.PROPS ', list(self.properties.keys())) - print('@ DEBUG TARGET.PROPS ', list(target.properties.keys())) - print('@ DEBUG TARGET.ANCHORS ', target.anchors) - # If a TextDocument is aligned to a BoundingBox then we grab the coordinates - # TODO: how are we getting the time point? - if source_attype == 'TextDocument' and target_attype == 'BoundingBox': - if 'coordinates' in target.properties: - self.anchors['coordinates'] = target.properties['coordinates'] - #print(source_attype, self.anchors) - elif source_attype == 'BoundingBox' and target_attype == 'TextDocument': - pass - # If a TextDocument is aligned to a TimeFrame then we copy time anchors - # but also targets and representatives, the latter because some alignments - # are not precise - elif source_attype == 'TextDocument' and target_attype == 'TimeFrame': - if 'start' in target.properties and 'end' in target.properties: - self.anchors['time-offsets'] = (target.properties['start'], - target.properties['end']) - if 'time-offsets' in target.anchors: - # TODO: is this ever used? - self.anchors['time-offsets'] = target.anchors['time-offsets'] - if 'targets' in target.properties: - self.anchors['targets'] = target.properties['targets'] - if 'representatives' in target.properties: - self.anchors['representatives'] = target.properties['representatives'] - #print('-', source_attype, self.anchors, self, target) - elif source_attype == 'TimeFrame' and target_attype == 'TextDocument': - pass - # Simply copy the time point - elif source_attype == 'TextDocument' and target_attype == 'TimePoint': - self.anchors['time-point'] = target.anchors['time-point'] - if debug: - print('+ ADDED SOURCE.ANCHORS ', self.anchors) - # For Token-TimeFrame alignments all we need are the start and end time points - elif source_attype == 'Token' and target_attype == 'TimeFrame': - if 'start' in target.properties and 'end' in target.properties: - self.anchors['time-offsets'] = (target.properties['start'], - target.properties['end']) - #print(source_attype, self.anchors) - elif source_attype == 'TimeFrame' and target_attype == 'Token': - pass - # TODO: check whether some action is needed for the next options - elif source_attype == 'TextDocument' and target_attype == 'VideoDocument': - pass - elif source_attype == 'VideoDocument' and target_attype == 'TextDocument': - pass - elif source_attype == 'BoundingBox' and target_attype == 'TimePoint': - pass - elif source_attype =='TimePoint' and target_attype == 'BoundingBox': - pass - elif source_attype == 'BoundingBox' and target_attype in ('Token', 'Sentence', 'Paragraph'): - pass - elif source_attype in ('Token', 'Sentence', 'Paragraph') and target_attype == 'BoundingBox': - pass - elif source_attype == 'TextDocument' and target_attype == 'TimePoint': - pass - elif source_attype == 'TimePoint' and target_attype == 'TextDocument': - pass - else: - print('-', source_attype, target_attype) - #if debug: - # print('DEBUG', self.anchors) - - def _get_document(self): - """Return the document or annotation node that the annotation/document in - the node refers to via the document property. This could be a local property - or a metadata property if there is no such local property. Return None - if neither of those exist.""" - # try the local property - docid = self.properties.get('document') - if docid is not None: - # print('>>>', docid, self.graph.get_node(docid)) - return self.graph.get_node(docid) - # try the metadata property - if self.view is not None: - try: - metadata = self.view.metadata.contains[self.at_type] - docid = metadata['document'] - return self.graph.get_node(docid) - except KeyError: - return None - return None - - def summary(self): - """The default summary is just the identfier, this should typically be - overriden by sub classes.""" - return { 'id': self.identifier } - - def pp(self, close=True): - print('-' * 80) - print(self) - print(f' document = {self.document}') - for prop in self.properties: - print(f' {prop} = {self.properties[prop]}') - print(' targets = ') - for target in self.targets: - print(' ', target) - print(' anchors = ') - for anchor in self.anchors: - print(f' {anchor} -> {self.anchors[anchor]}') - if close: - print('-' * 80) - - -class TimeFrameNode(Node): - - def __str__(self): - frame_type = ' ' + self.frame_type() if self.has_label() else '' - return ('' - % (self.identifier, self.start(), self.end(), frame_type)) - - def start(self): - return self.properties.get('start', -1) - - def end(self): - return self.properties.get('end', -1) - - def frame_type(self): - # TODO: rename this, uses old property since replaced by "label"" - # NOTE: this is still aloowing for the old property though - return self.properties.get('label') or self.properties.get('frameType') - - def has_label(self): - return self.frame_type() is not None - - def representatives(self) -> list: - """Return a list of the representative TimePoints.""" - # TODO: why could I not get this from the anchors? - rep_ids = self.properties.get('representatives', []) - reps = [self.graph.get_node(rep_id) for rep_id in rep_ids] - return reps - - def summary(self): - """The summary of a time frame just contains the identifier, start, end - and frame type.""" - return { 'id': self.identifier, - 'start': self.properties['start'], - 'end': self.properties['end'], - 'frameType': self.properties.get('frameType') } - - -class EntityNode(Node): - - def __init__(self, graph, view, annotation): - super().__init__(graph, view, annotation) - self.tokens = [] - self._paths = None - self._anchor = None - - def __str__(self): - try: - start = self.properties['start'] - end = self.properties['end'] - except KeyError: - start, end = self.anchors['text-offsets'] - return ("" - % (self.identifier, start, end, self.properties['text'])) - - def start_in_video(self): - #print('+++', self.document.properties) - try: - return self.document.anchors['time-point'] - except KeyError: - return -1 - #return self.anchor()['video-start'] - - def end_in_video(self): - return self.anchor().get('video-end') - - ''' - Commented this out because the type checking in the code coverage tests requires - the default vaue for the close parameter to be the same as on Node.pp(). - - def pp(self, close=False): - super().pp(close=close) - try: - for i, p in enumerate(self.paths_to_docs()): - print(' %s' % ' '.join([str(n) for n in p[1:]])) - except ValueError: - print(' WARNING: error in path_to_docs in NamedEntityNode.pp()') - print('-' * 80) - ''' - - def summary(self): - """The summary for entities needs to include where in the video or image - the entity occurs, it is not enough to just give the text document.""" - # TODO: in the old days this used an anchor() method which was fragile - # TODO: revamping it now - return { - 'id': self.identifier, - 'group': self.properties['group'], - 'cat': self.properties['category'], - 'document': self.document.identifier, - # Entities in a TextDocument that is a full transcript without any - # alignments do not have a TimePoint - #'time-point': self.document.anchors.get('time-point'), - #'text-offsets': self.anchors.get('text-offsets'), - 'time-point': self.document.anchors.get('time-point', -1), - 'text-offsets': self.anchors.get('text-offsets', (-1 ,-1)), - #'document': self._get_document_plus_span(), - #'video-start': anchor.get('video-start'), - #'video-end': anchor.get('video-end'), - #'coordinates': self._coordinates_as_string(anchor) - } - - def anchor(self): - """The anchor is the position in the video that the entity is linked to. - This anchor cannot be found in the document property because that points - to a text document that was somehow derived from the video document. Some - graph traversal is needed to get the anchor, but we know that the anchor - is always a time frame or a bounding box. - """ - # TODO: deal with the case where the primary document is not a video - self.paths = self.paths_to_docs() - bbtf = self.find_boundingbox_or_timeframe() - # for path in paths: - # print('... [') - # for n in path: print(' ', n) - # print('===', bbtf) - if bbtf.at_type.shortname == config.BOUNDING_BOX: - return {'video-start': bbtf.properties['timePoint'], - 'coordinates': bbtf.properties['coordinates']} - elif bbtf.at_type.shortname == config.TIME_FRAME: - return {'video-start': bbtf.properties['start'], - 'video-end': bbtf.properties['end']} - - def anchor2(self): - """The anchor is the position in the video that the entity is linked to. - This anchor cannot be found in the document property because that points - to a text document that was somehow derived from the video document. Some - graph traversal is needed to get the anchor, but we know that the anchor - is always a time frame or a bounding box. - """ - # TODO: with this version you get an error that the paths variable does - # not exist yet, must get a clearer picture on how to build a graph - # where nodes have paths to anchors - # TODO: deal with the case where the primary document is not a video - if self._anchor is None: - self._paths = self.paths_to_docs() - bbtf = self.find_boundingbox_or_timeframe() - # for path in self._paths: - # print('... [') - # for n in path: print(' ', n) - # print('===', bbtf) - if bbtf.at_type.shortname == config.BOUNDING_BOX: - self._anchor = {'video-start': bbtf.properties['timePoint'], - 'coordinates': bbtf.properties['coordinates']} - elif bbtf.at_type.shortname == config.TIME_FRAME: - self._anchor = {'video-start': bbtf.properties['start'], - 'video-end': bbtf.properties['end']} - return self._anchor - - def find_boundingbox_or_timeframe(self): - return self.paths[-1][-2] - - @staticmethod - def _coordinates_as_string(anchor): - if 'coordinates' not in anchor: - return None - return ','.join(["%s:%s" % (pair[0], pair[1]) - for pair in anchor['coordinates']]) - - -class Nodes(object): - - """Factory class for Node creation. Use Node for creation unless a special - class was registered for the kind of annotation we have.""" - - node_classes = { config.NAMED_ENTITY: EntityNode, - config.TIME_FRAME: TimeFrameNode } - - @classmethod - def new(cls, graph, view, annotation): - node_class = cls.node_classes.get(annotation.at_type.shortname, Node) - return node_class(graph, view, annotation) - - if __name__ == '__main__': diff --git a/mmif/utils/summarizer/nodes.py b/mmif/utils/summarizer/nodes.py new file mode 100644 index 00000000..53201022 --- /dev/null +++ b/mmif/utils/summarizer/nodes.py @@ -0,0 +1,370 @@ +import json + +from typing import Any + +from mmif.utils.summarizer import config + + + +class Node(object): + + def __init__(self, graph, view, annotation): + self.graph = graph + self.view = view + self.view_id = None if self.view is None else self.view.id + self.annotation = annotation + # copy some information from the Annotation + self.at_type = annotation.at_type + self.identifier = annotation.id + self.properties = json.loads(str(annotation.properties)) + # get the document from the view or the properties + self.document = self._get_document() + # The targets property contains a list of annotations or documents that + # the node content points to. This includes the document the annotation + # points to as well as the alignment from a token or text document to a + # bounding box or time frame (which is added later). + # TODO: the above does not seem to be true since there is no evidence of + # data from alignments being added. + self.targets = [] if self.document is None else [self.document] + self.anchors = {} + self.add_local_anchors() + self.add_anchors_from_targets() + + def __str__(self): + anchor = '' + if self.at_type.shortname == config.TOKEN: + anchor = " %s:%s '%s'" % (self.properties['start'], + self.properties['end'], + self.properties.get('text','').replace('\n', '\\n')) + return "<%s %s%s>" % (self.at_type.shortname, self.identifier, anchor) + + def add_local_anchors(self): + """Get the anchors that you can get from the annotation itself, which + includes the start and end offsets, the coordinates, the timePoint of + a BoundingBox and any annotation with targets.""" + props = self.properties + attype = self.annotation.at_type.shortname + if 'start' in props and 'end' in props: + # TimeFrame is the only non-character based interval so this simple + # if-then-else should work + if attype == config.TIME_FRAME: + self.anchors['text-offsets'] = (props['start'], props['end']) + else: + self.anchors['time-offsets'] = (props['start'], props['end']) + if 'coordinates' in props: + self.anchors['coordinates'] = props['coordinates'] + if 'timePoint' in props: + self.anchors['time-point'] = props['timePoint'] + if 'targets' in props: + self.anchors['targets'] = props['targets'] + + def add_anchors_from_targets(self): + """Get start and end offsets or timePoints from the targets and add them to + the anchors, but only if there were no anchors on the node already. This has + two cases: one for TimeFrames and one for text intervals.""" + props = self.properties + attype = self.annotation.at_type.shortname + if 'targets' in props: + try: + t1 = self.graph.nodes[props['targets'][0]] + t2 = self.graph.nodes[props['targets'][-1]] + if attype == config.TIME_FRAME: + if not 'time-offsets' in props: + self.anchors['time-offsets'] = ( + t1.properties['timePoint'], t2.properties['timePoint']) + else: + if not 'text-offsets' in props: + self.anchors['text-offsets'] = ( + t1.properties['start'], t2.properties['end']) + except IndexError: + print(f'WARNING: Unexpected empty target list for {self.identifier}') + + def add_anchors_from_alignment(self, target: Any, debug=False): + if target is None: + return + source_attype = self.at_type.shortname + target_attype = target.at_type.shortname + if debug: + print('\n@ DEBUG SOURCE->TARGET ', source_attype, target_attype) + print('@ DEBUG SOURCE.PROPS ', list(self.properties.keys())) + print('@ DEBUG TARGET.PROPS ', list(target.properties.keys())) + print('@ DEBUG TARGET.ANCHORS ', target.anchors) + # If a TextDocument is aligned to a BoundingBox then we grab the coordinates + # TODO: how are we getting the time point? + if source_attype == 'TextDocument' and target_attype == 'BoundingBox': + if 'coordinates' in target.properties: + self.anchors['coordinates'] = target.properties['coordinates'] + #print(source_attype, self.anchors) + elif source_attype == 'BoundingBox' and target_attype == 'TextDocument': + pass + # If a TextDocument is aligned to a TimeFrame then we copy time anchors + # but also targets and representatives, the latter because some alignments + # are not precise + elif source_attype == 'TextDocument' and target_attype == 'TimeFrame': + if 'start' in target.properties and 'end' in target.properties: + self.anchors['time-offsets'] = (target.properties['start'], + target.properties['end']) + if 'time-offsets' in target.anchors: + # TODO: is this ever used? + self.anchors['time-offsets'] = target.anchors['time-offsets'] + if 'targets' in target.properties: + self.anchors['targets'] = target.properties['targets'] + if 'representatives' in target.properties: + self.anchors['representatives'] = target.properties['representatives'] + #print('-', source_attype, self.anchors, self, target) + elif source_attype == 'TimeFrame' and target_attype == 'TextDocument': + pass + # Simply copy the time point + elif source_attype == 'TextDocument' and target_attype == 'TimePoint': + self.anchors['time-point'] = target.anchors['time-point'] + if debug: + print('+ ADDED SOURCE.ANCHORS ', self.anchors) + # For Token-TimeFrame alignments all we need are the start and end time points + elif source_attype == 'Token' and target_attype == 'TimeFrame': + if 'start' in target.properties and 'end' in target.properties: + self.anchors['time-offsets'] = (target.properties['start'], + target.properties['end']) + #print(source_attype, self.anchors) + elif source_attype == 'TimeFrame' and target_attype == 'Token': + pass + # TODO: check whether some action is needed for the next options + elif source_attype == 'TextDocument' and target_attype == 'VideoDocument': + pass + elif source_attype == 'VideoDocument' and target_attype == 'TextDocument': + pass + elif source_attype == 'BoundingBox' and target_attype == 'TimePoint': + pass + elif source_attype =='TimePoint' and target_attype == 'BoundingBox': + pass + elif source_attype == 'BoundingBox' and target_attype in ('Token', 'Sentence', 'Paragraph'): + pass + elif source_attype in ('Token', 'Sentence', 'Paragraph') and target_attype == 'BoundingBox': + pass + elif source_attype == 'TextDocument' and target_attype == 'TimePoint': + pass + elif source_attype == 'TimePoint' and target_attype == 'TextDocument': + pass + else: + print('-', source_attype, target_attype) + #if debug: + # print('DEBUG', self.anchors) + + def _get_document(self): + """Return the document or annotation node that the annotation/document in + the node refers to via the document property. This could be a local property + or a metadata property if there is no such local property. Return None + if neither of those exist.""" + # try the local property + docid = self.properties.get('document') + if docid is not None: + # print('>>>', docid, self.graph.get_node(docid)) + return self.graph.get_node(docid) + # try the metadata property + if self.view is not None: + try: + metadata = self.view.metadata.contains[self.at_type] + docid = metadata['document'] + return self.graph.get_node(docid) + except KeyError: + return None + return None + + def summary(self): + """The default summary is just the identfier, this should typically be + overriden by sub classes.""" + return { 'id': self.identifier } + + def has_label(self): + """Only TimeFrameNodes can have labels so this returns False.""" + return False + + def pp(self, close=True): + print('-' * 80) + print(self) + print(f' document = {self.document}') + for prop in self.properties: + print(f' {prop} = {self.properties[prop]}') + print(' targets = ') + for target in self.targets: + print(' ', target) + print(' anchors = ') + for anchor in self.anchors: + print(f' {anchor} -> {self.anchors[anchor]}') + if close: + print('-' * 80) + + +class TimeFrameNode(Node): + + def __str__(self): + frame_type = ' ' + self.frame_type() if self.has_label() else '' + return ('' + % (self.identifier, self.start(), self.end(), frame_type)) + + def start(self): + return self.properties.get('start', -1) + + def end(self): + return self.properties.get('end', -1) + + def frame_type(self): + # TODO: rename this, uses old property since replaced by "label"" + # NOTE: this is still aloowing for the old property though + return self.properties.get('label') or self.properties.get('frameType') + + def has_label(self): + return self.frame_type() is not None + + def representatives(self) -> list: + """Return a list of the representative TimePoints.""" + # TODO: why could I not get this from the anchors? + rep_ids = self.properties.get('representatives', []) + reps = [self.graph.get_node(rep_id) for rep_id in rep_ids] + return reps + + def summary(self): + """The summary of a time frame just contains the identifier, start, end + and frame type.""" + return { 'id': self.identifier, + 'start': self.properties['start'], + 'end': self.properties['end'], + 'frameType': self.properties.get('frameType') } + + +class EntityNode(Node): + + def __init__(self, graph, view, annotation): + super().__init__(graph, view, annotation) + self.tokens = [] + self._paths = None + self._anchor = None + + def __str__(self): + try: + start = self.properties['start'] + end = self.properties['end'] + except KeyError: + start, end = self.anchors['text-offsets'] + return ("" + % (self.identifier, start, end, self.properties['text'])) + + def start_in_video(self): + #print('+++', self.document.properties) + try: + return self.document.anchors['time-point'] + except KeyError: + return -1 + #return self.anchor()['video-start'] + + def end_in_video(self): + return self.anchor().get('video-end') + + ''' + Commented this out because the type checking in the code coverage tests requires + the default vaue for the close parameter to be the same as on Node.pp(). + + def pp(self, close=False): + super().pp(close=close) + try: + for i, p in enumerate(self.paths_to_docs()): + print(' %s' % ' '.join([str(n) for n in p[1:]])) + except ValueError: + print(' WARNING: error in path_to_docs in NamedEntityNode.pp()') + print('-' * 80) + ''' + + def summary(self): + """The summary for entities needs to include where in the video or image + the entity occurs, it is not enough to just give the text document.""" + # TODO: in the old days this used an anchor() method which was fragile + # TODO: revamping it now + return { + 'id': self.identifier, + 'group': self.properties['group'], + 'cat': self.properties['category'], + 'document': self.document.identifier, + # Entities in a TextDocument that is a full transcript without any + # alignments do not have a TimePoint + #'time-point': self.document.anchors.get('time-point'), + #'text-offsets': self.anchors.get('text-offsets'), + 'time-point': self.document.anchors.get('time-point', -1), + 'text-offsets': self.anchors.get('text-offsets', (-1 ,-1)), + #'document': self._get_document_plus_span(), + #'video-start': anchor.get('video-start'), + #'video-end': anchor.get('video-end'), + #'coordinates': self._coordinates_as_string(anchor) + } + + def anchor(self) -> dict: + """The anchor is the position in the video that the entity is linked to. + This anchor cannot be found in the document property because that points + to a text document that was somehow derived from the video document. Some + graph traversal is needed to get the anchor, but we know that the anchor + is always a time frame or a bounding box. + """ + # TODO: deal with the case where the primary document is not a video + self.paths = self.paths_to_docs() + bbtf = self.find_boundingbox_or_timeframe() + # for path in paths: + # print('... [') + # for n in path: print(' ', n) + # print('===', bbtf) + if bbtf.at_type.shortname == config.BOUNDING_BOX: + return {'video-start': bbtf.properties['timePoint'], + 'coordinates': bbtf.properties['coordinates']} + elif bbtf.at_type.shortname == config.TIME_FRAME: + return {'video-start': bbtf.properties['start'], + 'video-end': bbtf.properties['end']} + else: + return {} + + def anchor2(self): + """The anchor is the position in the video that the entity is linked to. + This anchor cannot be found in the document property because that points + to a text document that was somehow derived from the video document. Some + graph traversal is needed to get the anchor, but we know that the anchor + is always a time frame or a bounding box. + """ + # TODO: with this version you get an error that the paths variable does + # not exist yet, must get a clearer picture on how to build a graph + # where nodes have paths to anchors + # TODO: deal with the case where the primary document is not a video + if self._anchor is None: + self._paths = self.paths_to_docs() + bbtf = self.find_boundingbox_or_timeframe() + # for path in self._paths: + # print('... [') + # for n in path: print(' ', n) + # print('===', bbtf) + if bbtf.at_type.shortname == config.BOUNDING_BOX: + self._anchor = {'video-start': bbtf.properties['timePoint'], + 'coordinates': bbtf.properties['coordinates']} + elif bbtf.at_type.shortname == config.TIME_FRAME: + self._anchor = {'video-start': bbtf.properties['start'], + 'video-end': bbtf.properties['end']} + return self._anchor + + def find_boundingbox_or_timeframe(self): + return self.paths[-1][-2] + + @staticmethod + def _coordinates_as_string(anchor): + if 'coordinates' not in anchor: + return None + return ','.join(["%s:%s" % (pair[0], pair[1]) + for pair in anchor['coordinates']]) + + +class Nodes(object): + + """Factory class for Node creation. Use Node for creation unless a special + class was registered for the kind of annotation we have.""" + + node_classes = { config.NAMED_ENTITY: EntityNode, + config.TIME_FRAME: TimeFrameNode } + + @classmethod + def new(cls, graph, view, annotation): + node_class = cls.node_classes.get(annotation.at_type.shortname, Node) + return node_class(graph, view, annotation) + diff --git a/mmif/utils/summarizer/summary.py b/mmif/utils/summarizer/summary.py index b340f35b..a5c9bb07 100644 --- a/mmif/utils/summarizer/summary.py +++ b/mmif/utils/summarizer/summary.py @@ -81,11 +81,11 @@ from mmif.serialize import Mmif from mmif.vocabulary import DocumentTypes +from mmif.utils.summarizer import config from mmif.utils.summarizer.utils import CharacterList from mmif.utils.summarizer.utils import get_aligned_tokens, timestamp from mmif.utils.summarizer.utils import get_transcript_view, get_last_segmenter_view, get_captions_view from mmif.utils.summarizer.graph import Graph -from mmif.utils.summarizer import config VERSION = '0.2.0' @@ -150,23 +150,19 @@ def validate(self): def video_documents(self): return self.mmif.get_documents_by_type(DocumentTypes.VideoDocument) - def report(self, outfile=None, html=None, full=False, timeframes=False, - transcript=False, captions=False, entities=False): + def report(self, outfile=None): json_obj = { 'mmif_version': self.mmif.metadata.mmif, 'document': self.document.data, 'documents': self.documents.data, 'annotations': self.annotations.data, - 'views': self.views.data} - if transcript or full: - json_obj['transcript'] = self.transcript.data - if captions or full: - json_obj['captions'] = self.captions.as_json() - if timeframes or full: - json_obj['timeframes'] = self.timeframes.as_json() - json_obj['timeframe_stats'] = self.timeframe_stats.data - if entities or full: - json_obj['entities'] = self.entities.as_json() + 'views': self.views.data, + 'transcript': self.transcript.data, + 'captions': self.captions.as_json(), + 'timeframes': self.timeframes.as_json(), + 'timeframe_stats': self.timeframe_stats.data, + 'entities': self.entities.as_json() + } report = json.dumps(json_obj, indent=2) if outfile is None: return report @@ -631,17 +627,17 @@ def __init__(self, summary): f'>>> PROPS {list(doc.properties.keys())}', f'>>> TEXT ' + text.replace("\n", "")[:100], f'>>> ANCHORS {doc.anchors}') - if 'time-offsets' in doc.anchors: + if 'time-offsets' in doc.anchors and 'representatives' in doc.anchors: # For older LLava-style captions # http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97 - p1, p2 = doc.anchors['time-offsets'] - if 'representatives' in doc.anchors: - tp_id = doc.anchors["representatives"][0] - tp = summary.graph.get_node(tp_id) - self.captions.append( - { 'identifier': doc.identifier, - 'time-point': tp.properties['timePoint'], - 'text': text }) + # NOTE: probably obsolete, at least the link above is dead + tp_id = doc.anchors["representatives"][0] + tp = summary.graph.get_node(tp_id) + if tp is not None: + self.captions.append( + { 'identifier': doc.identifier, + 'time-point': tp.properties['timePoint'], + 'text': text }) if 'time-point' in doc.anchors: # For newer SmolVLM-style captions # http://apps.clams.ai/smolvlm2-captioner From b3ad9730df25be3140a18666be29690074cb29ff Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Wed, 14 Jan 2026 08:33:32 -0500 Subject: [PATCH 10/48] Replaced DocumentTypes type hint with DocumentTypesBase --- mmif/serialize/mmif.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index c6fa4c62..e8d2b7b5 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -24,7 +24,7 @@ from mmif.serialize.annotation import Annotation, Document from mmif.serialize.model import MmifObject, DataList from mmif.serialize.view import View -from mmif.vocabulary import AnnotationTypes, DocumentTypes +from mmif.vocabulary import AnnotationTypes, DocumentTypes, DocumentTypesBase __all__ = ['Mmif'] @@ -487,7 +487,7 @@ def get_documents_in_view(self, vid: Optional[str] = None) -> List[Document]: else: return [] - def get_documents_by_type(self, doc_type: Any) -> List[Document]: + def get_documents_by_type(self, doc_type: DocumentTypesBase) -> List[Document]: """ Method to get all documents where the type matches a particular document type, which should be one of the CLAMS document types. @@ -530,7 +530,7 @@ def get_documents_by_property(self, prop_key: str, prop_value: str) -> List[Docu docs.extend([document for document in self.documents if document[prop_key] == prop_value]) return docs - def get_documents_locations(self, m_type: Union[DocumentTypes, str], path_only=False) -> List[Union[str, None]]: + def get_documents_locations(self, m_type: Union[DocumentTypesBase, str], path_only=False) -> List[Union[str, None]]: """ This method returns the file paths of documents of given type. Only top-level documents have locations, so we only check them. @@ -545,7 +545,7 @@ def get_documents_locations(self, m_type: Union[DocumentTypes, str], path_only=F else: return [doc.location for doc in docs] - def get_document_location(self, m_type: Union[DocumentTypes, str], path_only=False) -> Optional[str]: + def get_document_location(self, m_type: Union[DocumentTypesBase, str], path_only=False) -> Optional[str]: """ Method to get the location of *first* document of given type. From 73da13e0883aa40e108cc82ac7dbd15c6d90adc1 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Fri, 30 Jan 2026 00:08:13 +0100 Subject: [PATCH 11/48] cleanup imports Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mmif/serialize/mmif.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index e8d2b7b5..68478c7f 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -24,7 +24,7 @@ from mmif.serialize.annotation import Annotation, Document from mmif.serialize.model import MmifObject, DataList from mmif.serialize.view import View -from mmif.vocabulary import AnnotationTypes, DocumentTypes, DocumentTypesBase +from mmif.vocabulary import AnnotationTypes, DocumentTypesBase __all__ = ['Mmif'] From 9bdbbab253e774f45227a39c894a884cd0eda4e9 Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Fri, 9 Jan 2026 13:51:53 -0500 Subject: [PATCH 12/48] Some documentation thoughts and updates. --- README.md | 20 ++++++---- documentation-notes.md | 61 ++++++++++++++++++++++++++++++ documentation/cli.rst | 8 ++-- documentation/conf.py | 62 ++++++++++++++++--------------- documentation/index.rst | 19 +++++----- documentation/introduction.rst | 22 ++++++----- documentation/plugins.rst | 10 ++--- documentation/target-versions.rst | 2 +- mmif/__init__.py | 15 ++++++++ 9 files changed, 153 insertions(+), 66 deletions(-) create mode 100644 documentation-notes.md diff --git a/README.md b/README.md index 1cd1070a..1d786ea1 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,25 @@ ## MultiMedia Interchange Format -[MMIF](https://mmif.clams.ai) is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications in [CLAMS project](https://clams.ai). + +[MMIF](https://mmif.clams.ai) is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications of the [CLAMS project](https://clams.ai). ## mmif-python -`mmif-python` is a Python implementation of the MMIF data format. -`mmif-python` provides various helper classes and functions to handle MMIF JSON in Python, -including ; -1. de-/serialization of MMIF internal data structures to/from JSON +`mmif-python` is a Python implementation of the MMIF data format. It provides various helper classes and functions to handle MMIF JSON in Python, including: + +1. serialization and de-serialization of MMIF internal data structures to/from JSON 2. validation of MMIF JSON 3. handling of CLAMS vocabulary types -4. navigation of MMIF object via various "search" methods (e.g. `mmif.get_all_views_contain(vocab_type))`) +4. navigation of MMIF objects via various "search" methods (e.g. `mmif.get_all_views_contain(vocab_type))`) + ## For more ... + * [Version history and patch notes](https://github.com/clamsproject/mmif-python/blob/main/CHANGELOG.md) -* [MMIF Python API documentation](https://clamsproject.github.io/mmif-python) +* [MMIF Python API documentation](https://clamsproject.github.io/mmif-python/latest) * [MMIF JSON specification and schema](https://clamsproject.github.io/mmif) + ## For devs ... -* Build documentation: `python build-tools/docs.py --help` + +* To build the documentation: `python build-tools/docs.py --help` diff --git a/documentation-notes.md b/documentation-notes.md new file mode 100644 index 00000000..668ceb16 --- /dev/null +++ b/documentation-notes.md @@ -0,0 +1,61 @@ +# Documentation notes + +Various temporary notes on the documentation. Parts of this should maybe be added to [issue #348](https://github.com/clamsproject/mmif-python/issues/348) or to a more general issue on mmif-python documentation. + +Do not keep this file here forever. + +-- + +In the [346-summarizer](https://github.com/clamsproject/mmif-python/tree/346-summarizer) branch I added one line trying to generate API documentation for the sumarizer: + +```rest +.. toctree:: + :maxdepth: 4 + + autodoc/mmif.serialize + autodoc/mmif.vocabulary + autodoc/mmif.utils + autodoc/mmif.utils.summarizer +``` + +However, it looks like this needs to be done elsewhere since after `make doc` no `mmif.utils.summarizer.html` file is added to `doct-test/develop/autodoc` and we get a warning that the TOC cannot add the module. + +Also note that this doesn't work for the mmif.utils.cli package either. + +-- + +At the moment `documentation/index.rst` imports the top-level readme file. Should probably revisit that because the goal of that file is different from what we are doing here. + +Update: I removed the include and wrote a shorter intro, but there is already something along those lines in `documentation/introduction.rst` so there is still some smoothing to be done here. + +-- + +In the summarizer branch there is a markdown file in the mmif.utils.summary package, that should maby be added here as `documentation/creating-clis.rst`/ + +-- + +All the source links in the generated documentation are dead. I thought that maybe editing `documentation/conf.py` and changing the line + +```python +html_show_sourcelink = True # Furo handles this well, no need to hide +``` + +by setting the variable to False might work, but that was a wild guess and it did not work. + +-- + +When reading the changes for a version frm the changelog file some of the typesetting does not translate well, for example in version 1.2.1 we get the raw text for the note: + +``` +[!NOTE] mmif describe (and the underlying mmif.utils.workflow_helper) is still experimental and subject to change in future releases without notice. Backward compatibility is not guaranteed. +``` + +-- + +Some changes already made (but not necessarily pushed up yet): + +- Fixing some types and minor style errors. +- Some type setting changes. +- Refactored the way the "what's new in section X" is generated. +- Removed the Search Page link from the main page. It was leading nowhere and there is a perfectly fine search box on the left anyway. +- Updated python requirement. \ No newline at end of file diff --git a/documentation/cli.rst b/documentation/cli.rst index 8a2f6836..e2e48a31 100644 --- a/documentation/cli.rst +++ b/documentation/cli.rst @@ -1,15 +1,15 @@ .. _cli: -``mmif`` shell command -====================== +The ``mmif`` shell command +========================== ``mmif-python`` comes with a command line interface (CLI) that allows you to handle MMIF files. Many of these commands are designed to handle MMIF files in the context of CLAMS workflows. -The CLI is installed as ``mmif`` shell command. To see the available commands, run +The CLI is installed as the ``mmif`` shell command. Run the following to see the available commands or the MMIF version: .. code-block:: bash - mmif --help + mmif (--help | --version) The following documentation is automatically generated from the CLI help messages. diff --git a/documentation/conf.py b/documentation/conf.py index 2b8a027d..3f5c0e0f 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -6,6 +6,7 @@ import inspect import textwrap import os +import re import sys from pathlib import Path @@ -64,7 +65,6 @@ "source_repository": "https://github.com/clamsproject/mmif-python", "source_branch": "main", # Default branch for "Edit on GitHub" links "source_directory": "documentation/", - # CLAMS brand colors "light_css_variables": { "color-brand-primary": "#008AFF", @@ -150,27 +150,24 @@ def update_target_versions(app): def generate_cli_rst(app): - from mmif import prep_argparser_and_subcmds, find_all_modules - - # Generate main help - os.environ['COLUMNS'] = '100' - parser, subparsers = prep_argparser_and_subcmds() - help_text = parser.format_help() content = [] - content.append('Main Command\n') - content.append('------------\n\n') - content.append('.. code-block:: text\n\n') - content.append(textwrap.indent(help_text, ' ')) - content.append('\n\n') - + # Generate main help + # os.environ['COLUMNS'] = '100' + # parser = mmif.prep_argparser_for_documentation() + # help_text = parser.format_help() + # content.append('Main Command\n') + # content.append('------------\n\n') + # content.append('.. code-block:: text\n\n') + # content.append(textwrap.indent(help_text, ' ')) + # content.append('\n\n') + # Generate subcommand help - for cli_module in find_all_modules('mmif.utils.cli'): + for cli_module in mmif.find_all_modules('mmif.utils.cli'): cli_module_name = cli_module.__name__.rsplit('.')[-1] subparser = cli_module.prep_argparser(prog=f'mmif {cli_module_name}') sub_help = subparser.format_help() - content.append(f'{cli_module_name}\n') content.append('-' * len(cli_module_name) + '\n\n') content.append('.. code-block:: text\n\n') @@ -182,6 +179,11 @@ def generate_cli_rst(app): def generate_whatsnew_rst(app): + """ + Create the documentation/whatsnew.md file by pulling out the changes for the + current version from the changelog file. + """ + changelog_path = proj_root_dir / 'CHANGELOG.md' output_path = proj_root_dir / 'documentation' / 'whatsnew.md' if not changelog_path.exists(): @@ -190,18 +192,15 @@ def generate_whatsnew_rst(app): f.write("") return - import re - content = [] found_version = False version_header_re = re.compile(r'^## releasing\s+([^\s]+)\s*(\(.*\))?') print(f"DEBUG: Looking for version '{version}' in CHANGELOG.md") - with open(changelog_path, 'r') as f: + with changelog_path.open() as f: lines = f.readlines() - - for line in lines: + for n, line in enumerate(lines): match = version_header_re.match(line) if match: header_version = match.group(1) @@ -211,18 +210,23 @@ def generate_whatsnew_rst(app): continue elif found_version: break - if found_version: + # Make the headers from the changelog mesh in properly with the headers + # in the documentation. + if line.startswith('###'): + line = '#' + line content.append(line) - if not found_version: - print(f"NOTE: No changelog entry found for version {version}") - with open(output_path, 'w') as f: - f.write("") - else: - # Dump matched markdown content directly to whatsnew.md - with open(output_path, 'w') as f: - f.write(f"## What's New in {version}\n\n(Full changelog available in the [CHANGELOG.md]({blob_base_url}/main/CHANGELOG.md))\n") + with open(output_path, 'w') as f: + f.write(f"### What's New in {version}\n\n") + f.write( + "The full changelog is available in [CHANGELOG.md]" + f"({blob_base_url}/main/CHANGELOG.md).\n\n") + if not found_version: + print(f"NOTE: No changelog entry found for this version\n\n") + f.write("There are no changelog entries for this version\n\n") + else: + # Dump matched markdown content directly to whatsnew.md f.writelines(content) diff --git a/documentation/index.rst b/documentation/index.rst index ddbf0691..129eec3d 100644 --- a/documentation/index.rst +++ b/documentation/index.rst @@ -1,17 +1,17 @@ -Welcome to mmif-python's documentation! -======================================= +mmif-python +=========== -.. mdinclude:: ../README.md - ----- +This is the documentation for the mmif-python package, a Python implementation for the MMIF data format. MMIF (MultiMedia Interchange Format) is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications of the CLAMS project. For descriptions of the CLAMS project and the MMIF format see https://clams.ai and https://mmif.clams.ai. The GitHub repository for the package is at https://github.com/clamsproject/mmif-python. .. mdinclude:: whatsnew.md ----- + +Contents +^^^^^^^^ .. toctree:: :maxdepth: 2 - :caption: Contents + :caption: General documentation introduction cli @@ -25,9 +25,8 @@ Welcome to mmif-python's documentation! modules -Indices and tables -================== +Indices +^^^^^^^ * :ref:`genindex` * :ref:`modindex` -* :ref:`search` diff --git a/documentation/introduction.rst b/documentation/introduction.rst index 95508f3c..acc49015 100644 --- a/documentation/introduction.rst +++ b/documentation/introduction.rst @@ -11,10 +11,12 @@ MultiMedia Interchange Format (MMIF) is a JSON(-LD)-based data format designed f This documentation focuses on Python implementation of the MMIF. To learn more about the data format specification, please visit the `MMIF website `_. ``mmif-python`` is a public, open source implementation of the MMIF data format. ``mmif-python`` supports serialization/deserialization of MMIF objects from/to Python objects, as well as many navigation and manipulation helpers for MMIF objects. + Prerequisites ------------- -* `Python `_: the latest ``mmif-python`` requires Python 3.8 or newer. We have no plan to support `Python 2.7 `_. +* `Python `_: the latest ``mmif-python`` requires Python 3.10 or newer. + Installation --------------- @@ -25,20 +27,20 @@ Package ``mmif-python`` is distributed via the official PyPI. Users are supposed pip install mmif-python -This will install a package `mmif` to local python. +This will install a package `mmif` to your local python library. The MMIF format and specification is evolving over time, and ``mmif-python`` package will be updated along with the changes in MMIF format. -.. note:: MMIF format is not always backward-compatible. To find out more about relations between MMIF specification versions and ``mmif-python`` versions, please take time to read our decision on the subject `here `_. If you need to know which python SDK supports which specification version, see :ref:`target-versions` page. +.. note:: The MMIF format is not always backward-compatible. To find out more about relations between MMIF specification versions and ``mmif-python`` versions, please take time to read our decision on the subject `here `_. If you need to know which python SDK supports which specification version, see :ref:`target-versions` page. + MMIF Serialization --------------------------- -:class:`mmif.serialize.mmif.Mmif` represents the top-level MMIF object. For subcomponents of the MMIF (view objects, annotation objects, metadata for each object) are all subclass of :class:`mmif.serialize.model.MmifObject`, including the :class:`mmif.serialize.mmif.Mmif`. To start with an existing MMIF :class:`str`, simple initiate a new ``Mmif`` object with the file. +:class:`mmif.serialize.mmif.Mmif` represents the top-level MMIF object. Subcomponents of the MMIF object (views, annotation objects and metadata for each object)and the MMIF object itself are all subclasses of :class:`mmif.serialize.model.MmifObject`. To start with an existing MMIF :class:`str`, simply initiate a new ``Mmif`` object with that string. .. code-block:: python - import mmif from mmif import Mmif mmif_str = """{ @@ -64,13 +66,14 @@ MMIF Serialization } ], "views": []}""" + mmif_obj = Mmif(mmif_str) Few notes; -#. MMIF does not carry the primary source files in it. -#. MMIF encode the specification version at the top. As not all MMIF versions are backward-compatible, a version ``mmif-python`` implementation of the MMIF might not be able to load an unsupported version of MMIF string. +#. MMIF objects do not carry the primary source files in it (although there are exceptions for text documents). +#. MMIF objects specify the MMIF version at the top. As not all MMIF versions are backward-compatible, a version of the ``mmif-python`` implementation might not be able to load an unsupported MMIF versions. When serializing back to :class:`str`, call :meth:`mmif.serialize.model.MmifObject.serialize` on the object. @@ -81,12 +84,13 @@ To get subcomponents, you can use various getters implemented in subclasses. For from mmif.vocabulary.document_types import DocumentTypes for video in mmif_obj.Mmif.get_documents_by_type(DocumentTypes.VideoDocument): - with open(video.location_path(), 'b') as in_video: - # do something with the video file + with open(video.location_path(), 'b') as in_video: + # do something with the video file For a full list of available helper methods, please refer to :ref:`the API documentation `. + MMIF usage in CLAMS Workflows ----------------------------- diff --git a/documentation/plugins.rst b/documentation/plugins.rst index 1af39426..55443aa1 100644 --- a/documentation/plugins.rst +++ b/documentation/plugins.rst @@ -1,7 +1,7 @@ .. _plugins: -Developing plugins for MMIF Python SDK -====================================== +Developing plugins for the MMIF Python SDK +========================================== Overview @@ -50,7 +50,7 @@ Here's a minimal example codebase that you refer to when you develop a ``docloc` ├── pyproject.toml └── setup.cfg - $ cat pyproject.toml + $ cat pyproject.toml [build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" @@ -85,5 +85,5 @@ And the plugin code. Bulit-in Document Location Scheme Plugins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -At the moment, ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. -Take a look at :mod:`mmif_docloc_http` module for details. +At the moment, the ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. +Take a look at the :mod:`mmif_docloc_http` module for details. diff --git a/documentation/target-versions.rst b/documentation/target-versions.rst index 34216d3e..9d89717d 100644 --- a/documentation/target-versions.rst +++ b/documentation/target-versions.rst @@ -3,7 +3,7 @@ Target MMIF Versions ==================== -This article provides targeting MMIF specification versions of different versions of ``mmif-python`` SDK. +This page lists targeting MMIF specification versions for different versions of the ``mmif-python`` SDK. .. csv-table:: Target Specification Versions :file: target-versions.csv diff --git a/mmif/__init__.py b/mmif/__init__.py index 6fde82fe..18d10c14 100644 --- a/mmif/__init__.py +++ b/mmif/__init__.py @@ -33,6 +33,21 @@ def find_all_modules(pkgname): yield importlib.import_module(module) +def prep_argparser_for_documentation(): + """This is specifically for when building the CLI documentation with sphinxs-build, + which without the prog parameter would otherwise use sphinxs-build when printing + the command name.""" + # TODO: this does not add the subcommands, which was faulty anyway because the + # original code left us with an empty list + parser = argparse.ArgumentParser(prog='mmif') + parser.add_argument( + '-v', '--version', + action='version', + version=version_template.format(__version__, __specver__) + ) + return parser + + def prep_argparser_and_subcmds(): parser = argparse.ArgumentParser() parser.add_argument( From d803abca8ea595495926d9e295d6a8f9f9435567 Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Wed, 14 Jan 2026 05:22:00 -0500 Subject: [PATCH 13/48] Added CLI module to documentation --- documentation/autodoc/mmif.utils.cli.rst | 32 ++++++++++++++++++++++++ documentation/index.rst | 2 +- documentation/modules.rst | 2 ++ mmif/utils/cli/describe.py | 12 +++++++-- 4 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 documentation/autodoc/mmif.utils.cli.rst diff --git a/documentation/autodoc/mmif.utils.cli.rst b/documentation/autodoc/mmif.utils.cli.rst new file mode 100644 index 00000000..894e38e1 --- /dev/null +++ b/documentation/autodoc/mmif.utils.cli.rst @@ -0,0 +1,32 @@ +mmif.utils.cli package +====================== + +Package containing CLI modules. + +Submodules +---------- + +``describe`` module +------------------- + +.. automodule:: mmif.utils.cli.describe + :members: + :undoc-members: + :show-inheritance: + +``rewind`` module +----------------- + +.. automodule:: mmif.utils.cli.rewind + :members: + :undoc-members: + :show-inheritance: + +``source`` module +----------------- + +.. automodule:: mmif.utils.cli.source + :members: + :undoc-members: + :show-inheritance: + diff --git a/documentation/index.rst b/documentation/index.rst index 129eec3d..e40f03ca 100644 --- a/documentation/index.rst +++ b/documentation/index.rst @@ -1,7 +1,7 @@ mmif-python =========== -This is the documentation for the mmif-python package, a Python implementation for the MMIF data format. MMIF (MultiMedia Interchange Format) is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications of the CLAMS project. For descriptions of the CLAMS project and the MMIF format see https://clams.ai and https://mmif.clams.ai. The GitHub repository for the package is at https://github.com/clamsproject/mmif-python. +This is the documentation for the mmif-python package, a Python implementation for the MultiMedia Interchange Format (MMIF). MMIF is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications of the CLAMS project. For descriptions of the CLAMS project and the MMIF format see https://clams.ai and https://mmif.clams.ai. The GitHub repository for the package is at https://github.com/clamsproject/mmif-python. .. mdinclude:: whatsnew.md diff --git a/documentation/modules.rst b/documentation/modules.rst index e32ade5d..9db714db 100644 --- a/documentation/modules.rst +++ b/documentation/modules.rst @@ -9,8 +9,10 @@ mmif package autodoc/mmif.serialize autodoc/mmif.vocabulary autodoc/mmif.utils + autodoc/mmif.utils.cli autodoc/mmif.utils.summarizer + mmif_docloc_http package ======================== diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index eaf35856..f32eaf85 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -32,9 +32,17 @@ def describe_argparser(): `clams --help`, respectively. """ oneliner = ( - 'provides CLI to describe the workflow specification from a MMIF ' + 'Describe the workflow specification from a MMIF ' 'file or a collection of MMIF files.' ) + return oneliner, oneliner + + +def describe_rst(): + """ + Returns a restructured text string meant to be used in some sphinxs-generated + documention. + """ # get and clean docstrings single_doc = describe_single_mmif.__doc__.split(':param')[0] @@ -56,7 +64,7 @@ def describe_argparser(): =============================== {collection_doc} """) - return oneliner, oneliner + '\n\n' + additional.strip() + return additional.strip() def prep_argparser(**kwargs): From 86578c788dbeca873ca637a5b49d27a198378421 Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Wed, 21 Jan 2026 19:27:55 -0500 Subject: [PATCH 14/48] Various documentation updates, including for the summarizer --- .gitignore | 2 +- documentation-notes.md | 2 + documentation/autodoc/mmif.utils.cli.rst | 8 ++ .../autodoc/mmif.utils.summarizer.rst | 44 +++++++ documentation/cli.rst | 2 +- documentation/index.rst | 1 + documentation/modules.rst | 1 - documentation/summarizer.rst | 32 ++++++ mmif/utils/cli/describe.py | 4 +- mmif/utils/cli/summarize.py | 8 +- mmif/utils/summarizer/__init__.py | 46 +++++--- mmif/utils/summarizer/graph.py | 49 ++++++-- mmif/utils/summarizer/summary.py | 108 ++++-------------- mmif/utils/summarizer/utils.py | 4 +- 14 files changed, 189 insertions(+), 122 deletions(-) create mode 100644 documentation/autodoc/mmif.utils.summarizer.rst create mode 100644 documentation/summarizer.rst diff --git a/.gitignore b/.gitignore index f937b437..c9c81e78 100644 --- a/.gitignore +++ b/.gitignore @@ -80,7 +80,7 @@ VERSION # Documentation build artifacts documentation/cli_help.rst -documentation/whatsnew.rst +documentation/whatsnew.md docs-test # environments diff --git a/documentation-notes.md b/documentation-notes.md index 668ceb16..c0098043 100644 --- a/documentation-notes.md +++ b/documentation-notes.md @@ -42,6 +42,8 @@ html_show_sourcelink = True # Furo handles this well, no need to hide by setting the variable to False might work, but that was a wild guess and it did not work. +On second thought, this is probably because the source links go to pages that do not exist yet. + -- When reading the changes for a version frm the changelog file some of the typesetting does not translate well, for example in version 1.2.1 we get the raw text for the note: diff --git a/documentation/autodoc/mmif.utils.cli.rst b/documentation/autodoc/mmif.utils.cli.rst index 894e38e1..2d0aa521 100644 --- a/documentation/autodoc/mmif.utils.cli.rst +++ b/documentation/autodoc/mmif.utils.cli.rst @@ -30,3 +30,11 @@ Submodules :undoc-members: :show-inheritance: +``summarize`` module +-------------------- + +.. automodule:: mmif.utils.cli.summarize + :members: + :undoc-members: + :show-inheritance: + diff --git a/documentation/autodoc/mmif.utils.summarizer.rst b/documentation/autodoc/mmif.utils.summarizer.rst new file mode 100644 index 00000000..299322c1 --- /dev/null +++ b/documentation/autodoc/mmif.utils.summarizer.rst @@ -0,0 +1,44 @@ +mmif.utils.summarizer package +============================= + +Package containing the code to generate a summary from a MMIF file. + + +Submodules +---------- + + +``graph`` module +---------------- + +.. automodule:: mmif.utils.summarizer.graph + :members: + :undoc-members: + :show-inheritance: + + +``nodes`` module +---------------- + +.. automodule:: mmif.utils.summarizer.nodes + :members: + :undoc-members: + :show-inheritance: + + +``summary`` module +------------------ + +.. automodule:: mmif.utils.summarizer.summary + :members: + :undoc-members: + :show-inheritance: + + +``utils`` module +---------------- + +.. automodule:: mmif.utils.summarizer.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/documentation/cli.rst b/documentation/cli.rst index e2e48a31..d8fa33a2 100644 --- a/documentation/cli.rst +++ b/documentation/cli.rst @@ -5,7 +5,7 @@ The ``mmif`` shell command ``mmif-python`` comes with a command line interface (CLI) that allows you to handle MMIF files. Many of these commands are designed to handle MMIF files in the context of CLAMS workflows. -The CLI is installed as the ``mmif`` shell command. Run the following to see the available commands or the MMIF version: +The CLI scripts are installed as subcommands of the ``mmif`` shell command. Run the following to see the available commands or the MMIF version: .. code-block:: bash diff --git a/documentation/index.rst b/documentation/index.rst index e40f03ca..f1d270a7 100644 --- a/documentation/index.rst +++ b/documentation/index.rst @@ -15,6 +15,7 @@ Contents introduction cli + summarizer plugins target-versions diff --git a/documentation/modules.rst b/documentation/modules.rst index 9db714db..22838e39 100644 --- a/documentation/modules.rst +++ b/documentation/modules.rst @@ -12,7 +12,6 @@ mmif package autodoc/mmif.utils.cli autodoc/mmif.utils.summarizer - mmif_docloc_http package ======================== diff --git a/documentation/summarizer.rst b/documentation/summarizer.rst new file mode 100644 index 00000000..dacdffaa --- /dev/null +++ b/documentation/summarizer.rst @@ -0,0 +1,32 @@ +.. _summarizer: + + +MMIF Summarizer +=============== + +The Summarizer is a MMIF consumer that creates a JSON summary from a MMIF file. It +makes some simplifying assumptions, including: + +- There is one video in the MMIF documents list. All start and end properties + are pointing to that video. +- The time unit is assumed to be milliseconds. + + +The summarizer is accessable via the ``mmif`` command line script. To run the +summarizer over a MMIF file and write the JSON summary to OUTFILE: + +.. code-block:: bash + + mmif summarize -i INFILE -o OUTFILE + +In all cases, the summarizer summarizes only the information that is there, it +does not fix any mistakes and in general it does not add any information that is +not explicitly or implicitly in the MMIF file. In rare cases some information is +added, for example if an ASR tool does not group tokens in sentence-like objects +then the summarizer will do that, but then only by creating token groups of the +same length. + +The summary includes the MMIF version, the list of documents, a summary of the +metadata of all views (identifier, CLAMS app, timestamp, total number of +annotations and number of annotations per type, it does not show parameters and +application configuration), time frames, transcript, captions and entities. \ No newline at end of file diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index f32eaf85..bbaac4c0 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -99,9 +99,9 @@ def main(args): Main entry point for the describe CLI command. Reads a MMIF file and outputs a JSON summary containing: + - workflow_id: unique identifier for the source and app sequence - - stats: view counts, annotation counts (total/per-view/per-type), - and lists of error/warning/empty view IDs + - stats: view counts, annotation counts (total/per-view/per-type), and lists of error/warning/empty view IDs - views: map of view IDs to app configurations and profiling data :param args: Parsed command-line arguments diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py index 06c1afae..29b94a60 100644 --- a/mmif/utils/cli/summarize.py +++ b/mmif/utils/cli/summarize.py @@ -17,6 +17,9 @@ def describe_argparser() -> tuple: def prep_argparser(**kwargs): + """ + Create the ArgumentParser instance for the summarizer. + """ parser = argparse.ArgumentParser( description=describe_argparser()[1], formatter_class=argparse.RawDescriptionHelpFormatter, @@ -26,6 +29,9 @@ def prep_argparser(**kwargs): return parser -def main(args): +def main(args: argparse.Namespace): + """ + The main summarizer command. + """ mmif_summary = Summary(args.i) mmif_summary.report(outfile=args.o) diff --git a/mmif/utils/summarizer/__init__.py b/mmif/utils/summarizer/__init__.py index 1122d449..c1afbb78 100644 --- a/mmif/utils/summarizer/__init__.py +++ b/mmif/utils/summarizer/__init__.py @@ -1,3 +1,33 @@ +""" + +MMIF consumer that creates a JSON summary from a MMIF file. + +Makes some simplifying assumptions, including: + +- There is one video in the MMIF documents list. All start and end properties + are pointing to that video. +- The time unit is assumed to be milliseconds. + +USAGE: + + $ mmif summarize -i INFILE -o OUTFILE + + Run the summarizer over a MMIF file and write the JSON summary to OUTFILE. + +In all cases, the summarizer summarizes the information that is there, it does +not fix any mistakes and in general it does not add any information that is not +explicitly or implicitly in the MMIF file. In rare cases some information is +added, for example if an ASR tool does not group tokens in sentence-like objects +then the summarizer will do that, but then only by creating token groups of the +same length. + +The summary includes the MMIF version, the list of documents, a summary of the +metadata of all views (identifier, CLAMS app, timestamp, total number of +annotations and number of annotations per type, it does not show parameters and +application configuration), time frames, transcript, captions and entities. + +""" + import argparse @@ -22,19 +52,3 @@ def main(): #pp_args(args) mmif_summary = Summary(args.i) mmif_summary.report(outfile=args.o) - - -""" -There used to be an option to process a whole directory, but I never used it and decided -that if needed it would better be done by an extra script or a separate function. - -The code for when there was a -d option is here just in case. - -if args.d: - for mmif_file in pathlib.Path(args.d).iterdir(): - if mmif_file.is_file() and mmif_file.name.endswith('.mmif'): - print(mmif_file) - json_file = str(mmif_file)[:-4] + 'json' - mmif_summary = Summary(mmif_file.read_text()) - mmif_summary.report(outfile=json_file) -""" \ No newline at end of file diff --git a/mmif/utils/summarizer/graph.py b/mmif/utils/summarizer/graph.py index 55c38ffd..a5ce3edd 100644 --- a/mmif/utils/summarizer/graph.py +++ b/mmif/utils/summarizer/graph.py @@ -14,13 +14,22 @@ class Graph(object): - """Graph implementation for a MMIF document. Each node contains an annotation + """ + Graph implementation for a MMIF document. Each node contains an annotation or document. Alignments are stored separately. Edges between nodes are created from the alignments and added to the Node.targets property. The first edge added to Node.targets is the document that the Node points to (if there is one). The goal for the graph is to store all useful annotation and to have simple ways - to trace nodes all the way up to the primary data.""" + to trace nodes all the way up to the primary data. + + :var mmif: the MMIF document that we are creating a graph for + :var documents: list of the top-level documents + :var nodes: dictionary of nodes, indexed on node identifier + :var alignments: list of pairs + :var token_idx: an instance of TokenIndex + + """ def __init__(self, mmif: Any): # TODO: the type hint should really be "MMif | str", but pytype did not @@ -89,7 +98,8 @@ def add_edge(self, view, alignment): source.add_anchors_from_alignment(target) target.add_anchors_from_alignment(source) - def get_node(self, node_id) -> Node | None: + def get_node(self, node_id: str) -> Node | None: + """Return the Node instance from the node index.""" return self.nodes.get(node_id) # def get_nodes(self, short_at_type: str, view_id : str = None): @@ -101,17 +111,25 @@ def get_nodes(self, short_at_type: str, view_id=None): if (node.at_type.shortname == short_at_type and (view_id is None or node.view.id == view_id))] - def statistics(self): + def statistics(self) -> defaultdict: + """ + Collect counts for node types in each view. + """ stats = defaultdict(int) for node in self.nodes.values(): stats[f'{str(node.view_id):4} {node.at_type.shortname}'] += 1 return stats def trim(self, start: int, end: int): - """Trim the graph and keep only those nodes that are included in the graph + """ + :meta private: + + Trim the graph and keep only those nodes that are included in the graph between two timepoints (both in milliseconds). This assumes that all nodes are anchored on the time in the audio or video stream. At the moment it - keeps all nodes that are not explicitly anchored.""" + keeps all nodes that are not explicitly anchored. Private for now because + it is still useless. + """ remove = set() for node_id, node in self.nodes.items(): if 'time-point' in node.anchors: @@ -125,6 +143,9 @@ def trim(self, start: int, end: int): self.nodes = { node.identifier: node for node in new_nodes } def pp(self, fname=None, skip_timepoints=False): + """ + :meta private: + """ fh = sys.stdout if fname is None else open(fname, 'w') fh.write("%s\n" % self) for view in self.mmif.views: @@ -137,6 +158,9 @@ def pp(self, fname=None, skip_timepoints=False): fh.write(' --> [%s]\n' % ' '.join(targets)) def pp_statistics(self): + """ + :meta private: + """ stats = self.statistics() for at_type in sorted(stats): print(f'{at_type:20} {stats[at_type]:>5}') @@ -148,11 +172,14 @@ class TokenIndex(object): The tokens are indexed on the identifier on the TextDocument that they occur in and for each text document we have a list of pairs - {'v_4:td1': [ - ((0, 5), ), - ((5, 6), ), - ... - } + .. code-block:: python + + {'v_4:td1': [ + ((0, 5), ), + ((5, 6), ), + ... + } + """ # TODO: diff --git a/mmif/utils/summarizer/summary.py b/mmif/utils/summarizer/summary.py index a5c9bb07..7dd19adf 100644 --- a/mmif/utils/summarizer/summary.py +++ b/mmif/utils/summarizer/summary.py @@ -1,73 +1,6 @@ -"""MMIF Summarizer - -MMIF consumer that creates a JSON summary from a MMIF file. - -Makes some simplifying assumptions, including: - -- There is one video in the MMIF documents list. All start and end properties - are pointing to that video. -- The time unit is assumed to be milliseconds. - -Other assumptions are listed with the options below. - - -USAGE: - - $ python summary.py [OPTIONS] - - Reads the MMIF file and creates a JSON summary file with the document list - and any requested extra information. - -Example: - - $ python summary -i input.mmif -o output.json --transcript - - Reads input.mmif and creates output.json with just transcript - information added to the documents list and the views. - -In all cases, the summarizer will summarize what is there and use the information -that is there, if the output of CLAMS is bad, then the results of the summarizer -will be bad (although it may hide a lot of the badness). In some rare cases some -information is added. For example if the ASR tool does not group tokens then the -summarizer will do that, but then only by simply grouping in equal chunks and not -trying to infer sentence-like groupings. - -The summary always includes the MMIF version, the list of documents and a summary -of the metadata of all views (identifier, CLAMS app, timestamp, total number of -annotations and number of annotations per type, it does not show parameters and -application configuration). - - -OPTIONS: - --i INFILE -o OUTFILE - -Run the summarizer over a single MMIF file and write the JSON summary to OUTFILE. - --- timeframes - -Shows basic information of all timeframes. This groups the timeframes according to -the apps it was found in. - ---transcript - -Shows the text from the transcript in pseudo sentences. - -The transcript is taken from the last non-warning ASR view, so only the last added -transcript will be summarized. It is assumed that Tokens in the view are ordered on -text occurrence. - ---captions - -Shows captions from the Llava captioner app. - ---entities - -Include entities from spaCy or other NER. - ---full +""" -Include all the above. +Main classes for the summarizer. """ @@ -107,15 +40,15 @@ class Summary(object): """Implements the summary of a MMIF file. - fname - name of the input mmif file - mmif - instance of mmif.serialize.Mmif - graph - instance of graph.Graph - documents - instance of Documents - views - instance of Views - transcript - instance of Transcript - timeframes - instance of TimeFrames - entities - instance of Entities - captions - instance of get_captions_view + :var fname: name of the input mmif file + :var mmif: instance of mmif.serialize.Mmif + :var graph: instance of graph.Graph + :var documents: instance of Documents + :var views: instance of Views + :var transcript: instance of Transcript + :var timeframes: instance of TimeFrames + :var entities: instance of Entities + :var captions: instance of Captions """ @@ -393,17 +326,16 @@ def as_json(self): class Nodes(object): - """Abstract class to store instances of subclasses of graph.Node. The + """ + Abstract class to store instances of subclasses of graph.Node. The initialization methods of subclasses of Nodes can guard what nodes will be allowed in, for example, as of July 2022 the TimeFrames class only allowed time frames that had a frame type (thereby blocking the many timeframes from Kaldi). - Instance variables: - - summary - an instance of Summary - graph - an instance of graph.Graph, taken from the summary - nodes - list of instances of subclasses of graph.Node + :var summary: an instance of Summary + :var graph: an instance of graph.Graph, taken from the summary + :var nodes: list of instances of subclasses of graph.Node """ @@ -533,11 +465,11 @@ def _collect_other_morsels(self): class Entities(Nodes): - """Collecting instances of graph.EntityNode. + """ + This class collects instances of graph.EntityNode. - nodes_idx - lists of instances of graph.EntityNode, indexed on entity text - { entity-string ==> list of graph.EntityNode } - bins - an instance of Bins + :var nodes_idx: maps entity texts to lists of instances of graph.EntityNode + :var bins: an instance of Bins """ diff --git a/mmif/utils/summarizer/utils.py b/mmif/utils/summarizer/utils.py index 61c3bc8b..897a3830 100644 --- a/mmif/utils/summarizer/utils.py +++ b/mmif/utils/summarizer/utils.py @@ -1,4 +1,6 @@ -"""Utility methods +""" + +Utility methods for the summarizer. """ From d39f9f023daf0d439d84356d9710cb638ec03d0f Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Fri, 30 Jan 2026 09:48:54 -0500 Subject: [PATCH 15/48] reverting incorrect heading level control in documentation --- documentation/conf.py | 28 ++++++++++++---------------- documentation/index.rst | 4 ++-- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/documentation/conf.py b/documentation/conf.py index 3f5c0e0f..7bfbdb54 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -198,9 +198,10 @@ def generate_whatsnew_rst(app): print(f"DEBUG: Looking for version '{version}' in CHANGELOG.md") - with changelog_path.open() as f: + with open(changelog_path, 'r') as f: lines = f.readlines() - for n, line in enumerate(lines): + + for line in lines: match = version_header_re.match(line) if match: header_version = match.group(1) @@ -210,23 +211,18 @@ def generate_whatsnew_rst(app): continue elif found_version: break + if found_version: - # Make the headers from the changelog mesh in properly with the headers - # in the documentation. - if line.startswith('###'): - line = '#' + line content.append(line) - with open(output_path, 'w') as f: - f.write(f"### What's New in {version}\n\n") - f.write( - "The full changelog is available in [CHANGELOG.md]" - f"({blob_base_url}/main/CHANGELOG.md).\n\n") - if not found_version: - print(f"NOTE: No changelog entry found for this version\n\n") - f.write("There are no changelog entries for this version\n\n") - else: - # Dump matched markdown content directly to whatsnew.md + if not found_version: + print(f"NOTE: No changelog entry found for version {version}") + with open(output_path, 'w') as f: + f.write("") + else: + # Dump matched markdown content directly to whatsnew.md + with open(output_path, 'w') as f: + f.write(f"## What's New in {version}\n\n(Full changelog available in the [CHANGELOG.md]({blob_base_url}/main/CHANGELOG.md))\n") f.writelines(content) diff --git a/documentation/index.rst b/documentation/index.rst index f1d270a7..9535636b 100644 --- a/documentation/index.rst +++ b/documentation/index.rst @@ -7,7 +7,7 @@ This is the documentation for the mmif-python package, a Python implementation f Contents -^^^^^^^^ +-------- .. toctree:: :maxdepth: 2 @@ -27,7 +27,7 @@ Contents Indices -^^^^^^^ +------- * :ref:`genindex` * :ref:`modindex` From 3548308daf5022069722b9afce66e295ba371e2c Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Fri, 30 Jan 2026 09:52:40 -0500 Subject: [PATCH 16/48] minor changes in sphinx conf.py.. * removed version number from the side menu, the version will be rendered in the nav bar when deployed * replaced print lines with sphinx logger --- documentation/conf.py | 18 ++++++++++-------- documentation/plugins.rst | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/documentation/conf.py b/documentation/conf.py index 7bfbdb54..1ca1c7ec 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -9,6 +9,9 @@ import re import sys from pathlib import Path +from sphinx.util import logging + +logger = logging.getLogger(__name__) # -- Path setup -------------------------------------------------------------- # Add project root to sys.path so that autodoc can find the mmif package. @@ -23,14 +26,13 @@ project = 'mmif-python' blob_base_url = f'https://github.com/clamsproject/{project}/blob' -copyright = f'{datetime.date.today().year}, Brandeis LLC' author = 'Brandeis LLC' +copyright = f'{datetime.date.today().year}, {author}' try: version = open(proj_root_dir / 'VERSION').read().strip() except FileNotFoundError: - print("WARNING: VERSION file not found, using 'dev' as version.") + logger.warning("VERSION file not found, using 'dev' as version.") version = 'dev' -release = version # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration @@ -142,7 +144,7 @@ def update_target_versions(app): return # Insert new version - print(f"Updating target-versions.csv: {current_ver} -> {spec_ver}") + logger.info(f"Updating target-versions.csv: {current_ver} -> {spec_ver}") lines.insert(1, f'{current_ver},"{spec_ver}"\n') with open(csv_path, 'w') as f: @@ -187,7 +189,7 @@ def generate_whatsnew_rst(app): changelog_path = proj_root_dir / 'CHANGELOG.md' output_path = proj_root_dir / 'documentation' / 'whatsnew.md' if not changelog_path.exists(): - print(f"WARNING: CHANGELOG.md not found at {changelog_path}") + logger.warning(f"CHANGELOG.md not found at {changelog_path}") with open(output_path, 'w') as f: f.write("") return @@ -196,7 +198,7 @@ def generate_whatsnew_rst(app): found_version = False version_header_re = re.compile(r'^## releasing\s+([^\s]+)\s*(\(.*\))?') - print(f"DEBUG: Looking for version '{version}' in CHANGELOG.md") + logger.debug(f"Looking for version '{version}' in CHANGELOG.md") with open(changelog_path, 'r') as f: lines = f.readlines() @@ -216,7 +218,7 @@ def generate_whatsnew_rst(app): content.append(line) if not found_version: - print(f"NOTE: No changelog entry found for version {version}") + logger.info(f"No changelog entry found for version {version}") with open(output_path, 'w') as f: f.write("") else: @@ -232,4 +234,4 @@ def setup(app): app.connect('builder-inited', generate_cli_rst) app.connect('builder-inited', generate_whatsnew_rst) except ImportError: - print("WARNING: 'mmif' package not found. Skipping dynamic generation of parts of documentation.") + logger.warning("'mmif' package not found. Skipping dynamic generation of parts of documentation.") diff --git a/documentation/plugins.rst b/documentation/plugins.rst index 55443aa1..853b7b90 100644 --- a/documentation/plugins.rst +++ b/documentation/plugins.rst @@ -85,5 +85,5 @@ And the plugin code. Bulit-in Document Location Scheme Plugins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -At the moment, the ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. +At the moment, the ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that supports both ``http`` and ``https`` schemes. Take a look at the :mod:`mmif_docloc_http` module for details. From 02e1a9b81a2a14c8bb5a769fa9057e5cb3305c20 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Fri, 30 Jan 2026 10:30:42 -0500 Subject: [PATCH 17/48] reverting "long" help msg for `mmif describe` command ... also removing duplicate entry in ignore --- .gitignore | 1 - mmif/utils/cli/describe.py | 47 ++++++++++++++++------------------- mmif/utils/workflow_helper.py | 10 ++++---- 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index c9c81e78..e7990191 100644 --- a/.gitignore +++ b/.gitignore @@ -75,7 +75,6 @@ mmif/ver mmif/res mmif/vocabulary ./VERSION* -VERSION .hypothesis # Documentation build artifacts diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index bbaac4c0..f8eb0e85 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -32,39 +32,36 @@ def describe_argparser(): `clams --help`, respectively. """ oneliner = ( - 'Describe the workflow specification from a MMIF ' - 'file or a collection of MMIF files.' + 'Describe the workflow specification from a MMIF file or a ' + 'collection of MMIF files.' ) - return oneliner, oneliner - - -def describe_rst(): - """ - Returns a restructured text string meant to be used in some sphinxs-generated - documention. - """ # get and clean docstrings - single_doc = describe_single_mmif.__doc__.split(':param')[0] - single_doc = textwrap.dedent(single_doc).strip() - collection_doc = describe_mmif_collection.__doc__.split(':param')[0] - collection_doc = textwrap.dedent(collection_doc).strip() + def _extract_describe_docstring(func): + doc = func.__doc__.split(':param')[0] + # then cut off all lines after `---` + doc = doc.split('---')[0] + return textwrap.dedent(doc).strip() + + single_doc = _extract_describe_docstring(describe_single_mmif) + collection_doc = _extract_describe_docstring(describe_mmif_collection) additional = textwrap.dedent(f""" This command extracts workflow information from a single MMIF file or - summarizes a directory of MMIF files. + summarizes a directory of MMIF files. The output is serialized as JSON and + includes: - ========================== - For a single MMIF file - ========================== - {single_doc} - - =============================== - For a directory of MMIF files - =============================== - {collection_doc} + ========================= + Single MMIF file as input + ========================= +{single_doc} + + ================================== + A directory of MMIF files as input + ================================== +{collection_doc} """) - return additional.strip() + return oneliner, additional def prep_argparser(**kwargs): diff --git a/mmif/utils/workflow_helper.py b/mmif/utils/workflow_helper.py index 7980eb89..ed305c7e 100644 --- a/mmif/utils/workflow_helper.py +++ b/mmif/utils/workflow_helper.py @@ -190,15 +190,15 @@ def describe_single_mmif(mmif_file: Union[str, Path]) -> dict: * ``stats`` A dictionary with the following keys: - ``appCount`` + * ``appCount`` Total number of identified app executions. - ``errorViews`` + * ``errorViews`` A list of view IDs that reported errors. - ``warningViews`` + * ``warningViews`` A list of view IDs that reported warnings. - ``emptyViews`` + * ``emptyViews`` A list of view IDs that contain no annotations. - ``annotationCountByType`` + * ``annotationCountByType`` A dictionary mapping each annotation type to its count, plus a ``total`` key for the sum of all annotations across all app executions. From 7895721e4e3f67d638be586af8ca5d93555c4c01 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Fri, 30 Jan 2026 11:30:20 -0500 Subject: [PATCH 18/48] fix minor typos --- README.md | 3 +-- documentation/introduction.rst | 2 +- documentation/summarizer.rst | 4 ++-- mmif/utils/summarizer/graph.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1d786ea1..188d061c 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,7 @@ 1. serialization and de-serialization of MMIF internal data structures to/from JSON 2. validation of MMIF JSON 3. handling of CLAMS vocabulary types -4. navigation of MMIF objects via various "search" methods (e.g. `mmif.get_all_views_contain(vocab_type))`) - +4. navigation of MMIF objects via various "search" methods (e.g. `mmif.get_all_views_contain(vocab_type)`) ## For more ... diff --git a/documentation/introduction.rst b/documentation/introduction.rst index acc49015..f2ed9f90 100644 --- a/documentation/introduction.rst +++ b/documentation/introduction.rst @@ -37,7 +37,7 @@ The MMIF format and specification is evolving over time, and ``mmif-python`` pac MMIF Serialization --------------------------- -:class:`mmif.serialize.mmif.Mmif` represents the top-level MMIF object. Subcomponents of the MMIF object (views, annotation objects and metadata for each object)and the MMIF object itself are all subclasses of :class:`mmif.serialize.model.MmifObject`. To start with an existing MMIF :class:`str`, simply initiate a new ``Mmif`` object with that string. +:class:`mmif.serialize.mmif.Mmif` represents the top-level MMIF object. Subcomponents of the MMIF object (views, annotation objects and metadata for each object) and the MMIF object itself are all subclasses of :class:`mmif.serialize.model.MmifObject`. To start with an existing MMIF :class:`str`, simply initiate a new ``Mmif`` object with that string. .. code-block:: python diff --git a/documentation/summarizer.rst b/documentation/summarizer.rst index dacdffaa..eaef8970 100644 --- a/documentation/summarizer.rst +++ b/documentation/summarizer.rst @@ -12,12 +12,12 @@ makes some simplifying assumptions, including: - The time unit is assumed to be milliseconds. -The summarizer is accessable via the ``mmif`` command line script. To run the +The summarizer is accessible via the ``mmif`` command line script. To run the summarizer over a MMIF file and write the JSON summary to OUTFILE: .. code-block:: bash - mmif summarize -i INFILE -o OUTFILE + mmif summarize -i INFILE -o OUTFILE In all cases, the summarizer summarizes only the information that is there, it does not fix any mistakes and in general it does not add any information that is diff --git a/mmif/utils/summarizer/graph.py b/mmif/utils/summarizer/graph.py index a5ce3edd..b5ea40a2 100644 --- a/mmif/utils/summarizer/graph.py +++ b/mmif/utils/summarizer/graph.py @@ -177,7 +177,7 @@ class TokenIndex(object): {'v_4:td1': [ ((0, 5), ), ((5, 6), ), - ... + ...] } """ From 6b11dcdc5e35b5479c59459a14e4f426140f7508 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Fri, 30 Jan 2026 11:31:05 -0500 Subject: [PATCH 19/48] update cli.rst, removing not-so-helpful, not-so-readable help msg snippets --- documentation/cli.rst | 8 ++++---- documentation/conf.py | 32 ++++++++++++-------------------- mmif/__init__.py | 31 ++++++++----------------------- 3 files changed, 24 insertions(+), 47 deletions(-) diff --git a/documentation/cli.rst b/documentation/cli.rst index d8fa33a2..481df49d 100644 --- a/documentation/cli.rst +++ b/documentation/cli.rst @@ -7,10 +7,10 @@ The ``mmif`` shell command The CLI scripts are installed as subcommands of the ``mmif`` shell command. Run the following to see the available commands or the MMIF version: -.. code-block:: bash +.. include:: cli_help.rst - mmif (--help | --version) +Please take a look at the individual command documentation for more details on each command: -The following documentation is automatically generated from the CLI help messages. +.. code-block:: text -.. include:: cli_help.rst + $ mmif --help diff --git a/documentation/conf.py b/documentation/conf.py index 1ca1c7ec..8f95b603 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -152,29 +152,21 @@ def update_target_versions(app): def generate_cli_rst(app): + from mmif import prep_argparser_and_subcmds + + # Generate main help + os.environ['COLUMNS'] = '100' + parser, _, _ = prep_argparser_and_subcmds() + help_text = parser.format_help() content = [] - # Generate main help - # os.environ['COLUMNS'] = '100' - # parser = mmif.prep_argparser_for_documentation() - # help_text = parser.format_help() - # content.append('Main Command\n') - # content.append('------------\n\n') - # content.append('.. code-block:: text\n\n') - # content.append(textwrap.indent(help_text, ' ')) - # content.append('\n\n') - - # Generate subcommand help - for cli_module in mmif.find_all_modules('mmif.utils.cli'): - cli_module_name = cli_module.__name__.rsplit('.')[-1] - subparser = cli_module.prep_argparser(prog=f'mmif {cli_module_name}') - sub_help = subparser.format_help() - content.append(f'{cli_module_name}\n') - content.append('-' * len(cli_module_name) + '\n\n') - content.append('.. code-block:: text\n\n') - content.append(textwrap.indent(sub_help, ' ')) - content.append('\n\n') + content.append('.. code-block:: text\n\n') + content.append(' $ mmif --help\n') + content.append(textwrap.indent(help_text, ' ')) + content.append('\n\n') + + # No longer generate subcommand help with open(proj_root_dir / 'documentation' / 'cli_help.rst', 'w') as f: f.write(''.join(content)) diff --git a/mmif/__init__.py b/mmif/__init__.py index 18d10c14..513ecd86 100644 --- a/mmif/__init__.py +++ b/mmif/__init__.py @@ -33,44 +33,29 @@ def find_all_modules(pkgname): yield importlib.import_module(module) -def prep_argparser_for_documentation(): - """This is specifically for when building the CLI documentation with sphinxs-build, - which without the prog parameter would otherwise use sphinxs-build when printing - the command name.""" - # TODO: this does not add the subcommands, which was faulty anyway because the - # original code left us with an empty list - parser = argparse.ArgumentParser(prog='mmif') - parser.add_argument( - '-v', '--version', - action='version', - version=version_template.format(__version__, __specver__) - ) - return parser - - def prep_argparser_and_subcmds(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(prog='mmif') parser.add_argument( '-v', '--version', action='version', version=version_template.format(__version__, __specver__) ) subparsers = parser.add_subparsers(title='sub-command', dest='subcmd') - return parser, subparsers - - -def cli(): - parser, subparsers = prep_argparser_and_subcmds() - cli_modules = {} + subcmds = {} for cli_module in find_all_modules('mmif.utils.cli'): cli_module_name = cli_module.__name__.rsplit('.')[-1] - cli_modules[cli_module_name] = cli_module + subcmds[cli_module_name] = cli_module subcmd_parser = cli_module.prep_argparser(add_help=False) subparsers.add_parser(cli_module_name, parents=[subcmd_parser], help=cli_module.describe_argparser()[0], description=cli_module.describe_argparser()[1], formatter_class=argparse.RawDescriptionHelpFormatter, ) + return parser, subparsers, subcmds + + +def cli(): + parser, subparsers, cli_modules = prep_argparser_and_subcmds() if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) From 9aac163610aa6409de6b6f323eefb2c64a2a6e9b Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Fri, 30 Jan 2026 11:39:43 -0500 Subject: [PATCH 20/48] merged developer documents into CONTRIBUTING file --- CONTRIBUTING.md | 40 +++++++++++++++++++++- README.md | 6 +--- mmif/utils/cli/README.md | 71 ---------------------------------------- 3 files changed, 40 insertions(+), 77 deletions(-) delete mode 100644 mmif/utils/cli/README.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0c7f166b..312a168c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,43 @@ # Contributing to mmif-python +## CLI Scripts + +The `mmif` command-line interface supports subcommands (e.g., `mmif source`, `mmif describe`). These are implemented as Python modules in `mmif/utils/cli/`. + +### Adding a New CLI Script + +To add a new CLI subcommand, create a Python module in `mmif/utils/cli/` with these three required functions: + +1. **`prep_argparser(**kwargs)`** - Define and return an `argparse.ArgumentParser` instance for your subcommand. + +2. **`describe_argparser()`** - Return a tuple of two strings: + - A one-line description (shown in `mmif --help`) + - A more verbose description (shown in `mmif --help`) + +3. **`main(args)`** - Execute the subcommand logic with the parsed arguments. + +See existing modules like `summarize.py` or `describe.py` for examples. + +### How CLI Discovery Works + +The CLI system automatically discovers subcommands at runtime. The entry point is configured in `setup.py`: + +```python +entry_points={ + 'console_scripts': [ + 'mmif = mmif.__init__:cli', + ], +}, +``` + +The `cli()` function in `mmif/__init__.py` delegates to `prep_argparser_and_subcmds()`, which uses `find_all_modules('mmif.utils.cli')` to locate all modules in the CLI package. For each module found, it: + +1. Calls `prep_argparser()` to get the argument parser +2. Calls `describe_argparser()` for help text +3. Registers the module name as a subcommand + +This means adding a properly structured module is all that's needed - no modifications to `setup.py` or other configuration files are required. + ## Documentation The documentation for `mmif-python` is built using Sphinx and published to the [CLAMS documentation hub](https://github.com/clamsproject/website-test). @@ -14,7 +52,7 @@ make doc python3 build-tools/docs.py ``` -The output will be in `documentation/_build/html`. +The output will be in `docs-test` For more options, run `python build-tools/docs.py --help`. ### Building Documentation for Old Versions diff --git a/README.md b/README.md index 188d061c..a2e57601 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,4 @@ * [Version history and patch notes](https://github.com/clamsproject/mmif-python/blob/main/CHANGELOG.md) * [MMIF Python API documentation](https://clamsproject.github.io/mmif-python/latest) * [MMIF JSON specification and schema](https://clamsproject.github.io/mmif) - - -## For devs ... - -* To build the documentation: `python build-tools/docs.py --help` +* [Contributing guide](CONTRIBUTING.md) diff --git a/mmif/utils/cli/README.md b/mmif/utils/cli/README.md deleted file mode 100644 index 6d04438d..00000000 --- a/mmif/utils/cli/README.md +++ /dev/null @@ -1,71 +0,0 @@ -# MMIF CLI Scripts - -This directory contains CLI scripts like `source` and `rewind` that can be called from the command line. These scripts are called as subcommands of the `mmif` CLI script, for example `mmif source --help`. - - -## Adding another CLI script - -To add a CLI script all you need to do is add a python module to `mmif/utils/cli` and make sure it has the following three methods: - -1. `prep_argparser(**kwargs)` to define and return an instance of `argparse.ArgumentParser`. - -2. `describe_argparser()` to return a pair of strings that describe the script. The first string is a one-line description of the argument parser and the second a more verbose description. These will be shown for `mmif --help` and `mmif subcommand --help` respectively. - -3. `main(args)` to do the actual work of running the code - -See the current CLI scripts for examples. - - -## Some background - -The mmif-python package has a particular way to deal with CLI utility scripts. All scripts live in the mmif.utils.cli package. The `mmif/__init__.py` module has the `cli()` function which illustrates the requirements on utility scripts: - -```python -def cli(): - parser, subparsers = prep_argparser_and_subcmds() - cli_modules = {} - for cli_module in find_all_modules('mmif.utils.cli'): - cli_module_name = cli_module.__name__.rsplit('.')[-1] - cli_modules[cli_module_name] = cli_module - subcmd_parser = cli_module.prep_argparser(add_help=False) - subparsers.add_parser(cli_module_name, parents=[subcmd_parser], - help=cli_module.describe_argparser()[0], - description=cli_module.describe_argparser()[1], - formatter_class=argparse.RawDescriptionHelpFormatter) - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - args = parser.parse_args() - if args.subcmd not in cli_modules: - parser.print_help(sys.stderr) - else: - cli_modules[args.subcmd].main(args) -``` - - - -You can see the invocations of the three functions mentioned above. - -The `prep_argparser()` function uses `find_all_modules()`, which finds modules in the top-level of the cli package. That module could have all the code needed for the CLI to work, but it could refer to other modules as well. For example, the `summary.py` script is in `cli`, but it imports the summary utility from `mmif.utls`. - -In the setup.py script there is this passage towards the end of the file: - -```python - entry_points={ - 'console_scripts': [ - 'mmif = mmif.__init__:cli', - ], - }, -``` - -This leaves it up to the `cli()` method to find the scripts and this is why just adding a submodule as mentioned above works. Note that the initialization file of the cli package imports two of the commandline related scripts: - -```python -from mmif.utils.cli import rewind -from mmif.utils.cli import source -``` - -These may be used somewhere, but they are not necessary to run MMIF CLI scripts. - From 99c8338104c5d1f0d414a9bd07bd0761dd90afb1 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Fri, 30 Jan 2026 14:36:54 -0500 Subject: [PATCH 21/48] added apidoc to automate package/module discovery for sphinx docs gen --- .gitignore | 1 + CONTRIBUTING.md | 19 ++++++- documentation/autodoc/mmif.serialize.rst | 37 -------------- documentation/autodoc/mmif.utils.cli.rst | 40 --------------- documentation/autodoc/mmif.utils.rst | 49 ------------------- .../autodoc/mmif.utils.summarizer.rst | 44 ----------------- documentation/autodoc/mmif.vocabulary.rst | 28 ----------- documentation/autodoc/mmif_docloc_http.rst | 11 ----- documentation/conf.py | 48 ++++++++++++++++++ documentation/index.rst | 1 - documentation/modules.rst | 18 ++----- mmif/serialize/__init__.py | 4 ++ mmif/utils/__init__.py | 4 ++ mmif/utils/cli/__init__.py | 4 ++ mmif/utils/summarizer/__init__.py | 28 +---------- mmif_docloc_http/__init__.py | 8 +++ 16 files changed, 91 insertions(+), 253 deletions(-) delete mode 100644 documentation/autodoc/mmif.serialize.rst delete mode 100644 documentation/autodoc/mmif.utils.cli.rst delete mode 100644 documentation/autodoc/mmif.utils.rst delete mode 100644 documentation/autodoc/mmif.utils.summarizer.rst delete mode 100644 documentation/autodoc/mmif.vocabulary.rst delete mode 100644 documentation/autodoc/mmif_docloc_http.rst diff --git a/.gitignore b/.gitignore index e7990191..7588b933 100644 --- a/.gitignore +++ b/.gitignore @@ -80,6 +80,7 @@ mmif/vocabulary # Documentation build artifacts documentation/cli_help.rst documentation/whatsnew.md +documentation/autodoc docs-test # environments diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 312a168c..01e69817 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -52,7 +52,24 @@ make doc python3 build-tools/docs.py ``` -The output will be in `docs-test` For more options, run `python build-tools/docs.py --help`. +The output will be in `docs-test`. For more options, run `python build-tools/docs.py --help`. + +### API Documentation (autodoc) + +As of 2026 (since the next version of 1.2.1), API documentation is **automatically generated** using `sphinx-apidoc`. When you run the documentation build: + +1. The `run_apidoc()` function in `documentation/conf.py` runs automatically +2. It scans packages listed in `apidoc_package_names` (currently `mmif` and `mmif_docloc_http`) +3. RST files are generated in `documentation/autodoc/` +4. These files are **not tracked in git** - they're regenerated on each build + +**When you add a new module or subpackage**, it will be automatically documented on the next build. No manual updates required. + +**To add a new top-level package** (like `mmif_docloc_http`), add it to `apidoc_package_names` in `documentation/conf.py`. + +**To exclude a subpackage** from documentation (like `mmif.res` or `mmif.ver`), add it to `apidoc_exclude_paths`. + +**Module docstrings** in `__init__.py` files are used as package descriptions in the documentation. Keep them concise and informative. ### Building Documentation for Old Versions diff --git a/documentation/autodoc/mmif.serialize.rst b/documentation/autodoc/mmif.serialize.rst deleted file mode 100644 index e58e0c24..00000000 --- a/documentation/autodoc/mmif.serialize.rst +++ /dev/null @@ -1,37 +0,0 @@ -mmif.serialize package -====================== - -Core package to provide serialization and deserialization of MMIF format. - -``model`` module ---------------------------- - -.. automodule:: mmif.serialize.model - :members: - :undoc-members: - :show-inheritance: - -``mmif`` module --------------------------- - -.. automodule:: mmif.serialize.mmif - :members: - :undoc-members: - :show-inheritance: - -``view`` module --------------------------- - -.. automodule:: mmif.serialize.view - :members: - :undoc-members: - :show-inheritance: - -``annotation`` module --------------------------------- - -.. automodule:: mmif.serialize.annotation - :members: - :undoc-members: - :show-inheritance: - diff --git a/documentation/autodoc/mmif.utils.cli.rst b/documentation/autodoc/mmif.utils.cli.rst deleted file mode 100644 index 2d0aa521..00000000 --- a/documentation/autodoc/mmif.utils.cli.rst +++ /dev/null @@ -1,40 +0,0 @@ -mmif.utils.cli package -====================== - -Package containing CLI modules. - -Submodules ----------- - -``describe`` module -------------------- - -.. automodule:: mmif.utils.cli.describe - :members: - :undoc-members: - :show-inheritance: - -``rewind`` module ------------------ - -.. automodule:: mmif.utils.cli.rewind - :members: - :undoc-members: - :show-inheritance: - -``source`` module ------------------ - -.. automodule:: mmif.utils.cli.source - :members: - :undoc-members: - :show-inheritance: - -``summarize`` module --------------------- - -.. automodule:: mmif.utils.cli.summarize - :members: - :undoc-members: - :show-inheritance: - diff --git a/documentation/autodoc/mmif.utils.rst b/documentation/autodoc/mmif.utils.rst deleted file mode 100644 index 8bd90cfd..00000000 --- a/documentation/autodoc/mmif.utils.rst +++ /dev/null @@ -1,49 +0,0 @@ -mmif.utils package -================== - -Package containing utility modules for handling different types of source -documents, and general implementation of common data structures and -algorithms. - -Submodules ----------- - -``video_document_helper`` module --------------------------------- - -.. automodule:: mmif.utils.video_document_helper - :members: - :undoc-members: - :show-inheritance: - -``text_document_helper`` module -------------------------------- - -.. automodule:: mmif.utils.text_document_helper - :members: - :undoc-members: - :show-inheritance: - -``timeunit_helper`` module -------------------------------- - -.. automodule:: mmif.utils.timeunit_helper - :members: - :undoc-members: - :show-inheritance: - -``sequence_helper`` module --------------------------- - -.. automodule:: mmif.utils.sequence_helper - :members: - :undoc-members: - :show-inheritance: - -``workflow_helper`` module --------------------------- - -.. automodule:: mmif.utils.workflow_helper - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/documentation/autodoc/mmif.utils.summarizer.rst b/documentation/autodoc/mmif.utils.summarizer.rst deleted file mode 100644 index 299322c1..00000000 --- a/documentation/autodoc/mmif.utils.summarizer.rst +++ /dev/null @@ -1,44 +0,0 @@ -mmif.utils.summarizer package -============================= - -Package containing the code to generate a summary from a MMIF file. - - -Submodules ----------- - - -``graph`` module ----------------- - -.. automodule:: mmif.utils.summarizer.graph - :members: - :undoc-members: - :show-inheritance: - - -``nodes`` module ----------------- - -.. automodule:: mmif.utils.summarizer.nodes - :members: - :undoc-members: - :show-inheritance: - - -``summary`` module ------------------- - -.. automodule:: mmif.utils.summarizer.summary - :members: - :undoc-members: - :show-inheritance: - - -``utils`` module ----------------- - -.. automodule:: mmif.utils.summarizer.utils - :members: - :undoc-members: - :show-inheritance: diff --git a/documentation/autodoc/mmif.vocabulary.rst b/documentation/autodoc/mmif.vocabulary.rst deleted file mode 100644 index 0eb985b5..00000000 --- a/documentation/autodoc/mmif.vocabulary.rst +++ /dev/null @@ -1,28 +0,0 @@ -mmif.vocabulary package -======================= - -Package contains Enum-like classes for CLAMS vocabulary. - -.. autoclass:: mmif.vocabulary.ThingTypesBase - :show-inheritance: -.. autoclass:: mmif.vocabulary.ThingType - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: mmif.vocabulary.ClamsTypesBase - :show-inheritance: -.. autoclass:: mmif.vocabulary.AnnotationTypesBase - :show-inheritance: -.. autoclass:: mmif.vocabulary.DocumentTypesBase - :show-inheritance: - -.. autoclass:: mmif.vocabulary.AnnotationTypes - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: mmif.vocabulary.DocumentTypes - :members: - :undoc-members: - :show-inheritance: diff --git a/documentation/autodoc/mmif_docloc_http.rst b/documentation/autodoc/mmif_docloc_http.rst deleted file mode 100644 index b76c8df5..00000000 --- a/documentation/autodoc/mmif_docloc_http.rst +++ /dev/null @@ -1,11 +0,0 @@ -mmif_docloc_http package -======================== - -MMIF document location helper module for `http` and `https` schemes. -If you want to write your own docloc scheme handler, please use the source code of this module as a reference. -See this :ref:`plug-in section ` for more information. - -.. automodule:: mmif_docloc_http - :members: - :undoc-members: - :show-inheritance: diff --git a/documentation/conf.py b/documentation/conf.py index 8f95b603..121054ef 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -21,6 +21,20 @@ # At this point, `pip install -e .` should have been run, so mmif is importable import mmif +# apidoc settings +apidoc_package_names = ['mmif', 'mmif_docloc_http'] +apidoc_exclude_paths = [ + proj_root_dir / 'mmif' / 'res', + proj_root_dir / 'mmif' / 'ver', +] +# this is used by sphinx.ext.autodoc +autodoc_default_options = { + 'members': True, + 'undoc-members': True, + 'show-inheritance': True, +} + + # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information @@ -220,8 +234,42 @@ def generate_whatsnew_rst(app): f.writelines(content) +def run_apidoc(app): + """ + Run sphinx-apidoc to auto-generate RST files for all modules. + This ensures new modules are automatically documented without manual updates. + """ + from sphinx.ext.apidoc import main as apidoc_main + + docs_dir = Path(__file__).parent + output_dir = docs_dir / 'autodoc' + + exclude_paths = map(str, apidoc_exclude_paths) + + # Run sphinx-apidoc for each package specified in package_names + # apidoc_main() accepts argv-style arguments (without the program name) + for package_name in apidoc_package_names: + package_dir = proj_root_dir / package_name + if not package_dir.exists(): + logger.warning(f"Package directory {package_dir} does not exist. " + f"Skipping apidoc for {package_name}.") + continue + + args = [ + '-o', str(output_dir), + str(package_dir), + *exclude_paths, + '--force', # Overwrite existing files + '--module-first', # Put module docs before submodule docs + '--no-toc', # Don't create modules.rst (we maintain our own) + ] + logger.info(f"Running sphinx-apidoc with args: {args}") + apidoc_main(args) + + def setup(app): try: + app.connect('builder-inited', run_apidoc) app.connect('builder-inited', update_target_versions) app.connect('builder-inited', generate_cli_rst) app.connect('builder-inited', generate_whatsnew_rst) diff --git a/documentation/index.rst b/documentation/index.rst index 9535636b..218271ba 100644 --- a/documentation/index.rst +++ b/documentation/index.rst @@ -21,7 +21,6 @@ Contents .. toctree:: :maxdepth: 2 - :caption: API documentation: modules diff --git a/documentation/modules.rst b/documentation/modules.rst index 22838e39..882db997 100644 --- a/documentation/modules.rst +++ b/documentation/modules.rst @@ -1,22 +1,10 @@ .. _apidoc: -mmif package -============ +API Documentation +================= .. toctree:: :maxdepth: 4 - autodoc/mmif.serialize - autodoc/mmif.vocabulary - autodoc/mmif.utils - autodoc/mmif.utils.cli - autodoc/mmif.utils.summarizer - -mmif_docloc_http package -======================== - -.. toctree:: - :maxdepth: 2 - + autodoc/mmif autodoc/mmif_docloc_http - diff --git a/mmif/serialize/__init__.py b/mmif/serialize/__init__.py index 18523bac..06964253 100644 --- a/mmif/serialize/__init__.py +++ b/mmif/serialize/__init__.py @@ -1,3 +1,7 @@ +""" +Core package to provide serialization and deserialization of MMIF format. +""" + from .annotation import * from .annotation import __all__ as anno_all from .mmif import * diff --git a/mmif/utils/__init__.py b/mmif/utils/__init__.py index e69de29b..fe8aea5d 100644 --- a/mmif/utils/__init__.py +++ b/mmif/utils/__init__.py @@ -0,0 +1,4 @@ +""" +Package containing utility modules for handling different types of source +documents, and general implementation of common data structures and algorithms. +""" diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 24855994..4993ea8e 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -1,3 +1,7 @@ +""" +Package containing CLI modules. +""" + from mmif.utils.cli import describe from mmif.utils.cli import rewind from mmif.utils.cli import source diff --git a/mmif/utils/summarizer/__init__.py b/mmif/utils/summarizer/__init__.py index c1afbb78..bbbd9cb8 100644 --- a/mmif/utils/summarizer/__init__.py +++ b/mmif/utils/summarizer/__init__.py @@ -1,31 +1,5 @@ """ - -MMIF consumer that creates a JSON summary from a MMIF file. - -Makes some simplifying assumptions, including: - -- There is one video in the MMIF documents list. All start and end properties - are pointing to that video. -- The time unit is assumed to be milliseconds. - -USAGE: - - $ mmif summarize -i INFILE -o OUTFILE - - Run the summarizer over a MMIF file and write the JSON summary to OUTFILE. - -In all cases, the summarizer summarizes the information that is there, it does -not fix any mistakes and in general it does not add any information that is not -explicitly or implicitly in the MMIF file. In rare cases some information is -added, for example if an ASR tool does not group tokens in sentence-like objects -then the summarizer will do that, but then only by creating token groups of the -same length. - -The summary includes the MMIF version, the list of documents, a summary of the -metadata of all views (identifier, CLAMS app, timestamp, total number of -annotations and number of annotations per type, it does not show parameters and -application configuration), time frames, transcript, captions and entities. - +Package containing the code to generate a summary from a MMIF file. """ diff --git a/mmif_docloc_http/__init__.py b/mmif_docloc_http/__init__.py index 9bdf9f22..c360f401 100644 --- a/mmif_docloc_http/__init__.py +++ b/mmif_docloc_http/__init__.py @@ -1,3 +1,11 @@ +""" +MMIF document location helper module for ``http`` and ``https`` schemes. + +If you want to write your own docloc scheme handler, please use the source +code of this module as a reference. See the :ref:`plug-in section ` +for more information. +""" + import urllib.request import urllib.error From 72bfc32b9b6b1f7ab485cf532b42f9d51bfdd1cb Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Fri, 30 Jan 2026 14:33:43 -0500 Subject: [PATCH 22/48] removed temporary note file --- documentation-notes.md | 63 ------------------------------------------ 1 file changed, 63 deletions(-) delete mode 100644 documentation-notes.md diff --git a/documentation-notes.md b/documentation-notes.md deleted file mode 100644 index c0098043..00000000 --- a/documentation-notes.md +++ /dev/null @@ -1,63 +0,0 @@ -# Documentation notes - -Various temporary notes on the documentation. Parts of this should maybe be added to [issue #348](https://github.com/clamsproject/mmif-python/issues/348) or to a more general issue on mmif-python documentation. - -Do not keep this file here forever. - --- - -In the [346-summarizer](https://github.com/clamsproject/mmif-python/tree/346-summarizer) branch I added one line trying to generate API documentation for the sumarizer: - -```rest -.. toctree:: - :maxdepth: 4 - - autodoc/mmif.serialize - autodoc/mmif.vocabulary - autodoc/mmif.utils - autodoc/mmif.utils.summarizer -``` - -However, it looks like this needs to be done elsewhere since after `make doc` no `mmif.utils.summarizer.html` file is added to `doct-test/develop/autodoc` and we get a warning that the TOC cannot add the module. - -Also note that this doesn't work for the mmif.utils.cli package either. - --- - -At the moment `documentation/index.rst` imports the top-level readme file. Should probably revisit that because the goal of that file is different from what we are doing here. - -Update: I removed the include and wrote a shorter intro, but there is already something along those lines in `documentation/introduction.rst` so there is still some smoothing to be done here. - --- - -In the summarizer branch there is a markdown file in the mmif.utils.summary package, that should maby be added here as `documentation/creating-clis.rst`/ - --- - -All the source links in the generated documentation are dead. I thought that maybe editing `documentation/conf.py` and changing the line - -```python -html_show_sourcelink = True # Furo handles this well, no need to hide -``` - -by setting the variable to False might work, but that was a wild guess and it did not work. - -On second thought, this is probably because the source links go to pages that do not exist yet. - --- - -When reading the changes for a version frm the changelog file some of the typesetting does not translate well, for example in version 1.2.1 we get the raw text for the note: - -``` -[!NOTE] mmif describe (and the underlying mmif.utils.workflow_helper) is still experimental and subject to change in future releases without notice. Backward compatibility is not guaranteed. -``` - --- - -Some changes already made (but not necessarily pushed up yet): - -- Fixing some types and minor style errors. -- Some type setting changes. -- Refactored the way the "what's new in section X" is generated. -- Removed the Search Page link from the main page. It was leading nowhere and there is a perfectly fine search box on the left anyway. -- Updated python requirement. \ No newline at end of file From 0a26924204347838050dff06770b12140d37ea7c Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sun, 1 Feb 2026 14:53:03 -0500 Subject: [PATCH 23/48] added caching local resolved path from http:// URI for speed --- documentation/plugins.rst | 40 +++++++++++++++++++++++++++++++----- mmif_docloc_http/__init__.py | 12 ++++++++--- tests/test_serialize.py | 15 ++++++++++++++ 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/documentation/plugins.rst b/documentation/plugins.rst index 1af39426..f3131224 100644 --- a/documentation/plugins.rst +++ b/documentation/plugins.rst @@ -3,7 +3,6 @@ Developing plugins for MMIF Python SDK ====================================== - Overview -------- @@ -80,10 +79,41 @@ And the plugin code. def help(): return "location format: `.video`" - - -Bulit-in Document Location Scheme Plugins +Built-in Document Location Scheme Plugins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -At the moment, ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. +At the moment, ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. This plugin implements caching as described above, so repeated access to the same URL will not trigger multiple downloads. Take a look at :mod:`mmif_docloc_http` module for details. + +Caching for Remote File Access +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When developing plugins that resolve remote document locations (e.g., ``http``, ``s3``, or custom schemes), it is highly recommended to implement caching to avoid repeated network requests or file downloads. Since ``mmif-python`` may call the ``resolve`` function multiple times for the same document location during processing, caching can significantly improve performance. + +A simple and effective approach is to use a module-level dictionary as a cache. Because Python modules are singletons (loaded once and cached in ``sys.modules``), this cache persists for the entire lifetime of the Python process, across multiple MMIF files and Document objects. + +Here's an example of how to implement caching in a plugin: + +.. code-block:: python + + # mmif_docloc_myscheme/__init__.py + + _cache = {} + + def resolve(docloc): + if docloc in _cache: + return _cache[docloc] + + # ... your resolution logic here ... + resolved_path = do_actual_resolution(docloc) + + _cache[docloc] = resolved_path + return resolved_path + +This pattern ensures that: + +* The first call to ``resolve`` performs the actual resolution (download, API call, etc.) +* Subsequent calls for the same location return the cached result immediately +* The cache is shared across all MMIF objects processed within the same Python process + +See :mod:`mmif_docloc_http` for a concrete example of this caching strategy in action. diff --git a/mmif_docloc_http/__init__.py b/mmif_docloc_http/__init__.py index 9bdf9f22..f92c87ba 100644 --- a/mmif_docloc_http/__init__.py +++ b/mmif_docloc_http/__init__.py @@ -1,16 +1,22 @@ import urllib.request import urllib.error +_cache = {} + def resolve(docloc): + if docloc in _cache: + return _cache[docloc] try: if docloc.startswith('http://') or docloc.startswith('https://'): - return urllib.request.urlretrieve(docloc)[0] + path = urllib.request.urlretrieve(docloc)[0] + _cache[docloc] = path + return path else: raise ValueError(f'cannot handle document location scheme: {docloc}') except urllib.error.URLError as e: raise e - - + + def help(): return "location must be a URL string." diff --git a/tests/test_serialize.py b/tests/test_serialize.py index b0836c5a..9e857a00 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -269,6 +269,21 @@ def test_document_location_helpers_http(self): # round_trip = Document(new_doc.serialize()) self.assertEqual(Document(new_doc.serialize()).serialize(), new_doc.serialize()) + def test_document_location_http_caching(self): + import mmif_docloc_http + mmif_docloc_http._cache.clear() + test_url = "https://example.com/" + self.assertNotIn(test_url, mmif_docloc_http._cache) + new_doc = Document() + new_doc.id = "d1" + new_doc.location = test_url + new_doc.location_path() + self.assertIn(test_url, mmif_docloc_http._cache) + # second call should use cache (same path returned) + cached_path = mmif_docloc_http._cache[test_url] + second_path = new_doc.location_path() + self.assertEqual(cached_path, second_path) + def test_get_documents_locations(self): mmif_obj = Mmif(MMIF_EXAMPLES['everything']) self.assertEqual(1, len(mmif_obj.get_documents_locations(DocumentTypes.VideoDocument))) From aa8b7d89a74a8c95b681f4ec95322bb40bcf210e Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Mon, 2 Feb 2026 12:58:03 -0500 Subject: [PATCH 24/48] minor documentation fix --- mmif/serialize/annotation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index 6527f482..672fa0bb 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -374,7 +374,7 @@ def add_property(self, name: str, With the former method, the SDK will record the added property as a `Annotation` annotation object, separate from the original `Document` - object. See :meth:`.Mmif.generate_capital_annotations()` for more. + object. See :meth:`mmif.serialize.Mmif.generate_capital_annotations()` for more. A few notes to keep in mind: From 39bbf420a4ba9387fb95fbb7bb6a00b7659723d0 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Mon, 2 Feb 2026 13:04:26 -0500 Subject: [PATCH 25/48] removed "source" media counts from workflow ID prefix ... - addressing https://github.com/clamsproject/mmif-python/issues/326#issuecomment-3675345689 --- mmif/utils/workflow_helper.py | 8 +++----- tests/test_utils.py | 6 +++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/mmif/utils/workflow_helper.py b/mmif/utils/workflow_helper.py index 08920292..74206db6 100644 --- a/mmif/utils/workflow_helper.py +++ b/mmif/utils/workflow_helper.py @@ -93,7 +93,9 @@ def _read_mmif_from_path(mmif_input: Union[str, Path, Mmif]) -> Mmif: ) -def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], return_param_dicts=False) -> Union[str, Tuple[str, List[dict]]]: +def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], + return_param_dicts=False) \ + -> Union[str, Tuple[str, List[dict]]]: """ Generate a workflow identifier string from a MMIF file or object. @@ -111,10 +113,6 @@ def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], return_para data = _read_mmif_from_path(mmif_input) segments = [] - # First prefix is source information, sorted by document type - sources = Counter(doc.at_type.shortname for doc in data.documents) - segments.append('-'.join([f'{k}-{sources[k]}' for k in sorted(sources.keys())])) - # Group views into runs grouped_apps = group_views_by_app(data.views) diff --git a/tests/test_utils.py b/tests/test_utils.py index 1fb97696..5f29b9d2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -276,9 +276,9 @@ def test_generate_workflow_identifier_grouped(self): try: workflow_id = workflow_helper.generate_workflow_identifier(tmp_file) segments = workflow_id.split('/') - self.assertEqual(len(segments), 7) - self.assertIn('app1', segments[1]) - self.assertIn('app2', segments[4]) + self.assertEqual(len(segments), 6) + self.assertIn('app1', segments[0]) + self.assertIn('app2', segments[3]) finally: os.unlink(tmp_file) From 854739dcbf4d0bdd73325dafed0bf74b81094542 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 3 Feb 2026 18:16:39 -0500 Subject: [PATCH 26/48] documented git workflow for developers --- CONTRIBUTING.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 01e69817..ca8fdb9d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,22 @@ # Contributing to mmif-python +## Git Workflow + +We follow a Gitflow-inspired branching model to maintain a stable `main` branch and a dynamic `develop` branch. + +1. **Branch Roles**: + - `main`: Reserved for stable, production-ready releases. + - `develop`: The primary branch for ongoing development, feature integration, and bug fixes. This serves as the "staging" area for the next release. +2. **Issue Tracking**: Every contribution (bug fix or feature) must first be reported as a [GitHub Issue](https://github.com/clamsproject/mmif-python/issues). Issues should clearly define goals and, preferably, include an implementation plan. +3. **Branch Naming**: Create a dedicated working branch for each issue. Branches must be named using the format `NUM-short-description`, where `NUM` is the issue number (e.g., `113-fix-file-loading`). +4. **Pull Requests (PRs)**: + - Once work is complete, open a PR targeting the `develop` branch. + - **Communication**: High-level discussion and planning should occur in the issue thread. The PR conversation is strictly for code review and implementation-specific feedback. +5. **Releases**: + - When `develop` is ready for a new release, open a PR from `develop` to `main` using the "release" PR template. + - After merging the release candidate into `main`, manually tag the commit with the version number. This tag triggers the automated CI/CD pipeline for publishing. +6. **Branch Protection**: Both `main` and `develop` are protected branches. Direct pushes are disabled; all changes must be introduced via Pull Requests. + ## CLI Scripts The `mmif` command-line interface supports subcommands (e.g., `mmif source`, `mmif describe`). These are implemented as Python modules in `mmif/utils/cli/`. From b5a32f9229a9074b09bce472bfc5d98bc55a5070 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Mon, 9 Feb 2026 17:38:00 -0500 Subject: [PATCH 27/48] some clarification regarding CLI subcmd auto-discovery --- CONTRIBUTING.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ca8fdb9d..93baf8f7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -25,7 +25,7 @@ The `mmif` command-line interface supports subcommands (e.g., `mmif source`, `mm To add a new CLI subcommand, create a Python module in `mmif/utils/cli/` with these three required functions: -1. **`prep_argparser(**kwargs)`** - Define and return an `argparse.ArgumentParser` instance for your subcommand. +1. **`prep_argparser(**kwargs)`** - Define and return an `argparse.ArgumentParser` instance for your subcommand. When called during discovery, the main CLI will pass `add_help=False` to this function to avoid duplicate help flags. 2. **`describe_argparser()`** - Return a tuple of two strings: - A one-line description (shown in `mmif --help`) @@ -33,11 +33,12 @@ To add a new CLI subcommand, create a Python module in `mmif/utils/cli/` with th 3. **`main(args)`** - Execute the subcommand logic with the parsed arguments. -See existing modules like `summarize.py` or `describe.py` for examples. +[!NOTE] +> CLI modules should typically act as thin wrappers. It is recommended to implement the core utility logic in other packages (e.g., `mmif.utils`) and import it into the CLI module. See existing modules like `summarize.py` (which imports from `mmif.utils.summarizer`) or `describe.py` for examples. ### How CLI Discovery Works -The CLI system automatically discovers subcommands at runtime. The entry point is configured in `setup.py`: +The CLI system automatically discovers subcommands at runtime. The entry point is configured in the build script (currently `setup.py`) as follows: ```python entry_points={ @@ -47,13 +48,12 @@ entry_points={ }, ``` -The `cli()` function in `mmif/__init__.py` delegates to `prep_argparser_and_subcmds()`, which uses `find_all_modules('mmif.utils.cli')` to locate all modules in the CLI package. For each module found, it: +The `cli()` function in `mmif/__init__.py` handles discovery and delegation. It uses `pkgutil.walk_packages` to find all modules within the top-level of the `mmif.utils.cli` package. For the discovery logic to work, a "cli module" should implement the requirements outlined above. -1. Calls `prep_argparser()` to get the argument parser -2. Calls `describe_argparser()` for help text -3. Registers the module name as a subcommand +This means adding a properly structured module within the CLI package is all that's needed—the module name will automatically be registered as a subcommand. No modifications to `setup.py` or other configuration files are required. -This means adding a properly structured module is all that's needed - no modifications to `setup.py` or other configuration files are required. +> [!NOTE] +> Any "client" code (not shell CLI) wants to use a module in `cli` package should be able to directrly `from mmif.utils.cli import a_module`. However, for historical reasons, some CLI modules are manually imported in `mmif/__init__.py` (e.g., `source.py`) for backward compatibility for clients predateing the discovery system. ## Documentation From 39ef2befd4f2c868a9abc1d64db619351dd19126 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Mon, 9 Feb 2026 17:42:38 -0500 Subject: [PATCH 28/48] local test build for documentation no longer requires VERSION file --- Makefile | 15 +++++---------- build-tools/docs.py | 25 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index ec63ccf1..0f3d3386 100644 --- a/Makefile +++ b/Makefile @@ -36,17 +36,12 @@ publish: distclean version package test $(generatedcode): dist/$(sdistname)*.tar.gz docs: - @echo "WARNING: The 'docs' target is deprecated and will be removed." - @echo "The 'docs' directory is no longer used. Documentation is now hosted in the central CLAMS documentation hub." - @echo "Use 'make doc' for local builds or 'make doc-version' for specific versions." - @echo "Nothing is done." + @echo "The 'docs' target is deprecated and will be removed." + @echo "Documentation is now managed by 'build-tools/docs.py'." + @echo "Please run 'python3 build-tools/docs.py --help' for usage." -doc: # for single version sphinx - builds current source - python3 build-tools/docs.py - -doc-version: # interactive build for specific version - @read -p "Enter version/tag to build (e.g., v1.0.0): " ver; \ - [ -n "$$ver" ] && python3 build-tools/docs.py --build-ver $$ver +doc: docs +doc-version: docs package: VERSION dist/$(sdistname)*.tar.gz diff --git a/build-tools/docs.py b/build-tools/docs.py index ee5d4550..eaea47e3 100644 --- a/build-tools/docs.py +++ b/build-tools/docs.py @@ -40,6 +40,19 @@ def run_sphinx_build(self, *args, cwd=None, check=True): return run_command([self.sphinx_build, *args], cwd=cwd, check=check) +def get_dummy_version(): + """Returns a dummy version based on current git branch and dirty status. + Falls back to 'unknown' if not in a git repository.""" + try: + branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], + stderr=subprocess.DEVNULL, text=True).strip() + dirty = subprocess.run(["git", "diff", "--quiet"], + stderr=subprocess.DEVNULL, check=False).returncode != 0 + return f"{branch}{'+dirty' if dirty else ''}" + except (subprocess.CalledProcessError, FileNotFoundError): + return "unknown" + + def build_docs_local(source_dir: Path, output_dir: Path): """ Builds documentation for the provided source directory. @@ -47,6 +60,18 @@ def build_docs_local(source_dir: Path, output_dir: Path): """ print("--- Running in Local Build Mode ---") + # Warning for user as VERSION file is critical + if sys.stdin.isatty(): + import select + print("\nWARNING: The 'VERSION' file will be overwritten with a dummy version for this local build.") + print("Pausing for 3 seconds (press Enter to continue immediately)...") + select.select([sys.stdin], [], [], 3) + + # Overwrite VERSION file with dummy version for local builds + version = get_dummy_version() + print(f"Generating dummy VERSION for local build: {version}") + (source_dir / "VERSION").write_text(version) + # 1. Generate source code and install in editable mode. print("\n--- Step 1: Generating source code and installing in editable mode ---") try: From 583174d9a2f8ec14c1261e2dfb46dd399a698fca Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Mon, 9 Feb 2026 17:44:55 -0500 Subject: [PATCH 29/48] fixed ambiguous fn references in docstring --- mmif/serialize/annotation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index 6527f482..b7f002cd 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -374,7 +374,7 @@ def add_property(self, name: str, With the former method, the SDK will record the added property as a `Annotation` annotation object, separate from the original `Document` - object. See :meth:`.Mmif.generate_capital_annotations()` for more. + object. See :meth:`mmif.serialize.mmif.Mmif.generate_capital_annotations` for more. A few notes to keep in mind: @@ -442,7 +442,7 @@ def get(self, prop_name, default=None): See Also -------- add_property : Add a new property to the document - Mmif.generate_capital_annotations : How pending properties are serialized + mmif.serialize.mmif.Mmif.generate_capital_annotations : How pending properties are serialized """ if prop_name == 'id': # because all three dicts have `id` key as required field, we need From 5eba9f6fdeaf108e86af52c6c47efcc91081c499 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 10 Feb 2026 11:50:14 -0500 Subject: [PATCH 30/48] removing advertiseing `make docs` in dev doc --- CONTRIBUTING.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 93baf8f7..49d47d49 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -64,8 +64,6 @@ The documentation for `mmif-python` is built using Sphinx and published to the [ To build the documentation for the current checkout: ```bash -make doc -# OR python3 build-tools/docs.py ``` From 5d36df646298edc5ca7312ab7fb5ca6fb563e25a Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 10 Feb 2026 19:57:28 -0500 Subject: [PATCH 31/48] fixing issues in pytest config and `make test` cmd --- Makefile | 6 +++--- pytest.ini | 3 +++ tests/test_utils_cli.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 pytest.ini diff --git a/Makefile b/Makefile index 0f3d3386..bac1919e 100644 --- a/Makefile +++ b/Makefile @@ -80,15 +80,15 @@ version: VERSION; cat VERSION # since the GH api will return tags in chronological order, we can just grab the last one without sorting AUTH_ARG := $(if $(GITHUB_TOKEN),-H "Authorization: token $(GITHUB_TOKEN)") -VERSION.dev: devver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif-python/git/refs/tags" | grep '"ref":' | sed -E 's/.+refs\/tags\/([0-9.]+)",/\1/g' | tail -n 1) -VERSION.dev: specver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif/git/refs/tags" | grep '"ref":' | grep -v 'py-' | sed -E 's/.+refs\/tags\/(spec-)?([0-9.]+)",/\2/g' | tail -n 1) +VERSION.dev: devver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif-python/git/refs/tags" | grep '"ref":' | sed -E 's/.+refs\/tags\/([0-9.]+)",/\1/g' | sort -V | tail -n 1) +VERSION.dev: specver := $(shell curl --silent $(AUTH_ARG) "https://api.github.com/repos/clamsproject/mmif/git/refs/tags" | grep '"ref":' | grep -v 'py-' | sed -E 's/.+refs\/tags\/(spec-)?([0-9.]+)",/\2/g' | sort -V | tail -n 1) VERSION.dev: @echo DEVVER: $(devver) @echo SPECVER: $(specver) @if [ $(call macro,$(devver)) = $(call macro,$(specver)) ] && [ $(call micro,$(devver)) = $(call micro,$(specver)) ] ; \ then \ if [[ $(devver) == *.dev* ]]; then echo $(call increase_dev,$(devver)) ; else echo $(call add_dev,$(call increase_patch, $(devver))); fi \ - else echo $(call add_dev,$(specver)) ; fi \ + else if [[ $(devver) == *.dev* ]]; then echo $(call increase_dev,$(devver)) ; else echo $(call add_dev,$(call increase_patch, $(devver))); fi ; fi \ > VERSION.dev VERSION: version := $(shell git tag | sort -t. -k 1,1nr -k 2,2nr -k 3,3nr -k 4,4nr | head -n 1) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..07055628 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = mmif tests +python_files = test_*.py *_test.py diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index fa0f8906..d51014de 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -16,7 +16,7 @@ class TestCli(unittest.TestCase): def setUp(self) -> None: - self.parser, _ = mmif.prep_argparser_and_subcmds() + self.parser, _, _ = mmif.prep_argparser_and_subcmds() def test_primary_cli(self): stdout = io.StringIO() From 5342210c08c0cc258ee91a34c64213a06ec2058d Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 11 Feb 2026 17:41:34 -0500 Subject: [PATCH 32/48] updated `summarize` flag for consistency, remove redundant docstrings (see CONTRIBUTING.md) --- mmif/utils/cli/__init__.py | 1 + mmif/utils/cli/describe.py | 5 --- mmif/utils/cli/rewind.py | 4 --- mmif/utils/cli/source.py | 4 --- mmif/utils/cli/summarize.py | 62 +++++++++++++++++++++++--------- mmif/utils/summarizer/summary.py | 7 ++-- 6 files changed, 52 insertions(+), 31 deletions(-) diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 4993ea8e..8a59e210 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -5,4 +5,5 @@ from mmif.utils.cli import describe from mmif.utils.cli import rewind from mmif.utils.cli import source +from mmif.utils.cli import summarize diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index aaa8245d..8e8c5171 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -26,11 +26,6 @@ def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: def describe_argparser(): - """ - Returns two strings: one-line description of the argparser, and - additional material, which will be shown in `clams --help` and - `clams --help`, respectively. - """ oneliner = ( 'Describe the workflow specification from a MMIF file or a ' 'collection of MMIF files.' diff --git a/mmif/utils/cli/rewind.py b/mmif/utils/cli/rewind.py index 1e038180..9be494de 100644 --- a/mmif/utils/cli/rewind.py +++ b/mmif/utils/cli/rewind.py @@ -55,10 +55,6 @@ def rewind_mmif(mmif_obj: mmif.Mmif, choice: int, choice_is_viewnum: bool = True def describe_argparser(): - """ - returns two strings: one-line description of the argparser, and addition material, - which will be shown in `clams --help` and `clams --help`, respectively. - """ oneliner = 'provides CLI to rewind a MMIF from a CLAMS workflow.' additional = textwrap.dedent(""" MMIF rewinder rewinds a MMIF by deleting the last N views. diff --git a/mmif/utils/cli/source.py b/mmif/utils/cli/source.py index 3abd2e1f..67e24015 100644 --- a/mmif/utils/cli/source.py +++ b/mmif/utils/cli/source.py @@ -214,10 +214,6 @@ def generate_source_mmif_from_file(documents, prefix=None, scheme='file', **igno def describe_argparser(): - """ - returns two strings: one-line description of the argparser, and addition material, - which will be shown in `clams --help` and `clams --help`, respectively. - """ oneliner = 'provides CLI to create a "source" MMIF json.' additional = textwrap.dedent(""" A source MMIF is a MMIF with a list of source documents but empty views. diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py index 29b94a60..c89b98fa 100644 --- a/mmif/utils/cli/summarize.py +++ b/mmif/utils/cli/summarize.py @@ -1,31 +1,38 @@ -import sys import argparse +import json +import pathlib +import sys +import tempfile from mmif.utils.summarizer.summary import Summary - def describe_argparser() -> tuple: """ - Returns two strings: a one-line description of the argparser and additional - material, which will be shown for `mmif --help` and `mmif summarize --help`, - respectively. For now they return the same string. The retun value should - still be a tuple because mmif.cli() depends on it. + returns two strings: one-line description of the argparser, and addition material, + which will be shown in `clams --help` and `clams --help`, respectively. """ - oneliner = 'Create a JSON Summary for a MMIF file' - return oneliner, oneliner + oneliner = 'Create a JSON Summary for a MMIF file.' + additional = 'The output is serialized as JSON and includes various statistics and summaries of the MMIF content.' + return oneliner, oneliner + '\n\n' + additional def prep_argparser(**kwargs): """ Create the ArgumentParser instance for the summarizer. """ - parser = argparse.ArgumentParser( - description=describe_argparser()[1], - formatter_class=argparse.RawDescriptionHelpFormatter, - **kwargs) - parser.add_argument("-i", metavar='MMIF_FILE', help='input MMIF file', required=True) - parser.add_argument("-o", metavar='OUTPUT_FILE', help='output JSON summary file', required=True) + parser = argparse.ArgumentParser(description=describe_argparser()[1], + formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs) + parser.add_argument("MMIF_FILE", + nargs="?", type=argparse.FileType("r"), + default=None if sys.stdin.isatty() else sys.stdin, + help='input MMIF file path, or STDIN if `-` or not provided.') + parser.add_argument("-o", "--output", + type=argparse.FileType("w"), + default=sys.stdout, + help='output file path, or STDOUT if not provided.') + parser.add_argument("-p", "--pretty", action="store_true", + help="Pretty-print JSON output") return parser @@ -33,5 +40,28 @@ def main(args: argparse.Namespace): """ The main summarizer command. """ - mmif_summary = Summary(args.i) - mmif_summary.report(outfile=args.o) + if args.MMIF_FILE is None: + raise ValueError("No input MMIF provided.") + + mmif_content = args.MMIF_FILE.read() + + tmp_path = None + try: + with tempfile.NamedTemporaryFile( + mode='w', suffix='.mmif', delete=False + ) as tmp: + tmp.write(mmif_content) + tmp_path = pathlib.Path(tmp.name) + mmif_summary = Summary(tmp_path) + output = mmif_summary.to_dict() + finally: + if tmp_path and tmp_path.exists(): + tmp_path.unlink() + + json.dump(output, args.output, indent=2 if args.pretty else None) + + +if __name__ == "__main__": + parser = prep_argparser() + args = parser.parse_args() + main(args) diff --git a/mmif/utils/summarizer/summary.py b/mmif/utils/summarizer/summary.py index 7dd19adf..b6ea4984 100644 --- a/mmif/utils/summarizer/summary.py +++ b/mmif/utils/summarizer/summary.py @@ -83,8 +83,8 @@ def validate(self): def video_documents(self): return self.mmif.get_documents_by_type(DocumentTypes.VideoDocument) - def report(self, outfile=None): - json_obj = { + def to_dict(self): + return { 'mmif_version': self.mmif.metadata.mmif, 'document': self.document.data, 'documents': self.documents.data, @@ -96,6 +96,9 @@ def report(self, outfile=None): 'timeframe_stats': self.timeframe_stats.data, 'entities': self.entities.as_json() } + + def report(self, outfile=None): + json_obj = self.to_dict() report = json.dumps(json_obj, indent=2) if outfile is None: return report From c166f700952d1424a295dfed7bf30a18f5fc9602 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 11 Feb 2026 15:07:39 -0500 Subject: [PATCH 33/48] added tests for summarize command --- mmif/utils/cli/summarize.py | 8 +-- mmif/utils/summarizer/summary.py | 3 + tests/test_utils_cli.py | 102 +++++++++++++++++++++++++------ 3 files changed, 89 insertions(+), 24 deletions(-) diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py index c89b98fa..8c91bb5f 100644 --- a/mmif/utils/cli/summarize.py +++ b/mmif/utils/cli/summarize.py @@ -8,10 +8,6 @@ def describe_argparser() -> tuple: - """ - returns two strings: one-line description of the argparser, and addition material, - which will be shown in `clams --help` and `clams --help`, respectively. - """ oneliner = 'Create a JSON Summary for a MMIF file.' additional = 'The output is serialized as JSON and includes various statistics and summaries of the MMIF content.' return oneliner, oneliner + '\n\n' + additional @@ -41,8 +37,8 @@ def main(args: argparse.Namespace): The main summarizer command. """ if args.MMIF_FILE is None: - raise ValueError("No input MMIF provided.") - + print("error: No input MMIF provided.", file=sys.stderr) + sys.exit(2) mmif_content = args.MMIF_FILE.read() tmp_path = None diff --git a/mmif/utils/summarizer/summary.py b/mmif/utils/summarizer/summary.py index b6ea4984..fd6d548a 100644 --- a/mmif/utils/summarizer/summary.py +++ b/mmif/utils/summarizer/summary.py @@ -102,6 +102,9 @@ def report(self, outfile=None): report = json.dumps(json_obj, indent=2) if outfile is None: return report + # Support both file-like objects and path-like values for outfile. + if hasattr(outfile, "write"): + outfile.write(report) else: with open(outfile, 'w') as fh: fh.write(report) diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index d51014de..01fedb44 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -9,11 +9,15 @@ from mmif.utils.cli import rewind from mmif.utils.cli import source from mmif.utils.cli import describe +from mmif.utils.cli import summarize from mmif.serialize import Mmif from mmif.vocabulary import DocumentTypes, AnnotationTypes +BASIC_MMIF_STRING = '{"metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", "properties": {"id": "d1", "mime": "video/mp4", "location": "file:///test/video.mp4"}}], "views": []}' + + class TestCli(unittest.TestCase): def setUp(self) -> None: self.parser, _, _ = mmif.prep_argparser_and_subcmds() @@ -121,23 +125,11 @@ def test_generate_mixed_scheme(self): class TestRewind(unittest.TestCase): def setUp(self): # mmif we add views to - self.mmif_one = Mmif( - { - "metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, - "documents": [], - "views": [], - } - ) + self.mmif_one = Mmif(BASIC_MMIF_STRING) # baseline empty mmif for comparison - self.empty_mmif = Mmif( - { - "metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, - "documents": [], - "views": [], - } - ) - + self.empty_mmif = Mmif(BASIC_MMIF_STRING) + @staticmethod def add_dummy_view(mmif: Mmif, appname: str, timestamp: str = None): v = mmif.new_view() @@ -192,9 +184,7 @@ def setUp(self): """Create test MMIF structures.""" self.parser = describe.prep_argparser() self.maxDiff = None - self.basic_mmif = Mmif( - '{"metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", "properties": {"id": "d1", "mime": "video/mp4", "location": "file:///test/video.mp4"}}], "views": []}' - ) + self.basic_mmif = Mmif(BASIC_MMIF_STRING) def create_temp_mmif_file(self, mmif_obj): """Helper to create a temporary MMIF file.""" @@ -302,5 +292,81 @@ def test_describe_collection_empty(self): os.rmdir(dummy_dir) +class TestSummarize(unittest.TestCase): + """Test suite for the summarize CLI module.""" + + def setUp(self): + """Create test MMIF structures.""" + self.parser = summarize.prep_argparser() + self.basic_mmif = Mmif(BASIC_MMIF_STRING) + + def create_temp_mmif_file(self, mmif_obj): + """Helper to create a temporary MMIF file.""" + tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) + tmp.write(mmif_obj.serialize(pretty=False)) + tmp.close() + return tmp.name + + def test_summarize_positional_input(self): + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + stdout = io.StringIO() + try: + args = self.parser.parse_args([tmp_file]) + args.output = stdout + summarize.main(args) + output = json.loads(stdout.getvalue()) + self.assertIn('mmif_version', output) + self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") + finally: + os.unlink(tmp_file) + + def test_summarize_output_file(self): + tmp_input = self.create_temp_mmif_file(self.basic_mmif) + tmp_output = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) + tmp_output.close() + try: + args = self.parser.parse_args([tmp_input, "-o", tmp_output.name]) + summarize.main(args) + args.output.close() + with open(tmp_output.name, 'r') as f: + output = json.load(f) + self.assertIn('mmif_version', output) + finally: + os.unlink(tmp_input) + os.unlink(tmp_output.name) + + def test_summarize_pretty_print(self): + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + stdout_pretty = io.StringIO() + stdout_compact = io.StringIO() + try: + # Pretty + args_pretty = self.parser.parse_args([tmp_file, "--pretty"]) + args_pretty.output = stdout_pretty + summarize.main(args_pretty) + + # Compact + args_compact = self.parser.parse_args([tmp_file]) + args_compact.output = stdout_compact + summarize.main(args_compact) + + self.assertNotEqual(stdout_pretty.getvalue(), stdout_compact.getvalue()) + self.assertIn('\n ', stdout_pretty.getvalue()) # Check for indentation + finally: + os.unlink(tmp_file) + + def test_summarize_stdin(self): + mmif_str = self.basic_mmif.serialize() + import argparse + stdout = io.StringIO() + stdin = io.StringIO(mmif_str) + + args = argparse.Namespace(MMIF_FILE=stdin, output=stdout, pretty=False) + summarize.main(args) + + output = json.loads(stdout.getvalue()) + self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") + + if __name__ == '__main__': unittest.main() From 4df2d3f1e6df30583655f7035d1747cf89546812 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 11 Feb 2026 16:13:00 -0500 Subject: [PATCH 34/48] replaced deprecated `argparser.FileType` with native implementation --- CONTRIBUTING.md | 12 ++++++ mmif/utils/cli/__init__.py | 79 +++++++++++++++++++++++++++++++++++++ mmif/utils/cli/describe.py | 22 +++++------ mmif/utils/cli/rewind.py | 13 +++--- mmif/utils/cli/source.py | 7 ++-- mmif/utils/cli/summarize.py | 18 ++++----- tests/mmif_examples.py | 2 +- tests/test_utils_cli.py | 2 +- 8 files changed, 122 insertions(+), 33 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 49d47d49..73cc01fd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,6 +33,18 @@ To add a new CLI subcommand, create a Python module in `mmif/utils/cli/` with th 3. **`main(args)`** - Execute the subcommand logic with the parsed arguments. +### Standard I/O Argument Pattern + +To ensure a consistent user experience and avoid resource leaks, all CLI subcommands should adhere to the following I/O argument patterns using the `mmif.utils.cli.open_cli_io_arg` context manager (which replaces the deprecated `argparse.FileType`): + +1. **Input**: Use a positional argument (usually named `MMIF_FILE`) that supports both file paths and STDIN. + - In `prep_argparser`, use `nargs='?'`, `type=str`, and `default=None`. + - In `main`, use `with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file:`. +2. **Output**: Use the `-o`/`--output` flag for the output destination. + - In `prep_argparser`, use `type=str` and `default=None`. + - In `main`, use `with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file:`. +3. **Formatting**: Use the `-p`/`--pretty` flag as a boolean switch (`action='store_true'`) to toggle between compact and pretty-printed JSON/MMIF output. + [!NOTE] > CLI modules should typically act as thin wrappers. It is recommended to implement the core utility logic in other packages (e.g., `mmif.utils`) and import it into the CLI module. See existing modules like `summarize.py` (which imports from `mmif.utils.summarizer`) or `describe.py` for examples. diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 8a59e210..fdb7c457 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -2,6 +2,85 @@ Package containing CLI modules. """ +import sys +import os +from contextlib import contextmanager +from typing import IO, ContextManager, Optional, Union + + +def open_cli_io_arg( + path_or_dash: Optional[Union[str, IO]], + mode: str = 'r', + encoding: Optional[str] = None, + errors: Optional[str] = None, + default_stdin: bool = False, +) -> ContextManager[IO]: + """ + Context manager for opening files with stdin/stdout support. + + This is a native replacement for argparse.FileType which is deprecated as of Python 3.14 + due to resource leak issues. Unlike FileType, this defers file opening until actually needed + and ensures proper cleanup via context manager. + + Handles the common CLI pattern where: + - '-' means stdin (read mode) or stdout (write mode) + - None means "argument not provided"; when default_stdin=True, it falls back to stdin/stdout + - Regular paths open actual files with proper resource management + + :param path_or_dash: File path, '-' for stdin/stdout, None for no argument, or a file-like object + :param mode: File mode ('r' for reading, 'w' for writing) + :param encoding: Optional file encoding + :param errors: Optional error handling strategy for encoding + :param default_stdin: If True and path_or_dash is None, default to stdin (mode 'r') or stdout (mode 'w') + :return: Context manager yielding file handle + """ + + def _requires_read(requested_mode: str) -> bool: + return 'r' in requested_mode or '+' in requested_mode + + def _requires_write(requested_mode: str) -> bool: + return any(flag in requested_mode for flag in ('w', 'a', 'x', '+')) + + @contextmanager + def _open(): + # Determine if we should use stdin/stdout + use_std = path_or_dash == '-' or (path_or_dash is None and default_stdin) + needs_read = _requires_read(mode) + needs_write = _requires_write(mode) + + if path_or_dash is None and default_stdin and needs_read and sys.stdin.isatty(): + raise SystemExit("error: No input MMIF provided.") + + if use_std: + if needs_read and needs_write: + raise ValueError(f"Mode '{mode}' not supported with stdin/stdout (use read or write only)") + if needs_read: + yield sys.stdin + elif needs_write: + yield sys.stdout + else: + raise ValueError(f"Mode '{mode}' not supported with stdin/stdout (use 'r' or 'w')") + elif hasattr(path_or_dash, 'read') or hasattr(path_or_dash, 'write'): + if needs_read and not hasattr(path_or_dash, 'read'): + raise ValueError(f"Mode '{mode}' requires a readable file-like object") + if needs_write and not hasattr(path_or_dash, 'write'): + raise ValueError(f"Mode '{mode}' requires a writable file-like object") + yield path_or_dash + else: + # Open actual file with proper cleanup + if path_or_dash is None: + raise ValueError("File path cannot be None when not using stdin/stdout") + if needs_read and not os.path.exists(path_or_dash): + raise FileNotFoundError(f"Input path does not exist: {path_or_dash}") + f = open(path_or_dash, mode, encoding=encoding, errors=errors) + try: + yield f + finally: + f.close() + + return _open() + + from mmif.utils.cli import describe from mmif.utils.cli import rewind from mmif.utils.cli import source diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index 8e8c5171..1021e52d 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -1,10 +1,12 @@ import argparse import json +import os import sys import textwrap from pathlib import Path from typing import Union, cast +from mmif.utils.cli import open_cli_io_arg from mmif.utils.workflow_helper import generate_workflow_identifier, describe_single_mmif, \ describe_mmif_collection # gen_param_hash is imported for backward compatibility @@ -74,8 +76,7 @@ def prep_argparser(**kwargs): ) parser.add_argument( "-o", "--output", - type=argparse.FileType("w"), - default=sys.stdout, + type=str, default=None, help='output file path, or STDOUT if not provided.' ) parser.add_argument( @@ -100,16 +101,13 @@ def main(args): """ output = {} # if input is a directory - if isinstance(args.MMIF_FILE, str) and Path(args.MMIF_FILE).is_dir(): + if isinstance(args.MMIF_FILE, (str, os.PathLike)) and Path(args.MMIF_FILE).is_dir(): output = describe_mmif_collection(args.MMIF_FILE) # if input is a file or stdin else: # Read MMIF content - if hasattr(args.MMIF_FILE, 'read'): - mmif_content = args.MMIF_FILE.read() - else: - with open(args.MMIF_FILE, 'r') as f: - mmif_content = f.read() + with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file: + mmif_content = input_file.read() # For file input, we need to handle the path # If input is from stdin, create a temp file @@ -127,11 +125,9 @@ def main(args): tmp_path.unlink() if output: - if args.pretty: - json.dump(output, args.output, indent=2) - else: - json.dump(output, args.output) - args.output.write('\n') + with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: + json.dump(output, output_file, indent=2 if args.pretty else None) + output_file.write('\n') if __name__ == "__main__": diff --git a/mmif/utils/cli/rewind.py b/mmif/utils/cli/rewind.py index 9be494de..8dccc63f 100644 --- a/mmif/utils/cli/rewind.py +++ b/mmif/utils/cli/rewind.py @@ -3,6 +3,7 @@ import textwrap import mmif +from mmif.utils.cli import open_cli_io_arg from mmif.utils.workflow_helper import group_views_by_app @@ -66,12 +67,10 @@ def prep_argparser(**kwargs): parser = argparse.ArgumentParser(description=describe_argparser()[1], formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs) parser.add_argument("MMIF_FILE", - nargs="?", type=argparse.FileType("r"), - default=None if sys.stdin.isatty() else sys.stdin, + nargs="?", type=str, default=None, help='input MMIF file path, or STDIN if `-` or not provided.') parser.add_argument("-o", "--output", - type=argparse.FileType("w"), - default=sys.stdout, + type=str, default=None, help='output file path, or STDOUT if not provided.') parser.add_argument("-p", '--pretty', action='store_true', help="Pretty-print rewound MMIF") @@ -84,7 +83,8 @@ def prep_argparser(**kwargs): def main(args): - mmif_obj = mmif.Mmif(args.MMIF_FILE.read()) + with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file: + mmif_obj = mmif.Mmif(input_file.read()) if args.number == 0: # If user doesn't know how many views to rewind, give them choices. choice = prompt_user(mmif_obj) @@ -93,7 +93,8 @@ def main(args): if not isinstance(choice, int) or choice <= 0: raise ValueError(f"Only can rewind by a positive number of views. Got {choice}.") - args.output.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty)) + with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: + output_file.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty)) if __name__ == "__main__": diff --git a/mmif/utils/cli/source.py b/mmif/utils/cli/source.py index 67e24015..6c3b4b86 100644 --- a/mmif/utils/cli/source.py +++ b/mmif/utils/cli/source.py @@ -9,6 +9,7 @@ from mmif import Mmif, Document, DocumentTypes, __specver__ from mmif.serialize.mmif import MmifMetadata +from mmif.utils.cli import open_cli_io_arg __all__ = ['WorkflowSource'] @@ -254,8 +255,7 @@ def prep_argparser(**kwargs): ) parser.add_argument( '-o', '--output', - type=argparse.FileType('w'), - default=sys.stdout, + type=str, default=None, help='output file path, or STDOUT if not provided.' ) scheme_help = 'A scheme to associate with the document location URI. When not given, the default scheme is `file://`.' @@ -275,7 +275,8 @@ def prep_argparser(**kwargs): def main(args): mmif = generate_source_mmif_from_file(windows_path=False, **vars(args)) - args.output.write(mmif) + with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: + output_file.write(mmif) return mmif if __name__ == '__main__': diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py index 8c91bb5f..d604d2fa 100644 --- a/mmif/utils/cli/summarize.py +++ b/mmif/utils/cli/summarize.py @@ -4,6 +4,7 @@ import sys import tempfile +from mmif.utils.cli import open_cli_io_arg from mmif.utils.summarizer.summary import Summary @@ -20,12 +21,10 @@ def prep_argparser(**kwargs): parser = argparse.ArgumentParser(description=describe_argparser()[1], formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs) parser.add_argument("MMIF_FILE", - nargs="?", type=argparse.FileType("r"), - default=None if sys.stdin.isatty() else sys.stdin, + nargs="?", type=str, default=None, help='input MMIF file path, or STDIN if `-` or not provided.') parser.add_argument("-o", "--output", - type=argparse.FileType("w"), - default=sys.stdout, + type=str, default=None, help='output file path, or STDOUT if not provided.') parser.add_argument("-p", "--pretty", action="store_true", help="Pretty-print JSON output") @@ -36,10 +35,10 @@ def main(args: argparse.Namespace): """ The main summarizer command. """ - if args.MMIF_FILE is None: - print("error: No input MMIF provided.", file=sys.stderr) - sys.exit(2) - mmif_content = args.MMIF_FILE.read() + # Check if stdin is available when no file is provided + + with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file: + mmif_content = input_file.read() tmp_path = None try: @@ -54,7 +53,8 @@ def main(args: argparse.Namespace): if tmp_path and tmp_path.exists(): tmp_path.unlink() - json.dump(output, args.output, indent=2 if args.pretty else None) + with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: + json.dump(output, output_file, indent=2 if args.pretty else None) if __name__ == "__main__": diff --git a/tests/mmif_examples.py b/tests/mmif_examples.py index b19f9d9a..8a8f4c6f 100644 --- a/tests/mmif_examples.py +++ b/tests/mmif_examples.py @@ -55,7 +55,7 @@ def _load_from_url_or_git(url): old_mmif_w_short_id_url = f"https://raw.githubusercontent.com/clamsproject/mmif/1.0.5/specifications/samples/everything/raw.json" EVERYTHING_JSON = _load_from_url_or_git(everything_file_url) OLD_SHORTID_JSON = _load_from_url_or_git(old_mmif_w_short_id_url) -SWT_1_0_JSON = open('tests/samples/1.0/swt.mmif').read() +SWT_1_0_JSON = (Path(__file__).resolve().parent / 'samples' / '1.0' / 'swt.mmif').read_text() # for keys and values in chain all typevers in mmif.vocabulary.*_types modules # merge into a single dict diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index 01fedb44..a54cbcf5 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -327,7 +327,7 @@ def test_summarize_output_file(self): try: args = self.parser.parse_args([tmp_input, "-o", tmp_output.name]) summarize.main(args) - args.output.close() + # args.output is a path string now; no file handle to close. with open(tmp_output.name, 'r') as f: output = json.load(f) self.assertIn('mmif_version', output) From d85526f2b92beb1b8b726c56e20249aac2ea19e6 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 11 Feb 2026 16:33:58 -0500 Subject: [PATCH 35/48] `summarize`'s log msgs are now using standard logging lib --- mmif/utils/cli/summarize.py | 1 - mmif/utils/summarizer/summary.py | 39 +++++++++++++------------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py index d604d2fa..447d1b36 100644 --- a/mmif/utils/cli/summarize.py +++ b/mmif/utils/cli/summarize.py @@ -1,7 +1,6 @@ import argparse import json import pathlib -import sys import tempfile from mmif.utils.cli import open_cli_io_arg diff --git a/mmif/utils/summarizer/summary.py b/mmif/utils/summarizer/summary.py index fd6d548a..f4fcc1c1 100644 --- a/mmif/utils/summarizer/summary.py +++ b/mmif/utils/summarizer/summary.py @@ -8,28 +8,22 @@ # - For the time unit we should really update get_start(), get_end() and other methods. -import os, sys, io, json, argparse, pathlib +import json +import logging +import os +import pathlib from collections import defaultdict from mmif.serialize import Mmif -from mmif.vocabulary import DocumentTypes - from mmif.utils.summarizer import config -from mmif.utils.summarizer.utils import CharacterList -from mmif.utils.summarizer.utils import get_aligned_tokens, timestamp -from mmif.utils.summarizer.utils import get_transcript_view, get_last_segmenter_view, get_captions_view from mmif.utils.summarizer.graph import Graph +from mmif.utils.summarizer.utils import CharacterList +from mmif.utils.summarizer.utils import get_transcript_view, get_captions_view +from mmif.utils.summarizer.utils import timestamp +from mmif.vocabulary import DocumentTypes - -VERSION = '0.2.0' - - -DEBUG = False - -def debug(*texts): - if DEBUG: - for text in texts: - sys.stderr.write(f'{text}\n') +VERSION = '0.2.0' # why there's a version inside a subpackage??? +logger = logging.getLogger(__name__) class SummaryException(Exception): @@ -56,7 +50,7 @@ def __init__(self, mmif_file): self.fname = mmif_file #self.mmif = mmif if type(mmif) is Mmif else Mmif(mmif) self.mmif = Mmif(pathlib.Path(mmif_file).read_text()) - self.warnings = [] + self.warnings: list[str] = [] self.graph = Graph(self.mmif) self.mmif_version = self.mmif.metadata['mmif'] self.documents = Documents(self) @@ -111,7 +105,7 @@ def report(self, outfile=None): def print_warnings(self): for warning in self.warnings: - print(f'WARNING: {warning}') + logger.warning(warning) def pp(self): self.documents.pp() @@ -560,11 +554,10 @@ def __init__(self, summary): if view is not None: for doc in self.graph.get_nodes(config.TEXT_DOCUMENT, view_id=view.id): text = doc.properties['text']['@value'].split('[/INST]')[-1] - debug( - f'>>> DOC {doc}', - f'>>> PROPS {list(doc.properties.keys())}', - f'>>> TEXT ' + text.replace("\n", "")[:100], - f'>>> ANCHORS {doc.anchors}') + logger.debug('>>> DOC %s', doc) + logger.debug('>>> PROPS %s', list(doc.properties.keys())) + logger.debug('>>> TEXT %s', text.replace("\n", "")[:100]) + logger.debug('>>> ANCHORS %s', doc.anchors) if 'time-offsets' in doc.anchors and 'representatives' in doc.anchors: # For older LLava-style captions # http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97 From ad5d0d3e8ee9f5c49fb1ba50d0e2dbadd9a21fdf Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 11 Feb 2026 17:56:53 -0500 Subject: [PATCH 36/48] fixed type hints in native CLI-IO hanlder --- mmif/utils/cli/__init__.py | 77 +++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index fdb7c457..62fce501 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -2,37 +2,45 @@ Package containing CLI modules. """ -import sys import os +import sys from contextlib import contextmanager -from typing import IO, ContextManager, Optional, Union +from typing import ContextManager, Optional, Union, TextIO -def open_cli_io_arg( - path_or_dash: Optional[Union[str, IO]], - mode: str = 'r', - encoding: Optional[str] = None, - errors: Optional[str] = None, - default_stdin: bool = False, -) -> ContextManager[IO]: +def open_cli_io_arg(path_or_dash: Optional[Union[str, TextIO]], + mode: str = 'r', + encoding: Optional[str] = None, + errors: Optional[str] = None, + default_stdin: bool = False, + ) -> ContextManager[TextIO]: """ Context manager for opening files with stdin/stdout support. - This is a native replacement for argparse.FileType which is deprecated as of Python 3.14 - due to resource leak issues. Unlike FileType, this defers file opening until actually needed - and ensures proper cleanup via context manager. + This function is intended for plain text streams (e.g. JSON/MMIF) and does + not support binary modes (e.g., 'rb', 'wb'). + + This is a native replacement for argparse.FileType which is deprecated as + of Python 3.14 due to resource leak issues. Unlike FileType, this defers + file opening until actually needed and ensures proper cleanup via context + manager. Handles the common CLI pattern where: - '-' means stdin (read mode) or stdout (write mode) - - None means "argument not provided"; when default_stdin=True, it falls back to stdin/stdout + - None means "argument not provided"; when default_stdin=True, it falls back + to stdin/stdout - Regular paths open actual files with proper resource management - :param path_or_dash: File path, '-' for stdin/stdout, None for no argument, or a file-like object - :param mode: File mode ('r' for reading, 'w' for writing) + :param path_or_dash: File path, '-' for stdin/stdout, None for no argument, + or a file-like object + :param mode: File mode ('r' for reading, 'w' for writing). Binary modes are + not supported. :param encoding: Optional file encoding :param errors: Optional error handling strategy for encoding - :param default_stdin: If True and path_or_dash is None, default to stdin (mode 'r') or stdout (mode 'w') - :return: Context manager yielding file handle + :param default_stdin: If True and path_or_dash is None, default to stdin + (mode 'r') or stdout (mode 'w') + :returns: Context manager yielding text-mode file handle + :rtype: ContextManager[TextIO] """ def _requires_read(requested_mode: str) -> bool: @@ -48,30 +56,49 @@ def _open(): needs_read = _requires_read(mode) needs_write = _requires_write(mode) - if path_or_dash is None and default_stdin and needs_read and sys.stdin.isatty(): + if ( + path_or_dash is None + and default_stdin + and needs_read + and sys.stdin.isatty() + ): raise SystemExit("error: No input MMIF provided.") if use_std: if needs_read and needs_write: - raise ValueError(f"Mode '{mode}' not supported with stdin/stdout (use read or write only)") + raise ValueError( + f"Mode '{mode}' not supported with stdin/stdout " + "(use read or write only)" + ) if needs_read: yield sys.stdin elif needs_write: yield sys.stdout else: - raise ValueError(f"Mode '{mode}' not supported with stdin/stdout (use 'r' or 'w')") + raise ValueError( + f"Mode '{mode}' not supported with stdin/stdout " + "(use 'r' or 'w')" + ) elif hasattr(path_or_dash, 'read') or hasattr(path_or_dash, 'write'): if needs_read and not hasattr(path_or_dash, 'read'): - raise ValueError(f"Mode '{mode}' requires a readable file-like object") + raise ValueError( + f"Mode '{mode}' requires a readable file-like object" + ) if needs_write and not hasattr(path_or_dash, 'write'): - raise ValueError(f"Mode '{mode}' requires a writable file-like object") + raise ValueError( + f"Mode '{mode}' requires a writable file-like object" + ) yield path_or_dash else: # Open actual file with proper cleanup if path_or_dash is None: - raise ValueError("File path cannot be None when not using stdin/stdout") + raise ValueError( + "File path cannot be None when not using stdin/stdout" + ) if needs_read and not os.path.exists(path_or_dash): - raise FileNotFoundError(f"Input path does not exist: {path_or_dash}") + raise FileNotFoundError( + f"Input path does not exist: {path_or_dash}" + ) f = open(path_or_dash, mode, encoding=encoding, errors=errors) try: yield f @@ -81,8 +108,8 @@ def _open(): return _open() +# keep CLI modules here to avoid circular imports from mmif.utils.cli import describe from mmif.utils.cli import rewind from mmif.utils.cli import source from mmif.utils.cli import summarize - From 3d214d9143a7b6deb8669c4390196175a62edcc2 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 11 Feb 2026 18:02:14 -0500 Subject: [PATCH 37/48] adding some minor fixes to main page of docs --- documentation/conf.py | 4 ++-- documentation/index.rst | 16 +++++++++------- documentation/introduction.rst | 2 +- documentation/modules.rst | 10 ---------- 4 files changed, 12 insertions(+), 20 deletions(-) delete mode 100644 documentation/modules.rst diff --git a/documentation/conf.py b/documentation/conf.py index 121054ef..f6a2a221 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -226,7 +226,7 @@ def generate_whatsnew_rst(app): if not found_version: logger.info(f"No changelog entry found for version {version}") with open(output_path, 'w') as f: - f.write("") + f.write(f"### nothing new in {version}\nDid you locally build for testing?") else: # Dump matched markdown content directly to whatsnew.md with open(output_path, 'w') as f: @@ -261,7 +261,7 @@ def run_apidoc(app): *exclude_paths, '--force', # Overwrite existing files '--module-first', # Put module docs before submodule docs - '--no-toc', # Don't create modules.rst (we maintain our own) + '--no-toc', # Don't create modules.rst, will be overwriting each other's ] logger.info(f"Running sphinx-apidoc with args: {args}") apidoc_main(args) diff --git a/documentation/index.rst b/documentation/index.rst index 218271ba..05d93a48 100644 --- a/documentation/index.rst +++ b/documentation/index.rst @@ -1,17 +1,17 @@ -mmif-python -=========== +MMIF Python SDK +=============== This is the documentation for the mmif-python package, a Python implementation for the MultiMedia Interchange Format (MMIF). MMIF is a JSON(-LD)-based data format designed for transferring annotation data between computational analysis applications of the CLAMS project. For descriptions of the CLAMS project and the MMIF format see https://clams.ai and https://mmif.clams.ai. The GitHub repository for the package is at https://github.com/clamsproject/mmif-python. -.. mdinclude:: whatsnew.md +---- +.. mdinclude:: whatsnew.md -Contents --------- +---- .. toctree:: :maxdepth: 2 - :caption: General documentation + :caption: Contents introduction cli @@ -21,8 +21,10 @@ Contents .. toctree:: :maxdepth: 2 + :caption: API Documentation - modules + autodoc/mmif + autodoc/mmif_docloc_http Indices diff --git a/documentation/introduction.rst b/documentation/introduction.rst index f2ed9f90..c9f63e98 100644 --- a/documentation/introduction.rst +++ b/documentation/introduction.rst @@ -88,7 +88,7 @@ To get subcomponents, you can use various getters implemented in subclasses. For # do something with the video file -For a full list of available helper methods, please refer to :ref:`the API documentation `. +For a full list of available helper methods, please refer to the API documentation pages (See left sidebar). MMIF usage in CLAMS Workflows diff --git a/documentation/modules.rst b/documentation/modules.rst deleted file mode 100644 index 882db997..00000000 --- a/documentation/modules.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. _apidoc: - -API Documentation -================= - -.. toctree:: - :maxdepth: 4 - - autodoc/mmif - autodoc/mmif_docloc_http From a9893de30eeae55d86aa2fa02716cd341b7f581b Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Thu, 12 Feb 2026 16:34:13 -0500 Subject: [PATCH 38/48] reflecting review comments --- mmif/utils/cli/__init__.py | 32 ++++++++++++++++++----------- mmif/utils/cli/summarize.py | 35 +++++++++++++++++--------------- mmif/utils/summarizer/summary.py | 1 - tests/test_utils_cli.py | 7 +++++-- 4 files changed, 44 insertions(+), 31 deletions(-) diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 62fce501..0ba97f4a 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -5,7 +5,7 @@ import os import sys from contextlib import contextmanager -from typing import ContextManager, Optional, Union, TextIO +from typing import ContextManager, Generator, Optional, Union, TextIO, cast def open_cli_io_arg(path_or_dash: Optional[Union[str, TextIO]], @@ -50,7 +50,14 @@ def _requires_write(requested_mode: str) -> bool: return any(flag in requested_mode for flag in ('w', 'a', 'x', '+')) @contextmanager - def _open(): + def _open() -> Generator[TextIO, None, None]: + # Validate that binary modes are not used + if 'b' in mode: + raise ValueError( + f"Binary mode '{mode}' is not supported. " + "Use text modes ('r', 'w', 'a', 'x') instead." + ) + # Determine if we should use stdin/stdout use_std = path_or_dash == '-' or (path_or_dash is None and default_stdin) needs_read = _requires_read(mode) @@ -79,7 +86,7 @@ def _open(): f"Mode '{mode}' not supported with stdin/stdout " "(use 'r' or 'w')" ) - elif hasattr(path_or_dash, 'read') or hasattr(path_or_dash, 'write'): + elif isinstance(path_or_dash, TextIO): if needs_read and not hasattr(path_or_dash, 'read'): raise ValueError( f"Mode '{mode}' requires a readable file-like object" @@ -89,27 +96,28 @@ def _open(): f"Mode '{mode}' requires a writable file-like object" ) yield path_or_dash - else: + elif isinstance(path_or_dash, str): # Open actual file with proper cleanup - if path_or_dash is None: - raise ValueError( - "File path cannot be None when not using stdin/stdout" - ) if needs_read and not os.path.exists(path_or_dash): raise FileNotFoundError( f"Input path does not exist: {path_or_dash}" ) - f = open(path_or_dash, mode, encoding=encoding, errors=errors) + f = cast(TextIO, open(path_or_dash, mode, encoding=encoding, errors=errors)) try: yield f finally: f.close() + else: + # there should be no other valid types at this point + raise ValueError( + f"Invalid type for path_or_dash: {type(path_or_dash)}. " + "Expected str of file path or text-based IO stream (TextIO)." + ) return _open() -# keep CLI modules here to avoid circular imports -from mmif.utils.cli import describe +# keep imports of CLI modules for historical reasons +# keep them here in the bottom to avoid circular imports from mmif.utils.cli import rewind from mmif.utils.cli import source -from mmif.utils.cli import summarize diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py index 447d1b36..53dc7513 100644 --- a/mmif/utils/cli/summarize.py +++ b/mmif/utils/cli/summarize.py @@ -34,23 +34,26 @@ def main(args: argparse.Namespace): """ The main summarizer command. """ - # Check if stdin is available when no file is provided - - with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file: - mmif_content = input_file.read() - - tmp_path = None - try: - with tempfile.NamedTemporaryFile( - mode='w', suffix='.mmif', delete=False - ) as tmp: - tmp.write(mmif_content) - tmp_path = pathlib.Path(tmp.name) - mmif_summary = Summary(tmp_path) + # If a real file path is provided (not None and not '-'), pass it directly to Summary + if args.MMIF_FILE is not None and args.MMIF_FILE != "-": + mmif_summary = Summary(pathlib.Path(args.MMIF_FILE)) output = mmif_summary.to_dict() - finally: - if tmp_path and tmp_path.exists(): - tmp_path.unlink() + else: + # Fallback: read from stdin (or default input), write to a temporary file, and summarize that + with open_cli_io_arg(args.MMIF_FILE, 'r', default_stdin=True) as input_file: + mmif_content = input_file.read() + tmp_path = None + try: + with tempfile.NamedTemporaryFile( + mode='w', suffix='.mmif', delete=False + ) as tmp: + tmp.write(mmif_content) + tmp_path = pathlib.Path(tmp.name) + mmif_summary = Summary(tmp_path) + output = mmif_summary.to_dict() + finally: + if tmp_path and tmp_path.exists(): + tmp_path.unlink() with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: json.dump(output, output_file, indent=2 if args.pretty else None) diff --git a/mmif/utils/summarizer/summary.py b/mmif/utils/summarizer/summary.py index f4fcc1c1..28339fad 100644 --- a/mmif/utils/summarizer/summary.py +++ b/mmif/utils/summarizer/summary.py @@ -22,7 +22,6 @@ from mmif.utils.summarizer.utils import timestamp from mmif.vocabulary import DocumentTypes -VERSION = '0.2.0' # why there's a version inside a subpackage??? logger = logging.getLogger(__name__) diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index a54cbcf5..2b1a4b11 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -124,11 +124,14 @@ def test_generate_mixed_scheme(self): class TestRewind(unittest.TestCase): def setUp(self): + empty_mmif_str = ('{"metadata": {"mmif": ' + '"http://mmif.clams.ai/1.0.0"}, "documents": [], ' + '"views": []}') # mmif we add views to - self.mmif_one = Mmif(BASIC_MMIF_STRING) + self.mmif_one = Mmif(empty_mmif_str) # baseline empty mmif for comparison - self.empty_mmif = Mmif(BASIC_MMIF_STRING) + self.empty_mmif = Mmif(empty_mmif_str) @staticmethod def add_dummy_view(mmif: Mmif, appname: str, timestamp: str = None): From c5c3d29fbe5150c93fba0bc9e7b57b01a139c562 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Thu, 12 Feb 2026 18:06:48 -0500 Subject: [PATCH 39/48] re-wrote CLI IO handler, simplifying input types --- mmif/utils/cli/__init__.py | 123 +++++++++++++++++++----------------- mmif/utils/cli/describe.py | 2 +- mmif/utils/cli/summarize.py | 2 +- tests/test_utils_cli.py | 55 ++++++---------- 4 files changed, 85 insertions(+), 97 deletions(-) diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 0ba97f4a..9b91b60c 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -2,18 +2,20 @@ Package containing CLI modules. """ +import contextlib +import io import os import sys -from contextlib import contextmanager -from typing import ContextManager, Generator, Optional, Union, TextIO, cast +from typing import Iterator, Optional, TextIO, cast -def open_cli_io_arg(path_or_dash: Optional[Union[str, TextIO]], +@contextlib.contextmanager +def open_cli_io_arg(path_or_dash: Optional[str], mode: str = 'r', encoding: Optional[str] = None, errors: Optional[str] = None, default_stdin: bool = False, - ) -> ContextManager[TextIO]: + ) -> Iterator[TextIO]: """ Context manager for opening files with stdin/stdout support. @@ -31,8 +33,7 @@ def open_cli_io_arg(path_or_dash: Optional[Union[str, TextIO]], to stdin/stdout - Regular paths open actual files with proper resource management - :param path_or_dash: File path, '-' for stdin/stdout, None for no argument, - or a file-like object + :param path_or_dash: File path, '-' for stdin/stdout, or None for no argument :param mode: File mode ('r' for reading, 'w' for writing). Binary modes are not supported. :param encoding: Optional file encoding @@ -40,81 +41,87 @@ def open_cli_io_arg(path_or_dash: Optional[Union[str, TextIO]], :param default_stdin: If True and path_or_dash is None, default to stdin (mode 'r') or stdout (mode 'w') :returns: Context manager yielding text-mode file handle - :rtype: ContextManager[TextIO] - """ + :rtype: Iterator[TextIO] - def _requires_read(requested_mode: str) -> bool: - return 'r' in requested_mode or '+' in requested_mode + Example usage:: - def _requires_write(requested_mode: str) -> bool: - return any(flag in requested_mode for flag in ('w', 'a', 'x', '+')) + # Read from file or stdin + with open_cli_io_arg(args.input, 'r', default_stdin=True) as f: + content = f.read() - @contextmanager - def _open() -> Generator[TextIO, None, None]: - # Validate that binary modes are not used - if 'b' in mode: - raise ValueError( - f"Binary mode '{mode}' is not supported. " - "Use text modes ('r', 'w', 'a', 'x') instead." - ) + # Write to file or stdout + with open_cli_io_arg(args.output, 'w', default_stdin=True) as f: + f.write(content) + """ + # Valid text modes for file operations + _READ_FLAGS = frozenset({'r', '+'}) + _WRITE_FLAGS = frozenset({'w', 'a', 'x', '+'}) + + if 'b' in mode: + raise ValueError( + f"Binary mode '{mode}' is not supported. " + "Use text modes ('r', 'w', 'a', 'x') instead." + ) - # Determine if we should use stdin/stdout - use_std = path_or_dash == '-' or (path_or_dash is None and default_stdin) - needs_read = _requires_read(mode) - needs_write = _requires_write(mode) + needs_read = bool(set(mode) & _READ_FLAGS) + needs_write = bool(set(mode) & _WRITE_FLAGS) - if ( - path_or_dash is None - and default_stdin - and needs_read - and sys.stdin.isatty() - ): - raise SystemExit("error: No input MMIF provided.") + should_use_stdio = path_or_dash == '-' or ( + path_or_dash is None and default_stdin + ) - if use_std: + file_handle: Optional[TextIO] = None + should_close = False + + try: + if should_use_stdio: if needs_read and needs_write: raise ValueError( f"Mode '{mode}' not supported with stdin/stdout " "(use read or write only)" ) + if needs_read: - yield sys.stdin + # Check for missing input when stdin is a terminal + if ( + path_or_dash is None + and default_stdin + and sys.stdin.isatty() + ): + raise SystemExit("error: No input provided.") + file_handle = sys.stdin + elif needs_write: - yield sys.stdout + file_handle = sys.stdout + else: raise ValueError( f"Mode '{mode}' not supported with stdin/stdout " "(use 'r' or 'w')" ) - elif isinstance(path_or_dash, TextIO): - if needs_read and not hasattr(path_or_dash, 'read'): - raise ValueError( - f"Mode '{mode}' requires a readable file-like object" - ) - if needs_write and not hasattr(path_or_dash, 'write'): - raise ValueError( - f"Mode '{mode}' requires a writable file-like object" - ) - yield path_or_dash + elif isinstance(path_or_dash, str): - # Open actual file with proper cleanup if needs_read and not os.path.exists(path_or_dash): - raise FileNotFoundError( - f"Input path does not exist: {path_or_dash}" - ) - f = cast(TextIO, open(path_or_dash, mode, encoding=encoding, errors=errors)) - try: - yield f - finally: - f.close() - else: - # there should be no other valid types at this point + raise FileNotFoundError(f"Input path does not exist: {path_or_dash}") + file_handle = cast(TextIO, io.open(path_or_dash, mode, encoding=encoding, errors=errors)) + should_close = True + + elif path_or_dash is None: + # None without default_stdin means no file specified raise ValueError( - f"Invalid type for path_or_dash: {type(path_or_dash)}. " - "Expected str of file path or text-based IO stream (TextIO)." + "No file path provided. Use '-' for stdin/stdout or set default_stdin=True." ) + else: + raise TypeError( + f"Invalid type for path_or_dash: {type(path_or_dash).__name__}. " + "Expected str or None." + ) + + yield file_handle - return _open() + finally: + if should_close and file_handle is not None: + file_handle.close() # keep imports of CLI modules for historical reasons diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index 1021e52d..0bbd49a8 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -71,7 +71,7 @@ def prep_argparser(**kwargs): "MMIF_FILE", nargs="?", type=str, - default=None if sys.stdin.isatty() else sys.stdin, + default=None, help='input MMIF file, a directory of MMIF files, or STDIN if `-` or not provided.' ) parser.add_argument( diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py index 53dc7513..17fe3d5d 100644 --- a/mmif/utils/cli/summarize.py +++ b/mmif/utils/cli/summarize.py @@ -47,8 +47,8 @@ def main(args: argparse.Namespace): with tempfile.NamedTemporaryFile( mode='w', suffix='.mmif', delete=False ) as tmp: - tmp.write(mmif_content) tmp_path = pathlib.Path(tmp.name) + tmp.write(mmif_content) mmif_summary = Summary(tmp_path) output = mmif_summary.to_dict() finally: diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index 2b1a4b11..10270525 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -54,9 +54,8 @@ def generate_source_mmif(self): # to suppress output (otherwise, set to stdout by default) args = self.parser.parse_args(self.get_params()) - with open(os.devnull, 'w') as devnull: - args.output = devnull - return source.main(args) + args.output = os.devnull + return source.main(args) def test_accept_file_paths(self): self.docs.append("video:/a/b/c.mp4") @@ -312,14 +311,14 @@ def create_temp_mmif_file(self, mmif_obj): def test_summarize_positional_input(self): tmp_file = self.create_temp_mmif_file(self.basic_mmif) - stdout = io.StringIO() try: - args = self.parser.parse_args([tmp_file]) - args.output = stdout - summarize.main(args) - output = json.loads(stdout.getvalue()) - self.assertIn('mmif_version', output) - self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = self.parser.parse_args([tmp_file]) + # args.output is None by default, which means stdout in open_cli_io_arg + summarize.main(args) + output = json.loads(stdout.getvalue()) + self.assertIn('mmif_version', output) + self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") finally: os.unlink(tmp_file) @@ -338,37 +337,19 @@ def test_summarize_output_file(self): os.unlink(tmp_input) os.unlink(tmp_output.name) - def test_summarize_pretty_print(self): - tmp_file = self.create_temp_mmif_file(self.basic_mmif) - stdout_pretty = io.StringIO() - stdout_compact = io.StringIO() - try: - # Pretty - args_pretty = self.parser.parse_args([tmp_file, "--pretty"]) - args_pretty.output = stdout_pretty - summarize.main(args_pretty) - - # Compact - args_compact = self.parser.parse_args([tmp_file]) - args_compact.output = stdout_compact - summarize.main(args_compact) - - self.assertNotEqual(stdout_pretty.getvalue(), stdout_compact.getvalue()) - self.assertIn('\n ', stdout_pretty.getvalue()) # Check for indentation - finally: - os.unlink(tmp_file) - def test_summarize_stdin(self): mmif_str = self.basic_mmif.serialize() import argparse - stdout = io.StringIO() - stdin = io.StringIO(mmif_str) - - args = argparse.Namespace(MMIF_FILE=stdin, output=stdout, pretty=False) - summarize.main(args) + + with unittest.mock.patch('sys.stdin', io.StringIO(mmif_str)), \ + unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + # MMIF_FILE defaults to None -> stdin + # output defaults to None -> stdout + args = argparse.Namespace(MMIF_FILE=None, output=None, pretty=False) + summarize.main(args) - output = json.loads(stdout.getvalue()) - self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") + output = json.loads(stdout.getvalue()) + self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") if __name__ == '__main__': From 398eb21277ad7783d5ac05818711d0f21d437316 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sat, 14 Feb 2026 17:09:29 -0500 Subject: [PATCH 40/48] Updated `mmif describe` implementation to be based on pydantic for better documentation --- build-tools/requirements.docs.txt | 3 +- documentation/conf.py | 9 + mmif/utils/cli/__init__.py | 4 +- mmif/utils/cli/describe.py | 88 ++++--- mmif/utils/workflow_helper.py | 414 ++++++++++++++++++------------ requirements.txt | 1 + 6 files changed, 312 insertions(+), 207 deletions(-) diff --git a/build-tools/requirements.docs.txt b/build-tools/requirements.docs.txt index 8d9ee33d..db2d03d8 100644 --- a/build-tools/requirements.docs.txt +++ b/build-tools/requirements.docs.txt @@ -1,3 +1,4 @@ -sphinx>=7.0,<8.0 +sphinx furo m2r2 +autodoc-pydantic diff --git a/documentation/conf.py b/documentation/conf.py index 121054ef..f309f548 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -33,6 +33,7 @@ 'undoc-members': True, 'show-inheritance': True, } +autodoc_member_order = 'bysource' # -- Project information ----------------------------------------------------- @@ -55,8 +56,16 @@ 'sphinx.ext.autodoc', 'sphinx.ext.linkcode', 'm2r2', + 'sphinxcontrib.autodoc_pydantic', ] +autodoc_pydantic_model_show_json = True +autodoc_pydantic_model_show_field_summary = True +autodoc_pydantic_model_show_config_summary = False +autodoc_pydantic_model_show_validator_members = False +autodoc_pydantic_model_show_validator_summary = False +autodoc_pydantic_field_list_validators = False + templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # dynamically generated files diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 9b91b60c..935ab0a7 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -28,6 +28,7 @@ def open_cli_io_arg(path_or_dash: Optional[str], manager. Handles the common CLI pattern where: + - '-' means stdin (read mode) or stdout (write mode) - None means "argument not provided"; when default_stdin=True, it falls back to stdin/stdout @@ -117,7 +118,8 @@ def open_cli_io_arg(path_or_dash: Optional[str], "Expected str or None." ) - yield file_handle + if file_handle is not None: + yield file_handle finally: if should_close and file_handle is not None: diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index 0bbd49a8..d921b329 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -1,17 +1,29 @@ import argparse import json -import os import sys import textwrap from pathlib import Path -from typing import Union, cast +from typing import Dict, Type, Union, cast + +from pydantic import BaseModel from mmif.utils.cli import open_cli_io_arg -from mmif.utils.workflow_helper import generate_workflow_identifier, describe_single_mmif, \ - describe_mmif_collection -# gen_param_hash is imported for backward compatibility -from mmif.utils.workflow_helper import generate_param_hash +# gen_param_hash is imported for backward compatibility +from mmif.utils.workflow_helper import ( + CollectionMmifDesc, + SingleMmifDesc, + describe_mmif_collection, + describe_single_mmif, + generate_workflow_identifier, +) + +models_to_help = [SingleMmifDesc, CollectionMmifDesc] +model_modules = set(model.__module__ for model in models_to_help) +def get_all_models() -> Dict[str, Type[BaseModel]]: + return { + name: cls for name, cls in models_to_help + } def get_pipeline_specs(mmif_file: Union[str, Path]): import warnings @@ -33,30 +45,11 @@ def describe_argparser(): 'collection of MMIF files.' ) - # get and clean docstrings - def _extract_describe_docstring(func): - doc = func.__doc__.split(':param')[0] - # then cut off all lines after `---` - doc = doc.split('---')[0] - return textwrap.dedent(doc).strip() - - single_doc = _extract_describe_docstring(describe_single_mmif) - collection_doc = _extract_describe_docstring(describe_mmif_collection) - additional = textwrap.dedent(f""" This command extracts workflow information from a single MMIF file or - summarizes a directory of MMIF files. The output is serialized as JSON and - includes: + a directory of MMIF files. The output is serialized as JSON. - ========================= - Single MMIF file as input - ========================= -{single_doc} - - ================================== - A directory of MMIF files as input - ================================== -{collection_doc} + Use `--help-schemas` to inspect the structure of the JSON output. """) return oneliner, additional @@ -67,6 +60,7 @@ def prep_argparser(**kwargs): formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs ) + parser.add_argument( "MMIF_FILE", nargs="?", @@ -84,24 +78,43 @@ def prep_argparser(**kwargs): action="store_true", help="Pretty-print JSON output" ) + parser.add_argument( + "--help-schemas", + nargs="*", + choices=["all"] + [m.__name__ for m in models_to_help], + metavar="SCHEMA_NAME", + help=f"Print the JSON schema for the output. For human-readable documentation, " + f"visit https://clams.ai/mmif-python and see the following modules: " + f"{', '.join(model_modules)}.\nOptions: all, {', '.join([m.__name__ for m in models_to_help])}." + ) return parser def main(args): """ - Main entry point for the describe CLI command. - - Reads a MMIF file and outputs a JSON summary containing: - - - workflow_id: unique identifier for the source and app sequence - - stats: view counts, annotation counts (total/per-view/per-type), and lists of error/warning/empty view IDs - - views: map of view IDs to app configurations and profiling data - - :param args: Parsed command-line arguments + Main block for the describe CLI command. + This function basically works as a wrapper around + :func:`describe_single_mmif` (for single file input) or + :func:`describe_mmif_collection` (for directory input). """ + if hasattr(args, 'help_schemas') and args.help_schemas is not None: + models_map = {m.__name__: m for m in models_to_help} + to_show = [] + if len(args.help_schemas) == 0 or 'all' in args.help_schemas: + to_show = models_to_help + else: + to_show = args.help_schemas + + for name in to_show: + model_cls = models_map[name] + schema = model_cls.model_json_schema() + print(json.dumps(schema, indent=2)) + print() + sys.exit(0) + output = {} # if input is a directory - if isinstance(args.MMIF_FILE, (str, os.PathLike)) and Path(args.MMIF_FILE).is_dir(): + if Path(str(args.MMIF_FILE)).is_dir(): output = describe_mmif_collection(args.MMIF_FILE) # if input is a file or stdin else: @@ -125,6 +138,7 @@ def main(args): tmp_path.unlink() if output: + # Convert Pydantic models to dicts with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: json.dump(output, output_file, indent=2 if args.pretty else None) output_file.write('\n') diff --git a/mmif/utils/workflow_helper.py b/mmif/utils/workflow_helper.py index c73c0cd2..bdde664a 100644 --- a/mmif/utils/workflow_helper.py +++ b/mmif/utils/workflow_helper.py @@ -1,13 +1,16 @@ import datetime import hashlib -from collections import Counter, defaultdict -from pathlib import Path -from typing import List, Any, Tuple, Optional, Union import itertools -from mmif import Mmif +from collections import Counter +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Tuple, Union, overload + +from pydantic import BaseModel, ConfigDict, Field +from mmif.serialize.mmif import Mmif, ViewsList -def group_views_by_app(views: List[Any]) -> List[List[Any]]: + +def group_views_by_app(views: ViewsList) -> List[List[Any]]: """ Groups views into app executions based on app and timestamp. @@ -93,9 +96,21 @@ def _read_mmif_from_path(mmif_input: Union[str, Path, Mmif]) -> Mmif: ) +@overload +def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], + return_param_dicts: Literal[True] + ) -> Tuple[str, List[dict]]: ... + + +@overload def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], - return_param_dicts=False) \ - -> Union[str, Tuple[str, List[dict]]]: + return_param_dicts: Literal[False] = False + ) -> str: ... + + +def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], + return_param_dicts: bool = False + ) -> Union[str, Tuple[str, List[dict]]]: """ Generate a workflow identifier string from a MMIF file or object. @@ -149,7 +164,53 @@ def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], return '/'.join(segments) -def _get_profile_data(view) -> dict: +## single MMIF summarization + +class SingleMmifStats(BaseModel): + """ + Aggregated statistics for a single MMIF file. + """ + model_config = ConfigDict(populate_by_name=True) + + app_count: int = Field(..., alias="appCount", description="Total number of app executions identified.") + error_views: List[str] = Field(default_factory=list, alias="errorViews", description="List of view IDs that contain errors.") + warning_views: List[str] = Field(default_factory=list, alias="warningViews", description="List of view IDs that contain warnings.") + empty_views: List[str] = Field(default_factory=list, alias="emptyViews", description="List of view IDs that contain no annotations.") + annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Total annotation counts across the file.") + +class AppProfiling(BaseModel): + """ + Profiling data for a single app execution. + """ + model_config = ConfigDict(populate_by_name=True) + + running_time_ms: Optional[int] = Field(default=None, alias="runningTimeMS", description="Execution time in milliseconds.") + +class AppExecution(BaseModel): + """ + Represents a single execution of an app, which may produce multiple views. + """ + model_config = ConfigDict(populate_by_name=True) + + app: str = Field(..., description="The URI of the app.") + view_ids: List[str] = Field(..., alias="viewIds", description="List of view IDs generated by this execution.") + app_configuration: Dict = Field(default_factory=dict, alias="appConfiguration", description="Configuration parameters used for this execution.") + app_profiling: AppProfiling = Field(default_factory=lambda: AppProfiling(), alias="appProfiling", description="Profiling data for this execution.") + annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Counts of annotations produced, grouped by type.") + + +class SingleMmifDesc(BaseModel): + """ + Description of a workflow extracted from a single MMIF file. + """ + model_config = ConfigDict(populate_by_name=True) + + workflow_id: str = Field(..., alias="workflowId", description="Unique identifier for the workflow structure.") + stats: SingleMmifStats = Field(..., description="Statistics about the views and annotations.") + apps: List[AppExecution] = Field(..., description="Sequence of app executions in the workflow.") + + +def _get_profile_data(view) -> AppProfiling: """ Extract profiling data from a view's metadata. @@ -168,13 +229,13 @@ def _get_profile_data(view) -> dict: running_time_str = profiling.get("runningTime") if running_time_str is None: - return {} + return AppProfiling(runningTimeMS=None) # the format is datetime.timedelta string, e.g. '0:00:02.345678' # need to convert to milliseconds integer time_obj = datetime.datetime.strptime(running_time_str, "%H:%M:%S.%f").time() milliseconds = (time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second) * 1000 + time_obj.microsecond // 1000 - return {"runningTimeMS": milliseconds} + return AppProfiling(runningTimeMS=milliseconds) def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: @@ -188,53 +249,24 @@ def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: a single logical "app execution". .. note:: - For MMIF files generated by ``clams-python`` <= 1.3.3, all views - are independently timestamped. This means that even if multiple views - were generated by a single execution of an app, their + For MMIF files generated by apps based on ``clams-python`` <= 1.3.3, all + views are independently timestamped. This means that even if multiple + views were generated by a single execution of an app, their ``metadata.timestamp`` values will be unique. As a result, the grouping logic will treat each view as a separate app execution. The change that aligns timestamps for views from a single app execution is implemented in `clams-python PR #271 `_. - The output format is a dictionary with the following keys: - - * ``workflowId`` - A unique identifier for the workflow, based on the - sequence of app executions (app, version, parameter hashes). App - executions with errors are excluded from this identifier. App - executions with warnings are still considered successful for the purpose - of this identifier. - * ``stats`` - A dictionary with the following keys: - - * ``appCount`` - Total number of identified app executions. - * ``errorViews`` - A list of view IDs that reported errors. - * ``warningViews`` - A list of view IDs that reported warnings. - * ``emptyViews`` - A list of view IDs that contain no annotations. - * ``annotationCountByType`` - A dictionary mapping each annotation type to its count, plus a - ``total`` key for the sum of all annotations across all app - executions. - * ``apps`` - A list of objects, where each object represents one app - execution. It includes metadata, profiling, and aggregated statistics - for all views generated by that execution. A special entry for views - that could not be assigned to an execution will be at the end of the list. - - --- - The docstring above is used to generate help messages for the CLI command. - Do not remove the triple-dashed lines. + The output is a serialized :class:`~SingleMmifDesc` object. + .. pydantic_model:: SingleMmifDesc + :noindex: + :param mmif_input: Path to MMIF file (str or Path) or a Mmif object :return: A dictionary containing the workflow specification. """ mmif = _read_mmif_from_path(mmif_input) - workflow_id = generate_workflow_identifier(mmif) error_view_ids = [] warning_view_ids = [] @@ -259,17 +291,21 @@ def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: execution_view_ids = [v.id for v in group] processed_view_ids.update(execution_view_ids) - app_data = { - "app": first_view.metadata.app, - "viewIds": execution_view_ids, - "appConfiguration": first_view.metadata.get("appConfiguration", {}), - "appProfiling": _get_profile_data(first_view), - } + # Prepare annotation counts total_annotations_in_exec = sum(execution_ann_counter.values()) if total_annotations_in_exec > 0: - app_data['annotationCountByType'] = dict(execution_ann_counter) - app_data['annotationCountByType']['total'] = total_annotations_in_exec - grouped_apps.append(app_data) + count_dict = dict(execution_ann_counter) + count_dict['total'] = total_annotations_in_exec + else: + count_dict = {} + + grouped_apps.append(AppExecution( + app=first_view.metadata.app, + viewIds=execution_view_ids, + appConfiguration=first_view.metadata.get("appConfiguration", {}), + appProfiling=_get_profile_data(first_view), + annotationCountByType=count_dict + )) # Handle unassigned and problematic views all_view_ids = set(v.id for v in mmif.views) @@ -289,19 +325,23 @@ def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: app_count = len(grouped_apps) if unassigned_view_ids: - grouped_apps.append({ - "app": "http://apps.clams.ai/non-existing-app/v1", - "viewIds": sorted(list(unassigned_view_ids)) - }) + grouped_apps.append(AppExecution( + app="http://apps.clams.ai/non-existing-app/v1", + viewIds=sorted(list(unassigned_view_ids)), + appConfiguration={}, + appProfiling=AppProfiling(runningTimeMS=None), + annotationCountByType={} + )) # aggregate total annotation counts total_annotations_by_type = Counter() for execution in grouped_apps: # Only aggregate from actual apps, not the special unassigned entry - if execution.get('app') != "http://apps.clams.ai/non-existing-app/v1": - if 'annotationCountByType' in execution: - exec_counts = execution['annotationCountByType'].copy() - del exec_counts['total'] + if execution.app != "http://apps.clams.ai/non-existing-app/v1": + if execution.annotation_count_by_type: + exec_counts = execution.annotation_count_by_type.copy() + if 'total' in exec_counts: + del exec_counts['total'] total_annotations_by_type.update(Counter(exec_counts)) final_total_annotations = sum(total_annotations_by_type.values()) @@ -309,17 +349,79 @@ def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: if final_total_annotations > 0: final_annotation_counts['total'] = final_total_annotations - return { - "workflowId": workflow_id, - "stats": { - "appCount": app_count, - "errorViews": error_view_ids, - "warningViews": warning_view_ids, - "emptyViews": empty_view_ids, - "annotationCountByType": final_annotation_counts - }, - "apps": grouped_apps - } + return SingleMmifDesc( + workflowId=generate_workflow_identifier(mmif, return_param_dicts=False), + stats=SingleMmifStats( + appCount=app_count, + errorViews=error_view_ids, + warningViews=warning_view_ids, + emptyViews=empty_view_ids, + annotationCountByType=final_annotation_counts + ), + apps=grouped_apps + ).model_dump(by_alias=True) + + +## MMIF collection summarization + +class AppProfilingStats(BaseModel): + """ + Aggregated profiling statistics for an app across a workflow. + """ + model_config = ConfigDict(populate_by_name=True) + + avg_running_time_ms: Optional[float] = Field(default=None, alias="avgRunningTimeMS", description="Average execution time in milliseconds.") + min_running_time_ms: Optional[float] = Field(default=None, alias="minRunningTimeMS", description="Minimum execution time in milliseconds.") + max_running_time_ms: Optional[float] = Field(default=None, alias="maxRunningTimeMS", description="Maximum execution time in milliseconds.") + stdev_running_time_ms: Optional[float] = Field(default=None, alias="stdevRunningTimeMS", description="Standard deviation of execution time.") + + + + +class WorkflowAppExecution(BaseModel): + """ + Aggregated information about an app's usage within a specific workflow across multiple files. + """ + model_config = ConfigDict(populate_by_name=True) + + app: str = Field(..., description="The URI of the app.") + app_configuration: Dict = Field(default_factory=dict, alias="appConfiguration", description="Representative configuration (usually from the first occurrence).") + app_profiling: AppProfilingStats = Field(default_factory=lambda: AppProfilingStats(), alias="appProfiling", description="Aggregated profiling statistics.") + + +class WorkflowCollectionEntry(BaseModel): + """ + Summary of a unique workflow found within a collection. + """ + model_config = ConfigDict(populate_by_name=True) + + workflow_id: str = Field(..., alias="workflowId", description="Unique identifier for the workflow.") + mmifs: List[str] = Field(..., description="List of filenames belonging to this workflow.") + mmif_count: int = Field(..., alias="mmifCount", description="Number of MMIF files matching this workflow.") + apps: List[WorkflowAppExecution] = Field(..., description="Sequence of apps in this workflow with aggregated stats.") + +class MmifCountByStatus(BaseModel): + """ + Breakdown of MMIF files in a collection by their processing status. + """ + model_config = ConfigDict(populate_by_name=True) + + total: int = Field(..., description="Total number of MMIF files found.") + successful: int = Field(..., description="Number of files processed without errors.") + with_errors: int = Field(..., alias="withErrors", description="Number of files containing error views.") + with_warnings: int = Field(..., alias="withWarnings", description="Number of files containing warning views.") + invalid: int = Field(..., description="Number of files that failed to parse as valid MMIF.") + + +class CollectionMmifDesc(BaseModel): + """ + Summary of a collection of MMIF files. + """ + model_config = ConfigDict(populate_by_name=True) + + mmif_count_by_status: MmifCountByStatus = Field(..., alias="mmifCountByStatus", description="Counts of MMIF files by status.") + workflows: List[WorkflowCollectionEntry] = Field(..., description="List of unique workflows identified in the collection.") + annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Total annotation counts across the entire collection.") def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict: @@ -329,139 +431,115 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict: This function provides an overview of a collection of MMIF files, aggregating statistics across multiple files. - The output format is a dictionary with the following keys: - - * ``mmifCountByStatus`` - A dictionary summarizing the processing status of all MMIF files in the - collection. It includes: - - ``total`` - Total number of MMIF files found. - ``successful`` - Number of MMIF files processed without errors (may contain warnings). - ``withErrors`` - Number of MMIF files containing app executions that reported errors. - ``withWarnings`` - Number of MMIF files containing app executions that reported warnings. - ``invalid`` - Number of files that failed to be parsed as valid MMIF. - * ``workflows`` - A list of "workflow" objects found in the "successful" MMIF files (files - with errors are excluded), where each object contains: - - ``workflowId`` - The unique identifier for the workflow. - ``apps`` - A list of app objects, each with ``app`` (name+ver identifier), - ``appConfiguration``, and ``appProfiling`` statistics (avg, min, max, - stdev running times) aggregated per workflow. - ``mmifs`` - A list of MMIF file basenames belonging to this workflow. - ``mmifCount`` - The number of MMIF files in this workflow. - * ``annotationCountByType`` - A dictionary aggregating annotation counts across the entire collection. - It includes a ``total`` key for the grand total, plus integer counts for - each individual annotation type. - - --- - The docstring above is used to generate help messages for the CLI command. - Do not remove the triple-dashed lines. + The output is a serialized :class:`~CollectionMmifDesc` object. + + .. pydantic_model:: CollectionMmifDesc + :noindex: :param mmif_dir: Path to the directory containing MMIF files. :return: A dictionary containing the summarized collection specification. """ import statistics - from collections import defaultdict, Counter + from collections import Counter mmif_files = list(Path(mmif_dir).glob('*.mmif')) - status_summary = defaultdict(int) - status_summary['total'] = len(mmif_files) - status_summary['successful'] = 0 - status_summary['withErrors'] = 0 - status_summary['withWarnings'] = 0 - status_summary['invalid'] = 0 + status_summary = MmifCountByStatus( + total=len(mmif_files), + successful=0, + withErrors=0, + withWarnings=0, + invalid=0 + ) aggregated_counts = Counter() - workflows_data = defaultdict(lambda: { - 'mmifs': [], - 'apps': defaultdict(lambda: { - 'appConfiguration': None, # Store the first config here - 'execution_times': [] - }) - }) + # Structure: {workflow_id: {'mmifs': [...], 'apps': {app_uri: {'appConfiguration': ..., 'execution_times': [...]}}}} + workflows_data: Dict[str, Dict] = {} for mmif_file in mmif_files: try: - single_report = describe_single_mmif(mmif_file) - except Exception as e: - status_summary['invalid'] += 1 + single_report = SingleMmifDesc.model_validate(describe_single_mmif(mmif_file)) + except Exception: + status_summary.invalid += 1 continue - if single_report['stats']['errorViews']: - status_summary['withErrors'] += 1 + if single_report.stats.error_views: + status_summary.with_errors += 1 continue # Exclude from all other stats # If we get here, the MMIF has no errors and is considered "successful" - status_summary['successful'] += 1 - if single_report['stats']['warningViews']: - status_summary['withWarnings'] += 1 - - wf_id = single_report['workflowId'] + status_summary.successful += 1 + if single_report.stats.warning_views: + status_summary.with_warnings += 1 + + wf_id = single_report.workflow_id + # Initialize workflow entry if not exists + if wf_id not in workflows_data: + workflows_data[wf_id] = {'mmifs': [], 'apps': {}} workflows_data[wf_id]['mmifs'].append(Path(mmif_file).name) # Aggregate annotation counts for successful mmifs - report_counts = single_report['stats'].get('annotationCountByType', {}) + report_counts = single_report.stats.annotation_count_by_type.copy() if 'total' in report_counts: del report_counts['total'] # don't add the sub-total to the main counter aggregated_counts.update(report_counts) - for app_exec in single_report.get('apps', []): - app_uri = app_exec.get('app') + for app_exec in single_report.apps: + app_uri = app_exec.app # skip the special "unassigned" app if app_uri and app_uri != "http://apps.clams.ai/non-existing-app/v1": - running_time = app_exec.get('appProfiling', {}).get('runningTimeMS') + # Initialize app entry if not exists + if app_uri not in workflows_data[wf_id]['apps']: + workflows_data[wf_id]['apps'][app_uri] = { + 'appConfiguration': None, + 'execution_times': [] + } + + running_time = app_exec.app_profiling.running_time_ms if running_time is not None: workflows_data[wf_id]['apps'][app_uri]['execution_times'].append(running_time) # Store the first non-empty app configuration we find for this app in this workflow if workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] is None: - config = app_exec.get('appConfiguration', {}) + config = app_exec.app_configuration if config: workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] = config # Process collected data into the final output format final_workflows_list = [] for wf_id, wf_data in sorted(workflows_data.items()): - workflow_object = { - 'workflowId': wf_id, - 'mmifs': sorted(wf_data['mmifs']), - 'mmifCount': len(wf_data['mmifs']), - 'apps': [] - } + workflow_apps = [] for app_uri, app_data in sorted(wf_data['apps'].items()): times = app_data['execution_times'] if times: - profiling_stats = { - 'avgRunningTimeMS': statistics.mean(times), - 'minRunningTimeMS': min(times), - 'maxRunningTimeMS': max(times), - 'stdevRunningTimeMS': statistics.stdev(times) if len(times) > 1 else 0 - } + profiling_stats = AppProfilingStats( + avgRunningTimeMS=statistics.mean(times), + minRunningTimeMS=min(times), + maxRunningTimeMS=max(times), + stdevRunningTimeMS=statistics.stdev(times) if len(times) > 1 else 0 + ) else: - profiling_stats = {} - - app_object = { - 'app': app_uri, - 'appConfiguration': app_data['appConfiguration'] or {}, # Default to empty dict - 'appProfiling': profiling_stats - } - workflow_object['apps'].append(app_object) - - final_workflows_list.append(workflow_object) + profiling_stats = AppProfilingStats( + avgRunningTimeMS=None, + minRunningTimeMS=None, + maxRunningTimeMS=None, + stdevRunningTimeMS=None + ) + + workflow_apps.append(WorkflowAppExecution( + app=app_uri, + appConfiguration=app_data['appConfiguration'] or {}, + appProfiling=profiling_stats + )) + + final_workflows_list.append(WorkflowCollectionEntry( + workflowId=wf_id, + mmifs=sorted(wf_data['mmifs']), + mmifCount=len(wf_data['mmifs']), + apps=workflow_apps + )) # Finalize annotation counts final_annotation_counts = dict(aggregated_counts) @@ -469,8 +547,8 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict: if grand_total > 0: final_annotation_counts['total'] = grand_total - return { - 'mmifCountByStatus': dict(status_summary), - 'workflows': final_workflows_list, - 'annotationCountByType': final_annotation_counts - } + return CollectionMmifDesc( + mmifCountByStatus=status_summary, + workflows=final_workflows_list, + annotationCountByType=final_annotation_counts + ).model_dump(by_alias=True) diff --git a/requirements.txt b/requirements.txt index a97c214e..c3e9d722 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ orderly-set==5.3.* # 5.4 drops py38 support jsonschema +pydantic>=2.0 From 8266a2edfcf9846f3d98baa3ab4e0e7c635074dc Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sun, 15 Feb 2026 08:51:34 -0500 Subject: [PATCH 41/48] updated test cases for utils and clis --- mmif/utils/cli/describe.py | 2 +- tests/test_utils.py | 203 +++++++++++++++++++---- tests/test_utils_cli.py | 328 ++++++++++++++++++------------------- 3 files changed, 333 insertions(+), 200 deletions(-) diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index d921b329..bb226a81 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -101,7 +101,7 @@ def main(args): models_map = {m.__name__: m for m in models_to_help} to_show = [] if len(args.help_schemas) == 0 or 'all' in args.help_schemas: - to_show = models_to_help + to_show = [m.__name__ for m in models_to_help] else: to_show = args.help_schemas diff --git a/tests/test_utils.py b/tests/test_utils.py index 5f29b9d2..1aa4fdaf 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,17 +1,25 @@ +import json +import os import pathlib -import unittest import tempfile -import json +import unittest +from pathlib import Path import pytest - -from mmif import Mmif, Document, AnnotationTypes +from hypothesis import given +from hypothesis import strategies as st + +from mmif import ( + AnnotationTypes, + Document, + Mmif +) from mmif.utils import sequence_helper as sqh from mmif.utils import text_document_helper as tdh from mmif.utils import timeunit_helper as tuh from mmif.utils import video_document_helper as vdh -from tests.mmif_examples import * -from hypothesis import given, strategies as st +from mmif.utils import workflow_helper as wfh +from tests import mmif_examples class TestTimeunitHelper(unittest.TestCase): @@ -205,7 +213,7 @@ def test_width_based_smoothing(self): class TestTextDocHelper(unittest.TestCase): - mmif_obj = Mmif(MMIF_EXAMPLES['everything']) + mmif_obj = Mmif(mmif_examples.MMIF_EXAMPLES['everything']) @pytest.mark.skip("The only valid test cases come from kaldi app which annotates wrong property") def test_slice_text(self): @@ -232,8 +240,6 @@ def setUp(self) -> None: def create_temp_mmif_file(self, mmif_obj): """Helper to create a temporary MMIF file.""" - import tempfile - import json tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) if isinstance(mmif_obj, Mmif): content_to_write = mmif_obj.serialize(pretty=False) @@ -244,24 +250,20 @@ def create_temp_mmif_file(self, mmif_obj): return tmp.name def test_split_appname_appversion(self): - from mmif.utils.workflow_helper import _split_appname_appversion - app_name, app_version = _split_appname_appversion("http://apps.clams.ai/test-app/v1.0.0") + app_name, app_version = wfh._split_appname_appversion("http://apps.clams.ai/test-app/v1.0.0") self.assertEqual(app_name, "test-app") self.assertEqual(app_version, "v1.0.0") def test_generate_param_hash(self): - from mmif.utils.workflow_helper import generate_param_hash params = {"param1": "value1", "param2": 42} - hash1 = generate_param_hash(params) - hash2 = generate_param_hash(params) + hash1 = wfh.generate_param_hash(params) + hash2 = wfh.generate_param_hash(params) self.assertEqual(hash1, hash2) params_reversed = {"param2": 42, "param1": "value1"} - hash3 = generate_param_hash(params_reversed) + hash3 = wfh.generate_param_hash(params_reversed) self.assertEqual(hash1, hash3) def test_generate_workflow_identifier_grouped(self): - from mmif.vocabulary import AnnotationTypes - from mmif.utils import workflow_helper view1 = self.basic_mmif.new_view() view1.metadata.app = "http://apps.clams.ai/app1/v1.0.0" view1.metadata.timestamp = "2024-01-01T12:00:00Z" @@ -274,7 +276,7 @@ def test_generate_workflow_identifier_grouped(self): tmp_file = self.create_temp_mmif_file(self.basic_mmif) import os try: - workflow_id = workflow_helper.generate_workflow_identifier(tmp_file) + workflow_id = wfh.generate_workflow_identifier(tmp_file) segments = workflow_id.split('/') self.assertEqual(len(segments), 6) self.assertIn('app1', segments[0]) @@ -284,39 +286,35 @@ def test_generate_workflow_identifier_grouped(self): def test_generate_workflow_identifier_with_mmif_object(self): """Test that generate_workflow_identifier accepts Mmif objects directly.""" - from mmif.utils import workflow_helper import os # Test with Mmif object directly - workflow_id_from_obj = workflow_helper.generate_workflow_identifier(self.basic_mmif) + workflow_id_from_obj = wfh.generate_workflow_identifier(self.basic_mmif) # Test with file path - should produce the same result tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - workflow_id_from_file = workflow_helper.generate_workflow_identifier(tmp_file) + workflow_id_from_file = wfh.generate_workflow_identifier(tmp_file) self.assertEqual(workflow_id_from_obj, workflow_id_from_file) finally: os.unlink(tmp_file) def test_read_mmif_from_path(self): """Test the _read_mmif_from_path helper function.""" - from mmif.utils.workflow_helper import _read_mmif_from_path - from pathlib import Path - import os # Test with Mmif object - should return as-is - result = _read_mmif_from_path(self.basic_mmif) + result = wfh._read_mmif_from_path(self.basic_mmif) self.assertIs(result, self.basic_mmif) # Test with file path string tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - result_from_str = _read_mmif_from_path(tmp_file) + result_from_str = wfh._read_mmif_from_path(tmp_file) self.assertIsInstance(result_from_str, Mmif) self.assertEqual(result_from_str.serialize(pretty=False), self.basic_mmif.serialize(pretty=False)) # Test with Path object - result_from_path = _read_mmif_from_path(Path(tmp_file)) + result_from_path = wfh._read_mmif_from_path(Path(tmp_file)) self.assertIsInstance(result_from_path, Mmif) self.assertEqual(result_from_path.serialize(pretty=False), self.basic_mmif.serialize(pretty=False)) finally: @@ -324,27 +322,164 @@ def test_read_mmif_from_path(self): # Test with invalid input with pytest.raises(ValueError): - _read_mmif_from_path(12345) + wfh._read_mmif_from_path(12345) def test_describe_single_mmif_with_mmif_object(self): """Test that describe_single_mmif accepts Mmif objects directly.""" - from mmif.utils.workflow_helper import describe_single_mmif import os # Test with Mmif object directly - result_from_obj = describe_single_mmif(self.basic_mmif) + result_from_obj = wfh.describe_single_mmif(self.basic_mmif) # Test with file path - should produce the same result tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - result_from_file = describe_single_mmif(tmp_file) + result_from_file = wfh.describe_single_mmif(tmp_file) self.assertEqual(result_from_obj, result_from_file) - self.assertIn('workflowId', result_from_obj) - self.assertIn('stats', result_from_obj) - self.assertIn('apps', result_from_obj) + + # Validate that the output conforms to the SingleMmifDesc Pydantic model + # If validation succeeds, all required fields with correct aliases are present + validated = wfh.SingleMmifDesc.model_validate(result_from_obj) + # Can assert on the validated object's attributes if needed + self.assertIsNotNone(validated.workflow_id) + self.assertIsNotNone(validated.stats) + self.assertIsNotNone(validated.apps) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_empty(self): + """Test describe_single_mmif with an empty MMIF (no views).""" + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 0) + self.assertEqual(len(validated.apps), 0) + self.assertEqual(validated.stats.annotation_count_by_type, {}) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_one_app(self): + """Test describe_single_mmif with a single app execution.""" + view = self.basic_mmif.new_view() + view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view.metadata.timestamp = "2024-01-01T12:00:00Z" + view.metadata.appProfiling = {"runningTime": "0:00:01.234"} + view.new_annotation(AnnotationTypes.TimeFrame) + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 1) + self.assertEqual(len(validated.apps), 1) + app_exec = validated.apps[0] + self.assertEqual(app_exec.app, view.metadata.app) + self.assertEqual(app_exec.view_ids, [view.id]) + self.assertEqual(app_exec.app_profiling.running_time_ms, 1234) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_one_app_two_views(self): + """Test describe_single_mmif with one app execution producing two views.""" + view1 = self.basic_mmif.new_view() + view1.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view1.metadata.timestamp = "2024-01-01T12:00:00Z" + view1.new_annotation(AnnotationTypes.TimeFrame) + view2 = self.basic_mmif.new_view() + view2.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view2.metadata.timestamp = "2024-01-01T12:00:00Z" + view2.new_annotation(AnnotationTypes.TimeFrame) + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 1) + self.assertEqual(len(validated.apps), 1) + app_exec = validated.apps[0] + self.assertEqual(app_exec.view_ids, [view1.id, view2.id]) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_error_view(self): + """Test describe_single_mmif with a view containing an error.""" + view = self.basic_mmif.new_view() + view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view.metadata.timestamp = "2024-01-01T12:00:00Z" + view.metadata.error = {"message": "Something went wrong"} + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 0) + self.assertEqual(len(validated.apps), 0) + self.assertEqual(len(validated.stats.error_views), 1) finally: os.unlink(tmp_file) + def test_describe_single_mmif_with_unassigned_views(self): + """Test describe_single_mmif with views that cannot be grouped.""" + import unittest.mock + raw_mmif = json.loads(self.basic_mmif.serialize()) + raw_mmif['views'].append({'id': 'v1', 'metadata': {'app': 'http://apps.clams.ai/app1/v1.0.0', 'timestamp': '2024-01-01T12:00:00Z'}, 'annotations': []}) + raw_mmif['views'].append({'id': 'v2', 'metadata': {'app': 'http://apps.clams.ai/app2/v2.0.0'}, 'annotations': []}) + raw_mmif['views'].append({'id': 'v3', 'metadata': {'timestamp': '2024-01-01T12:01:00Z', 'app': ''}, 'annotations': []}) + tmp_file = self.create_temp_mmif_file(raw_mmif) + try: + with unittest.mock.patch('jsonschema.validators.validate'): + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 1) + self.assertEqual(len(validated.apps), 2) + special_entry = validated.apps[-1] + self.assertEqual(special_entry.app, 'http://apps.clams.ai/non-existing-app/v1') + self.assertEqual(len(special_entry.view_ids), 2) + self.assertIn('v2', special_entry.view_ids) + self.assertIn('v3', special_entry.view_ids) + finally: + os.unlink(tmp_file) + + def test_describe_collection_empty(self): + """Test describe_mmif_collection with an empty directory.""" + dummy_dir = 'dummy_mmif_collection' + os.makedirs(dummy_dir, exist_ok=True) + try: + output = wfh.describe_mmif_collection(dummy_dir) + # Validate using Pydantic model + validated = wfh.CollectionMmifDesc.model_validate(output) + self.assertEqual(validated.mmif_count_by_status.total, 0) + self.assertEqual(len(validated.workflows), 0) + finally: + os.rmdir(dummy_dir) + + def test_describe_collection_with_files(self): + """Test describe_mmif_collection with MMIF files.""" + dummy_dir = 'dummy_mmif_collection_with_files' + os.makedirs(dummy_dir, exist_ok=True) + try: + # Create two MMIF files in the directory + for i in range(2): + tmp_file = os.path.join(dummy_dir, f'{i}.mmif') + with open(tmp_file, 'w') as f: + f.write(self.basic_mmif.serialize()) + + output = wfh.describe_mmif_collection(dummy_dir) + + # Validate structure using Pydantic model + # If validation succeeds, all required fields with correct aliases are present + validated = wfh.CollectionMmifDesc.model_validate(output) + + # Verify counts using validated object attributes + self.assertEqual(validated.mmif_count_by_status.total, 2) + self.assertIsInstance(validated.workflows, list) + finally: + import shutil + shutil.rmtree(dummy_dir) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index 10270525..66c77c38 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -1,3 +1,4 @@ +import argparse import contextlib import io import json @@ -6,18 +7,121 @@ import unittest.mock import mmif -from mmif.utils.cli import rewind -from mmif.utils.cli import source -from mmif.utils.cli import describe -from mmif.utils.cli import summarize - from mmif.serialize import Mmif -from mmif.vocabulary import DocumentTypes, AnnotationTypes - +from mmif.utils.cli import describe, rewind, source, summarize +from mmif.vocabulary import AnnotationTypes BASIC_MMIF_STRING = '{"metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", "properties": {"id": "d1", "mime": "video/mp4", "location": "file:///test/video.mp4"}}], "views": []}' +class BaseCliTestCase(unittest.TestCase): + """Base class for CLI module tests with common utilities.""" + + cli_module = None # Override in subclass + + def setUp(self): + """Set up common test fixtures.""" + if self.cli_module: + self.parser = self.cli_module.prep_argparser() + self.basic_mmif = Mmif(BASIC_MMIF_STRING) + self.maxDiff = None + + @staticmethod + def create_temp_mmif_file(mmif_obj): + """Create a temporary MMIF file for testing. + + Args: + mmif_obj: Either a Mmif object or a dict/string to serialize + + Returns: + str: Path to the temporary file (caller must unlink) + """ + tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) + if isinstance(mmif_obj, Mmif): + content = mmif_obj.serialize(pretty=False) + else: + content = json.dumps(mmif_obj) if isinstance(mmif_obj, dict) else mmif_obj + tmp.write(content) + tmp.close() + return tmp.name + + def run_cli_capture_stdout(self, args_namespace): + """Run CLI module and capture stdout as parsed JSON. + + Args: + args_namespace: Namespace object with CLI arguments + + Returns: + dict: Parsed JSON output from stdout + """ + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + self.cli_module.main(args_namespace) + return json.loads(stdout.getvalue()) + + +class IOTestMixin: + """Mixin providing common I/O tests for CLI modules. + + Requires the test class to have: + - cli_module attribute + - basic_mmif attribute + - create_temp_mmif_file method + - run_cli_capture_stdout method + - expected_output_keys attribute (list of keys to check in output) + """ + + def test_file_input_stdout_output(self): + """Test reading from file and outputting to stdout.""" + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + args = argparse.Namespace( + MMIF_FILE=tmp_file, + output=None, + pretty=False, + help_schemas=None # For describe module + ) + output = self.run_cli_capture_stdout(args) + self.assertIsInstance(output, dict) + for key in self.expected_output_keys: + self.assertIn(key, output) + finally: + os.unlink(tmp_file) + + def test_file_input_file_output(self): + """Test reading from file and outputting to file.""" + tmp_input = self.create_temp_mmif_file(self.basic_mmif) + tmp_output = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) + tmp_output.close() + try: + args = self.parser.parse_args([tmp_input, '-o', tmp_output.name]) + self.cli_module.main(args) + with open(tmp_output.name, 'r') as f: + output = json.load(f) + self.assertIsInstance(output, dict) + for key in self.expected_output_keys: + self.assertIn(key, output) + finally: + os.unlink(tmp_input) + os.unlink(tmp_output.name) + + def test_stdin_input_stdout_output(self): + """Test reading from stdin and outputting to stdout.""" + mmif_str = self.basic_mmif.serialize() + with unittest.mock.patch('sys.stdin', io.StringIO(mmif_str)), \ + unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = argparse.Namespace( + MMIF_FILE=None, + output=None, + pretty=False, + help_schemas=None # For describe module + ) + self.cli_module.main(args) + output = json.loads(stdout.getvalue()) + self.assertIsInstance(output, dict) + for key in self.expected_output_keys: + self.assertIn(key, output) + + class TestCli(unittest.TestCase): def setUp(self) -> None: self.parser, _, _ = mmif.prep_argparser_and_subcmds() @@ -179,178 +283,72 @@ def test_app_rewind(self): self.assertIn('dummy_app_two', remaining_apps) -class TestDescribe(unittest.TestCase): +class TestDescribe(BaseCliTestCase, IOTestMixin): """Test suite for the describe CLI module.""" - - def setUp(self): - """Create test MMIF structures.""" - self.parser = describe.prep_argparser() - self.maxDiff = None - self.basic_mmif = Mmif(BASIC_MMIF_STRING) - - def create_temp_mmif_file(self, mmif_obj): - """Helper to create a temporary MMIF file.""" - tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) - if isinstance(mmif_obj, Mmif): - content_to_write = mmif_obj.serialize(pretty=False) - else: - content_to_write = json.dumps(mmif_obj) - tmp.write(content_to_write) - tmp.close() - return tmp.name - - def test_describe_single_mmif_empty(self): - tmp_file = self.create_temp_mmif_file(self.basic_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 0) - self.assertEqual(len(result["apps"]), 0) - self.assertEqual(result["stats"]["annotationCountByType"], {}) - finally: - os.unlink(tmp_file) - - def test_describe_single_mmif_one_app(self): - view = self.basic_mmif.new_view() - view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view.metadata.timestamp = "2024-01-01T12:00:00Z" - view.metadata.appProfiling = {"runningTime": "0:00:01.234"} - view.new_annotation(AnnotationTypes.TimeFrame) - tmp_file = self.create_temp_mmif_file(self.basic_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 1) - self.assertEqual(len(result["apps"]), 1) - app_exec = result["apps"][0] - self.assertEqual(app_exec["app"], view.metadata.app) - self.assertEqual(app_exec["viewIds"], [view.id]) - self.assertEqual(app_exec["appProfiling"]["runningTimeMS"], 1234) - finally: - os.unlink(tmp_file) - - def test_describe_single_mmif_one_app_two_views(self): - view1 = self.basic_mmif.new_view() - view1.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view1.metadata.timestamp = "2024-01-01T12:00:00Z" - view1.new_annotation(AnnotationTypes.TimeFrame) - view2 = self.basic_mmif.new_view() - view2.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view2.metadata.timestamp = "2024-01-01T12:00:00Z" - view2.new_annotation(AnnotationTypes.TimeFrame) - tmp_file = self.create_temp_mmif_file(self.basic_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 1) - self.assertEqual(len(result["apps"]), 1) - app_exec = result["apps"][0] - self.assertEqual(app_exec["viewIds"], [view1.id, view2.id]) - finally: - os.unlink(tmp_file) - - def test_describe_single_mmif_error_view(self): - view = self.basic_mmif.new_view() - view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view.metadata.timestamp = "2024-01-01T12:00:00Z" - view.metadata.error = {"message": "Something went wrong"} + + cli_module = describe + expected_output_keys = ['workflowId', 'stats', 'apps'] + + def test_help_schemas_all(self): + """Test --help-schemas all""" + from mmif.utils.cli.describe import models_to_help + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = argparse.Namespace(help_schemas=['all'], MMIF_FILE=None, output=None, pretty=False) + with self.assertRaises(SystemExit) as cm: + describe.main(args) + self.assertEqual(cm.exception.code, 0) + output = stdout.getvalue() + for m in models_to_help: + self.assertIn(m.__name__, output) + self.assertIn("$defs", output) + + def test_describe_main_directory(self): + """Test describe.main with a directory input""" + with tempfile.TemporaryDirectory() as tmp_dir: + # Create two mmif files + with open(os.path.join(tmp_dir, '1.mmif'), 'w') as f: + f.write(self.basic_mmif.serialize()) + with open(os.path.join(tmp_dir, '2.mmif'), 'w') as f: + f.write(self.basic_mmif.serialize()) + + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + # MMIF_FILE argument expects a string path + args = argparse.Namespace(MMIF_FILE=tmp_dir, output=None, pretty=False, help_schemas=None) + describe.main(args) + output_json = json.loads(stdout.getvalue()) + # Just verify valid JSON output was produced + self.assertIsInstance(output_json, dict) + self.assertTrue(len(output_json) > 0) + + def test_deprecated_functions(self): + """Test backward compatibility wrapper functions""" tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 0) - self.assertEqual(len(result["apps"]), 0) - self.assertEqual(len(result["stats"]["errorViews"]), 1) - finally: - os.unlink(tmp_file) - - @unittest.mock.patch('jsonschema.validators.validate') - def test_describe_single_mmif_with_unassigned_views(self, mock_validate): - raw_mmif = json.loads(self.basic_mmif.serialize()) - raw_mmif['views'].append({'id': 'v1', 'metadata': {'app': 'http://apps.clams.ai/app1/v1.0.0', 'timestamp': '2024-01-01T12:00:00Z'}, 'annotations': []}) - raw_mmif['views'].append({'id': 'v2', 'metadata': {'app': 'http://apps.clams.ai/app2/v2.0.0'}, 'annotations': []}) - raw_mmif['views'].append({'id': 'v3', 'metadata': {'timestamp': '2024-01-01T12:01:00Z', 'app': ''}, 'annotations': []}) - tmp_file = self.create_temp_mmif_file(raw_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result['stats']['appCount'], 1) - self.assertEqual(len(result['apps']), 2) - special_entry = result['apps'][-1] - self.assertEqual(special_entry['app'], 'http://apps.clams.ai/non-existing-app/v1') - self.assertEqual(len(special_entry['viewIds']), 2) - self.assertIn('v2', special_entry['viewIds']) - self.assertIn('v3', special_entry['viewIds']) + with self.assertWarns(DeprecationWarning): + describe.get_pipeline_specs(tmp_file) + with self.assertWarns(DeprecationWarning): + describe.generate_pipeline_identifier(tmp_file) finally: os.unlink(tmp_file) - def test_describe_collection_empty(self): - dummy_dir = 'dummy_mmif_collection' - os.makedirs(dummy_dir, exist_ok=True) - try: - output = mmif.utils.workflow_helper.describe_mmif_collection(dummy_dir) - expected = { - 'mmifCountByStatus': {'total': 0, 'successful': 0, 'withErrors': 0, 'withWarnings': 0, 'invalid': 0}, - 'workflows': [], - 'annotationCountByType': {} - } - self.assertEqual(output, expected) - finally: - os.rmdir(dummy_dir) - -class TestSummarize(unittest.TestCase): +class TestSummarize(BaseCliTestCase, IOTestMixin): """Test suite for the summarize CLI module.""" + + cli_module = summarize + expected_output_keys = ['mmif_version', 'documents', 'views'] - def setUp(self): - """Create test MMIF structures.""" - self.parser = summarize.prep_argparser() - self.basic_mmif = Mmif(BASIC_MMIF_STRING) - - def create_temp_mmif_file(self, mmif_obj): - """Helper to create a temporary MMIF file.""" - tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) - tmp.write(mmif_obj.serialize(pretty=False)) - tmp.close() - return tmp.name - - def test_summarize_positional_input(self): + def test_summarize_validates_content(self): + """Test that summarize produces expected content.""" tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: - args = self.parser.parse_args([tmp_file]) - # args.output is None by default, which means stdout in open_cli_io_arg - summarize.main(args) - output = json.loads(stdout.getvalue()) - self.assertIn('mmif_version', output) - self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") + output = self.run_cli_capture_stdout( + argparse.Namespace(MMIF_FILE=tmp_file, output=None, pretty=False) + ) + self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") finally: os.unlink(tmp_file) - def test_summarize_output_file(self): - tmp_input = self.create_temp_mmif_file(self.basic_mmif) - tmp_output = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) - tmp_output.close() - try: - args = self.parser.parse_args([tmp_input, "-o", tmp_output.name]) - summarize.main(args) - # args.output is a path string now; no file handle to close. - with open(tmp_output.name, 'r') as f: - output = json.load(f) - self.assertIn('mmif_version', output) - finally: - os.unlink(tmp_input) - os.unlink(tmp_output.name) - - def test_summarize_stdin(self): - mmif_str = self.basic_mmif.serialize() - import argparse - - with unittest.mock.patch('sys.stdin', io.StringIO(mmif_str)), \ - unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: - # MMIF_FILE defaults to None -> stdin - # output defaults to None -> stdout - args = argparse.Namespace(MMIF_FILE=None, output=None, pretty=False) - summarize.main(args) - - output = json.loads(stdout.getvalue()) - self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") - if __name__ == '__main__': unittest.main() From 9ee0bd5ac5ecf7fee0b80e5e372b7fec8708dd00 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Mon, 16 Feb 2026 05:32:33 -0500 Subject: [PATCH 42/48] added human-friendly summary for pydantic classes in `describe --help` --- mmif/utils/cli/__init__.py | 136 +++++++++++++++++++++++++++++++------ mmif/utils/cli/describe.py | 52 +++++++------- tests/test_utils_cli.py | 34 +++++++--- 3 files changed, 162 insertions(+), 60 deletions(-) diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 935ab0a7..f24248f2 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -6,16 +6,19 @@ import io import os import sys -from typing import Iterator, Optional, TextIO, cast +from typing import Iterator, Optional, TextIO, Type, Union, cast, get_args, get_origin + +from pydantic import BaseModel @contextlib.contextmanager -def open_cli_io_arg(path_or_dash: Optional[str], - mode: str = 'r', - encoding: Optional[str] = None, - errors: Optional[str] = None, - default_stdin: bool = False, - ) -> Iterator[TextIO]: +def open_cli_io_arg( + path_or_dash: Optional[str], + mode: str = "r", + encoding: Optional[str] = None, + errors: Optional[str] = None, + default_stdin: bool = False, +) -> Iterator[TextIO]: """ Context manager for opening files with stdin/stdout support. @@ -55,10 +58,10 @@ def open_cli_io_arg(path_or_dash: Optional[str], f.write(content) """ # Valid text modes for file operations - _READ_FLAGS = frozenset({'r', '+'}) - _WRITE_FLAGS = frozenset({'w', 'a', 'x', '+'}) + _READ_FLAGS = frozenset({"r", "+"}) + _WRITE_FLAGS = frozenset({"w", "a", "x", "+"}) - if 'b' in mode: + if "b" in mode: raise ValueError( f"Binary mode '{mode}' is not supported. " "Use text modes ('r', 'w', 'a', 'x') instead." @@ -67,9 +70,7 @@ def open_cli_io_arg(path_or_dash: Optional[str], needs_read = bool(set(mode) & _READ_FLAGS) needs_write = bool(set(mode) & _WRITE_FLAGS) - should_use_stdio = path_or_dash == '-' or ( - path_or_dash is None and default_stdin - ) + should_use_stdio = path_or_dash == "-" or (path_or_dash is None and default_stdin) file_handle: Optional[TextIO] = None should_close = False @@ -84,11 +85,7 @@ def open_cli_io_arg(path_or_dash: Optional[str], if needs_read: # Check for missing input when stdin is a terminal - if ( - path_or_dash is None - and default_stdin - and sys.stdin.isatty() - ): + if path_or_dash is None and default_stdin and sys.stdin.isatty(): raise SystemExit("error: No input provided.") file_handle = sys.stdin @@ -97,14 +94,15 @@ def open_cli_io_arg(path_or_dash: Optional[str], else: raise ValueError( - f"Mode '{mode}' not supported with stdin/stdout " - "(use 'r' or 'w')" + f"Mode '{mode}' not supported with stdin/stdout (use 'r' or 'w')" ) elif isinstance(path_or_dash, str): if needs_read and not os.path.exists(path_or_dash): raise FileNotFoundError(f"Input path does not exist: {path_or_dash}") - file_handle = cast(TextIO, io.open(path_or_dash, mode, encoding=encoding, errors=errors)) + file_handle = cast( + TextIO, io.open(path_or_dash, mode, encoding=encoding, errors=errors) + ) should_close = True elif path_or_dash is None: @@ -126,6 +124,102 @@ def open_cli_io_arg(path_or_dash: Optional[str], file_handle.close() +def generate_model_summary(model: Type[BaseModel], indent: int = 0) -> str: + lines = [] + prefix = " " * indent + + # model_fields is a dictionary of FieldInfo objects + for name, field in model.model_fields.items(): + # Get the alias if available, otherwise use the field name + field_name = field.alias if field.alias else name + + # Get type annotation + type_annotation = field.annotation + + def format_type(t) -> str: + origin = get_origin(t) + args = get_args(t) + + # Handle Optional (Union[T, None]) + if origin is Union and type(None) in args: + non_none_args = [arg for arg in args if arg is not type(None)] + if len(non_none_args) == 1: + return f"{format_type(non_none_args[0])}, optional" + + # Handle List + if origin is list: + if args: + return f"[{format_type(args[0])}]" + return "[]" + + # Handle Dict + if origin is dict: + return "obj" + + # Handle Pydantic Models (Custom Classes) + if isinstance(t, type) and issubclass(t, BaseModel): + return "obj" + + # Handle basic types and cleanup + t_str = str(t) + if t_str.startswith(" 1 + and isinstance(args[1], type) + and issubclass(args[1], BaseModel) + ): + nested_model = args[1] + + if nested_model: + lines.append(generate_model_summary(nested_model, indent + 4)) + + return "\n".join(lines) + + # keep imports of CLI modules for historical reasons # keep them here in the bottom to avoid circular imports from mmif.utils.cli import rewind diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index bb226a81..b8c79ced 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -3,11 +3,9 @@ import sys import textwrap from pathlib import Path -from typing import Dict, Type, Union, cast +from typing import Union, cast -from pydantic import BaseModel - -from mmif.utils.cli import open_cli_io_arg +from mmif.utils.cli import open_cli_io_arg, generate_model_summary # gen_param_hash is imported for backward compatibility from mmif.utils.workflow_helper import ( @@ -18,12 +16,6 @@ generate_workflow_identifier, ) -models_to_help = [SingleMmifDesc, CollectionMmifDesc] -model_modules = set(model.__module__ for model in models_to_help) -def get_all_models() -> Dict[str, Type[BaseModel]]: - return { - name: cls for name, cls in models_to_help - } def get_pipeline_specs(mmif_file: Union[str, Path]): import warnings @@ -49,7 +41,15 @@ def describe_argparser(): This command extracts workflow information from a single MMIF file or a directory of MMIF files. The output is serialized as JSON. - Use `--help-schemas` to inspect the structure of the JSON output. + Output Schemas: + + 1. Single MMIF File (mmif-file): +{generate_model_summary(SingleMmifDesc, indent=4)} + + 2. MMIF Collection (mmif-dir): +{generate_model_summary(CollectionMmifDesc, indent=4)} + + Use `--help-schema` to inspect the full JSON schema for a specific output type. """) return oneliner, additional @@ -79,13 +79,11 @@ def prep_argparser(**kwargs): help="Pretty-print JSON output" ) parser.add_argument( - "--help-schemas", - nargs="*", - choices=["all"] + [m.__name__ for m in models_to_help], + "--help-schema", + nargs=1, + choices=["mmif-file", "mmif-dir"], metavar="SCHEMA_NAME", - help=f"Print the JSON schema for the output. For human-readable documentation, " - f"visit https://clams.ai/mmif-python and see the following modules: " - f"{', '.join(model_modules)}.\nOptions: all, {', '.join([m.__name__ for m in models_to_help])}." + help="Print the JSON schema for the output. Options: mmif-file, mmif-dir." ) return parser @@ -97,19 +95,15 @@ def main(args): :func:`describe_single_mmif` (for single file input) or :func:`describe_mmif_collection` (for directory input). """ - if hasattr(args, 'help_schemas') and args.help_schemas is not None: - models_map = {m.__name__: m for m in models_to_help} - to_show = [] - if len(args.help_schemas) == 0 or 'all' in args.help_schemas: - to_show = [m.__name__ for m in models_to_help] - else: - to_show = args.help_schemas + if hasattr(args, 'help_schema') and args.help_schema is not None: + schema_name = args.help_schema[0] + if schema_name == 'mmif-file': + model_cls = SingleMmifDesc + elif schema_name == 'mmif-dir': + model_cls = CollectionMmifDesc - for name in to_show: - model_cls = models_map[name] - schema = model_cls.model_json_schema() - print(json.dumps(schema, indent=2)) - print() + schema = model_cls.model_json_schema() + print(json.dumps(schema, indent=2)) sys.exit(0) output = {} diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index 66c77c38..dd33fec2 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -78,7 +78,7 @@ def test_file_input_stdout_output(self): MMIF_FILE=tmp_file, output=None, pretty=False, - help_schemas=None # For describe module + help_schema=None # For describe module ) output = self.run_cli_capture_stdout(args) self.assertIsInstance(output, dict) @@ -113,7 +113,7 @@ def test_stdin_input_stdout_output(self): MMIF_FILE=None, output=None, pretty=False, - help_schemas=None # For describe module + help_schema=None # For describe module ) self.cli_module.main(args) output = json.loads(stdout.getvalue()) @@ -289,18 +289,32 @@ class TestDescribe(BaseCliTestCase, IOTestMixin): cli_module = describe expected_output_keys = ['workflowId', 'stats', 'apps'] - def test_help_schemas_all(self): - """Test --help-schemas all""" - from mmif.utils.cli.describe import models_to_help + def test_help_schema(self): + """Test --help-schema with different options""" + from mmif.utils.workflow_helper import SingleMmifDesc, CollectionMmifDesc + + # Test mmif-file + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = argparse.Namespace(help_schema=['mmif-file'], MMIF_FILE=None, output=None, pretty=False) + with self.assertRaises(SystemExit) as cm: + describe.main(args) + self.assertEqual(cm.exception.code, 0) + output = stdout.getvalue() + # Verify SingleMmifDesc schema keys are present + self.assertIn("workflowId", output) + self.assertIn("stats", output) + self.assertIn("apps", output) + + # Test mmif-dir with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: - args = argparse.Namespace(help_schemas=['all'], MMIF_FILE=None, output=None, pretty=False) + args = argparse.Namespace(help_schema=['mmif-dir'], MMIF_FILE=None, output=None, pretty=False) with self.assertRaises(SystemExit) as cm: describe.main(args) self.assertEqual(cm.exception.code, 0) output = stdout.getvalue() - for m in models_to_help: - self.assertIn(m.__name__, output) - self.assertIn("$defs", output) + # Verify CollectionMmifDesc schema keys are present + self.assertIn("mmifCountByStatus", output) + self.assertIn("workflows", output) def test_describe_main_directory(self): """Test describe.main with a directory input""" @@ -313,7 +327,7 @@ def test_describe_main_directory(self): with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: # MMIF_FILE argument expects a string path - args = argparse.Namespace(MMIF_FILE=tmp_dir, output=None, pretty=False, help_schemas=None) + args = argparse.Namespace(MMIF_FILE=tmp_dir, output=None, pretty=False, help_schema=None) describe.main(args) output_json = json.loads(stdout.getvalue()) # Just verify valid JSON output was produced From 96a8b2e18406eb3d5a0bc02532015b5a0b2fff1a Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 25 Feb 2026 15:40:19 -0500 Subject: [PATCH 43/48] updated dev document --- CONTRIBUTING.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 49d47d49..7c4e00ef 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -69,6 +69,17 @@ python3 build-tools/docs.py The output will be in `docs-test`. For more options, run `python build-tools/docs.py --help`. +> [!NOTE] +> Since the documentation build process is relying on the working `mmif` package, one must "build" the package first before building the documentation. This can be done by running +> ```bash +> rm VERSION* # remove existing VERSION file if exists +> make devversion # creates a dummy VERSION file +> pip install -r requirements.dev # install dev dependencies +> python setup.py sdist # build the package (will download auto-generate subpackges like `mmif.res` and `mmif.ver`) + +> [!NOTE] +> running `build-tools/docs.py` in "local testing" mode will overwrite any existing VERSION file with a dummy version. + ### API Documentation (autodoc) As of 2026 (since the next version of 1.2.1), API documentation is **automatically generated** using `sphinx-apidoc`. When you run the documentation build: From bb8472574f8c4cbbbe2c8f85021561dd796aa824 Mon Sep 17 00:00:00 2001 From: kelleyl Date: Thu, 26 Feb 2026 14:09:30 -0500 Subject: [PATCH 44/48] sort frame numbers --- mmif/utils/video_document_helper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mmif/utils/video_document_helper.py b/mmif/utils/video_document_helper.py index a1b9c59a..9da2a683 100644 --- a/mmif/utils/video_document_helper.py +++ b/mmif/utils/video_document_helper.py @@ -85,6 +85,8 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int], :return: frames as a list of :py:class:`~numpy.ndarray` or :py:class:`~PIL.Image.Image` """ import cv2 + # sort frame numbers + framenums = sorted(framenums) if as_PIL: from PIL import Image frames = [] From 3ae9eca479c34f9d7b36b6a60d4d9fef32ef0b99 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Thu, 26 Feb 2026 18:59:21 -0500 Subject: [PATCH 45/48] adding `Z` suffix in timestamps, always defaulting to UTC (docker def) --- mmif/serialize/mmif.py | 4 ++-- mmif/serialize/model.py | 5 ++++- tests/test_serialize.py | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 68478c7f..245c96aa 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -14,7 +14,7 @@ import math import warnings from collections import defaultdict -from datetime import datetime +from datetime import datetime, timezone from typing import Any, List, Union, Optional, Dict, cast, Iterator import jsonschema.validators @@ -433,7 +433,7 @@ def new_view(self) -> View: """ new_view = View() new_view.id = self.new_view_id() - new_view.metadata.timestamp = datetime.now() + new_view.metadata.timestamp = datetime.now(timezone.utc) self.add_view(new_view) return new_view diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 1bec7b29..95fdc28c 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -402,7 +402,10 @@ def default(self, obj: 'MmifObject'): if hasattr(obj, '_serialize'): return obj._serialize() elif hasattr(obj, 'isoformat'): # for datetime objects - return obj.isoformat() + s = obj.isoformat() + if s.endswith('+00:00'): + s = s[:-6] + 'Z' + return s elif hasattr(obj, '__str__'): return str(obj) else: diff --git a/tests/test_serialize.py b/tests/test_serialize.py index 9e857a00..f5b0846f 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -608,6 +608,25 @@ def test_get_label(self): a = v.new_annotation(AnnotationTypes.BoundingBox) _ = a._get_label() + def test_timestamp_uses_utc_with_z_suffix(self): + """Test that timestamps are in UTC with 'Z' suffix to avoid ambiguity""" + from datetime import timezone + mmif_obj = Mmif(validate=False) + + new_view = mmif_obj.new_view() + new_view.metadata.app = "http://test.app" + + # Verify the timestamp is timezone-aware and uses UTC + self.assertIsNotNone(new_view.metadata.timestamp) + self.assertIsNotNone(new_view.metadata.timestamp.tzinfo) + self.assertEqual(new_view.metadata.timestamp.tzinfo, timezone.utc) + + # Verify serialization uses 'Z' suffix instead of '+00:00' + serialized = json.loads(mmif_obj.serialize()) + ts = serialized['views'][0]['metadata']['timestamp'] + self.assertTrue(ts.endswith('Z')) + self.assertNotIn('+00:00', ts) + def test_get_anchor_point(self): mmif = Mmif(validate=False) v1 = mmif.new_view() From a6b1982cab03a5fea1cd453326b794d00c2818dd Mon Sep 17 00:00:00 2001 From: kelleyl Date: Thu, 5 Mar 2026 12:50:46 -0500 Subject: [PATCH 46/48] adding deduplication to frame extraction --- mmif/utils/video_document_helper.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mmif/utils/video_document_helper.py b/mmif/utils/video_document_helper.py index 9071d1e6..b7547263 100644 --- a/mmif/utils/video_document_helper.py +++ b/mmif/utils/video_document_helper.py @@ -85,19 +85,20 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int], :return: frames as a list of :py:class:`~numpy.ndarray` or :py:class:`~PIL.Image.Image` """ import cv2 - # sort frame numbers - framenums = sorted(framenums) + # deduplicate and sort frame numbers for extraction, then map back to original order + original_framenums = list(framenums) + unique_framenums = sorted(set(original_framenums)) if as_PIL: from PIL import Image - frames = [] + unique_frames = {} video = capture(video_document) cur_f = 0 tot_fcount = video_document.get_property(FRAMECOUNT_DOCPROP_KEY) # when the target frame is more than this frames away, fast-forward instead of reading frame by frame - # this is sanity-checked with a small number of video samples + # this is sanity-checked with a small number of video samples # (frame-by-frame ndarrays are compared with fast-forwarded ndarrays) - skip_threadhold = 1000 - framenumi = iter(framenums) # make sure that it's actually an iterator, in case a list is passed + skip_threadhold = 1000 + framenumi = iter(unique_framenums) next_target_f = next(framenumi, None) from wurlitzer import pipes as cpipes ffmpeg_errs = StringIO() @@ -116,14 +117,15 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int], sec = convert(cur_f, 'f', 's', video_document.get_property(FPS_DOCPROP_KEY)) warnings.warn(f'Frame #{cur_f} ({sec}s) could not be read from the video {video_document.id} @ {video_document.location} .') else: - frames.append(Image.fromarray(frame[:, :, ::-1]) if as_PIL else frame) + unique_frames[cur_f] = Image.fromarray(frame[:, :, ::-1]) if as_PIL else frame next_target_f = next(framenumi, None) cur_f += 1 ffmpeg_err_str = ffmpeg_errs.getvalue() if ffmpeg_err_str and record_ffmpeg_errors: warnings.warn(f'FFmpeg output during extracting frames: {ffmpeg_err_str}') video.release() - return frames + # return frames in original input order, duplicating where needed + return [unique_frames[f] for f in original_framenums if f in unique_frames] def get_mid_framenum(mmif: Mmif, time_frame: Annotation) -> int: From 09c7e324928a065ed0dd4716746c532262e44dd4 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 10 Mar 2026 14:26:17 -0400 Subject: [PATCH 47/48] added three basic modes for sampling from TF --- mmif/utils/video_document_helper.py | 248 +++++++++++++++++++++++++--- tests/test_utils.py | 149 +++++++++-------- 2 files changed, 300 insertions(+), 97 deletions(-) diff --git a/mmif/utils/video_document_helper.py b/mmif/utils/video_document_helper.py index b7547263..1ff6df40 100644 --- a/mmif/utils/video_document_helper.py +++ b/mmif/utils/video_document_helper.py @@ -1,5 +1,7 @@ +import contextvars import importlib import sys +from enum import Enum import math import warnings @@ -12,13 +14,22 @@ from mmif.utils.timeunit_helper import convert from mmif.vocabulary import DocumentTypes -for cv_dep in ('cv2', 'ffmpeg', 'PIL', 'wurlitzer'): +_CV_DEPS = ('cv2', 'PIL', 'wurlitzer') +_cv_import_warning = ( + 'Optional package "{}" is not found. ' + 'You might want to install Computer-Vision dependencies ' + 'by running `pip install mmif-python[cv]=={}`' +) + + +def _check_cv_dep(dep): + """Import a CV dependency, raising ImportError with a helpful message.""" try: - importlib.__import__(cv_dep) + return importlib.__import__(dep) except ImportError as e: - warnings.warn(f"Optional package \"{e.name}\" is not found. " - f"You might want to install Computer-Vision dependencies " - f"by running `pip install mmif-python[cv]=={mmif.__version__}`") + raise ImportError( + _cv_import_warning.format(e.name, mmif.__version__) + ) from e FPS_DOCPROP_KEY = 'fps' @@ -27,6 +38,36 @@ DURATIONUNIT_DOCPROP_KEY = 'durationTimeUnit' +class SamplingMode(Enum): + """Determines how timepoints are selected from a TimeFrame.""" + REPRESENTATIVES = "representatives" + SINGLE = "single" + ALL = "all" + + +SAMPLING_MODE_DESCRIPTIONS = { + SamplingMode.REPRESENTATIVES: ( + "uses all representative timepoints if present, " + "otherwise skips the TimeFrame." + ), + SamplingMode.SINGLE: ( + "uses the middle representative if present, otherwise " + "extracts a frame from the midpoint of the start/end " + "interval (midpoint is calculated by floor division " + "of the sum of start and end)." + ), + SamplingMode.ALL: ( + "uses all target timepoints if present, otherwise " + "extracts all frames from the time interval." + ), +} +SAMPLING_MODE_DEFAULT = SamplingMode.REPRESENTATIVES + + +_sampling_mode = contextvars.ContextVar( + 'sampling_mode', default=SamplingMode.REPRESENTATIVES) + + def capture(video_document: Document): """ Captures a video file using OpenCV and adds fps, frame count, and duration as properties to the document. @@ -34,7 +75,7 @@ def capture(video_document: Document): :param video_document: :py:class:`~mmif.serialize.annotation.Document` instance that holds a video document (``"@type": ".../VideoDocument/..."``) :return: `OpenCV VideoCapture `_ object """ - import cv2 # pytype: disable=import-error + cv2 = _check_cv_dep('cv2') if video_document is None or video_document.at_type != DocumentTypes.VideoDocument: raise ValueError(f'The document does not exist.') @@ -59,8 +100,8 @@ def get_framerate(video_document: Document) -> float: if video_document is None or video_document.at_type != DocumentTypes.VideoDocument: raise ValueError(f'The document does not exist.') - framerate_keys = (FPS_DOCPROP_KEY, - 'framerate', 'frameRate', 'frame_rate', 'frame-rate', + framerate_keys = (FPS_DOCPROP_KEY, + 'framerate', 'frameRate', 'frame_rate', 'frame-rate', 'framespersecond', 'framesPerSecond', 'frames_per_second', 'frames-per-second', 'framepersecond', 'framePerSecond', 'frame_per_second', 'frame-per-second') for k in framerate_keys: @@ -84,12 +125,12 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int], :param record_ffmpeg_errors: if True, records and warns about FFmpeg stderr output during extraction :return: frames as a list of :py:class:`~numpy.ndarray` or :py:class:`~PIL.Image.Image` """ - import cv2 + cv2 = _check_cv_dep('cv2') # deduplicate and sort frame numbers for extraction, then map back to original order original_framenums = list(framenums) unique_framenums = sorted(set(original_framenums)) if as_PIL: - from PIL import Image + Image = _check_cv_dep('PIL').Image unique_frames = {} video = capture(video_document) cur_f = 0 @@ -100,7 +141,7 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int], skip_threadhold = 1000 framenumi = iter(unique_framenums) next_target_f = next(framenumi, None) - from wurlitzer import pipes as cpipes + cpipes = _check_cv_dep('wurlitzer').pipes ffmpeg_errs = StringIO() with cpipes(stderr=ffmpeg_errs, stdout=sys.stdout): while True: @@ -129,7 +170,11 @@ def extract_frames_as_images(video_document: Document, framenums: Iterable[int], def get_mid_framenum(mmif: Mmif, time_frame: Annotation) -> int: - warnings.warn('This function is deprecated. Use ``get_representative_framenums()`` instead.', DeprecationWarning, stacklevel=2) + """ + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + """ + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) return _get_mid_framenum(mmif, time_frame) @@ -149,6 +194,9 @@ def _get_mid_framenum(mmif: Mmif, time_frame: Annotation) -> int: def extract_mid_frame(mmif: Mmif, time_frame: Annotation, as_PIL: bool = False): """ + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + Extracts the middle frame of a time interval annotation as a numpy ndarray. :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance @@ -156,21 +204,25 @@ def extract_mid_frame(mmif: Mmif, time_frame: Annotation, as_PIL: bool = False): :param as_PIL: return :py:class:`~PIL.Image.Image` instead of :py:class:`~numpy.ndarray` :return: frame as a :py:class:`numpy.ndarray` or :py:class:`PIL.Image.Image` """ - warnings.warn('This function is deprecated. Use ``extract_representative_frames()`` instead.', DeprecationWarning, stacklevel=2) + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) vd = mmif[time_frame.get_property('document')] return extract_frames_as_images(vd, [get_mid_framenum(mmif, time_frame)], as_PIL=as_PIL)[0] def get_representative_framenums(mmif: Mmif, time_frame: Annotation) -> List[int]: """ - Calculates the representative frame numbers from an annotation. To pick the representative frames, it first looks - up the ``representatives`` property of the ``TimeFrame`` annotation. If it is not found, it will calculate the + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + + Calculates the representative frame numbers from an annotation. To pick the representative frames, it first looks + up the ``representatives`` property of the ``TimeFrame`` annotation. If it is not found, it will calculate the number of the middle frame. :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` instance that holds a time interval annotation containing a `representatives` property (``"@type": ".../TimeFrame/..."``) :return: representative frame number as an integer """ + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) if 'representatives' not in time_frame.properties: return [_get_mid_framenum(mmif, time_frame)] timeunit = time_frame.get_property('timeUnit') @@ -189,9 +241,13 @@ def get_representative_framenums(mmif: Mmif, time_frame: Annotation) -> List[int def get_representative_framenum(mmif: Mmif, time_frame: Annotation) -> int: """ + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + A thin wrapper around :py:func:`get_representative_framenums` to return a single representative frame number. Always return the first frame number found. """ + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) try: return get_representative_framenums(mmif, time_frame)[0] except IndexError: @@ -200,6 +256,9 @@ def get_representative_framenum(mmif: Mmif, time_frame: Annotation) -> int: def extract_representative_frame(mmif: Mmif, time_frame: Annotation, as_PIL: bool = False, first_only: bool = True): """ + .. deprecated:: + Use :py:func:`extract_frames_by_mode` instead. + Extracts the representative frame of an annotation as a numpy ndarray or PIL Image. :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance @@ -208,11 +267,125 @@ def extract_representative_frame(mmif: Mmif, time_frame: Annotation, as_PIL: boo :param first_only: return the first representative frame only :return: frame as a :py:class:`numpy.ndarray` or :py:class:`PIL.Image.Image` """ + warnings.warn('This function is deprecated. Use ``extract_frames_by_mode()`` instead.', DeprecationWarning, stacklevel=2) video_document = mmif[time_frame.get_property('document')] rep_frame_num = [get_representative_framenum(mmif, time_frame)] if first_only else get_representative_framenums(mmif, time_frame) return extract_frames_as_images(video_document, rep_frame_num, as_PIL=as_PIL)[0] +def _tp_ids_to_framenums(mmif: Mmif, tp_ids: List[str]) -> List[int]: + """ + Converts a list of timepoint annotation IDs to frame numbers. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param tp_ids: list of timepoint annotation IDs + :return: list of frame numbers + """ + return [ + int(convert_timepoint(mmif, mmif[tp_id], 'f')) + for tp_id in tp_ids + ] + + +def _resolve_video_document(mmif: Mmif, time_frame: Annotation): + """ + Resolves the video document associated with a TimeFrame. + Checks the TimeFrame's own ``document`` property first, + then falls back to the ``document`` property of the first + target timepoint. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame + :return: :py:class:`~mmif.serialize.annotation.Document` + """ + if 'document' in time_frame.properties: + return mmif[time_frame.get_property('document')] + if 'targets' in time_frame.properties: + targets = time_frame.get_property('targets') + if targets: + tp = mmif[targets[0]] + return mmif[tp.get_property('document')] + raise ValueError( + f'Cannot resolve video document for TimeFrame ' + f'{time_frame.id}.') + + +def _timeframe_to_frame_range( + mmif: Mmif, time_frame: Annotation +) -> Tuple[int, int]: + """ + Converts a TimeFrame's start/end to frame numbers. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame with ``start``, ``end``, + ``timeUnit``, and ``document`` properties + :return: tuple of (start_frame, end_frame) + """ + start, end = convert_timeframe(mmif, time_frame, 'f') + return int(start), int(end) + + +def _sample_all(mmif: Mmif, time_frame: Annotation) -> List[int]: + """ + Samples all frame numbers from a TimeFrame. Uses all + ``targets`` if present, otherwise generates every frame + in the start/end interval. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame + :return: list of frame numbers + """ + if 'targets' in time_frame.properties: + return _tp_ids_to_framenums( + mmif, time_frame.get_property('targets')) + start, end = _timeframe_to_frame_range(mmif, time_frame) + return sample_frames(start, end) + + +def _sample_representatives( + mmif: Mmif, time_frame: Annotation +) -> List[int]: + """ + Samples frame numbers from a TimeFrame's representatives. + Returns an empty list if ``representatives`` is not present + (skips the TimeFrame). + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame + :return: list of frame numbers (empty if no representatives) + """ + if 'representatives' in time_frame.properties: + reps = time_frame.get_property('representatives') + if reps: + return _tp_ids_to_framenums(mmif, reps) + return [] + + +def _sample_single(mmif: Mmif, time_frame: Annotation) -> List[int]: + """ + Samples a single frame number from a TimeFrame. Uses the + middle representative if ``representatives`` is present, + otherwise computes the midpoint of the start/end interval + via floor division. + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` + instance of a TimeFrame + :return: list containing a single frame number + """ + if 'representatives' in time_frame.properties: + reps = time_frame.get_property('representatives') + if reps: + mid = reps[len(reps) // 2] + return _tp_ids_to_framenums(mmif, [mid]) + start, end = _timeframe_to_frame_range(mmif, time_frame) + return [(start + end) // 2] + + def extract_target_frames(mmif: Mmif, annotation: Annotation, min_timepoints: int = 0, max_timepoints: int = sys.maxsize, fraction: float = 1.0, as_PIL: bool = False): """ Extracts frames corresponding to the timepoints listed in the ``targets`` property of an annotation. @@ -245,19 +418,46 @@ def extract_target_frames(mmif: Mmif, annotation: Annotation, min_timepoints: in indices = [int(i * (num_targets - 1) / (count - 1)) for i in range(count)] selected_target_ids = [targets[i] for i in indices] - selected_timepoints = [mmif[target_id] for target_id in selected_target_ids] - - # Assuming all targets use the same document as the parent annotation if it exists, - # otherwise we'll have to check each timepoint. convert_timepoint handles document lookup. - frame_nums = [int(convert_timepoint(mmif, tp, 'f')) for tp in selected_timepoints] - - # Get the document from the first selected timepoint to use with extract_frames_as_images - video_doc = mmif[selected_timepoints[0].get_property('document')] + frame_nums = _tp_ids_to_framenums(mmif, selected_target_ids) + video_doc = _resolve_video_document(mmif, annotation) images = extract_frames_as_images(video_doc, frame_nums, as_PIL=as_PIL) - return images, selected_target_ids +def extract_frames_by_mode( + mmif: Mmif, + time_frame: Annotation, + mode: Union[SamplingMode, None] = None, + as_PIL: bool = False +) -> List: + """ + Extracts frames from a TimeFrame annotation based on a + sampling mode. If ``mode`` is not specified, uses the + context-level default (set via + :py:data:`_sampling_mode` context variable). + + :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance + :param time_frame: TimeFrame annotation to sample from + :param mode: :py:class:`SamplingMode`, or None to use + the context default + :param as_PIL: return PIL Images instead of ndarrays + :return: list of frames (may be empty for + ``REPRESENTATIVES`` mode when no representatives exist) + """ + if mode is None: + mode = _sampling_mode.get() + if mode == SamplingMode.ALL: + frame_nums = _sample_all(mmif, time_frame) + elif mode == SamplingMode.REPRESENTATIVES: + frame_nums = _sample_representatives(mmif, time_frame) + else: + frame_nums = _sample_single(mmif, time_frame) + if not frame_nums: + return [] + video_doc = _resolve_video_document(mmif, time_frame) + return extract_frames_as_images(video_doc, frame_nums, as_PIL=as_PIL) + + def sample_frames(start_frame: int, end_frame: int, sample_rate: float = 1) -> List[int]: """ Helper function to sample frames from a time interval. diff --git a/tests/test_utils.py b/tests/test_utils.py index fb239d66..1d903b10 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -144,82 +144,85 @@ def test_extract_frames_as_images(self): self.assertEqual(4, len(frame_list)) self.assertEqual(3, len(new_target_images)) - def test_extract_target_frames(self): - # Create 10 timepoints + def test_sample_all(self): tps = [] for i in range(10): - tp = self.a_view.new_annotation(AnnotationTypes.TimePoint, timePoint=i*100, timeUnit='frame', document=self.video_doc.id) + tp = self.a_view.new_annotation( + AnnotationTypes.TimePoint, + timePoint=i * 100, timeUnit='frame', + document=self.video_doc.id) tps.append(tp) - - # Create an annotation with targets - parent_ann = self.a_view.new_annotation(AnnotationTypes.TimeFrame, targets=[tp.id for tp in tps]) - - # Test fraction=0.5 (should get 5 timepoints: indices 0, 2, 4, 6, 9) - # indices = [int(i * 9 / 4) for i in range(5)] = [0, 2, 4, 6, 9] - images, ids = vdh.extract_target_frames(self.mmif_obj, parent_ann, fraction=0.5) - self.assertEqual(5, len(images)) - self.assertEqual(5, len(ids)) - self.assertEqual(tps[0].id, ids[0]) - self.assertEqual(tps[9].id, ids[-1]) - - # Test min_timepoints=8, fraction=0.1 (should get 8) - images, ids = vdh.extract_target_frames(self.mmif_obj, parent_ann, min_timepoints=8, fraction=0.1) - self.assertEqual(8, len(images)) - - # Test max_timepoints=3, fraction=1.0 (should get 3) - images, ids = vdh.extract_target_frames(self.mmif_obj, parent_ann, max_timepoints=3, fraction=1.0) - self.assertEqual(3, len(images)) - self.assertEqual(tps[0].id, ids[0]) - self.assertEqual(tps[4].id, ids[1]) # int(1 * 9 / 2) = 4 - self.assertEqual(tps[9].id, ids[2]) - - # Test all targets if min_timepoints > num_targets - images, ids = vdh.extract_target_frames(self.mmif_obj, parent_ann, min_timepoints=20) - self.assertEqual(10, len(images)) - self.assertEqual([tp.id for tp in tps], ids) - - def test_extract_target_frames_with_sample(self): - # Load from the sample file - swt_path = pathlib.Path(__file__).parent / 'samples' / '1.0' / 'swt.mmif' - with open(swt_path) as f: - mmif_obj = Mmif(f.read()) - - # Find a timeframe with targets - tf = None - for view in mmif_obj.views: - for ann in view.annotations: - if ann.at_type == AnnotationTypes.TimeFrame and 'targets' in ann.properties: - tf = ann - break - if tf: break - - self.assertIsNotNone(tf, "Could not find a TimeFrame with targets in swt.mmif") - - # Update document location to avoid error, although we'll mock extraction - # because we don't have the original video referenced in swt.mmif - video_doc = mmif_obj[tf.get_property('document')] - video_doc.location = f"file://{pathlib.Path(__file__).parent}/black-2997fps.mp4" - video_doc.add_property('fps', 29.97) - video_doc.add_property('frameCount', 1000) - - # Test with max_timepoints=5 - # We mock extract_frames_as_images because we don't really need to decode - # frames to test the selection logic here, and we don't want to rely on CV2/FFMPEG - # being fully functional for a dummy video in this test environment if possible - with mock.patch('mmif.utils.video_document_helper.extract_frames_as_images') as mock_extract: - mock_extract.return_value = [f"img_{i}" for i in range(5)] - images, ids = vdh.extract_target_frames(mmif_obj, tf, max_timepoints=5) - - self.assertEqual(len(images), 5) - self.assertEqual(len(ids), 5) - # Verify that IDs are from the targets list - targets = tf.get_property('targets') - for id in ids: - self.assertIn(id, targets) - - # Verify the first and last targets are selected (if count > 1) - self.assertEqual(ids[0], targets[0]) - self.assertEqual(ids[-1], targets[-1]) + parent_ann = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + targets=[tp.id for tp in tps]) + + frame_nums = vdh._sample_all(self.mmif_obj, parent_ann) + self.assertEqual(10, len(frame_nums)) + self.assertEqual([i * 100 for i in range(10)], frame_nums) + + # start/end fallback (no targets) + parent_ann2 = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + start=0, end=10, timeUnit='frame', + document=self.video_doc.id) + frame_nums2 = vdh._sample_all(self.mmif_obj, parent_ann2) + self.assertEqual(list(range(10)), frame_nums2) + + def test_sample_representatives(self): + tps = [] + for i in range(10): + tp = self.a_view.new_annotation( + AnnotationTypes.TimePoint, + timePoint=i * 100, timeUnit='frame', + document=self.video_doc.id) + tps.append(tp) + reps = [tps[2].id, tps[5].id, tps[8].id] + parent_ann = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + targets=[tp.id for tp in tps], + representatives=reps) + + # should use representatives + frame_nums = vdh._sample_representatives( + self.mmif_obj, parent_ann) + self.assertEqual(3, len(frame_nums)) + self.assertEqual([200, 500, 800], frame_nums) + + # without representatives, should return empty (skip) + parent_ann2 = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + targets=[tp.id for tp in tps]) + frame_nums2 = vdh._sample_representatives( + self.mmif_obj, parent_ann2) + self.assertEqual([], frame_nums2) + + def test_sample_single(self): + tps = [] + for i in range(10): + tp = self.a_view.new_annotation( + AnnotationTypes.TimePoint, + timePoint=i * 100, timeUnit='frame', + document=self.video_doc.id) + tps.append(tp) + reps = [tps[2].id, tps[5].id, tps[8].id] + parent_ann = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + targets=[tp.id for tp in tps], + representatives=reps) + + # should pick middle representative (index 1 of 3 = tps[5]) + frame_nums = vdh._sample_single( + self.mmif_obj, parent_ann) + self.assertEqual([500], frame_nums) + + # start/end fallback (no representatives) + parent_ann2 = self.a_view.new_annotation( + AnnotationTypes.TimeFrame, + start=100, end=500, timeUnit='frame', + document=self.video_doc.id) + frame_nums2 = vdh._sample_single( + self.mmif_obj, parent_ann2) + self.assertEqual([300], frame_nums2) class TestSequenceHelper(unittest.TestCase): From 815b22785e1d7b3809e341f513c052fb971e250b Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 10 Mar 2026 19:22:14 -0400 Subject: [PATCH 48/48] updated build GHA to use documentation hub --- .github/workflows/publish.yml | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 6be5812e..781bec97 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,12 +1,27 @@ -name: "📦 Publish (docs, PyPI)" +name: "📦 Publish (PyPI + docs)" -on: - push: - tags: +on: + push: + tags: - '[0-9]+.[0-9]+.[0-9]+' jobs: - package-and-upload: - name: "🤙 Call SDK publish workflow" + publish-pypi: + name: "📦 Build and upload to PyPI" uses: clamsproject/.github/.github/workflows/sdk-publish.yml@main secrets: inherit + + publish-docs: + name: "📖 Build and publish docs" + needs: publish-pypi + uses: clamsproject/clamsproject.github.io/.github/workflows/sdk-docs.yml@main + with: + source_repo: clamsproject/mmif-python + source_ref: ${{ github.ref_name }} + project_name: mmif-python + version: ${{ github.ref_name }} + build_command: 'python3 build-tools/docs.py --build-ver ${{ github.ref_name }} --output-dir docs' + docs_output_dir: 'docs/${{ github.ref_name }}' + python_version: '3.11' + update_latest: true + secrets: inherit