hlt-mt · pe-trik · Feb 16, 2026 · Feb 16, 2026 · Feb 20, 2026 · Feb 26, 2026
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
@@ -58,3 +58,4 @@ Evaluation
    simulstream.metrics.scorers.quality.mwersegmenter
    simulstream.metrics.scorers.latency
    simulstream.metrics.scorers.latency.mwersegmenter
+   simulstream.metrics.scorers.latency.softsegmenter
-   simulstream.metrics.scorers.latency.softsegmenter
+   simulstream.metrics.scorers.latency.softsegmenter
+   simulstream.metrics.scorers.latency.segmenter_based_scorer
-   simulstream.metrics.scorers.latency.softsegmenter
+   simulstream.metrics.scorers.latency.softsegmenter
+   simulstream.metrics.scorers.latency.segmenter_based_scorer
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,7 +62,8 @@ vad = [
 eval = [
     "unbabel-comet==2.2.6",
     "mweralign",
-    "sacrebleu"
+    "sacrebleu",
+    "mosestokenizer",
 ]
 
 [tool.setuptools.dynamic]

diff --git a/simulstream/metrics/scorers/latency/__init__.py b/simulstream/metrics/scorers/latency/__init__.py
@@ -94,6 +94,7 @@ class LatencyScorer:
     Args:
         args (argparse.Namespace): Parsed command-line arguments.
     """
+
     def __init__(self, args: argparse.Namespace):
         self.args = args
 
@@ -111,5 +112,20 @@ def requires_reference(self) -> bool:
         ...
 
 
+@dataclass
+class ResegmentedLatencyScoringSample:
+    """
+    A sample containing realigned hypotheses and references.
+
+    Attributes:
+        audio_name (str): The identifier of the audio file.
+        hypothesis (List[str]): Hypothesis lines after realignment.
+        reference (List[str]): Reference lines aligned to the hypothesis.
+    """
+    audio_name: str
+    hypothesis: List[OutputWithDelays]
+    reference: List[ReferenceSentenceDefinition]
+
+
 for loader, name, is_pkg in pkgutil.walk_packages(__path__, __name__ + "."):
     importlib.import_module(name)
diff --git a/simulstream/metrics/scorers/latency/long_yaal.py b/simulstream/metrics/scorers/latency/long_yaal.py
@@ -0,0 +1,165 @@
+# Copyright 2026 FBK
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import logging
+import statistics
+from typing import List, Optional
+
+from simulstream.metrics.readers import text_items
+from simulstream.metrics.scorers.latency import register_latency_scorer, LatencyScores
+from simulstream.metrics.scorers.latency.softsegmenter import (
+    SoftSegmenterBasedLatencyScorer,
+    ResegmentedLatencyScoringSample
+)
+
+
+LOGGER = logging.getLogger('simulstream.metrics.scorers.latency.long_yaal')
+
+
+@register_latency_scorer("long_yaal")
+class LongYAAL(SoftSegmenterBasedLatencyScorer):
+    """
+    Computes Long-form Yet Another Average Lagging (LongYAAL) as proposed in
+    `Better Late Than Never: Evaluation of Latency Metrics for Simultaneous Speech-to-Text
+    Translation <https://arxiv.org/abs/2509.17349>`_.
+
+    This metric uses SoftSegmenter alignment to realign system outputs to reference segments
+    before computing latency, making it more robust for long-form speech translation evaluation.
+
+    The key difference from StreamLAAL is the use of SoftSegmenter's more sophisticated
+    alignment algorithm that handles long-form audio better. Additionally, LongYAAL is considers
-    alignment algorithm that handles long-form audio better. Additionally, LongYAAL is considers
+    alignment algorithm that handles long-form audio better. Additionally, LongYAAL considers
-    alignment algorithm that handles long-form audio better. Additionally, LongYAAL is considers
+    alignment algorithm that handles long-form audio better. Additionally, LongYAAL considers
+    all output tokens up until the end of the recording. StreamLAAL ignores any output tokens
+    emitted after the end of the reference segments.
+    """
+
+    @staticmethod
+    def _sentence_level_yaal(
+        delays: List[float],
+        source_length: float,
+        target_length: int,
+        relative_recording_duration: float = float("inf"),
+    ) -> Optional[float]:
-        delays: List[float],
-        source_length: float,
-        target_length: int,
-        relative_recording_duration: float = float("inf"),
-    ) -> Optional[float]:
+            delays: List[float],
+            source_length: float,
+            target_length: int,
+            relative_recording_duration: float = float("inf")) -> Optional[float]:
-        delays: List[float],
-        source_length: float,
-        target_length: int,
-        relative_recording_duration: float = float("inf"),
-    ) -> Optional[float]:
+            delays: List[float],
+            source_length: float,
+            target_length: int,
+            relative_recording_duration: float = float("inf")) -> Optional[float]:
+        """
+        Compute Yet Another Average Lagging (YAAL) on one sentence.
+
+        Args:
+            delays (List[float]): Sequence of delays for each output token.
+            source_length (float): Length of the source audio segment in milliseconds.
+            target_length (int): Length of the target reference in tokens/characters.
+            relative_recording_duration (float): Duration of the recording relative to the start
+                of the current sentence.
+
+        Returns:
+            Optional[float]: The YAAL score for the sentence, or None if computation is
+                not possible.
+        """
+        assert source_length > 0, "Source length must be greater than zero"
+
+        YAAL = 0.0
+        gamma = max(len(delays), target_length) / source_length
+
+        valid_delays = [d for d in delays if d < relative_recording_duration]
+        if len(valid_delays) == 0:
+            return None
+
+        for prev_time_step, current_delay in enumerate(valid_delays):
+            YAAL += current_delay - prev_time_step / gamma
+
+        return YAAL / len(valid_delays)
+
+    def _do_score(
+        self, samples: List[ResegmentedLatencyScoringSample]
+    ) -> LatencyScores:
-        self, samples: List[ResegmentedLatencyScoringSample]
-    ) -> LatencyScores:
+            self, samples: List[ResegmentedLatencyScoringSample]) -> LatencyScores:
-        self, samples: List[ResegmentedLatencyScoringSample]
-    ) -> LatencyScores:
+            self, samples: List[ResegmentedLatencyScoringSample]) -> LatencyScores:
+        sentence_level_ideal_scores = []
+        sentence_level_ca_scores = []
+        skipped_sentences = 0
+
+        for sample in samples:
+            # Compute the total recording length (end time of the last reference segment)
+            if sample.reference:
+                recording_duration = max(
+                    ref.start_time + ref.duration for ref in sample.reference
+                )
+            else:
+                LOGGER.warning(
+                    f"Sample {sample.audio_name} has no reference segments; treating recording "
+                    "length as infinite"
+                )
+                recording_duration = float("inf")
+
+            for sentence_output, sentence_reference in zip(
+                sample.hypothesis, sample.reference
+            ):
+                # Note: delays in sentence_output are already offset relative to
+                # sentence_reference.start_time by the SoftSegmenter alignment
+                # (unlike MWERSegmenter which doesn't offset)
+                ideal_delays = sentence_output.ideal_delays
+                ca_delays = sentence_output.computational_aware_delays
+
+                assert len(ideal_delays) == len(ca_delays), \
+                    f"Mismatch in delay counts: {len(ideal_delays)} vs {len(ca_delays)}"
+
+                target_length = len(
+                    text_items(sentence_reference.content, self.latency_unit)
+                )
+
+                if len(ideal_delays) > 0:
+                    # Compute recording end time relative to sentence start.
+                    # This considers the entire recording, not just this segment.
+                    # This allows LongYAAL to account for outputs emitted after the reference
+                    # segment ends but before the recording ends (key difference from StreamLAAL)
+                    relative_recording_duration = \
+                        recording_duration - sentence_reference.start_time
+
+                    ideal_score = self._sentence_level_yaal(
+                        ideal_delays,
+                        sentence_reference.duration,
+                        target_length,
+                        relative_recording_duration=relative_recording_duration,
+                    )
+
+                    ca_score = self._sentence_level_yaal(
+                        ca_delays,
+                        sentence_reference.duration,
+                        target_length,
+                        relative_recording_duration=relative_recording_duration,
+                    )
+
+                    if ideal_score is not None:
+                        sentence_level_ideal_scores.append(ideal_score)
+                    else:
+                        skipped_sentences += 1
+                    if ca_score is not None:
+                        sentence_level_ca_scores.append(ca_score)
+                else:
+                    skipped_sentences += 1
+
+        if skipped_sentences > 0:
+            LOGGER.warning(
+                f"{skipped_sentences} sentences have been skipped in LongYAAL computation "
+                "as they were empty or could not be scored."
+            )
+
+        if len(sentence_level_ideal_scores) == 0:
+            LOGGER.error("No sentences could be scored for LongYAAL")
+            return LatencyScores(float("nan"), float("nan"))
+
+        return LatencyScores(
+            statistics.mean(sentence_level_ideal_scores),
+            (
+                statistics.mean(sentence_level_ca_scores)
+                if len(sentence_level_ca_scores) > 0
+                else float("nan")
+            ),
+        )
-
-        if len(sentence_level_ideal_scores) == 0:
-            LOGGER.error("No sentences could be scored for LongYAAL")
-            return LatencyScores(float("nan"), float("nan"))
-
-        return LatencyScores(
-            statistics.mean(sentence_level_ideal_scores),
-            (
-                statistics.mean(sentence_level_ca_scores)
-                if len(sentence_level_ca_scores) > 0
-                else float("nan")
-            ),
-        )
+
+        ideal_score = float("nan")
+        ca_score = float("nan")
+        if len(sentence_level_ideal_scores) == 0:
+            LOGGER.error("No sentences could be scored for LongYAAL")
+        else:
+            ideal_score =  statistics.mean(sentence_level_ideal_scores)
+            if len(sentence_level_ca_scores) > 0:
+                ca_score = statistics.mean(sentence_level_ca_scores)
+        return LatencyScores(ideal_score, ca_score)
-
-        if len(sentence_level_ideal_scores) == 0:
-            LOGGER.error("No sentences could be scored for LongYAAL")
-            return LatencyScores(float("nan"), float("nan"))
-
-        return LatencyScores(
-            statistics.mean(sentence_level_ideal_scores),
-            (
-                statistics.mean(sentence_level_ca_scores)
-                if len(sentence_level_ca_scores) > 0
-                else float("nan")
-            ),
-        )
+
+        ideal_score = float("nan")
+        ca_score = float("nan")
+        if len(sentence_level_ideal_scores) == 0:
+            LOGGER.error("No sentences could be scored for LongYAAL")
+        else:
+            ideal_score =  statistics.mean(sentence_level_ideal_scores)
+            if len(sentence_level_ca_scores) > 0:
+                ca_score = statistics.mean(sentence_level_ca_scores)
+        return LatencyScores(ideal_score, ca_score)
diff --git a/simulstream/metrics/scorers/latency/mwersegmenter.py b/simulstream/metrics/scorers/latency/mwersegmenter.py
@@ -13,31 +13,17 @@
 # limitations under the License
 
 from abc import abstractmethod
-from dataclasses import dataclass
 from typing import List
 
 from mweralign import mweralign
 
-from simulstream.metrics.readers import ReferenceSentenceDefinition, OutputWithDelays, text_items
-from simulstream.metrics.scorers.latency import LatencyScorer, LatencyScoringSample, LatencyScores
+from simulstream.metrics.readers import OutputWithDelays, text_items
+from simulstream.metrics.scorers.latency import LatencyScoringSample, LatencyScores
+from simulstream.metrics.scorers.latency import ResegmentedLatencyScoringSample
+from simulstream.metrics.scorers.latency.segmenter_based_scorer import SegmenterBasedScorer
 
 
-@dataclass
-class ResegmentedLatencyScoringSample:
-    """
-    A sample containing realigned hypotheses and references.
-
-    Attributes:
-        audio_name (str): The identifier of the audio file.
-        hypothesis (List[str]): Hypothesis lines after realignment.
-        reference (List[str]): Reference lines aligned to the hypothesis.
-    """
-    audio_name: str
-    hypothesis: List[OutputWithDelays]
-    reference: List[ReferenceSentenceDefinition]
-
-
-class MWERSegmenterBasedLatencyScorer(LatencyScorer):
+class MWERSegmenterBasedLatencyScorer(SegmenterBasedScorer):
     """
     Abstract base class for scorers that require aligned system outputs and references through
     MWER Segmenter alignment.

diff --git a/simulstream/metrics/scorers/latency/segmenter_based_scorer.py b/simulstream/metrics/scorers/latency/segmenter_based_scorer.py
@@ -0,0 +1,35 @@
+
+from typing import List
+
+from simulstream.metrics.readers import text_items
+from simulstream.metrics.scorers.latency import LatencyScorer
+
+
+class SegmenterBasedScorer(LatencyScorer):
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.latency_unit = args.latency_unit
+
+    def _split_delays_by_segmented_text(
+            self, delays: List[float], segmented_text: List[str]) -> List[List[float]]:
+        """
+        Assign delay values to the corresponding segmented hypotheses.
+
+        Args:
+            delays (List[float]): Delay values (per token or per char).
+            segmented_text (List[str]): Segmented hypothesis strings.
+
+        Returns:
+            List[List[float]]: Delays split per segment.
+        """
+        segmented_delays = []
+        index = 0
+
+        for segment in segmented_text:
+            segment_len = len(text_items(segment, self.latency_unit))
+            segmented_delays.append(delays[index:index + segment_len])
+            index += segment_len
+        assert len(delays) == index, \
+            f"Index {index} should have reached end of delays ({len(delays)})"
+        return segmented_delays
-        return segmented_delays
+        return segmented_delays
+
+        def _resegment_samples(self, samples: List[LatencyScoringSample]) -> List[ResegmentedLatencyScoringSample]:
+            ...
+
+        def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
+            resegmented_samples = self._resegment_samples(samples)
+            return self._do_score(resegmented_samples)
+        
-        return segmented_delays
+        return segmented_delays
+
+        def _resegment_samples(self, samples: List[LatencyScoringSample]) -> List[ResegmentedLatencyScoringSample]:
+            ...
+
+        def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
+            resegmented_samples = self._resegment_samples(samples)
+            return self._do_score(resegmented_samples)
+