Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/modules.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,4 @@ Evaluation
simulstream.metrics.scorers.quality.mwersegmenter
simulstream.metrics.scorers.latency
simulstream.metrics.scorers.latency.mwersegmenter
simulstream.metrics.scorers.latency.softsegmenter
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
simulstream.metrics.scorers.latency.softsegmenter
simulstream.metrics.scorers.latency.softsegmenter
simulstream.metrics.scorers.latency.segmenter_based_scorer

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ vad = [
eval = [
"unbabel-comet==2.2.6",
"mweralign",
"sacrebleu"
"sacrebleu",
"mosestokenizer",
]

[tool.setuptools.dynamic]
Expand Down
16 changes: 16 additions & 0 deletions simulstream/metrics/scorers/latency/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class LatencyScorer:
Args:
args (argparse.Namespace): Parsed command-line arguments.
"""

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please avoid unrelated changes

def __init__(self, args: argparse.Namespace):
self.args = args

Expand All @@ -111,5 +112,20 @@ def requires_reference(self) -> bool:
...


@dataclass
class ResegmentedLatencyScoringSample:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since we have a segmenter_based_scorer file we can put this there

"""
A sample containing realigned hypotheses and references.

Attributes:
audio_name (str): The identifier of the audio file.
hypothesis (List[str]): Hypothesis lines after realignment.
reference (List[str]): Reference lines aligned to the hypothesis.
"""
audio_name: str
hypothesis: List[OutputWithDelays]
reference: List[ReferenceSentenceDefinition]


for loader, name, is_pkg in pkgutil.walk_packages(__path__, __name__ + "."):
importlib.import_module(name)
165 changes: 165 additions & 0 deletions simulstream/metrics/scorers/latency/long_yaal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Copyright 2026 FBK

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

import logging
import statistics
from typing import List, Optional

from simulstream.metrics.readers import text_items
from simulstream.metrics.scorers.latency import register_latency_scorer, LatencyScores
from simulstream.metrics.scorers.latency.softsegmenter import (
SoftSegmenterBasedLatencyScorer,
ResegmentedLatencyScoringSample
)


LOGGER = logging.getLogger('simulstream.metrics.scorers.latency.long_yaal')


@register_latency_scorer("long_yaal")
class LongYAAL(SoftSegmenterBasedLatencyScorer):
"""
Computes Long-form Yet Another Average Lagging (LongYAAL) as proposed in
`Better Late Than Never: Evaluation of Latency Metrics for Simultaneous Speech-to-Text
Translation <https://arxiv.org/abs/2509.17349>`_.

This metric uses SoftSegmenter alignment to realign system outputs to reference segments
before computing latency, making it more robust for long-form speech translation evaluation.

The key difference from StreamLAAL is the use of SoftSegmenter's more sophisticated
alignment algorithm that handles long-form audio better. Additionally, LongYAAL is considers
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
alignment algorithm that handles long-form audio better. Additionally, LongYAAL is considers
alignment algorithm that handles long-form audio better. Additionally, LongYAAL considers

all output tokens up until the end of the recording. StreamLAAL ignores any output tokens
emitted after the end of the reference segments.
"""

@staticmethod
def _sentence_level_yaal(
delays: List[float],
source_length: float,
target_length: int,
relative_recording_duration: float = float("inf"),
) -> Optional[float]:
Comment on lines +48 to +52
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
delays: List[float],
source_length: float,
target_length: int,
relative_recording_duration: float = float("inf"),
) -> Optional[float]:
delays: List[float],
source_length: float,
target_length: int,
relative_recording_duration: float = float("inf")) -> Optional[float]:

we should be consistent in the style within the repo

"""
Compute Yet Another Average Lagging (YAAL) on one sentence.

Args:
delays (List[float]): Sequence of delays for each output token.
source_length (float): Length of the source audio segment in milliseconds.
target_length (int): Length of the target reference in tokens/characters.
relative_recording_duration (float): Duration of the recording relative to the start
of the current sentence.

Returns:
Optional[float]: The YAAL score for the sentence, or None if computation is
not possible.
"""
assert source_length > 0, "Source length must be greater than zero"

YAAL = 0.0
gamma = max(len(delays), target_length) / source_length

valid_delays = [d for d in delays if d < relative_recording_duration]
if len(valid_delays) == 0:
return None

for prev_time_step, current_delay in enumerate(valid_delays):
YAAL += current_delay - prev_time_step / gamma

return YAAL / len(valid_delays)

def _do_score(
self, samples: List[ResegmentedLatencyScoringSample]
) -> LatencyScores:
Comment on lines +82 to +83
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self, samples: List[ResegmentedLatencyScoringSample]
) -> LatencyScores:
self, samples: List[ResegmentedLatencyScoringSample]) -> LatencyScores:

sentence_level_ideal_scores = []
sentence_level_ca_scores = []
skipped_sentences = 0

for sample in samples:
# Compute the total recording length (end time of the last reference segment)
if sample.reference:
recording_duration = max(
ref.start_time + ref.duration for ref in sample.reference
)
else:
LOGGER.warning(
f"Sample {sample.audio_name} has no reference segments; treating recording "
"length as infinite"
)
recording_duration = float("inf")

for sentence_output, sentence_reference in zip(
sample.hypothesis, sample.reference
):
# Note: delays in sentence_output are already offset relative to
# sentence_reference.start_time by the SoftSegmenter alignment
# (unlike MWERSegmenter which doesn't offset)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should keep the behavior between the two segmenters the same, so that one can easily switch from one to the other. So please update the SoftSegmenter accordingly. Otherwise in the future we may have problems of inconsistency if we switch from one to the other or if we have the same metric with the two different resegmentation methods.

ideal_delays = sentence_output.ideal_delays
ca_delays = sentence_output.computational_aware_delays

assert len(ideal_delays) == len(ca_delays), \
f"Mismatch in delay counts: {len(ideal_delays)} vs {len(ca_delays)}"

target_length = len(
text_items(sentence_reference.content, self.latency_unit)
)

if len(ideal_delays) > 0:
# Compute recording end time relative to sentence start.
# This considers the entire recording, not just this segment.
# This allows LongYAAL to account for outputs emitted after the reference
# segment ends but before the recording ends (key difference from StreamLAAL)
relative_recording_duration = \
recording_duration - sentence_reference.start_time

ideal_score = self._sentence_level_yaal(
ideal_delays,
sentence_reference.duration,
target_length,
relative_recording_duration=relative_recording_duration,
)

ca_score = self._sentence_level_yaal(
ca_delays,
sentence_reference.duration,
target_length,
relative_recording_duration=relative_recording_duration,
)

if ideal_score is not None:
sentence_level_ideal_scores.append(ideal_score)
else:
skipped_sentences += 1
if ca_score is not None:
sentence_level_ca_scores.append(ca_score)
else:
skipped_sentences += 1

if skipped_sentences > 0:
LOGGER.warning(
f"{skipped_sentences} sentences have been skipped in LongYAAL computation "
"as they were empty or could not be scored."
)

if len(sentence_level_ideal_scores) == 0:
LOGGER.error("No sentences could be scored for LongYAAL")
return LatencyScores(float("nan"), float("nan"))

return LatencyScores(
statistics.mean(sentence_level_ideal_scores),
(
statistics.mean(sentence_level_ca_scores)
if len(sentence_level_ca_scores) > 0
else float("nan")
),
)
Comment on lines +153 to +165
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if len(sentence_level_ideal_scores) == 0:
LOGGER.error("No sentences could be scored for LongYAAL")
return LatencyScores(float("nan"), float("nan"))
return LatencyScores(
statistics.mean(sentence_level_ideal_scores),
(
statistics.mean(sentence_level_ca_scores)
if len(sentence_level_ca_scores) > 0
else float("nan")
),
)
ideal_score = float("nan")
ca_score = float("nan")
if len(sentence_level_ideal_scores) == 0:
LOGGER.error("No sentences could be scored for LongYAAL")
else:
ideal_score = statistics.mean(sentence_level_ideal_scores)
if len(sentence_level_ca_scores) > 0:
ca_score = statistics.mean(sentence_level_ca_scores)
return LatencyScores(ideal_score, ca_score)

24 changes: 5 additions & 19 deletions simulstream/metrics/scorers/latency/mwersegmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,17 @@
# limitations under the License

from abc import abstractmethod
from dataclasses import dataclass
from typing import List

from mweralign import mweralign

from simulstream.metrics.readers import ReferenceSentenceDefinition, OutputWithDelays, text_items
from simulstream.metrics.scorers.latency import LatencyScorer, LatencyScoringSample, LatencyScores
from simulstream.metrics.readers import OutputWithDelays, text_items
from simulstream.metrics.scorers.latency import LatencyScoringSample, LatencyScores
from simulstream.metrics.scorers.latency import ResegmentedLatencyScoringSample
from simulstream.metrics.scorers.latency.segmenter_based_scorer import SegmenterBasedScorer


@dataclass
class ResegmentedLatencyScoringSample:
"""
A sample containing realigned hypotheses and references.

Attributes:
audio_name (str): The identifier of the audio file.
hypothesis (List[str]): Hypothesis lines after realignment.
reference (List[str]): Reference lines aligned to the hypothesis.
"""
audio_name: str
hypothesis: List[OutputWithDelays]
reference: List[ReferenceSentenceDefinition]


class MWERSegmenterBasedLatencyScorer(LatencyScorer):
class MWERSegmenterBasedLatencyScorer(SegmenterBasedScorer):
"""
Abstract base class for scorers that require aligned system outputs and references through
MWER Segmenter alignment.
Expand Down
35 changes: 35 additions & 0 deletions simulstream/metrics/scorers/latency/segmenter_based_scorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

from typing import List

from simulstream.metrics.readers import text_items
from simulstream.metrics.scorers.latency import LatencyScorer


class SegmenterBasedScorer(LatencyScorer):

def __init__(self, args):
super().__init__(args)
self.latency_unit = args.latency_unit

def _split_delays_by_segmented_text(
self, delays: List[float], segmented_text: List[str]) -> List[List[float]]:
"""
Assign delay values to the corresponding segmented hypotheses.

Args:
delays (List[float]): Delay values (per token or per char).
segmented_text (List[str]): Segmented hypothesis strings.

Returns:
List[List[float]]: Delays split per segment.
"""
segmented_delays = []
index = 0

for segment in segmented_text:
segment_len = len(text_items(segment, self.latency_unit))
segmented_delays.append(delays[index:index + segment_len])
index += segment_len
assert len(delays) == index, \
f"Index {index} should have reached end of delays ({len(delays)})"
return segmented_delays
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return segmented_delays
return segmented_delays
def _resegment_samples(self, samples: List[LatencyScoringSample]) -> List[ResegmentedLatencyScoringSample]:
...
def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
resegmented_samples = self._resegment_samples(samples)
return self._do_score(resegmented_samples)

and we can add a comment to the main class that sublcasses should implement _resegment_samples,, like it is done for _do_score. In this way we can isolate in the subclasses the resegmantion part. Thanks.

Loading