From 3b438fd82cd22ee7521c0834acca05d116acb9f2 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Mon, 14 Oct 2024 13:03:04 +0300
Subject: [PATCH 01/97] Add templates for new methods

---
 .../polygraph_eval_triviaqa_sentsar.yaml      | 86 +++++++++++++++++++
 .../polygraph_eval_wmt14_fren_sentsar.yaml    | 85 ++++++++++++++++++
 .../polygraph_eval_wmt19_deen_sentsar.yaml    | 85 ++++++++++++++++++
 src/lm_polygraph/estimators/__init__.py       |  7 +-
 src/lm_polygraph/estimators/sentence_sar.py   | 67 +++++++++++++++
 5 files changed, 329 insertions(+), 1 deletion(-)
 create mode 100644 examples/configs/polygraph_eval_triviaqa_sentsar.yaml
 create mode 100644 examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
 create mode 100644 examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml

diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
new file mode 100644
index 000000000..eaf197a0a
--- /dev/null
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -0,0 +1,86 @@
+hydra:
+  run:
+    dir: ${cache_path}/${task}/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+
+task: qa
+
+dataset: [trivia_qa, rc.nocontext]
+text_column: question
+label_column: answer
+prompt: "Question: {question}\nAnswer:{answer}"
+few_shot_split: train
+train_split: train
+eval_split: validation
+max_new_tokens: 20
+load_from_disk: false
+n_shot: 5
+multiref: true
+normalize: true
+generation_params:
+  generate_until:
+    - "\n"
+
+train_dataset: null
+train_test_split: false
+test_split_size: 1
+
+background_train_dataset: allenai/c4
+background_train_dataset_text_column: text
+background_train_dataset_label_column: url
+background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
+background_load_from_disk: false
+
+subsample_background_train_dataset: 1000
+subsample_train_dataset: 1000
+subsample_eval_dataset: -1
+
+use_density_based_ue: false
+use_seq_ue: false
+use_tok_ue: false
+use_ens_ue: false
+generation_metrics: null
+ens_type: 
+
+additional_estimators:
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
+    kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.semantic_entropy
+    class_name: SemanticEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: SentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: ReweightedSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs: {}
+
+ignore_exceptions: false
+
+batch_size: 1
+deberta_batch_size: 1
+
+seed:
+    - 1
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
new file mode 100644
index 000000000..4dcbf11a5
--- /dev/null
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -0,0 +1,85 @@
+hydra:
+  run:
+    dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+
+device: cpu
+
+task: nmt
+
+dataset: [wmt14, fr-en]
+text_column: fr
+label_column: en
+prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n"
+train_split: train
+eval_split: test
+max_new_tokens: 107
+load_from_disk: false
+generation_params:
+  generate_until:
+    - "\n"
+
+source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
+
+train_dataset: null
+train_test_split: false
+test_split_size: 1
+
+background_train_dataset: allenai/c4
+background_train_dataset_text_column: text
+background_train_dataset_label_column: url
+background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
+background_load_from_disk: false
+
+subsample_background_train_dataset: 1000
+subsample_train_dataset: 1000
+subsample_eval_dataset: -1
+
+use_density_based_ue: false
+use_ens_ue: false
+use_seq_ue: false
+use_tok_ue: false
+generation_metrics: null
+
+additional_estimators:
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
+    kwargs:
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs:
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs:
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+  - module: lm_polygraph.estimatorts.semantic_entropy
+    class_name: SemanticEntropy
+    kwargs:
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: SentenceSAR
+    kwargs:
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs:
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: ReweightedSentenceSAR
+    kwargs:
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs:
+
+ignore_exceptions: false
+
+batch_size: 1
+deberta_batch_size: 1
+
+seed:
+    - 1
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
new file mode 100644
index 000000000..8cb7432ce
--- /dev/null
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -0,0 +1,85 @@
+hydra:
+  run:
+    dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+
+device: cpu
+
+task: nmt
+
+dataset: [wmt19, de-en]
+text_column: de
+label_column: en
+prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n"
+train_split: train
+eval_split: validation
+max_new_tokens: 107
+load_from_disk: false
+generation_params:
+  generate_until:
+    - "\n"
+
+source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
+
+train_dataset: null
+train_test_split: false
+test_split_size: 1
+
+background_train_dataset: allenai/c4
+background_train_dataset_text_column: text
+background_train_dataset_label_column: url
+background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
+background_load_from_disk: false
+
+subsample_background_train_dataset: 1000
+subsample_train_dataset: 1000
+subsample_eval_dataset: -1
+
+use_density_based_ue: false
+use_ens_ue: false
+use_seq_ue: false
+use_tok_ue: false
+
+additional_estimators:
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
+    kwargs:
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs:
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs:
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+  - module: lm_polygraph.estimatorts.semantic_entropy
+    class_name: SemanticEntropy
+    kwargs:
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: SentenceSAR
+    kwargs:
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs:
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: ReweightedSentenceSAR
+    kwargs:
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs:
+
+ignore_exceptions: false
+
+batch_size: 1
+deberta_batch_size: 1
+
+seed:
+    - 1
+    
diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index c9c16afa2..5287644f5 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -62,7 +62,12 @@
     PESrmiabs,
 )
 from .token_sar import TokenSAR
-from .sentence_sar import SentenceSAR
+from .sentence_sar import (
+    SentenceSAR,
+    OtherSentenceSAR,
+    ReweightedSentenceSAR,
+    PPLSentenceSAR
+)
 from .sar import SAR
 from .renyi_neg import RenyiNeg
 from .fisher_rao import FisherRao
diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index 44f762afe..f911496be 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -52,3 +52,70 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)
+
+
+class OtherSentenceSAR(Estimator):
+    """
+    Like SAR, but only looks at other samples for each sample in the output.
+    """
+
+    def __init__(self, verbose: bool = False):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.t = 0.001
+
+    def __str__(self):
+        return "OtherSentenceSAR"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        sentenceSAR = []
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sample_probs = np.exp(np.array(sample_log_probs))
+            R_s = (
+                sample_probs
+                * sample_sentence_similarity
+                * (1 - np.eye(sample_sentence_similarity.shape[0]))
+            )
+            sent_relevance = R_s.sum(-1) / self.t
+            E_s = -np.log(sent_relevance)
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)
+
+
+class ReweightedSentenceSAR(Estimator):
+    """
+    Like SAR, but normalizes similarity-based scores at each iteration
+    alpha_ij = g(s_i, s_j) / (\sum_k^(K - 1) g(s_i, s_k))
+    K - number of samples in output minus one
+    """
+
+    def __str__(self):
+        return "ReweightedSentenceSAR"
+
+
+class PPLSentenceSAR(Estimator):
+    """
+    Like SAR, but uses log probs normalized on sample length in tokens 
+    Look at perplexity.py for an example
+    Tokenwise log-likelihoods are available in stats['sample_log_likelihoods'] i think
+    """
+
+    def __str__(self):
+        return "PPLSentenceSAR"

From e8c11cd955f29b9434d4843251a9982760343a6c Mon Sep 17 00:00:00 2001
From: SDUgitrep <mayagoloburda@gmail.com>
Date: Mon, 14 Oct 2024 17:33:42 +0400
Subject: [PATCH 02/97] PPL + Reweighted

---
 src/lm_polygraph/estimators/sentence_sar.py | 83 ++++++++++++++++++++-
 1 file changed, 80 insertions(+), 3 deletions(-)

diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index f911496be..c4516ab72 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -105,17 +105,94 @@ class ReweightedSentenceSAR(Estimator):
     alpha_ij = g(s_i, s_j) / (\sum_k^(K - 1) g(s_i, s_k))
     K - number of samples in output minus one
     """
+    def __init__(self, verbose: bool = False):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.t = 0.001
 
     def __str__(self):
         return "ReweightedSentenceSAR"
+    
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        sentenceSAR = []
+
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            # Compute probabilities from log probabilities
+            sample_probs = np.exp(np.array(sample_log_probs))
+            
+            # Initialize alpha_ij (reweighted sentence similarities)
+            alpha_ij = np.zeros_like(sample_sentence_similarity)
+
+            # Normalize similarity-based scores at each iteration 
+            for i in range(sample_sentence_similarity.shape[0]):
+                similarity_row = sample_sentence_similarity[i]
+                # Exclude self-similarity g(s_i, s_i)
+                similarity_row_without_self = similarity_row * (1 - np.eye(len(similarity_row)))[i]
+                sum_similarity = np.sum(similarity_row_without_self)
+                
+                if sum_similarity > 0:
+                    alpha_ij[i] = similarity_row_without_self / sum_similarity
+                else:
+                    alpha_ij[i] = similarity_row_without_self  # If the normalization factor is 0, leave the row unchanged
+
+            # Compute sentence relevance using normalized alpha_ij
+            R_s = sample_probs * alpha_ij
+            sent_relevance = R_s.sum(-1) / self.t
+
+            # Compute SentenceSAR (Uncertainty Estimation)
+            E_s = -np.log(sent_relevance + sample_probs)
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)
+
 
 
 class PPLSentenceSAR(Estimator):
     """
-    Like SAR, but uses log probs normalized on sample length in tokens 
-    Look at perplexity.py for an example
-    Tokenwise log-likelihoods are available in stats['sample_log_likelihoods'] i think
+    Like SAR, but uses log probs normalized by sample length in tokens to calculate PPL (Perplexity).
+    Tokenwise log-likelihoods are available in stats['sample_log_likelihoods'].
     """
+    def __init__(self, verbose: bool = False):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.t = 0.001
 
     def __str__(self):
         return "PPLSentenceSAR"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the PPL-based sentence-level uncertainty using token-wise log-likelihoods.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): Input statistics, including:
+                * 'sample_log_likelihoods': token-wise log-likelihoods for each sample.
+        
+        Returns:
+            np.ndarray: float PPL values for each sample.
+                Lower values indicate less uncertainty (better predictions), higher values indicate more uncertainty.
+        """
+        # Extract token-wise log-likelihoods from the stats
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+
+        perplexities = []
+
+        # Loop over each sample's token-wise log-likelihoods
+        for sample_log_likelihoods in batch_sample_log_likelihoods:
+            # Calculate the number of tokens (length of the sample in tokens)
+            num_tokens = len(sample_log_likelihoods)
+
+            # Calculate average log-likelihood for the sample
+            avg_log_likelihood = np.mean(sample_log_likelihoods)
+
+            # Perplexity is exp(-avg_log_likelihood)
+            ppl = np.exp(-avg_log_likelihood)
+
+            perplexities.append(ppl)
+
+        return np.array(perplexities)

From aa1f199f31ce97a67e3db5269efe5856e92f784c Mon Sep 17 00:00:00 2001
From: SDUgitrep <mayagoloburda@gmail.com>
Date: Mon, 14 Oct 2024 17:47:21 +0400
Subject: [PATCH 03/97] PPL upd

---
 src/lm_polygraph/estimators/sentence_sar.py | 25 ++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index c4516ab72..4bbbc67e9 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -179,11 +179,14 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
         # Extract token-wise log-likelihoods from the stats
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
 
-        perplexities = []
+        sentenceSAR = []
 
-        # Loop over each sample's token-wise log-likelihoods
-        for sample_log_likelihoods in batch_sample_log_likelihoods:
+        # Loop over each sample's log-likelihoods and sentence similarities
+        for sample_log_likelihoods, sample_sentence_similarity in zip(
+            batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
             # Calculate the number of tokens (length of the sample in tokens)
             num_tokens = len(sample_log_likelihoods)
 
@@ -193,6 +196,18 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             # Perplexity is exp(-avg_log_likelihood)
             ppl = np.exp(-avg_log_likelihood)
 
-            perplexities.append(ppl)
+            # Initialize the sentence relevance (R_s) using PPL
+            R_s = (
+                ppl  # Use PPL instead of probabilities
+                * sample_sentence_similarity
+                * (1 - np.eye(sample_sentence_similarity.shape[0]))  # Remove self-similarity
+            )
 
-        return np.array(perplexities)
+            # Compute sentence relevance
+            sent_relevance = R_s.sum(-1) / self.t
+
+            # Compute SentenceSAR (Uncertainty Estimation) using PPL
+            E_s = -np.log(sent_relevance + ppl)
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)

From 3ba0f9132b905cfe5a164cc3946eea036516ba53 Mon Sep 17 00:00:00 2001
From: SDUgitrep <mayagoloburda@gmail.com>
Date: Mon, 14 Oct 2024 18:50:53 +0400
Subject: [PATCH 04/97] PPL upd

---
 src/lm_polygraph/estimators/sentence_sar.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index 4bbbc67e9..90155ae11 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -188,11 +188,14 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
             # Calculate the number of tokens (length of the sample in tokens)
-            num_tokens = len(sample_log_likelihoods)
 
-            # Calculate average log-likelihood for the sample
-            avg_log_likelihood = np.mean(sample_log_likelihoods)
+            token_log_likelihoods = [np.mean(token_ll) for token_ll in sample_log_likelihoods]
+            
+            # Calculate the number of tokens (length of the sample in tokens)
+            num_tokens = len(token_log_likelihoods)
 
+            # Calculate the mean log-likelihood across tokens
+            avg_log_likelihood = np.sum(token_log_likelihoods) / num_tokens
             # Perplexity is exp(-avg_log_likelihood)
             ppl = np.exp(-avg_log_likelihood)
 
@@ -205,9 +208,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             # Compute sentence relevance
             sent_relevance = R_s.sum(-1) / self.t
-
+            sample_probs = np.exp(np.array(sample_log_likelihoods))
             # Compute SentenceSAR (Uncertainty Estimation) using PPL
-            E_s = -np.log(sent_relevance + ppl)
+            E_s = -np.log(sent_relevance + sample_probs)
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)

From 94b8b183cd5bbf961751adddc37561828e257130 Mon Sep 17 00:00:00 2001
From: SDUgitrep <mayagoloburda@gmail.com>
Date: Mon, 14 Oct 2024 18:51:21 +0400
Subject: [PATCH 05/97] PPL upd

---
 src/lm_polygraph/estimators/sentence_sar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index 90155ae11..7a1a2252d 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -210,7 +210,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             sent_relevance = R_s.sum(-1) / self.t
             sample_probs = np.exp(np.array(sample_log_likelihoods))
             # Compute SentenceSAR (Uncertainty Estimation) using PPL
-            E_s = -np.log(sent_relevance + sample_probs)
+            E_s = -np.log(sent_relevance + ppl)
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)

From 0a437e68d735ee76e3173f182bf87e18765ec31e Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Tue, 15 Oct 2024 00:10:50 +0300
Subject: [PATCH 06/97] Fix ppl

---
 src/lm_polygraph/estimators/sentence_sar.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index 7a1a2252d..97d646373 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -189,15 +189,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         ):
             # Calculate the number of tokens (length of the sample in tokens)
 
-            token_log_likelihoods = [np.mean(token_ll) for token_ll in sample_log_likelihoods]
-            
-            # Calculate the number of tokens (length of the sample in tokens)
-            num_tokens = len(token_log_likelihoods)
-
-            # Calculate the mean log-likelihood across tokens
-            avg_log_likelihood = np.sum(token_log_likelihoods) / num_tokens
-            # Perplexity is exp(-avg_log_likelihood)
-            ppl = np.exp(-avg_log_likelihood)
+            token_log_likelihoods = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
 
             # Initialize the sentence relevance (R_s) using PPL
             R_s = (

From a6a49ecc565ca97d2648688cd9638eb8136f578f Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Tue, 15 Oct 2024 17:41:19 +0300
Subject: [PATCH 07/97] Fix yamls

---
 .../polygraph_eval_wmt14_fren_sentsar.yaml     | 18 +++++++++---------
 .../polygraph_eval_wmt19_deen_sentsar.yaml     | 18 +++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 4dcbf11a5..715c808e9 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -50,31 +50,31 @@ generation_metrics: null
 additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimatorts.semantic_entropy
     class_name: SemanticEntropy
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: ReweightedSentenceSAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSentenceSAR
-    kwargs:
+    kwargs: {}
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index 8cb7432ce..f5371f7ee 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -49,31 +49,31 @@ use_tok_ue: false
 additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimatorts.semantic_entropy
     class_name: SemanticEntropy
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: ReweightedSentenceSAR
-    kwargs:
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSentenceSAR
-    kwargs:
+    kwargs: {}
 
 ignore_exceptions: false
 

From a5d5dc72f4ef739112e312efe1703ae677b04af5 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 16 Oct 2024 14:20:46 +0300
Subject: [PATCH 08/97] Fix small bugs

---
 examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 2 +-
 examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 2 +-
 src/lm_polygraph/estimators/sentence_sar.py             | 4 +---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 715c808e9..a67f961fa 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -60,7 +60,7 @@ additional_estimators:
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
-  - module: lm_polygraph.estimatorts.semantic_entropy
+  - module: lm_polygraph.estimators.semantic_entropy
     class_name: SemanticEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index f5371f7ee..77c0b2f62 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -59,7 +59,7 @@ additional_estimators:
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
-  - module: lm_polygraph.estimatorts.semantic_entropy
+  - module: lm_polygraph.estimators.semantic_entropy
     class_name: SemanticEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index 97d646373..ff2f00dc1 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -187,9 +187,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_log_likelihoods, sample_sentence_similarity in zip(
             batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
-            # Calculate the number of tokens (length of the sample in tokens)
-
-            token_log_likelihoods = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+            ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
 
             # Initialize the sentence relevance (R_s) using PPL
             R_s = (

From d8a7278b08ad543ce20a12fc635f1680a8ade880 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 16 Oct 2024 14:57:33 +0300
Subject: [PATCH 09/97] Remove redundant line

---
 src/lm_polygraph/estimators/sentence_sar.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index ff2f00dc1..790a8f34e 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -198,7 +198,6 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             # Compute sentence relevance
             sent_relevance = R_s.sum(-1) / self.t
-            sample_probs = np.exp(np.array(sample_log_likelihoods))
             # Compute SentenceSAR (Uncertainty Estimation) using PPL
             E_s = -np.log(sent_relevance + ppl)
             sentenceSAR.append(E_s.mean())

From bb25a126896c267ab4fc1e583990e3d1a69b1b67 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 25 Oct 2024 12:43:27 +0400
Subject: [PATCH 10/97] Add distilled sars

---
 .../polygraph_eval_triviaqa_sentsar.yaml      |  42 ++++++
 src/lm_polygraph/estimators/__init__.py       |   4 +-
 src/lm_polygraph/estimators/sentence_sar.py   | 142 +++++++++++++++++-
 3 files changed, 183 insertions(+), 5 deletions(-)

diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index eaf197a0a..9686c1aef 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -70,12 +70,54 @@ additional_estimators:
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs:
+      t: 1.0
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs:
+      t: 1.0
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs:
+      t: 1.0
+      use_log: false
+      reverse: false
   - module: lm_polygraph.estimators.sentence_sar
     class_name: ReweightedSentenceSAR
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSentenceSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
+    kwargs:
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
+    kwargs:
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilOneSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilOneSentenceSAR
+    kwargs:
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilOneSentenceSAR
+    kwargs:
+      use_log: false
+      reverse: false
 
 ignore_exceptions: false
 
diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 5287644f5..25f57e792 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -66,7 +66,9 @@
     SentenceSAR,
     OtherSentenceSAR,
     ReweightedSentenceSAR,
-    PPLSentenceSAR
+    PPLSentenceSAR,
+    DistilSentenceSAR,
+    DistilOneSentenceSAR,
 )
 from .sar import SAR
 from .renyi_neg import RenyiNeg
diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index 790a8f34e..6f51f0ba2 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 from typing import Dict
+from copy import deepcopy
 
 from .estimator import Estimator
 
@@ -59,13 +60,20 @@ class OtherSentenceSAR(Estimator):
     Like SAR, but only looks at other samples for each sample in the output.
     """
 
-    def __init__(self, verbose: bool = False):
+    def __init__(self, verbose: bool = False, t: float = 0.001, use_log: bool = True, reverse: bool = False):
         super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
         self.verbose = verbose
-        self.t = 0.001
+        self.t = t
+        self.use_log = use_log
+        self.reverse = reverse
 
     def __str__(self):
-        return "OtherSentenceSAR"
+        base = f"OtherSentenceSAR_{self.t}"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -93,7 +101,15 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 * (1 - np.eye(sample_sentence_similarity.shape[0]))
             )
             sent_relevance = R_s.sum(-1) / self.t
-            E_s = -np.log(sent_relevance)
+
+            if self.use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                if self.reverse:
+                    E_s = sent_relevance
+                else:
+                    E_s = -sent_relevance
+
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)
@@ -203,3 +219,121 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)
+
+
+class DistilSentenceSAR(Estimator):
+    """
+    Like SAR, but only looks at other samples for each sample in the output.
+    """
+
+    def __init__(self, verbose: bool = False, use_log: bool = True, reverse: bool = False):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = f"DistilSentenceSAR"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        sentenceSAR = []
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sample_probs = np.exp(np.array(sample_log_probs))
+            R_s = (
+                sample_probs
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+
+            if self.use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                if self.reverse:
+                    E_s = sent_relevance
+                else:
+                    E_s = -sent_relevance
+
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)
+
+
+class DistilOneSentenceSAR(Estimator):
+    """
+    Like SAR, but only looks at other samples for each sample in the output.
+    """
+
+    def __init__(self, verbose: bool = False, use_log: bool = True, reverse: bool = False):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = f"DistilOneSentenceSAR"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = deepcopy(stats["sample_sentence_similarity"])
+
+        sentenceSAR = []
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sample_probs = np.exp(np.array(sample_log_probs))
+            np.fill_diagonal(sample_sentence_similarity, 1)
+            
+            R_s = (
+                sample_probs
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+
+            if self.use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                if self.reverse:
+                    E_s = sent_relevance
+                else:
+                    E_s = -sent_relevance
+
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)

From 0c7644924fead55849b8c4f6d2097d170c351cfc Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 25 Oct 2024 14:23:39 +0400
Subject: [PATCH 11/97] WiP

---
 src/lm_polygraph/utils/manager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 263034002..5d7d1e03d 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -256,6 +256,7 @@ def __init__(
         max_new_tokens: int = 100,
         background_train_dataset_max_new_tokens: int = 100,
         cache_path=os.path.expanduser("~") + "/.cache",
+        save_stats: List[str] = []
     ):
         """
         Parameters:
@@ -400,6 +401,7 @@ def __init__(
         self.metrics: Dict[Tuple[str, str, str, str], float] = {}
         self.total_bad_estimators: Dict[Estimator, float] = {}
         self.stats: Dict[str, List] = defaultdict(list)
+        self.save_stats = list(set(['greedy_texts', 'greedy_tokens']) + set(save_stats))
 
         self.processors = processors
         self.ignore_exceptions = ignore_exceptions
@@ -474,7 +476,7 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
                 self.gen_metrics[generation_metric.level, str(generation_metric)] += m
                 batch_gen_metrics[generation_metric.level, str(generation_metric)] += m
 
-            for key in ["greedy_texts", "greedy_tokens"]:
+            for key in self.save_stats:
                 if key in batch_stats.keys():
                     self.stats[key] += batch_stats[key]
             for processor in self.processors:

From d064d905c384b95304dabb3e5911ddf7ddccca80 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 30 Oct 2024 21:37:38 +0400
Subject: [PATCH 12/97] Add new sar variants

---
 .../polygraph_eval_triviaqa_sentsar.yaml      |  76 ++++-
 src/lm_polygraph/estimators/__init__.py       |   6 +-
 src/lm_polygraph/estimators/sar.py            |   6 +-
 src/lm_polygraph/estimators/sentence_sar.py   | 286 +++++++++++++++++-
 4 files changed, 364 insertions(+), 10 deletions(-)

diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index eaf197a0a..29662b6d1 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -61,21 +61,93 @@ additional_estimators:
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+      t: 1
   - module: lm_polygraph.estimators.semantic_entropy
     class_name: SemanticEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: ReweightedSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: ReweightedSentenceSAR
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DisilSentenceSAR
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
+    class_name: DistilSentenceSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: true
 
 ignore_exceptions: false
 
diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 5287644f5..539450637 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -66,7 +66,11 @@
     SentenceSAR,
     OtherSentenceSAR,
     ReweightedSentenceSAR,
-    PPLSentenceSAR
+    PPLSentenceSAR,
+    DistilSentenceSAR,
+    DistilSAR,
+    DistilPPLSAR,
+    DistilMTESAR,
 )
 from .sar import SAR
 from .renyi_neg import RenyiNeg
diff --git a/src/lm_polygraph/estimators/sar.py b/src/lm_polygraph/estimators/sar.py
index 2aed3fa76..57e9c2902 100644
--- a/src/lm_polygraph/estimators/sar.py
+++ b/src/lm_polygraph/estimators/sar.py
@@ -15,7 +15,7 @@ class SAR(Estimator):
     and text relevance relative to all other generations.
     """
 
-    def __init__(self, verbose: bool = False):
+    def __init__(self, verbose: bool = False, t: float = 0.001):
         super().__init__(
             [
                 "sample_sentence_similarity",
@@ -25,10 +25,10 @@ def __init__(self, verbose: bool = False):
             "sequence",
         )
         self.verbose = verbose
-        self.t = 0.001
+        self.t = t
 
     def __str__(self):
-        return "SAR"
+        return f"SAR_t{self.t}"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index 790a8f34e..082f9be79 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -59,13 +59,26 @@ class OtherSentenceSAR(Estimator):
     Like SAR, but only looks at other samples for each sample in the output.
     """
 
-    def __init__(self, verbose: bool = False):
+    def __init__(
+        self,
+        verbose: bool = False,
+        t: float = 0.001,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
         super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
         self.verbose = verbose
-        self.t = 0.001
+        self.t = t
+        self.use_log = use_log
+        self.reverse = reverse
 
     def __str__(self):
-        return "OtherSentenceSAR"
+        base = f"OtherSentenceSAR_t{self.t}"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -93,7 +106,15 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 * (1 - np.eye(sample_sentence_similarity.shape[0]))
             )
             sent_relevance = R_s.sum(-1) / self.t
-            E_s = -np.log(sent_relevance)
+
+            if use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                if reverse:
+                    E_s = -sent_relevance
+                else:
+                    E_s = sent_relevance
+
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)
@@ -203,3 +224,260 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)
+
+
+class DistilSentenceSAR(Estimator):
+    """
+    Like SAR, but only looks at other samples for each sample in the output.
+    """
+
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "DistilSentenceSAR"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        sentenceSAR = []
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sample_probs = np.exp(np.array(sample_log_probs))
+            R_s = (
+                sample_probs
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+
+            if use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                if reverse:
+                    E_s = -sent_relevance
+                else:
+                    E_s = sent_relevance
+
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)
+
+
+class DistilSAR(Estimator):
+    """
+    Like SAR, but only looks at other samples for each sample in the output.
+    """
+
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "DistilSentenceSAR"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        sentenceSAR = []
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sample_probs = np.exp(np.array(sample_log_probs))
+            R_s = (
+                sample_probs
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+
+            if use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                if reverse:
+                    E_s = -sent_relevance
+                else:
+                    E_s = sent_relevance
+
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)
+
+
+class DistilPPLSAR(Estimator):
+    """
+    Like SAR, but only looks at other samples for each sample in the output.
+    """
+
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "DistilPPLSAR"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        sentenceSAR = []
+        for sample_log_likelihoods, sample_sentence_similarity in zip(
+            batch_sample_likelihoods, batch_sample_sentence_similarity
+        ):
+            ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+
+            R_s = (
+                ppl
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+
+            if use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                if reverse:
+                    E_s = -sent_relevance
+                else:
+                    E_s = sent_relevance
+
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)
+
+
+class DistilMTESAR(Estimator):
+    """
+    Like SAR, but only looks at other samples for each sample in the output.
+    """
+
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "DistilMTESAR"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        sentenceSAR = []
+        for sample_log_likelihoods, sample_sentence_similarity in zip(
+            batch_sample_likelihoods, batch_sample_sentence_similarity
+        ):
+            entropy = []
+            for lp in sample_log_likelihoods:
+                mask = ~np.isinf(lp)
+                entropy.append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask])))
+
+            R_s = (
+                entropy
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+
+            if use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                if reverse:
+                    E_s = -sent_relevance
+                else:
+                    E_s = sent_relevance
+
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)

From ff39d6ede32bca7ac05932aec13bca620e36fa55 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 31 Oct 2024 11:34:54 +0400
Subject: [PATCH 13/97] Fix various errors

---
 .../polygraph_eval_triviaqa_sentsar.yaml      | 34 +++++----
 scripts/polygraph_eval                        |  2 +-
 src/lm_polygraph/estimators/sentence_sar.py   | 70 ++++++++++++-------
 3 files changed, 67 insertions(+), 39 deletions(-)

diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 29662b6d1..41de5578c 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -58,6 +58,12 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
+    class_name: MonteCarloSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
+    class_name: MonteCarloNormalizedSequenceEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
@@ -97,7 +103,7 @@ additional_estimators:
       use_log: false
       reverse: true
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: DisilSentenceSAR
+    class_name: DistilSentenceSAR
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSentenceSAR
@@ -135,19 +141,19 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
-    kwargs: 
-      use_log: false
-      reverse: true
+        #  - module: lm_polygraph.estimators.sentence_sar
+        #    class_name: DistilMTESAR
+        #    kwargs: {}
+        #  - module: lm_polygraph.estimators.sentence_sar
+        #    class_name: DistilMTESAR
+        #    kwargs: 
+        #      use_log: false
+        #      reverse: false
+        #  - module: lm_polygraph.estimators.sentence_sar
+        #    class_name: DistilMTESAR
+        #    kwargs: 
+        #      use_log: false
+        #      reverse: true
 
 ignore_exceptions: false
 
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 23f6db69e..70abd8523 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -89,7 +89,7 @@ def main(args):
             instruct=getattr(args, "instruct", None),
             split=args.eval_split,
             load_from_disk=args.load_from_disk,
-            trust_remote_code=getattr(args, "trust_remote_code", False),
+            #trust_remote_code=getattr(args, "trust_remote_code", False),
             **cache_kwargs
         )
         log.info("Done with loading eval data.")
diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index 082f9be79..f7101582a 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -107,10 +107,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             )
             sent_relevance = R_s.sum(-1) / self.t
 
-            if use_log:
+            if self.use_log:
                 E_s = -np.log(sent_relevance)
             else:
-                if reverse:
+                if self.reverse:
                     E_s = -sent_relevance
                 else:
                     E_s = sent_relevance
@@ -276,10 +276,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             )
             sent_relevance = R_s.sum(-1)
 
-            if use_log:
+            if self.use_log:
                 E_s = -np.log(sent_relevance)
             else:
-                if reverse:
+                if self.reverse:
                     E_s = -sent_relevance
                 else:
                     E_s = sent_relevance
@@ -306,7 +306,7 @@ def __init__(
         self.reverse = reverse
 
     def __str__(self):
-        base = "DistilSentenceSAR"
+        base = "DistilSAR"
         if not self.use_log:
             base += "_no_log"
             if self.reverse:
@@ -325,31 +325,50 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             np.ndarray: float sentenceSAR for each sample in input statistics.
                 Higher values indicate more uncertain samples.
         """
-        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_token_similarity = stats["sample_token_similarity"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
 
-        sentenceSAR = []
-        for sample_log_probs, sample_sentence_similarity in zip(
-            batch_sample_log_probs, batch_sample_sentence_similarity
+        SAR = []
+        for batch_data in zip(
+            batch_sample_log_likelihoods,
+            batch_sample_token_similarity,
+            batch_sample_sentence_similarity,
         ):
-            sample_probs = np.exp(np.array(sample_log_probs))
+            sample_log_likelihoods = batch_data[0]
+            sample_token_similarity = batch_data[1]
+            sample_sentence_similarity = batch_data[2]
+
+            tokenSAR = []
+            for log_likelihoods, token_similarity in zip(
+                sample_log_likelihoods, sample_token_similarity
+            ):
+                log_likelihoods = np.array(log_likelihoods)
+                R_t = 1 - token_similarity
+                R_t_norm = R_t / R_t.sum()
+                E_t = -log_likelihoods * R_t_norm
+                tokenSAR.append(E_t.sum())
+
+            tokenSAR = np.array(tokenSAR)
+            probs_token_sar = np.exp(-tokenSAR)
+
             R_s = (
-                sample_probs
+                probs_token_sar
                 * sample_sentence_similarity
             )
             sent_relevance = R_s.sum(-1)
 
-            if use_log:
+            if self.use_log:
                 E_s = -np.log(sent_relevance)
             else:
-                if reverse:
+                if self.reverse:
                     E_s = -sent_relevance
                 else:
                     E_s = sent_relevance
 
-            sentenceSAR.append(E_s.mean())
+            SAR.append(E_s.mean())
 
-        return np.array(sentenceSAR)
+        return np.array(SAR)
 
 
 class DistilPPLSAR(Estimator):
@@ -393,7 +412,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
         sentenceSAR = []
         for sample_log_likelihoods, sample_sentence_similarity in zip(
-            batch_sample_likelihoods, batch_sample_sentence_similarity
+            batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
             ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
 
@@ -403,10 +422,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             )
             sent_relevance = R_s.sum(-1)
 
-            if use_log:
+            if self.use_log:
                 E_s = -np.log(sent_relevance)
             else:
-                if reverse:
+                if self.reverse:
                     E_s = -sent_relevance
                 else:
                     E_s = sent_relevance
@@ -457,12 +476,15 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
         sentenceSAR = []
         for sample_log_likelihoods, sample_sentence_similarity in zip(
-            batch_sample_likelihoods, batch_sample_sentence_similarity
+            batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
             entropy = []
-            for lp in sample_log_likelihoods:
-                mask = ~np.isinf(lp)
-                entropy.append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask])))
+            for seq_lp in sample_log_likelihoods:
+                seq_entropy = []
+                for lp in seq_lp:
+                    mask = ~np.isinf(lp)
+                    seq_entropy.append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask])))
+                entropy.append(np.mean(seq_entropy))
 
             R_s = (
                 entropy
@@ -470,10 +492,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             )
             sent_relevance = R_s.sum(-1)
 
-            if use_log:
+            if self.use_log:
                 E_s = -np.log(sent_relevance)
             else:
-                if reverse:
+                if self.reverse:
                     E_s = -sent_relevance
                 else:
                     E_s = sent_relevance

From af30cb75c240c57f73c8c90759fdbb9571986e3e Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 31 Oct 2024 14:14:35 +0400
Subject: [PATCH 14/97] Add xsum config, different alignscore versions

---
 .../polygraph_eval_wmt14_fren_sentsar.yaml    |  69 +++++++-
 .../polygraph_eval_wmt19_deen_sentsar.yaml    |  70 ++++++++-
 .../configs/polygraph_eval_xsum_sentsar.yaml  | 148 ++++++++++++++++++
 scripts/polygraph_eval                        |   4 +-
 .../generation_metrics/alignscore.py          |  27 +++-
 5 files changed, 306 insertions(+), 12 deletions(-)
 create mode 100644 examples/configs/polygraph_eval_xsum_sentsar.yaml

diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index a67f961fa..b0635ce2a 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -57,24 +57,89 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
+    class_name: MonteCarloSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
+    class_name: MonteCarloNormalizedSequenceEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+      t: 1
   - module: lm_polygraph.estimators.semantic_entropy
     class_name: SemanticEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: ReweightedSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: ReweightedSentenceSAR
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
+    class_name: DistilSentenceSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: 
+      use_log: false
+      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index 77c0b2f62..0c0f0e730 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -56,24 +56,89 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
+    class_name: MonteCarloSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
+    class_name: MonteCarloNormalizedSequenceEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+      t: 1
   - module: lm_polygraph.estimators.semantic_entropy
     class_name: SemanticEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: ReweightedSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: ReweightedSentenceSAR
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
+    class_name: DistilSentenceSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: 
+      use_log: false
+      reverse: true
 
 ignore_exceptions: false
 
@@ -82,4 +147,3 @@ deberta_batch_size: 1
 
 seed:
     - 1
-    
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
new file mode 100644
index 000000000..3af6b12f3
--- /dev/null
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -0,0 +1,148 @@
+hydra:
+  run:
+    dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+
+device: cpu
+
+task: ats
+
+dataset: xsum
+text_column: document
+label_column: summary
+prompt: "Here's the text and it's short one-sentence summary.\n\nText:\n{text}\n\nSummary (one sentence):\n"
+train_split: train
+eval_split: test
+max_new_tokens: 56
+load_from_disk: false
+trust_remote_code: true
+generation_params:
+  generate_until:
+    - "\n"
+
+train_dataset: null
+train_test_split: false
+test_split_size: 1
+
+background_train_dataset: allenai/c4
+background_train_dataset_text_column: text
+background_train_dataset_label_column: url
+background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
+background_load_from_disk: false
+
+subsample_background_train_dataset: 1000
+subsample_train_dataset: 1000
+subsample_eval_dataset: -1
+
+use_density_based_ue: false
+use_seq_ue: false
+use_tok_ue: false
+use_ens_ue: false
+
+additional_estimators:
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
+    kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
+    class_name: MonteCarloSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
+    class_name: MonteCarloNormalizedSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+      t: 1
+  - module: lm_polygraph.estimators.semantic_entropy
+    class_name: SemanticEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: SentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: ReweightedSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: OtherSentenceSAR
+    kwargs: 
+      t: 1
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSentenceSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilPPLSAR
+    kwargs: 
+      use_log: false
+      reverse: true
+
+ignore_exceptions: false
+
+batch_size: 1
+deberta_batch_size: 1
+
+seed:
+    - 1
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 70abd8523..239dc921f 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -395,7 +395,9 @@ def get_generation_metrics(args):
                 output_ignore_regex = getattr(args, "output_ignore_regex", None),
                 normalize = getattr(args, "normalize", False),
             ),
-            AlignScore(target_is_claims=False if args.task == "ats" else True),
+            AlignScore(),
+            AlignScore(target_is_claims=False),
+            AlignScore(ignore_target=True),
         ]
         if getattr(args.model, "type", "Whitebox") != "Blackbox":
             if getattr(args, "use_claim_ue", False):
diff --git a/src/lm_polygraph/generation_metrics/alignscore.py b/src/lm_polygraph/generation_metrics/alignscore.py
index a1f9a63d7..139b558e1 100644
--- a/src/lm_polygraph/generation_metrics/alignscore.py
+++ b/src/lm_polygraph/generation_metrics/alignscore.py
@@ -18,11 +18,13 @@ def __init__(
         ckpt_path="https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt",
         batch_size=16,
         target_is_claims=True,
+        ignore_target=False,
     ):
         super().__init__(["greedy_texts", "input_texts"], "sequence")
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.target_is_claims = target_is_claims
         self.batch_size = batch_size
+        self.ignore_target = ignore_target
         self.scorer = AlignScorer(
             model="roberta-large",
             batch_size=batch_size,
@@ -32,7 +34,14 @@ def __init__(
         )
 
     def __str__(self):
-        return "AlignScore"
+        base = "AlignScore"
+        if self.ignore_target:
+            base += "InputOutput"
+        elif self.target_is_claims:
+            base += "OutputTarget"
+        else:
+            base += "TargetOutput"
+        return base
 
     def __call__(
         self,
@@ -51,16 +60,22 @@ def __call__(
             np.ndarray: list of AlignScore Scores for each sample in input.
         """
         greedy_texts = stats["greedy_texts"]
+        input_texts = stats["input_texts"]
 
         filtered_targets = [x if len(x.strip()) else "(empty)" for x in target_texts]
         filtered_outputs = [x if len(x.strip()) else "(empty)" for x in greedy_texts]
+        filtered_inputs = [x if len(x.strip()) else "(empty)" for x in input_texts]
 
-        if self.target_is_claims:
-            claims = filtered_targets
-            contexts = filtered_outputs
-        else:
+        if self.ignore_target:
             claims = filtered_outputs
-            contexts = filtered_targets
+            contexts = filtered_inputs
+        else:
+            if self.target_is_claims:
+                claims = filtered_targets
+                contexts = filtered_outputs
+            else:
+                claims = filtered_outputs
+                contexts = filtered_targets
 
         scores = np.array(
             self.scorer.score(

From 3fad16bf37774ecbdcae71f544373fbb9984d190 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 8 Nov 2024 15:36:30 +0400
Subject: [PATCH 15/97] Smallfix

---
 src/lm_polygraph/estimators/sentence_sar.py | 2 +-
 src/lm_polygraph/utils/manager.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index 6f51f0ba2..48c4ca0b0 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -319,7 +319,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         ):
             sample_probs = np.exp(np.array(sample_log_probs))
             np.fill_diagonal(sample_sentence_similarity, 1)
-            
+
             R_s = (
                 sample_probs
                 * sample_sentence_similarity
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 5d7d1e03d..8e903c731 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -401,7 +401,7 @@ def __init__(
         self.metrics: Dict[Tuple[str, str, str, str], float] = {}
         self.total_bad_estimators: Dict[Estimator, float] = {}
         self.stats: Dict[str, List] = defaultdict(list)
-        self.save_stats = list(set(['greedy_texts', 'greedy_tokens']) + set(save_stats))
+        self.save_stats = list(set(['greedy_texts', 'greedy_tokens']).union(set(save_stats)))
 
         self.processors = processors
         self.ignore_exceptions = ignore_exceptions

From b1f0346da886ce44f709a9e300eaa55466282cdb Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Sat, 9 Nov 2024 19:24:22 +0400
Subject: [PATCH 16/97] Sample entropy

---
 src/lm_polygraph/stat_calculators/__init__.py |  1 +
 src/lm_polygraph/stat_calculators/entropy.py  | 26 +++++++++++++++++++
 .../utils/register_stat_calculators.py        |  1 +
 3 files changed, 28 insertions(+)

diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py
index 7bf5b7c21..f69abd428 100644
--- a/src/lm_polygraph/stat_calculators/__init__.py
+++ b/src/lm_polygraph/stat_calculators/__init__.py
@@ -9,6 +9,7 @@
     OPENAI_FACT_CHECK_PROMPTS,
 )
 from .entropy import EntropyCalculator
+from .entropy import SampleEntropyCalculator
 from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator
 from .greedy_alternatives_nli import (
     GreedyAlternativesNLICalculator,
diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py
index 1696007fd..02939569d 100644
--- a/src/lm_polygraph/stat_calculators/entropy.py
+++ b/src/lm_polygraph/stat_calculators/entropy.py
@@ -42,3 +42,29 @@ def __call__(
                 mask = ~np.isinf(lp)
                 entropies[-1].append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask])))
         return {"entropy": entropies}
+
+class SampleEntropyCalculator(StatCalculator):
+    def __init__(self):
+        super().__init__(["sample_entropy"], ["sample_log_likelihoods"])
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str] = None,
+        model: WhiteboxModel = None,
+        max_new_tokens: int = 100,
+        **kwargs,
+    ) -> Dict[str, np.ndarray]:
+        logprobs = dependencies["sample_log_likelihoods"]
+        entropies = []
+
+        for sample_log_probs in logprobs:
+            for token_log_probs in sample_log_probs:
+                token_log_probs = np.array(token_log_probs)
+                probabilities = np.exp(token_log_probs)
+
+                mask = ~np.isinf(token_log_probs)
+                sample_entropy = -np.sum(probabilities[mask] * token_log_probs[mask])
+            
+                entropies.append(sample_entropy)
+        return {"sample_entropy": entropies}
\ No newline at end of file
diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
index 7588ed1c6..2a46b5740 100644
--- a/src/lm_polygraph/utils/register_stat_calculators.py
+++ b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -63,6 +63,7 @@ def _register(calculator_class: StatCalculator):
     else:
         _register(GreedyProbsCalculator(n_alternatives=n_ccp_alternatives))
         _register(EntropyCalculator())
+        _register(SampleEntropyCalculator())
         _register(GreedyLMProbsCalculator())
         _register(SamplingGenerationCalculator())
         _register(BartScoreCalculator())

From 73e6a6ffbbc18192125fa24b19be64d72aa77375 Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Sat, 9 Nov 2024 19:37:42 +0400
Subject: [PATCH 17/97] Add entropy-based sentence sar

---
 src/lm_polygraph/estimators/__init__.py     |  1 +
 src/lm_polygraph/estimators/sentence_sar.py | 53 +++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 539450637..17d70def6 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -71,6 +71,7 @@
     DistilSAR,
     DistilPPLSAR,
     DistilMTESAR,
+    EntropySentenceSAR,
 )
 from .sar import SAR
 from .renyi_neg import RenyiNeg
diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index f7101582a..5d187b0db 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -503,3 +503,56 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)
+
+
+class EntropySentenceSAR(Estimator):
+    """
+    Like SAR, but uses sample entropy calculated from token-wise log probs for each sample.
+    Tokenwise log-likelihoods are available in stats['sample_log_likelihoods'].
+    """
+    def __init__(self, verbose: bool = False):
+        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        self.verbose = verbose
+        self.t = 0.001
+
+    def __str__(self):
+        return "EntropySentenceSAR"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the Entropy-based sentence-level uncertainty using token-wise log-likelihoods.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): Input statistics, including:
+                * 'sample_log_likelihoods': token-wise log-likelihoods for each sample.
+        
+        Returns:
+            np.ndarray: float PPL values for each sample.
+                Lower values indicate less uncertainty (better predictions), higher values indicate more uncertainty.
+        """
+        # Extract token-wise log-likelihoods from the stats
+        batch_sample_entropy = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        sentenceSAR = []
+
+        # Loop over each sample's log-likelihoods and sentence similarities
+        for sample_entropy, sample_sentence_similarity in zip(
+            batch_sample_entropy, batch_sample_sentence_similarity
+        ):
+            entropy = sample_entropy
+            # Initialize the sentence relevance (R_s) using PPL
+            R_s = (
+                entropy  # Use entropy instead of probabilities
+                * sample_sentence_similarity
+                * (1 - np.eye(sample_sentence_similarity.shape[0]))  # Remove self-similarity
+            )
+
+            # Compute sentence relevance
+            sent_relevance = R_s.sum(-1) / self.t
+            # Compute SentenceSAR (Uncertainty Estimation) using PPL
+            E_s = -np.log(sent_relevance + entropy)
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)
+

From d3d6723f8329ff1c3366d241ceb60164bfdee653 Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Tue, 12 Nov 2024 19:08:44 +0400
Subject: [PATCH 18/97] New sentences

---
 requirements.txt                             |  2 +-
 src/lm_polygraph/estimators/__init__.py      |  1 +
 src/lm_polygraph/estimators/sentence_sar.py  | 70 +++++++++++++++++++-
 src/lm_polygraph/stat_calculators/entropy.py | 29 +++++---
 src/lm_polygraph/stat_calculators/sample.py  | 11 ++-
 5 files changed, 98 insertions(+), 15 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 949524b00..39fc2a345 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -33,5 +33,5 @@ bert-score
 unbabel-comet==2.2.1
 nltk>=3.7,<4
 evaluate
-spacy>=3.4.0,<4
+spacy>=3.4.0,<3.8
 fastchat
diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 5d66c3523..2eb69d448 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -67,6 +67,7 @@
     OtherSentenceSAR,
     ReweightedSentenceSAR,
     PPLSentenceSAR,
+    MTESentenceSAR,
     DistilSentenceSAR,
     DistilOneSentenceSAR,
     DistilSAR,
diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index f30aad7ec..fede78ded 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -501,7 +501,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(sentenceSAR)
 
 
-class DistilMTESAR(Estimator):
+class MTESentenceSAR(Estimator):
     """
     Like SAR, but uses sample entropy calculated from token-wise log probs for each sample.
     Tokenwise log-likelihoods are available in stats['sample_log_likelihoods'].
@@ -512,7 +512,7 @@ def __init__(self, verbose: bool = False):
         self.t = 0.001
 
     def __str__(self):
-        return "EntropySentenceSAR"
+        return "MTESentenceSAR"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -551,3 +551,69 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)
+
+
+
+
+class DistilMTESAR(Estimator):
+    """
+    Like SAR, but uses Mean Token Entropy (MTE) calculated from token-wise log probs for each sample.
+    Token-wise log-likelihoods are available in stats['sample_entropy'].
+    """
+
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "DistilMTESAR"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample using Mean Token Entropy (MTE).
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * 'sample_entropy': Mean Token Entropy for each sample,
+                * 'sample_sentence_similarity': matrix with cross-encoder similarities.
+        
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_entropy = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        sentenceSAR = []
+
+        # Loop over each sample's Mean Token Entropy and sentence similarities
+        for sample_entropy, sample_sentence_similarity in zip(
+            batch_sample_entropy, batch_sample_sentence_similarity
+        ):
+            # Use MTE for sentence relevance calculation
+            R_s = sample_entropy * sample_sentence_similarity
+            
+            # Compute sentence relevance by summing along the last axis
+            sent_relevance = R_s.sum(-1)
+
+            # Calculate E_s with options for log transformation and reversal
+            if self.use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                E_s = -sent_relevance if self.reverse else sent_relevance
+
+            sentenceSAR.append(E_s.mean())
+
+        return np.array(sentenceSAR)
diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py
index 02939569d..dbc49a4c8 100644
--- a/src/lm_polygraph/stat_calculators/entropy.py
+++ b/src/lm_polygraph/stat_calculators/entropy.py
@@ -4,7 +4,8 @@
 
 from .stat_calculator import StatCalculator
 from lm_polygraph.utils.model import WhiteboxModel
-
+import torch
+from torch.nn import functional as F
 
 class EntropyCalculator(StatCalculator):
     """
@@ -45,7 +46,7 @@ def __call__(
 
 class SampleEntropyCalculator(StatCalculator):
     def __init__(self):
-        super().__init__(["sample_entropy"], ["sample_log_likelihoods"])
+        super().__init__(["sample_entropy"], ["token_distributions"])
 
     def __call__(
         self,
@@ -55,16 +56,22 @@ def __call__(
         max_new_tokens: int = 100,
         **kwargs,
     ) -> Dict[str, np.ndarray]:
-        logprobs = dependencies["sample_log_likelihoods"]
+        token_distributions = dependencies["token_distributions"]
         entropies = []
 
-        for sample_log_probs in logprobs:
-            for token_log_probs in sample_log_probs:
-                token_log_probs = np.array(token_log_probs)
-                probabilities = np.exp(token_log_probs)
+        for sample_distributions in token_distributions:
+            sample_entropies = []
+            for token_dist in sample_distributions:
+                # Convert token_dist to a numpy array first, then to a torch tensor
+                token_dist_tensor = torch.tensor(np.array(token_dist))
+
+                # Calculate entropy using torch's Categorical distribution
+                # Apply mean() in case the entropy returns a multi-element tensor
+                entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy().mean()
+                sample_entropies.append(entropy.item())  # Convert to a scalar value if needed
 
-                mask = ~np.isinf(token_log_probs)
-                sample_entropy = -np.sum(probabilities[mask] * token_log_probs[mask])
-            
-                entropies.append(sample_entropy)
+            # Calculate mean entropy for the sample
+            mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0
+            entropies.append(mean_entropy.item())
+        
         return {"sample_entropy": entropies}
\ No newline at end of file
diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py
index 96a447a57..6f1fb2236 100644
--- a/src/lm_polygraph/stat_calculators/sample.py
+++ b/src/lm_polygraph/stat_calculators/sample.py
@@ -98,6 +98,7 @@ def __init__(self, samples_n: int = 10):
                 "sample_tokens",
                 "sample_texts",
                 "sample_log_likelihoods",
+                "token_distributions",
             ],
             [],
         )
@@ -123,6 +124,7 @@ def __call__(
                 - 'sample_tokens' (List[List[List[float]]]): tokenized 'sample_texts',
                 - 'sample_log_probs' (List[List[float]]): sum of the log probabilities at each token of the sampling generation.
                 - 'sample_log_likelihoods' (List[List[List[float]]]): log probabilities at each token of the sampling generation.
+                - 'token_distributions' (List[List[List[float]]]): full token probability distributions for each generated token.
         """
         batch: Dict[str, torch.Tensor] = model.tokenize(texts)
         batch = {k: v.to(model.device()) for k, v in batch.items()}
@@ -152,10 +154,14 @@ def __call__(
         tokens = [[] for _ in range(len(texts))]
         texts = [[] for _ in range(len(texts))]
         log_likelihoods = [[] for _ in range(len(texts))]
+        token_distributions = [[] for _ in range(len(texts))]
+
+
         if model.model_type == "Seq2SeqLM":
             sequences = [seq[1:] for seq in sequences]
+
         for i in range(len(logits)):
-            log_prob, ll, toks = 0, [], []
+            log_prob, ll, toks, distributions = 0, [], [], []
             inp_size = (
                 len(batch["input_ids"][int(i / self.samples_n)])
                 if model.model_type == "CausalLM"
@@ -168,15 +174,18 @@ def __call__(
                     break
                 ll.append(logits[i][j][cur_token].item())
                 toks.append(cur_token)
+                distributions.append(logits[i][j].softmax(dim=-1).cpu().numpy())
 
             log_likelihoods[int(i / self.samples_n)].append(ll)
             log_probs[int(i / self.samples_n)].append(log_prob)
             tokens[int(i / self.samples_n)].append(toks)
             texts[int(i / self.samples_n)].append(model.tokenizer.decode(toks))
+            token_distributions[int(i / self.samples_n)].append(distributions)
 
         return {
             "sample_log_likelihoods": log_likelihoods,
             "sample_log_probs": log_probs,
             "sample_tokens": tokens,
             "sample_texts": texts,
+            "token_distributions": token_distributions,
         }

From 3504d82cfce7f5d3be94c4f2bb7f4dfe14742528 Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Mon, 18 Nov 2024 14:38:21 +0400
Subject: [PATCH 19/97] Small fix to sample entropy

---
 src/lm_polygraph/stat_calculators/entropy.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py
index dbc49a4c8..eaa07dee1 100644
--- a/src/lm_polygraph/stat_calculators/entropy.py
+++ b/src/lm_polygraph/stat_calculators/entropy.py
@@ -66,9 +66,8 @@ def __call__(
                 token_dist_tensor = torch.tensor(np.array(token_dist))
 
                 # Calculate entropy using torch's Categorical distribution
-                # Apply mean() in case the entropy returns a multi-element tensor
-                entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy().mean()
-                sample_entropies.append(entropy.item())  # Convert to a scalar value if needed
+                entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy()
+                sample_entropies.append(entropy.item()) 
 
             # Calculate mean entropy for the sample
             mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0

From 877a3dddf7d9ad8f81673c63a0d85fcf784ad535 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Tue, 19 Nov 2024 16:26:19 +0400
Subject: [PATCH 20/97] Save some additional stats for samples

---
 examples/configs/polygraph_eval_xsum_sentsar.yaml | 5 +++++
 scripts/polygraph_eval                            | 1 +
 2 files changed, 6 insertions(+)

diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index 3af6b12f3..c78c143ba 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -25,6 +25,11 @@ trust_remote_code: true
 generation_params:
   generate_until:
     - "\n"
+save_stats:
+  - sample_tokens
+  - sample_texts
+  - sample_log_probs
+  - sample_sentence_similarity
 
 train_dataset: null
 train_test_split: false
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 239dc921f..9fac71216 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -201,6 +201,7 @@ def main(args):
             ensemble_model=ensemble_model,
             cache_path=args.cache_path,
             language=getattr(args, 'language', 'en'),
+            save_stats=getattr(args, 'save_stats', []),
         )
 
         man()

From 9906efc559adc0d007acb7fdaf12f93229533d99 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Tue, 19 Nov 2024 16:35:15 +0400
Subject: [PATCH 21/97] Use MTE sar

---
 .../configs/polygraph_eval_xsum_sentsar.yaml     | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index c78c143ba..1203c1408 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -85,6 +85,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSentenceSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
     kwargs: {}
@@ -143,6 +146,19 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: true
 
 ignore_exceptions: false
 

From 7c89d1141b20de727efd138ecb7447d25dd50570 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 21 Nov 2024 08:35:36 +0400
Subject: [PATCH 22/97] Add batch iteration in MTE sar

---
 src/lm_polygraph/stat_calculators/entropy.py | 23 ++++++++++----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py
index eaa07dee1..44e592316 100644
--- a/src/lm_polygraph/stat_calculators/entropy.py
+++ b/src/lm_polygraph/stat_calculators/entropy.py
@@ -56,21 +56,22 @@ def __call__(
         max_new_tokens: int = 100,
         **kwargs,
     ) -> Dict[str, np.ndarray]:
-        token_distributions = dependencies["token_distributions"]
+        batch_distributions = dependencies["token_distributions"]
         entropies = []
+        
+        for input_distributions in batch_distributions:
+            for sample_distributions in input_distributions:
+                sample_entropies = []
+                for token_dist in sample_distributions:
+                    # Convert token_dist to a numpy array first, then to a torch tensor
+                    token_dist_tensor = torch.tensor(np.array(token_dist))
 
-        for sample_distributions in token_distributions:
-            sample_entropies = []
-            for token_dist in sample_distributions:
-                # Convert token_dist to a numpy array first, then to a torch tensor
-                token_dist_tensor = torch.tensor(np.array(token_dist))
-
-                # Calculate entropy using torch's Categorical distribution
-                entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy()
-                sample_entropies.append(entropy.item()) 
+                    # Calculate entropy using torch's Categorical distribution
+                    entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy()
+                    sample_entropies.append(entropy.item()) 
 
             # Calculate mean entropy for the sample
             mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0
             entropies.append(mean_entropy.item())
         
-        return {"sample_entropy": entropies}
\ No newline at end of file
+        return {"sample_entropy": entropies}

From 4fcd0301ebabaa709bce46e82463a1c2a836db24 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 21 Nov 2024 17:52:12 +0400
Subject: [PATCH 23/97] Make xsum work

---
 scripts/polygraph_eval            | 2 +-
 src/lm_polygraph/utils/manager.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 9fac71216..7a61f524e 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -89,7 +89,7 @@ def main(args):
             instruct=getattr(args, "instruct", None),
             split=args.eval_split,
             load_from_disk=args.load_from_disk,
-            #trust_remote_code=getattr(args, "trust_remote_code", False),
+            trust_remote_code=getattr(args, "trust_remote_code", False),
             **cache_kwargs
         )
         log.info("Done with loading eval data.")
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 8e903c731..88d8494cc 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -478,7 +478,11 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
 
             for key in self.save_stats:
                 if key in batch_stats.keys():
-                    self.stats[key] += batch_stats[key]
+                    try:
+                        self.stats[key] += list(batch_stats[key])
+                    except:
+                        breakpoint()
+                        pass
             for processor in self.processors:
                 processor.on_batch(batch_stats, batch_gen_metrics, batch_estimations)
 

From 65c91f0c034f21d40c92cfe43fe9e89eb378043c Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 22 Nov 2024 19:58:03 +0400
Subject: [PATCH 24/97] Prevent generation of newlines only

---
 src/lm_polygraph/utils/model.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index 4ab587f27..de4994b0c 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -355,6 +355,10 @@ def __init__(
         def __call__(self, input_ids, scores, **kwargs) -> bool:
             # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
             lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
+            
+            # Do not stop generation if stop sequence is the first thing generated
+            if lookback_ids_batch.shape[1] < 2:
+                return False
 
             lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
 

From 2007f63f44fd8149cded1a76c91abd4c36461958 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 22 Nov 2024 20:01:38 +0400
Subject: [PATCH 25/97] Consistent method set

---
 .../polygraph_eval_triviaqa_sentsar.yaml      | 51 ++++++++-----------
 .../polygraph_eval_wmt14_fren_sentsar.yaml    | 19 +++++++
 .../polygraph_eval_wmt19_deen_sentsar.yaml    | 19 +++++++
 .../configs/polygraph_eval_xsum_sentsar.yaml  |  3 ++
 4 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 1a9179268..8a9fbf367 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -58,6 +58,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
     class_name: MonteCarloSequenceEntropy
     kwargs: {}
@@ -83,6 +86,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSentenceSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
     kwargs: {}
@@ -107,27 +113,14 @@ additional_estimators:
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSentenceSAR
-    kwargs:
-      use_log: false
-      reverse: true
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
-    kwargs:
+    kwargs: 
       use_log: false
       reverse: false
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilOneSentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilOneSentenceSAR
-    kwargs:
+    class_name: DistilSentenceSAR
+    kwargs: 
       use_log: false
       reverse: true
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilOneSentenceSAR
-    kwargs:
-      use_log: false
-      reverse: false
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSAR
     kwargs: {}
@@ -154,19 +147,19 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
-        #  - module: lm_polygraph.estimators.sentence_sar
-        #    class_name: DistilMTESAR
-        #    kwargs: {}
-        #  - module: lm_polygraph.estimators.sentence_sar
-        #    class_name: DistilMTESAR
-        #    kwargs: 
-        #      use_log: false
-        #      reverse: false
-        #  - module: lm_polygraph.estimators.sentence_sar
-        #    class_name: DistilMTESAR
-        #    kwargs: 
-        #      use_log: false
-        #      reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index b0635ce2a..c8684afc4 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -57,6 +57,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
     class_name: MonteCarloSequenceEntropy
     kwargs: {}
@@ -82,6 +85,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSentenceSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
     kwargs: {}
@@ -140,6 +146,19 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index 0c0f0e730..0210310b4 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -56,6 +56,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
     class_name: MonteCarloSequenceEntropy
     kwargs: {}
@@ -81,6 +84,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSentenceSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: OtherSentenceSAR
     kwargs: {}
@@ -139,6 +145,19 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: DistilMTESAR
+    kwargs: 
+      use_log: false
+      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index 1203c1408..ed8d86730 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -60,6 +60,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
     class_name: MonteCarloSequenceEntropy
     kwargs: {}

From 303d5792ea8f696d4ef1e1ecbf53062ea4f53ac1 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Sat, 23 Nov 2024 12:15:17 +0400
Subject: [PATCH 26/97] Fix lookback procedure

---
 src/lm_polygraph/utils/model.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index de4994b0c..c38fb118b 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -356,17 +356,14 @@ def __call__(self, input_ids, scores, **kwargs) -> bool:
             # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
             lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
             
-            # Do not stop generation if stop sequence is the first thing generated
-            if lookback_ids_batch.shape[1] < 2:
-                return False
-
             lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
 
             lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
 
             for i, done in enumerate(self.done_tracker):
                 if not done:
-                    self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+                    # Stop generation if the stop sequence is in the lookback tokens but doesn't start with stop sequence
+                    self.done_tracker[i] = (self.sequence in lookback_tokens_batch[i] and not lookback_tokens_batch[i][: len(self.sequence)] == self.sequence)
             return False not in self.done_tracker
 
     def get_stopping_criteria(self, input_ids: torch.Tensor):

From 0978d2a883db95a4b59a1e6a064dee7093db1325 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Sat, 23 Nov 2024 12:40:22 +0400
Subject: [PATCH 27/97] One more stopping criterion fix

---
 src/lm_polygraph/utils/model.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index c38fb118b..f22901919 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -362,8 +362,11 @@ def __call__(self, input_ids, scores, **kwargs) -> bool:
 
             for i, done in enumerate(self.done_tracker):
                 if not done:
-                    # Stop generation if the stop sequence is in the lookback tokens but doesn't start with stop sequence
-                    self.done_tracker[i] = (self.sequence in lookback_tokens_batch[i] and not lookback_tokens_batch[i][: len(self.sequence)] == self.sequence)
+                    lookback_tokens_batch_i = lookback_tokens_batch[i]
+                    # Remove stop sequence from the begginning of the lookback tokens if it is there
+                    if len(lookback_tokens_batch_i) >= len(self.sequence) and lookback_tokens_batch_i[: len(self.sequence)] == self.sequence:
+                        lookback_tokens_batch_i = lookback_tokens_batch_i[len(self.sequence) :]
+                    self.done_tracker[i] = self.sequence in lookback_tokens_batch_i
             return False not in self.done_tracker
 
     def get_stopping_criteria(self, input_ids: torch.Tensor):

From c8d06dd224cb3456e6aa06cc93e648f7aa3685bd Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Mon, 25 Nov 2024 13:26:49 +0400
Subject: [PATCH 28/97] Use stop string criteria from transformers to stop
 generation early

---
 src/lm_polygraph/utils/model.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index f22901919..7469b19b0 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -17,6 +17,7 @@
     BartForConditionalGeneration,
     StoppingCriteria,
     StoppingCriteriaList,
+    StopStringCriteria,
     PreTrainedTokenizer,
 )
 
@@ -375,10 +376,11 @@ def get_stopping_criteria(self, input_ids: torch.Tensor):
         return StoppingCriteriaList(
             [
                 *[
-                    self._MultiTokenEOSCriteria(
-                        sequence, self.tokenizer, input_ids.shape[1], input_ids.shape[0]
-                    )
-                    for sequence in stop_sequences
+                    #self._MultiTokenEOSCriteria(
+                    #    sequence, self.tokenizer, input_ids.shape[1], input_ids.shape[0]
+                    #)
+                    #for sequence in stop_sequences
+                    StopStringCriteria(self.tokenizer, stop_sequences)
                 ],
             ]
         )

From 49fbbc2582df1ddfc94283521b2bd6b64bf4d5ba Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Mon, 25 Nov 2024 13:50:24 +0400
Subject: [PATCH 29/97] Rollback

---
 src/lm_polygraph/utils/model.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index 7469b19b0..f22901919 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -17,7 +17,6 @@
     BartForConditionalGeneration,
     StoppingCriteria,
     StoppingCriteriaList,
-    StopStringCriteria,
     PreTrainedTokenizer,
 )
 
@@ -376,11 +375,10 @@ def get_stopping_criteria(self, input_ids: torch.Tensor):
         return StoppingCriteriaList(
             [
                 *[
-                    #self._MultiTokenEOSCriteria(
-                    #    sequence, self.tokenizer, input_ids.shape[1], input_ids.shape[0]
-                    #)
-                    #for sequence in stop_sequences
-                    StopStringCriteria(self.tokenizer, stop_sequences)
+                    self._MultiTokenEOSCriteria(
+                        sequence, self.tokenizer, input_ids.shape[1], input_ids.shape[0]
+                    )
+                    for sequence in stop_sequences
                 ],
             ]
         )

From 6f41f7efd004f69517979aa5d62dfd5785430ded Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Mon, 25 Nov 2024 15:30:46 +0400
Subject: [PATCH 30/97] Top K entropy

---
 .../polygraph_eval_triviaqa_sentsar.yaml      | 82 ++++++++-----------
 .../polygraph_eval_wmt14_fren_sentsar.yaml    | 82 ++++++++-----------
 .../polygraph_eval_wmt19_deen_sentsar.yaml    | 82 ++++++++-----------
 .../configs/polygraph_eval_xsum_sentsar.yaml  | 77 +++++++----------
 scripts/polygraph_eval                        |  1 +
 src/lm_polygraph/stat_calculators/entropy.py  | 34 ++++++--
 src/lm_polygraph/stat_calculators/sample.py   |  2 +-
 src/lm_polygraph/utils/manager.py             |  4 +-
 .../utils/register_stat_calculators.py        |  5 +-
 9 files changed, 169 insertions(+), 200 deletions(-)

diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 8a9fbf367..dcc0ffb30 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -26,6 +26,12 @@ normalize: true
 generation_params:
   generate_until:
     - "\n"
+save_stats:
+  - sample_tokens
+  - sample_texts
+  - sample_log_probs
+  - sample_sentence_similarity
+entropy_top_k: 50
 
 train_dataset: null
 train_test_split: false
@@ -49,65 +55,22 @@ generation_metrics: null
 ens_type: 
 
 additional_estimators:
-  - module: lm_polygraph.estimators.max_probability
-    class_name: MaximumSequenceProbability
-    kwargs: {}
-  - module: lm_polygraph.estimators.perplexity
-    class_name: Perplexity
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_sar
-    class_name: TokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_entropy
-    class_name: MeanTokenEntropy
-    kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
     class_name: MonteCarloSequenceEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
     class_name: MonteCarloNormalizedSequenceEntropy
     kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs:
-      t: 1
   - module: lm_polygraph.estimators.semantic_entropy
     class_name: SemanticEntropy
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: SentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: ReweightedSentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESentenceSAR
+
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
+    class_name: SentenceSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-      use_log: false
-      reverse: true
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSentenceSAR
     kwargs: {}
@@ -121,6 +84,17 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+      t: 1
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSAR
     kwargs: {}
@@ -134,6 +108,13 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilPPLSAR
     kwargs: {}
@@ -147,6 +128,13 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilMTESAR
     kwargs: {}
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index c8684afc4..1c34e85e2 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -24,6 +24,12 @@ load_from_disk: false
 generation_params:
   generate_until:
     - "\n"
+save_stats:
+  - sample_tokens
+  - sample_texts
+  - sample_log_probs
+  - sample_sentence_similarity
+entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
 
@@ -48,65 +54,22 @@ use_tok_ue: false
 generation_metrics: null
 
 additional_estimators:
-  - module: lm_polygraph.estimators.max_probability
-    class_name: MaximumSequenceProbability
-    kwargs: {}
-  - module: lm_polygraph.estimators.perplexity
-    class_name: Perplexity
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_sar
-    class_name: TokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_entropy
-    class_name: MeanTokenEntropy
-    kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
     class_name: MonteCarloSequenceEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
     class_name: MonteCarloNormalizedSequenceEntropy
     kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs:
-      t: 1
   - module: lm_polygraph.estimators.semantic_entropy
     class_name: SemanticEntropy
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: SentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: ReweightedSentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESentenceSAR
+
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
+    class_name: SentenceSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-      use_log: false
-      reverse: true
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSentenceSAR
     kwargs: {}
@@ -120,6 +83,17 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+      t: 1
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSAR
     kwargs: {}
@@ -133,6 +107,13 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilPPLSAR
     kwargs: {}
@@ -146,6 +127,13 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilMTESAR
     kwargs: {}
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index 0210310b4..f6c4b1ada 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -24,6 +24,12 @@ load_from_disk: false
 generation_params:
   generate_until:
     - "\n"
+save_stats:
+  - sample_tokens
+  - sample_texts
+  - sample_log_probs
+  - sample_sentence_similarity
+entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
 
@@ -47,65 +53,22 @@ use_seq_ue: false
 use_tok_ue: false
 
 additional_estimators:
-  - module: lm_polygraph.estimators.max_probability
-    class_name: MaximumSequenceProbability
-    kwargs: {}
-  - module: lm_polygraph.estimators.perplexity
-    class_name: Perplexity
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_sar
-    class_name: TokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_entropy
-    class_name: MeanTokenEntropy
-    kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
     class_name: MonteCarloSequenceEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
     class_name: MonteCarloNormalizedSequenceEntropy
     kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs:
-      t: 1
   - module: lm_polygraph.estimators.semantic_entropy
     class_name: SemanticEntropy
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: SentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: ReweightedSentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESentenceSAR
+
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
+    class_name: SentenceSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-      use_log: false
-      reverse: true
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSentenceSAR
     kwargs: {}
@@ -119,6 +82,17 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+      t: 1
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSAR
     kwargs: {}
@@ -132,6 +106,13 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilPPLSAR
     kwargs: {}
@@ -145,6 +126,13 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilMTESAR
     kwargs: {}
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index ed8d86730..9dcac6213 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -30,6 +30,7 @@ save_stats:
   - sample_texts
   - sample_log_probs
   - sample_sentence_similarity
+entropy_top_k: 50
 
 train_dataset: null
 train_test_split: false
@@ -51,65 +52,22 @@ use_tok_ue: false
 use_ens_ue: false
 
 additional_estimators:
-  - module: lm_polygraph.estimators.max_probability
-    class_name: MaximumSequenceProbability
-    kwargs: {}
-  - module: lm_polygraph.estimators.perplexity
-    class_name: Perplexity
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_sar
-    class_name: TokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_entropy
-    class_name: MeanTokenEntropy
-    kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
     class_name: MonteCarloSequenceEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
     class_name: MonteCarloNormalizedSequenceEntropy
     kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs:
-      t: 1
   - module: lm_polygraph.estimators.semantic_entropy
     class_name: SemanticEntropy
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: SentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: ReweightedSentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESentenceSAR
+
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
+    class_name: SentenceSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: OtherSentenceSAR
-    kwargs: 
-      t: 1
-      use_log: false
-      reverse: true
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSentenceSAR
     kwargs: {}
@@ -123,6 +81,17 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs:
+      t: 1
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilSAR
     kwargs: {}
@@ -136,6 +105,13 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilPPLSAR
     kwargs: {}
@@ -149,6 +125,13 @@ additional_estimators:
     kwargs: 
       use_log: false
       reverse: true
+
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESentenceSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: DistilMTESAR
     kwargs: {}
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 7a61f524e..ccf079f9e 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -202,6 +202,7 @@ def main(args):
             cache_path=args.cache_path,
             language=getattr(args, 'language', 'en'),
             save_stats=getattr(args, 'save_stats', []),
+            entropy_top_k=getattr(args, 'entropy_top_k', None),
         )
 
         man()
diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py
index 44e592316..0a4409797 100644
--- a/src/lm_polygraph/stat_calculators/entropy.py
+++ b/src/lm_polygraph/stat_calculators/entropy.py
@@ -12,7 +12,11 @@ class EntropyCalculator(StatCalculator):
     Calculates entropy of probabilities at each token position in the generation of a Whitebox model.
     """
 
-    def __init__(self):
+    def __init__(
+        self,
+        top_k: int = None,
+    ):
+        self.top_k = top_k
         super().__init__(["entropy"], ["greedy_log_probs"])
 
     def __call__(
@@ -40,12 +44,23 @@ def __call__(
         for s_lp in logprobs:
             entropies.append([])
             for lp in s_lp:
-                mask = ~np.isinf(lp)
-                entropies[-1].append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask])))
+                lp = torch.tensor(lp)
+                if self.top_k is not None:
+                    lp = torch.topk(lp, self.top_k).values
+                #mask = ~np.isinf(lp)
+                #lp = lp[mask]
+                #if self.top_k is not None:
+                #    lp = np.sort(lp)[-self.top_k:]
+                #entropies[-1].append(-np.sum(np.array(lp) * np.exp(lp)))
+                entropies[-1].append(torch.distributions.Categorical(logits=lp).entropy().item())
         return {"entropy": entropies}
 
 class SampleEntropyCalculator(StatCalculator):
-    def __init__(self):
+    def __init__(
+        self,
+        top_k: int = None,
+    ):
+        self.top_k = top_k
         super().__init__(["sample_entropy"], ["token_distributions"])
 
     def __call__(
@@ -58,20 +73,23 @@ def __call__(
     ) -> Dict[str, np.ndarray]:
         batch_distributions = dependencies["token_distributions"]
         entropies = []
-        
+
         for input_distributions in batch_distributions:
             for sample_distributions in input_distributions:
                 sample_entropies = []
                 for token_dist in sample_distributions:
                     # Convert token_dist to a numpy array first, then to a torch tensor
-                    token_dist_tensor = torch.tensor(np.array(token_dist))
+                    token_dist_tensor = torch.tensor(token_dist)
+
+                    if self.top_k is not None:
+                        token_dist_tensor = torch.topk(token_dist_tensor, self.top_k).values
 
                     # Calculate entropy using torch's Categorical distribution
-                    entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy()
+                    entropy = torch.distributions.Categorical(logits=token_dist_tensor).entropy()
                     sample_entropies.append(entropy.item()) 
 
             # Calculate mean entropy for the sample
             mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0
             entropies.append(mean_entropy.item())
-        
+
         return {"sample_entropy": entropies}
diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py
index 6f1fb2236..a5f9dc3d2 100644
--- a/src/lm_polygraph/stat_calculators/sample.py
+++ b/src/lm_polygraph/stat_calculators/sample.py
@@ -174,7 +174,7 @@ def __call__(
                     break
                 ll.append(logits[i][j][cur_token].item())
                 toks.append(cur_token)
-                distributions.append(logits[i][j].softmax(dim=-1).cpu().numpy())
+                distributions.append(logits[i][j].cpu().numpy())
 
             log_likelihoods[int(i / self.samples_n)].append(ll)
             log_probs[int(i / self.samples_n)].append(log_prob)
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 88d8494cc..f62da1cba 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -256,7 +256,8 @@ def __init__(
         max_new_tokens: int = 100,
         background_train_dataset_max_new_tokens: int = 100,
         cache_path=os.path.expanduser("~") + "/.cache",
-        save_stats: List[str] = []
+        save_stats: List[str] = [],
+        entropy_top_k: Optional[int] = None,
     ):
         """
         Parameters:
@@ -286,6 +287,7 @@ def __init__(
             language=language,
             cache_path=cache_path,
             model=model,
+            entropy_top_k=entropy_top_k,
         )
 
         self.stat_calculators_dict = stat_calculators_dict
diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
index 2a46b5740..b827b7702 100644
--- a/src/lm_polygraph/utils/register_stat_calculators.py
+++ b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -18,6 +18,7 @@ def register_stat_calculators(
     n_ccp_alternatives: int = 10,
     cache_path=os.path.expanduser("~") + "/.cache",
     model: Model = None,
+    entropy_top_k: Optional[int] = None,
 ) -> Tuple[Dict[str, "StatCalculator"], Dict[str, List[str]]]:
     """
     Registers all available statistic calculators to be seen by UEManager for properly organizing the calculations
@@ -62,8 +63,8 @@ def _register(calculator_class: StatCalculator):
         _register(BlackboxSamplingGenerationCalculator())
     else:
         _register(GreedyProbsCalculator(n_alternatives=n_ccp_alternatives))
-        _register(EntropyCalculator())
-        _register(SampleEntropyCalculator())
+        _register(EntropyCalculator(top_k=entropy_top_k))
+        _register(SampleEntropyCalculator(top_k=entropy_top_k))
         _register(GreedyLMProbsCalculator())
         _register(SamplingGenerationCalculator())
         _register(BartScoreCalculator())

From bc4558387aa7fd4cd682061835cc0ab6246b8e8c Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Mon, 25 Nov 2024 16:40:31 +0400
Subject: [PATCH 31/97] Rename stuff

---
 .../configs/polygraph_eval_xsum_sentsar.yaml  |  56 +-
 src/lm_polygraph/estimators/__init__.py       |  15 +-
 src/lm_polygraph/estimators/gsu.py            | 259 +++++++
 src/lm_polygraph/estimators/sentence_sar.py   | 653 +++++-------------
 4 files changed, 480 insertions(+), 503 deletions(-)
 create mode 100644 src/lm_polygraph/estimators/gsu.py

diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index 9dcac6213..b157ac671 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -68,16 +68,16 @@ additional_estimators:
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -88,20 +88,16 @@ additional_estimators:
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs:
-      t: 1
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -110,18 +106,18 @@ additional_estimators:
     class_name: Perplexity
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
+    class_name: PPLSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -130,18 +126,18 @@ additional_estimators:
     class_name: MeanTokenEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESentenceSAR
+    class_name: MTESAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: 
       use_log: false
       reverse: true
diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 2eb69d448..1da22c401 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -64,17 +64,14 @@
 from .token_sar import TokenSAR
 from .sentence_sar import (
     SentenceSAR,
-    OtherSentenceSAR,
-    ReweightedSentenceSAR,
-    PPLSentenceSAR,
-    MTESentenceSAR,
-    DistilSentenceSAR,
-    DistilOneSentenceSAR,
-    DistilSAR,
-    DistilPPLSAR,
-    DistilMTESAR,
+#    OtherSentenceSAR,
+#    ReweightedSentenceSAR,
+    PPLSAR,
+    MTESAR,
+    #DistilOneSentenceSAR,
 )
 from .sar import SAR
+from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU
 from .renyi_neg import RenyiNeg
 from .fisher_rao import FisherRao
 from .verbalized_1s import Verbalized1S
diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py
new file mode 100644
index 000000000..8b192c699
--- /dev/null
+++ b/src/lm_polygraph/estimators/gsu.py
@@ -0,0 +1,259 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+
+
+class MaxprobGSU(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "MaxprobGSU"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        GSU = []
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sample_probs = np.exp(np.array(sample_log_probs))
+            R_s = (
+                sample_probs
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+
+            if self.use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                E_s = -sent_relevance if self.reverse else sent_relevance
+
+            GSU.append(E_s.mean())
+
+        return np.array(GSU)
+
+
+class PPLGSU(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "PPLGSU"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        GSU = []
+        for sample_log_likelihoods, sample_sentence_similarity in zip(
+            batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
+            ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+
+            R_s = (
+                ppl
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+
+            if self.use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                E_s = -sent_relevance if self.reverse else sent_relevance
+
+            GSU.append(E_s.mean())
+
+        return np.array(GSU)
+
+
+class TokenSARGSU(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(
+            [
+                "sample_sentence_similarity",
+                "sample_log_likelihoods",
+                "sample_token_similarity",
+            ],
+            "sequence",
+        )
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "TokenSARGSU"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the SAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * log p(y_i | y_<i, x) for each sample in 'sample_log_likelihoods'
+                * similarity for each sample of the generated text and generated text without one token for each token in 'sample_token_similarity',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float SAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_token_similarity = stats["sample_token_similarity"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        GSU = []
+        for batch_data in zip(
+            batch_sample_log_likelihoods,
+            batch_sample_token_similarity,
+            batch_sample_sentence_similarity,
+        ):
+            sample_log_likelihoods = batch_data[0]
+            sample_token_similarity = batch_data[1]
+            sample_sentence_similarity = batch_data[2]
+
+            tokenSAR = []
+            for log_likelihoods, token_similarity in zip(
+                sample_log_likelihoods, sample_token_similarity
+            ):
+                log_likelihoods = np.array(log_likelihoods)
+                R_t = 1 - token_similarity
+                R_t_norm = R_t / R_t.sum()
+                E_t = -log_likelihoods * R_t_norm
+                tokenSAR.append(E_t.sum())
+
+            tokenSAR = np.array(tokenSAR)
+            probs_token_sar = np.exp(-tokenSAR)
+            R_s = (
+                probs_token_sar
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+            E_s = -np.log(sent_relevance)
+            if self.use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                E_s = -sent_relevance if self.reverse else sent_relevance
+
+            GSU.append(E_s.mean())
+
+        return np.array(GSU)
+
+
+class MTEGSU(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "MTEGSU"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample using Mean Token Entropy (MTE).
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * 'sample_entropy': Mean Token Entropy for each sample,
+                * 'sample_sentence_similarity': matrix with cross-encoder similarities.
+        
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_entropy = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        GSU = []
+        # Loop over each sample's Mean Token Entropy and sentence similarities
+        for sample_entropy, sample_sentence_similarity in zip(
+            batch_sample_entropy, batch_sample_sentence_similarity
+        ):
+            # Use MTE for sentence relevance calculation
+            R_s = sample_entropy * sample_sentence_similarity
+            
+            # Compute sentence relevance by summing along the last axis
+            sent_relevance = R_s.sum(-1)
+
+            # Calculate E_s with options for log transformation and reversal
+            if self.use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                E_s = -sent_relevance if self.reverse else sent_relevance
+
+            GSU.append(E_s.mean())
+
+        return np.array(GSU)
diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index fede78ded..c2ff21395 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -55,126 +55,127 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(sentenceSAR)
 
 
-class OtherSentenceSAR(Estimator):
-    """
-    Like SAR, but only looks at other samples for each sample in the output.
-    """
-
-    def __init__(
-        self,
-        verbose: bool = False,
-        t: float = 0.001,
-        use_log: bool = True,
-        reverse: bool = False
-    ):
-        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
-        self.verbose = verbose
-        self.t = t
-        self.use_log = use_log
-        self.reverse = reverse
-
-    def __str__(self):
-        base = f"OtherSentenceSAR_t{self.t}"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        """
-        Estimates the sentenceSAR for each sample in the input statistics.
-
-        Parameters:
-            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
-                * corresponding log probabilities in 'sample_log_probs',
-                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
-        Returns:
-            np.ndarray: float sentenceSAR for each sample in input statistics.
-                Higher values indicate more uncertain samples.
-        """
-        batch_sample_log_probs = stats["sample_log_probs"]
-        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
-
-        sentenceSAR = []
-        for sample_log_probs, sample_sentence_similarity in zip(
-            batch_sample_log_probs, batch_sample_sentence_similarity
-        ):
-            sample_probs = np.exp(np.array(sample_log_probs))
-            R_s = (
-                sample_probs
-                * sample_sentence_similarity
-                * (1 - np.eye(sample_sentence_similarity.shape[0]))
-            )
-            sent_relevance = R_s.sum(-1) / self.t
-
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                if self.reverse:
-                    E_s = sent_relevance
-                else:
-                    E_s = -sent_relevance
-
-            sentenceSAR.append(E_s.mean())
-
-        return np.array(sentenceSAR)
-
-
-class ReweightedSentenceSAR(Estimator):
-    """
-    Like SAR, but normalizes similarity-based scores at each iteration
-    alpha_ij = g(s_i, s_j) / (\sum_k^(K - 1) g(s_i, s_k))
-    K - number of samples in output minus one
-    """
-    def __init__(self, verbose: bool = False):
-        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
-        self.verbose = verbose
-        self.t = 0.001
-
-    def __str__(self):
-        return "ReweightedSentenceSAR"
-    
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        batch_sample_log_probs = stats["sample_log_probs"]
-        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
-
-        sentenceSAR = []
-
-        for sample_log_probs, sample_sentence_similarity in zip(
-            batch_sample_log_probs, batch_sample_sentence_similarity
-        ):
-            # Compute probabilities from log probabilities
-            sample_probs = np.exp(np.array(sample_log_probs))
-            
-            # Initialize alpha_ij (reweighted sentence similarities)
-            alpha_ij = np.zeros_like(sample_sentence_similarity)
-
-            # Normalize similarity-based scores at each iteration 
-            for i in range(sample_sentence_similarity.shape[0]):
-                similarity_row = sample_sentence_similarity[i]
-                # Exclude self-similarity g(s_i, s_i)
-                similarity_row_without_self = similarity_row * (1 - np.eye(len(similarity_row)))[i]
-                sum_similarity = np.sum(similarity_row_without_self)
-                
-                if sum_similarity > 0:
-                    alpha_ij[i] = similarity_row_without_self / sum_similarity
-                else:
-                    alpha_ij[i] = similarity_row_without_self  # If the normalization factor is 0, leave the row unchanged
-
-            # Compute sentence relevance using normalized alpha_ij
-            R_s = sample_probs * alpha_ij
-            sent_relevance = R_s.sum(-1) / self.t
-
-            # Compute SentenceSAR (Uncertainty Estimation)
-            E_s = -np.log(sent_relevance + sample_probs)
-            sentenceSAR.append(E_s.mean())
-
-        return np.array(sentenceSAR)
-
-
-
-class PPLSentenceSAR(Estimator):
+#class OtherSentenceSAR(Estimator):
+#    """
+#    Like SAR, but only looks at other samples for each sample in the output.
+#    """
+#
+#    def __init__(
+#        self,
+#        verbose: bool = False,
+#        t: float = 0.001,
+#        use_log: bool = True,
+#        reverse: bool = False
+#    ):
+#        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+#        self.verbose = verbose
+#        self.t = t
+#        self.use_log = use_log
+#        self.reverse = reverse
+#
+#    def __str__(self):
+#        base = f"OtherSentenceSAR_t{self.t}"
+#        if not self.use_log:
+#            base += "_no_log"
+#            if self.reverse:
+#                base += "_reverse"
+#        return base
+#
+#    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+#        """
+#        Estimates the sentenceSAR for each sample in the input statistics.
+#
+#        Parameters:
+#            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+#                * corresponding log probabilities in 'sample_log_probs',
+
+#                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+#        Returns:
+#            np.ndarray: float sentenceSAR for each sample in input statistics.
+#                Higher values indicate more uncertain samples.
+#        """
+#        batch_sample_log_probs = stats["sample_log_probs"]
+#        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+#
+#        sentenceSAR = []
+#        for sample_log_probs, sample_sentence_similarity in zip(
+#            batch_sample_log_probs, batch_sample_sentence_similarity
+#        ):
+#            sample_probs = np.exp(np.array(sample_log_probs))
+#            R_s = (
+#                sample_probs
+#                * sample_sentence_similarity
+#                * (1 - np.eye(sample_sentence_similarity.shape[0]))
+#            )
+#            sent_relevance = R_s.sum(-1) / self.t
+#
+#            if self.use_log:
+#                E_s = -np.log(sent_relevance)
+#            else:
+#                if self.reverse:
+#                    E_s = sent_relevance
+#                else:
+#                    E_s = -sent_relevance
+#
+#            sentenceSAR.append(E_s.mean())
+#
+#        return np.array(sentenceSAR)
+#
+#
+#class ReweightedSentenceSAR(Estimator):
+#    """
+#    Like SAR, but normalizes similarity-based scores at each iteration
+#    alpha_ij = g(s_i, s_j) / (\sum_k^(K - 1) g(s_i, s_k))
+#    K - number of samples in output minus one
+#    """
+#    def __init__(self, verbose: bool = False):
+#        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+#        self.verbose = verbose
+#        self.t = 0.001
+#
+#    def __str__(self):
+#        return "ReweightedSentenceSAR"
+#    
+#    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+#        batch_sample_log_probs = stats["sample_log_probs"]
+#        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+#
+#        sentenceSAR = []
+#
+#        for sample_log_probs, sample_sentence_similarity in zip(
+#            batch_sample_log_probs, batch_sample_sentence_similarity
+#        ):
+#            # Compute probabilities from log probabilities
+#            sample_probs = np.exp(np.array(sample_log_probs))
+#            
+#            # Initialize alpha_ij (reweighted sentence similarities)
+#            alpha_ij = np.zeros_like(sample_sentence_similarity)
+#
+#            # Normalize similarity-based scores at each iteration 
+#            for i in range(sample_sentence_similarity.shape[0]):
+#                similarity_row = sample_sentence_similarity[i]
+#                # Exclude self-similarity g(s_i, s_i)
+#                similarity_row_without_self = similarity_row * (1 - np.eye(len(similarity_row)))[i]
+#                sum_similarity = np.sum(similarity_row_without_self)
+#                
+#                if sum_similarity > 0:
+#                    alpha_ij[i] = similarity_row_without_self / sum_similarity
+#                else:
+#                    alpha_ij[i] = similarity_row_without_self  # If the normalization factor is 0, leave the row unchanged
+#
+#            # Compute sentence relevance using normalized alpha_ij
+#            R_s = sample_probs * alpha_ij
+#            sent_relevance = R_s.sum(-1) / self.t
+#
+#            # Compute SentenceSAR (Uncertainty Estimation)
+#            E_s = -np.log(sent_relevance + sample_probs)
+#            sentenceSAR.append(E_s.mean())
+#
+#        return np.array(sentenceSAR)
+
+
+
+class PPLSAR(Estimator):
     """
     Like SAR, but uses log probs normalized by sample length in tokens to calculate PPL (Perplexity).
     Tokenwise log-likelihoods are available in stats['sample_log_likelihoods'].
@@ -185,7 +186,7 @@ def __init__(self, verbose: bool = False):
         self.t = 0.001
 
     def __str__(self):
-        return "PPLSentenceSAR"
+        return "PPLSAR"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -226,282 +227,72 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
         return np.array(sentenceSAR)
 
-
-class DistilSentenceSAR(Estimator):
-    """
-    Like SAR, but only looks at other samples for each sample in the output.
-    """
-
-    def __init__(
-        self,
-        verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
-    ):
-        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
-        self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
-
-    def __str__(self):
-        base = "DistilSentenceSAR"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        """
-        Estimates the sentenceSAR for each sample in the input statistics.
-
-        Parameters:
-            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
-                * corresponding log probabilities in 'sample_log_probs',
-                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
-        Returns:
-            np.ndarray: float sentenceSAR for each sample in input statistics.
-                Higher values indicate more uncertain samples.
-        """
-        batch_sample_log_probs = stats["sample_log_probs"]
-        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
-
-        sentenceSAR = []
-        for sample_log_probs, sample_sentence_similarity in zip(
-            batch_sample_log_probs, batch_sample_sentence_similarity
-        ):
-            sample_probs = np.exp(np.array(sample_log_probs))
-            R_s = (
-                sample_probs
-                * sample_sentence_similarity
-            )
-            sent_relevance = R_s.sum(-1)
-
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                if self.reverse:
-                    E_s = sent_relevance
-                else:
-                    E_s = -sent_relevance
-
-            sentenceSAR.append(E_s.mean())
-
-        return np.array(sentenceSAR)
-
-
-class DistilOneSentenceSAR(Estimator):
-    """
-    Like SAR, but only looks at other samples for each sample in the output.
-    """
-
-    def __init__(
-        self,
-        verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
-    ):
-        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
-        self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
-
-    def __str__(self):
-        base = f"DistilOneSentenceSAR"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        """
-        Estimates the sentenceSAR for each sample in the input statistics.
-
-        Parameters:
-            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
-                * corresponding log probabilities in 'sample_log_probs',
-                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
-        Returns:
-            np.ndarray: float sentenceSAR for each sample in input statistics.
-                Higher values indicate more uncertain samples.
-        """
-        batch_sample_log_probs = stats["sample_log_probs"]
-        batch_sample_sentence_similarity = deepcopy(stats["sample_sentence_similarity"])
-
-        sentenceSAR = []
-        for sample_log_probs, sample_sentence_similarity in zip(
-            batch_sample_log_probs, batch_sample_sentence_similarity
-        ):
-            sample_probs = np.exp(np.array(sample_log_probs))
-            np.fill_diagonal(sample_sentence_similarity, 1)
-
-            R_s = (
-                sample_probs
-                * sample_sentence_similarity
-            )
-            sent_relevance = R_s.sum(-1)
-
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                if self.reverse:
-                    E_s = sent_relevance
-                else:
-                    E_s = -sent_relevance
-
-            SAR.append(E_s.mean())
-
-        return np.array(SAR)
-
-
-class DistilSAR(Estimator):
-    """
-    Like SAR, but only looks at other samples for each sample in the output.
-    """
-
-    def __init__(
-        self,
-        verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
-    ):
-        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
-        self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
-
-    def __str__(self):
-        base = "DistilSAR"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        """
-        Estimates the sentenceSAR for each sample in the input statistics.
-
-        Parameters:
-            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
-                * corresponding log probabilities in 'sample_log_probs',
-                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
-        Returns:
-            np.ndarray: float sentenceSAR for each sample in input statistics.
-                Higher values indicate more uncertain samples.
-        """
-        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
-        batch_sample_token_similarity = stats["sample_token_similarity"]
-        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
-
-        SAR = []
-        for batch_data in zip(
-            batch_sample_log_likelihoods,
-            batch_sample_token_similarity,
-            batch_sample_sentence_similarity,
-        ):
-            sample_log_likelihoods = batch_data[0]
-            sample_token_similarity = batch_data[1]
-            sample_sentence_similarity = batch_data[2]
-
-            tokenSAR = []
-            for log_likelihoods, token_similarity in zip(
-                sample_log_likelihoods, sample_token_similarity
-            ):
-                log_likelihoods = np.array(log_likelihoods)
-                R_t = 1 - token_similarity
-                R_t_norm = R_t / R_t.sum()
-                E_t = -log_likelihoods * R_t_norm
-                tokenSAR.append(E_t.sum())
-
-            tokenSAR = np.array(tokenSAR)
-            probs_token_sar = np.exp(-tokenSAR)
-
-            R_s = (
-                probs_token_sar
-                * sample_sentence_similarity
-            )
-            sent_relevance = R_s.sum(-1)
-
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                if self.reverse:
-                    E_s = sent_relevance
-                else:
-                    E_s = -sent_relevance
-
-            SAR.append(E_s.mean())
-
-        return np.array(SAR)
-
-
-class DistilPPLSAR(Estimator):
-    """
-    Like SAR, but only looks at other samples for each sample in the output.
-    """
-
-    def __init__(
-        self,
-        verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
-    ):
-        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
-        self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
-
-    def __str__(self):
-        base = "DistilPPLSAR"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        """
-        Estimates the sentenceSAR for each sample in the input statistics.
-
-        Parameters:
-            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
-                * corresponding log probabilities in 'sample_log_probs',
-                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
-        Returns:
-            np.ndarray: float sentenceSAR for each sample in input statistics.
-                Higher values indicate more uncertain samples.
-        """
-        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
-        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
-
-        sentenceSAR = []
-        for sample_log_likelihoods, sample_sentence_similarity in zip(
-            batch_sample_log_likelihoods, batch_sample_sentence_similarity
-        ):
-            ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
-
-            R_s = (
-                ppl
-                * sample_sentence_similarity
-            )
-            sent_relevance = R_s.sum(-1)
-
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                if self.reverse:
-                    E_s = -sent_relevance
-                else:
-                    E_s = sent_relevance
-
-            sentenceSAR.append(E_s.mean())
-
-        return np.array(sentenceSAR)
-
-
-class MTESentenceSAR(Estimator):
+#class DistilOneSentenceSAR(Estimator):
+#    """
+#    Like SAR, but only looks at other samples for each sample in the output.
+#    """
+#
+#    def __init__(
+#        self,
+#        verbose: bool = False,
+#        use_log: bool = True,
+#        reverse: bool = False
+#    ):
+#        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+#        self.verbose = verbose
+#        self.use_log = use_log
+#        self.reverse = reverse
+#
+#    def __str__(self):
+#        base = f"DistilOneSentenceSAR"
+#        if not self.use_log:
+#            base += "_no_log"
+#            if self.reverse:
+#                base += "_reverse"
+#        return base
+#
+#    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+#        """
+#        Estimates the sentenceSAR for each sample in the input statistics.
+#
+#        Parameters:
+#            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+#                * corresponding log probabilities in 'sample_log_probs',
+#                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+#        Returns:
+#            np.ndarray: float sentenceSAR for each sample in input statistics.
+#                Higher values indicate more uncertain samples.
+#        """
+#        batch_sample_log_probs = stats["sample_log_probs"]
+#        batch_sample_sentence_similarity = deepcopy(stats["sample_sentence_similarity"])
+#
+#        sentenceSAR = []
+#        for sample_log_probs, sample_sentence_similarity in zip(
+#            batch_sample_log_probs, batch_sample_sentence_similarity
+#        ):
+#            sample_probs = np.exp(np.array(sample_log_probs))
+#            np.fill_diagonal(sample_sentence_similarity, 1)
+#
+#            R_s = (
+#                sample_probs
+#                * sample_sentence_similarity
+#            )
+#            sent_relevance = R_s.sum(-1)
+#
+#            if self.use_log:
+#                E_s = -np.log(sent_relevance)
+#            else:
+#                if self.reverse:
+#                    E_s = sent_relevance
+#                else:
+#                    E_s = -sent_relevance
+#
+#            SAR.append(E_s.mean())
+#
+#        return np.array(SAR)
+
+
+class MTESAR(Estimator):
     """
     Like SAR, but uses sample entropy calculated from token-wise log probs for each sample.
     Tokenwise log-likelihoods are available in stats['sample_log_likelihoods'].
@@ -512,7 +303,7 @@ def __init__(self, verbose: bool = False):
         self.t = 0.001
 
     def __str__(self):
-        return "MTESentenceSAR"
+        return "MTESAR"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -551,69 +342,3 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)
-
-
-
-
-class DistilMTESAR(Estimator):
-    """
-    Like SAR, but uses Mean Token Entropy (MTE) calculated from token-wise log probs for each sample.
-    Token-wise log-likelihoods are available in stats['sample_entropy'].
-    """
-
-    def __init__(
-        self,
-        verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
-    ):
-        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
-        self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
-
-    def __str__(self):
-        base = "DistilMTESAR"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        """
-        Estimates the sentenceSAR for each sample using Mean Token Entropy (MTE).
-
-        Parameters:
-            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
-                * 'sample_entropy': Mean Token Entropy for each sample,
-                * 'sample_sentence_similarity': matrix with cross-encoder similarities.
-        
-        Returns:
-            np.ndarray: float sentenceSAR for each sample in input statistics.
-                Higher values indicate more uncertain samples.
-        """
-        batch_sample_entropy = stats["sample_entropy"]
-        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
-
-        sentenceSAR = []
-
-        # Loop over each sample's Mean Token Entropy and sentence similarities
-        for sample_entropy, sample_sentence_similarity in zip(
-            batch_sample_entropy, batch_sample_sentence_similarity
-        ):
-            # Use MTE for sentence relevance calculation
-            R_s = sample_entropy * sample_sentence_similarity
-            
-            # Compute sentence relevance by summing along the last axis
-            sent_relevance = R_s.sum(-1)
-
-            # Calculate E_s with options for log transformation and reversal
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                E_s = -sent_relevance if self.reverse else sent_relevance
-
-            sentenceSAR.append(E_s.mean())
-
-        return np.array(sentenceSAR)

From cfeff8232cd39c376acf99bd580ffffa77e20a1f Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Mon, 25 Nov 2024 16:42:42 +0400
Subject: [PATCH 32/97] Use renamed methods everywhere

---
 .../polygraph_eval_triviaqa_sentsar.yaml      | 56 +++++++++----------
 .../polygraph_eval_wmt14_fren_sentsar.yaml    | 56 +++++++++----------
 .../polygraph_eval_wmt19_deen_sentsar.yaml    | 56 +++++++++----------
 3 files changed, 78 insertions(+), 90 deletions(-)

diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index dcc0ffb30..74a06f08d 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -71,16 +71,16 @@ additional_estimators:
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -91,20 +91,16 @@ additional_estimators:
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs:
-      t: 1
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -113,18 +109,18 @@ additional_estimators:
     class_name: Perplexity
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
+    class_name: PPLSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -133,18 +129,18 @@ additional_estimators:
     class_name: MeanTokenEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESentenceSAR
+    class_name: MTESAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: 
       use_log: false
       reverse: true
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 1c34e85e2..7c2213716 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -70,16 +70,16 @@ additional_estimators:
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -90,20 +90,16 @@ additional_estimators:
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs:
-      t: 1
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -112,18 +108,18 @@ additional_estimators:
     class_name: Perplexity
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
+    class_name: PPLSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -132,18 +128,18 @@ additional_estimators:
     class_name: MeanTokenEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESentenceSAR
+    class_name: MTESAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: 
       use_log: false
       reverse: true
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index f6c4b1ada..f5d70927a 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -69,16 +69,16 @@ additional_estimators:
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSentenceSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -89,20 +89,16 @@ additional_estimators:
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs:
-      t: 1
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -111,18 +107,18 @@ additional_estimators:
     class_name: Perplexity
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSentenceSAR
+    class_name: PPLSAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilPPLSAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
     kwargs: 
       use_log: false
       reverse: true
@@ -131,18 +127,18 @@ additional_estimators:
     class_name: MeanTokenEntropy
     kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESentenceSAR
+    class_name: MTESAR
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: 
       use_log: false
       reverse: false
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: DistilMTESAR
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
     kwargs: 
       use_log: false
       reverse: true

From 14dfb287de80c508f4dc704d3101f0b6c85030b8 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 27 Nov 2024 19:39:24 +0400
Subject: [PATCH 33/97] Fix ccp, add consistency between sampling and greedy,
 and ccpgsu

---
 .../configs/polygraph_eval_xsum_sentsar.yaml  |  17 +++
 src/lm_polygraph/estimators/__init__.py       |   2 +-
 .../claim_conditioned_probability.py          |  25 +++-
 src/lm_polygraph/estimators/gsu.py            |  82 ++++++++++++++
 src/lm_polygraph/stat_calculators/__init__.py |   1 +
 src/lm_polygraph/stat_calculators/entropy.py  |   4 +-
 .../stat_calculators/greedy_probs.py          |   9 +-
 src/lm_polygraph/stat_calculators/sample.py   |  33 ++++--
 .../sample_alternatives_nli.py                | 107 ++++++++++++++++++
 src/lm_polygraph/utils/dataset.py             |   6 +
 src/lm_polygraph/utils/manager.py             |   6 +-
 .../utils/register_stat_calculators.py        |   3 +-
 12 files changed, 268 insertions(+), 27 deletions(-)
 create mode 100644 src/lm_polygraph/stat_calculators/sample_alternatives_nli.py

diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index b157ac671..084015edb 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -142,6 +142,23 @@ additional_estimators:
       use_log: false
       reverse: true
 
+  - module: lm_polygraph.estimators.claim_conditioned_probability
+    class_name: ClaimConditionedProbability
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: CCPGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: CCPGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: CCPGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 1da22c401..ee51c6b77 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -71,7 +71,7 @@
     #DistilOneSentenceSAR,
 )
 from .sar import SAR
-from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU
+from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU, CCPGSU
 from .renyi_neg import RenyiNeg
 from .fisher_rao import FisherRao
 from .verbalized_1s import Verbalized1S
diff --git a/src/lm_polygraph/estimators/claim_conditioned_probability.py b/src/lm_polygraph/estimators/claim_conditioned_probability.py
index 5c7b63add..7e2a86d1e 100644
--- a/src/lm_polygraph/estimators/claim_conditioned_probability.py
+++ b/src/lm_polygraph/estimators/claim_conditioned_probability.py
@@ -20,7 +20,22 @@ def __str__(self):
         return "CCP"
 
     def _reduce(self, logprobs: list[float]):
-        return np.exp(np.sum(logprobs))
+        return np.sum(logprobs)
+
+    def _combine_nli(self, forward: str, backward: str):
+        """
+        Combines two NLI predictions NLI(x, y) and NLI(y, x) into a single prediction.
+
+        Prioritizes "entail" or "contra" if present, otherwise returns "neutral".
+        """
+        if forward == backward:
+            return forward
+        if all(x in [forward, backward] for x in ["entail", "contra"]):
+            return "neutral"
+        for x in ["entail", "contra"]:
+            if x in [forward, backward]:
+                return x
+        return "neutral"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         words = stats["greedy_tokens"]
@@ -42,10 +57,14 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 contra_logprobs, contra_words = [], []
                 for i in range(len(word_alternatives)):
                     word_alt, logprob = word_alternatives[i]
-                    if i == 0 or word_alternatives_nli[0][i] == "entail":
+                    nli_outcome = self._combine_nli(
+                        word_alternatives_nli[0][i],
+                        word_alternatives_nli[i][0],
+                    )
+                    if i == 0 or nli_outcome == "entail":
                         entail_logprobs.append(logprob)
                         entail_words.append(word_alt)
-                    elif word_alternatives_nli[0][i] == "contra":
+                    elif nli_outcome == "contra":
                         contra_logprobs.append(logprob)
                         contra_words.append(word_alt)
                 entail_logprob = np.logaddexp.reduce(entail_logprobs)
diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py
index 8b192c699..b7ab5b6d9 100644
--- a/src/lm_polygraph/estimators/gsu.py
+++ b/src/lm_polygraph/estimators/gsu.py
@@ -4,6 +4,7 @@
 from copy import deepcopy
 
 from .estimator import Estimator
+from lm_polygraph.estimators.claim_conditioned_probability import ClaimConditionedProbability
 
 
 class MaxprobGSU(Estimator):
@@ -257,3 +258,84 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             GSU.append(E_s.mean())
 
         return np.array(GSU)
+
+
+class CCPGSU(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        use_log: bool = True,
+        reverse: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity",
+                          "sample_tokens",
+                          "sample_tokens_alternatives",
+                          "sample_tokens_alternatives_nli"], "sequence")
+        self.verbose = verbose
+        self.use_log = use_log
+        self.reverse = reverse
+
+    def __str__(self):
+        base = "CCPGSU"
+        if not self.use_log:
+            base += "_no_log"
+            if self.reverse:
+                base += "_reverse"
+        return base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the sentenceSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * corresponding log probabilities in 'sample_log_probs',
+                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
+        Returns:
+            np.ndarray: float sentenceSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        batch_sample_tokens = stats["sample_tokens"]
+        batch_sample_tokens_alternatives = stats["sample_tokens_alternatives"]
+        batch_sample_tokens_alternatives_nli = stats["sample_tokens_alternatives_nli"]
+
+        GSU = []
+        for sample_sentence_similarity, \
+            samples_tokens, \
+            samples_tokens_alternatives, \
+            samples_tokens_alternatives_nli in zip(
+                batch_sample_sentence_similarity,
+                batch_sample_tokens,
+                batch_sample_tokens_alternatives,
+                batch_sample_tokens_alternatives_nli
+            ):
+            ccps = []
+            for sample_tokens, \
+                sample_tokens_alternatives, \
+                sample_tokens_alternatives_nli in zip(
+                    samples_tokens,
+                    samples_tokens_alternatives,
+                    samples_tokens_alternatives_nli
+                ):
+                ccp_stats = {
+                    "greedy_tokens": [sample_tokens],
+                    "greedy_tokens_alternatives": [sample_tokens_alternatives],
+                    "greedy_tokens_alternatives_nli": [sample_tokens_alternatives_nli]
+                }
+                ccps.append(ClaimConditionedProbability()(stats=ccp_stats)[0])
+
+            R_s = (
+                ccps
+                * sample_sentence_similarity
+            )
+            sent_relevance = R_s.sum(-1)
+
+            if self.use_log:
+                E_s = -np.log(sent_relevance)
+            else:
+                E_s = -sent_relevance if self.reverse else sent_relevance
+
+            GSU.append(E_s.mean())
+
+        return np.array(GSU)
diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py
index f69abd428..1a2ea5721 100644
--- a/src/lm_polygraph/stat_calculators/__init__.py
+++ b/src/lm_polygraph/stat_calculators/__init__.py
@@ -11,6 +11,7 @@
 from .entropy import EntropyCalculator
 from .entropy import SampleEntropyCalculator
 from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator
+from .sample_alternatives_nli import SampleAlternativesNLICalculator
 from .greedy_alternatives_nli import (
     GreedyAlternativesNLICalculator,
     GreedyAlternativesFactPrefNLICalculator,
diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py
index 0a4409797..f6ba90c4e 100644
--- a/src/lm_polygraph/stat_calculators/entropy.py
+++ b/src/lm_polygraph/stat_calculators/entropy.py
@@ -61,7 +61,7 @@ def __init__(
         top_k: int = None,
     ):
         self.top_k = top_k
-        super().__init__(["sample_entropy"], ["token_distributions"])
+        super().__init__(["sample_entropy"], ["sample_tokens_distributions"])
 
     def __call__(
         self,
@@ -71,7 +71,7 @@ def __call__(
         max_new_tokens: int = 100,
         **kwargs,
     ) -> Dict[str, np.ndarray]:
-        batch_distributions = dependencies["token_distributions"]
+        batch_distributions = dependencies["sample_tokens_distributions"]
         entropies = []
 
         for input_distributions in batch_distributions:
diff --git a/src/lm_polygraph/stat_calculators/greedy_probs.py b/src/lm_polygraph/stat_calculators/greedy_probs.py
index c94468fb5..5c746c1ab 100644
--- a/src/lm_polygraph/stat_calculators/greedy_probs.py
+++ b/src/lm_polygraph/stat_calculators/greedy_probs.py
@@ -134,14 +134,9 @@ def __call__(
                 seq = sequences[i, idx:].cpu()
             else:
                 seq = sequences[i, 1:].cpu()
-            length, text_length = len(seq), len(seq)
-            for j in range(len(seq)):
-                if seq[j] == model.tokenizer.eos_token_id:
-                    length = j + 1
-                    text_length = j
-                    break
+            length = len(seq)
             cut_sequences.append(seq[:length].tolist())
-            cut_texts.append(model.tokenizer.decode(seq[:text_length]))
+            cut_texts.append(model.tokenizer.decode(seq[:length], skip_special_tokens=True))
             cut_logits.append(logits[i, :length, :].cpu().numpy())
             cut_alternatives.append([[] for _ in range(length)])
             for j in range(length):
diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py
index a5f9dc3d2..2c74f7a1d 100644
--- a/src/lm_polygraph/stat_calculators/sample.py
+++ b/src/lm_polygraph/stat_calculators/sample.py
@@ -86,19 +86,21 @@ class SamplingGenerationCalculator(StatCalculator):
     * probabilities of the sampled tokens generation
     """
 
-    def __init__(self, samples_n: int = 10):
+    def __init__(self, samples_n: int = 10, n_alternatives: int = 10):
         """
         Parameters:
             samples_n (int): number of samples to generate per input text. Default: 10
         """
         self.samples_n = samples_n
+        self.n_alternatives = n_alternatives
         super().__init__(
             [
                 "sample_log_probs",
                 "sample_tokens",
                 "sample_texts",
                 "sample_log_likelihoods",
-                "token_distributions",
+                "sample_tokens_distributions",
+                "sample_tokens_alternatives",
             ],
             [],
         )
@@ -155,6 +157,7 @@ def __call__(
         texts = [[] for _ in range(len(texts))]
         log_likelihoods = [[] for _ in range(len(texts))]
         token_distributions = [[] for _ in range(len(texts))]
+        alternatives = [[] for _ in range(len(texts))]
 
 
         if model.model_type == "Seq2SeqLM":
@@ -167,25 +170,39 @@ def __call__(
                 if model.model_type == "CausalLM"
                 else 0
             )
-            for j in range(len(sequences[i]) - inp_size):
+            gen_size = len(sequences[i]) - inp_size
+            sample_alternatives = [[] for _ in range(gen_size)]
+            for j in range(gen_size):
                 cur_token = sequences[i][j + inp_size].item()
                 log_prob += logits[i][j][cur_token].item()
-                if cur_token == model.tokenizer.eos_token_id:
-                    break
                 ll.append(logits[i][j][cur_token].item())
                 toks.append(cur_token)
-                distributions.append(logits[i][j].cpu().numpy())
+
+                lt = logits[i][j].cpu().numpy()
+                distributions.append(lt)
+
+                best_tokens = np.argpartition(lt, -self.n_alternatives)
+                ln = len(best_tokens)
+                best_tokens = best_tokens[ln - self.n_alternatives : ln]
+                for t in best_tokens:
+                    sample_alternatives[j].append((t.item(), lt[t].item()))
+                sample_alternatives[j].sort(
+                    key=lambda x: x[0] == cur_token,
+                    reverse=True,
+                )
 
             log_likelihoods[int(i / self.samples_n)].append(ll)
             log_probs[int(i / self.samples_n)].append(log_prob)
             tokens[int(i / self.samples_n)].append(toks)
-            texts[int(i / self.samples_n)].append(model.tokenizer.decode(toks))
+            texts[int(i / self.samples_n)].append(model.tokenizer.decode(toks, skip_special_tokens=True))
             token_distributions[int(i / self.samples_n)].append(distributions)
+            alternatives[int(i / self.samples_n)].append(sample_alternatives)
 
         return {
             "sample_log_likelihoods": log_likelihoods,
             "sample_log_probs": log_probs,
             "sample_tokens": tokens,
             "sample_texts": texts,
-            "token_distributions": token_distributions,
+            "sample_tokens_distributions": token_distributions,
+            "sample_tokens_alternatives": alternatives,
         }
diff --git a/src/lm_polygraph/stat_calculators/sample_alternatives_nli.py b/src/lm_polygraph/stat_calculators/sample_alternatives_nli.py
new file mode 100644
index 000000000..1832278af
--- /dev/null
+++ b/src/lm_polygraph/stat_calculators/sample_alternatives_nli.py
@@ -0,0 +1,107 @@
+import numpy as np
+
+from typing import Dict, List, Tuple
+
+from .stat_calculator import StatCalculator
+from lm_polygraph.utils.model import WhiteboxModel
+from lm_polygraph.utils.deberta import Deberta
+from collections import defaultdict
+import torch.nn as nn
+import string
+
+
+def _eval_nli_model(nli_queue: List[Tuple[str, str]], deberta: Deberta) -> List[str]:
+    nli_set = list(set(nli_queue))
+
+    softmax = nn.Softmax(dim=1)
+    w_probs = defaultdict(lambda: defaultdict(lambda: None))
+    for k in range(0, len(nli_set), deberta.batch_size):
+        batch = nli_set[k : k + deberta.batch_size]
+        encoded = deberta.deberta_tokenizer.batch_encode_plus(
+            batch, padding=True, return_tensors="pt"
+        ).to(deberta.device)
+        logits = deberta.deberta(**encoded).logits
+        logits = logits.detach().to(deberta.device)
+        for (wi, wj), prob in zip(batch, softmax(logits).cpu().detach()):
+            w_probs[wi][wj] = prob
+
+    classes = []
+    for w1, w2 in nli_queue:
+        pr = w_probs[w1][w2]
+        id = pr.argmax()
+        ent_id = deberta.deberta.config.label2id["ENTAILMENT"]
+        contra_id = deberta.deberta.config.label2id["CONTRADICTION"]
+        if id == ent_id:
+            str_class = "entail"
+        elif id == contra_id:
+            str_class = "contra"
+        else:
+            str_class = "neutral"
+        classes.append(str_class)
+    return classes
+
+
+class SampleAlternativesNLICalculator(StatCalculator):
+    def __init__(self, nli_model):
+        super().__init__(
+            [
+                "sample_tokens_alternatives_nli",
+            ],
+            ["sample_tokens_alternatives"],
+        )
+
+        self.nli_model = nli_model
+
+    def _strip(self, w: str):
+        return w.strip(string.punctuation + " \n")
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+        **kwargs,
+    ) -> Dict[str, np.ndarray]:
+        batch_alternatives = dependencies["sample_tokens_alternatives"]
+        batch_alternatives_nli = []
+        for samples_alternatives in batch_alternatives:
+            sample_alternatives_nli = []
+            for sample_alternatives in samples_alternatives:
+                nli_matrixes = []
+                for w_number, word_alternatives in enumerate(sample_alternatives):
+                    nli_queue = []
+                    nli_matrix = [
+                        ["" for _ in range(len(word_alternatives))]
+                        for _ in range(len(word_alternatives))
+                    ]
+                    if len(word_alternatives) > 0 and not isinstance(
+                        word_alternatives[0][0],
+                        str,
+                    ):
+                        word_alternatives = [
+                            (model.tokenizer.decode([alt]), prob)
+                            for alt, prob in word_alternatives
+                        ]
+                    words = [self._strip(alt[0]) for alt in word_alternatives]
+                    for wi in words:
+                        nli_queue.append((words[0], wi))
+                        nli_queue.append((wi, words[0]))
+
+                    nli_classes = _eval_nli_model(nli_queue, self.nli_model)
+                    nli_class = defaultdict(lambda: None)
+                    for nli_cl, (w1, w2) in zip(nli_classes, nli_queue):
+                        nli_class[w1, w2] = nli_cl
+
+                    for i, wi in enumerate(words):
+                        for j, wj in enumerate(words):
+                            # Only calculate NLI with sample token
+                            if i > 0 and j > 0:
+                                continue
+                            nli_matrix[i][j] = nli_class[wi, wj]
+
+                    nli_matrixes.append(nli_matrix)
+                sample_alternatives_nli.append(nli_matrixes)
+            batch_alternatives_nli.append(sample_alternatives_nli)
+
+        return {"sample_tokens_alternatives_nli": batch_alternatives_nli}
diff --git a/src/lm_polygraph/utils/dataset.py b/src/lm_polygraph/utils/dataset.py
index 05c79ea1c..49a1c29a4 100644
--- a/src/lm_polygraph/utils/dataset.py
+++ b/src/lm_polygraph/utils/dataset.py
@@ -184,6 +184,8 @@ def from_datasets(
         """
         dataset_name, dataset = Dataset.load_hf_dataset(dataset_path, split, **kwargs)
         few_shot_dataset = None
+        #no_few_shot_x = None
+
         if n_shot > 0:
             _, few_shot_dataset = Dataset.load_hf_dataset(
                 dataset_path, few_shot_split, **kwargs
@@ -417,7 +419,11 @@ def doc_to_text(doc, prompt, i=0):
         else:
             x = dataset[x_column]
             y = dataset[y_column]
+    
+        #if no_few_shot_x is None:
+        #    no_few_shot_x = x
 
+        #return Dataset(x, y, batch_size, no_few_shot_x)
         return Dataset(x, y, batch_size)
 
     @staticmethod
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index f62da1cba..36e882308 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -480,11 +480,7 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
 
             for key in self.save_stats:
                 if key in batch_stats.keys():
-                    try:
-                        self.stats[key] += list(batch_stats[key])
-                    except:
-                        breakpoint()
-                        pass
+                    self.stats[key] += list(batch_stats[key])
             for processor in self.processors:
                 processor.on_batch(batch_stats, batch_gen_metrics, batch_estimations)
 
diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
index b827b7702..2c7ccd8f8 100644
--- a/src/lm_polygraph/utils/register_stat_calculators.py
+++ b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -66,13 +66,14 @@ def _register(calculator_class: StatCalculator):
         _register(EntropyCalculator(top_k=entropy_top_k))
         _register(SampleEntropyCalculator(top_k=entropy_top_k))
         _register(GreedyLMProbsCalculator())
-        _register(SamplingGenerationCalculator())
+        _register(SamplingGenerationCalculator(n_alternatives=n_ccp_alternatives))
         _register(BartScoreCalculator())
         _register(ModelScoreCalculator())
         _register(EmbeddingsCalculator())
         _register(EnsembleTokenLevelDataCalculator())
         _register(CrossEncoderSimilarityMatrixCalculator(nli_model=nli_model))
         _register(GreedyAlternativesNLICalculator(nli_model=nli_model))
+        _register(SampleAlternativesNLICalculator(nli_model=nli_model))
         _register(GreedyAlternativesFactPrefNLICalculator(nli_model=nli_model))
         _register(ClaimsExtractor(openai_chat=openai_chat, language=language))
         _register(

From b37936f41f4fd52afc4cd4c03232a9160528ccd7 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 28 Nov 2024 10:35:34 +0400
Subject: [PATCH 34/97] Fix sample entropy

---
 .../configs/polygraph_eval_xsum_sentsar.yaml    | 17 -----------------
 src/lm_polygraph/stat_calculators/entropy.py    | 16 +++++++++-------
 2 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index 084015edb..b157ac671 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -142,23 +142,6 @@ additional_estimators:
       use_log: false
       reverse: true
 
-  - module: lm_polygraph.estimators.claim_conditioned_probability
-    class_name: ClaimConditionedProbability
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: CCPGSU
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: CCPGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: CCPGSU
-    kwargs: 
-      use_log: false
-      reverse: true
-
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py
index f6ba90c4e..c5956992d 100644
--- a/src/lm_polygraph/stat_calculators/entropy.py
+++ b/src/lm_polygraph/stat_calculators/entropy.py
@@ -72,11 +72,12 @@ def __call__(
         **kwargs,
     ) -> Dict[str, np.ndarray]:
         batch_distributions = dependencies["sample_tokens_distributions"]
-        entropies = []
 
+        input_entropies = []
         for input_distributions in batch_distributions:
+            sample_entropies = []
             for sample_distributions in input_distributions:
-                sample_entropies = []
+                token_entropies = []
                 for token_dist in sample_distributions:
                     # Convert token_dist to a numpy array first, then to a torch tensor
                     token_dist_tensor = torch.tensor(token_dist)
@@ -86,10 +87,11 @@ def __call__(
 
                     # Calculate entropy using torch's Categorical distribution
                     entropy = torch.distributions.Categorical(logits=token_dist_tensor).entropy()
-                    sample_entropies.append(entropy.item()) 
+                    token_entropies.append(entropy.item()) 
 
-            # Calculate mean entropy for the sample
-            mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0
-            entropies.append(mean_entropy.item())
+                # Calculate mean entropy for the sample
+                sample_entropy = torch.mean(torch.tensor(token_entropies)) if token_entropies else 0
+                sample_entropies.append(sample_entropy.item())
+            input_entropies.append(sample_entropies)
 
-        return {"sample_entropy": entropies}
+        return {"sample_entropy": input_entropies}

From 22a3a70db17f5c214585ffeef2bf24ab510f9858 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 28 Nov 2024 11:20:22 +0400
Subject: [PATCH 35/97] Expand configs

---
 .../configs/polygraph_eval_coqa_sentsar.yaml  | 158 +++++++++++++++++
 .../polygraph_eval_gsm8k_sentsar_cot.yaml     | 162 ++++++++++++++++++
 .../configs/polygraph_eval_mmlu_sentsar.yaml  | 160 +++++++++++++++++
 .../polygraph_eval_triviaqa_sentsar.yaml      |   9 +-
 .../polygraph_eval_wmt14_enfr_sentsar.yaml    | 160 +++++++++++++++++
 .../polygraph_eval_wmt14_fren_sentsar.yaml    |   9 +-
 .../polygraph_eval_wmt19_deen_sentsar.yaml    |   9 +-
 .../polygraph_eval_wmt19_ende_sentsar.yaml    | 159 +++++++++++++++++
 .../configs/polygraph_eval_xsum_sentsar.yaml  |   9 +-
 9 files changed, 831 insertions(+), 4 deletions(-)
 create mode 100644 examples/configs/polygraph_eval_coqa_sentsar.yaml
 create mode 100644 examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
 create mode 100644 examples/configs/polygraph_eval_mmlu_sentsar.yaml
 create mode 100644 examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
 create mode 100644 examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
new file mode 100644
index 000000000..5c4c83673
--- /dev/null
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -0,0 +1,158 @@
+hydra:
+  run:
+    dir: ${cache_path}/coqa/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+
+task: qa
+
+dataset: coqa
+text_column: questions
+label_column: answers
+description: "The following are stories and questions about them. Each story is followed by a question and answer to a given question.\n\nStory: {story}"
+prompt: "Question: {question}\nAnswer:{answer}"
+train_split: train
+eval_split: validation
+max_new_tokens: 20
+load_from_disk: false
+normalize: true
+generation_params:
+  generate_until:
+    - "\n"
+save_stats:
+  - greedy_tokens
+  - greedy_log_likelihoods
+  - greedy_tokens_alternatives
+  - entropy
+  - sample_tokens
+  - sample_tokens_alternatives
+  - sample_texts
+  - sample_log_probs
+  - sample_log_likelihoods
+  - sample_sentence_similarity
+  - sample_entropy
+entropy_top_k: 50
+
+train_dataset: null
+train_test_split: false
+test_split_size: 1
+
+background_train_dataset: allenai/c4
+background_train_dataset_text_column: text
+background_train_dataset_label_column: url
+background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
+background_load_from_disk: false
+
+subsample_background_train_dataset: 1000
+subsample_train_dataset: 1000
+subsample_eval_dataset: -1
+
+use_density_based_ue: false
+use_seq_ue: false
+use_tok_ue: false
+use_ens_ue: false
+generation_metrics: null
+
+additional_estimators:
+  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
+    class_name: MonteCarloSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
+    class_name: MonteCarloNormalizedSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.semantic_entropy
+    class_name: SemanticEntropy
+    kwargs: {}
+
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: SentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+ignore_exceptions: false
+
+batch_size: 1
+deberta_batch_size: 1
+
+seed:
+    - 1
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
new file mode 100644
index 000000000..dfd7d072c
--- /dev/null
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -0,0 +1,162 @@
+hydra:
+  run:
+    dir: ${cache_path}/gsm8k_cot/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+
+task: qa
+
+dataset: [gsm8k, main]
+text_column: question
+label_column: answer
+prompt: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\nQ: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\nQ: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39.\n\nQ: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is 8.\n\nQ: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is 9.\n\nQ: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is 29.\n\nQ: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33.\n\nQ: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.\n\nQ: {question}\nA:"
+train_split: train
+few_shot_split: train
+eval_split: test
+max_new_tokens: 256
+load_from_disk: false
+n_shot: 0
+normalize: true
+generation_params:
+  generate_until:
+    - "\n"
+save_stats:
+  - greedy_tokens
+  - greedy_log_likelihoods
+  - greedy_tokens_alternatives
+  - entropy
+  - sample_tokens
+  - sample_tokens_alternatives
+  - sample_texts
+  - sample_log_probs
+  - sample_log_likelihoods
+  - sample_sentence_similarity
+  - sample_entropy
+entropy_top_k: 50
+
+target_ignore_regex: "(?s).*#### "
+output_ignore_regex: "(?s).*The answer is "
+
+train_dataset: null
+train_test_split: false
+test_split_size: 1
+
+background_train_dataset: allenai/c4
+background_train_dataset_text_column: text
+background_train_dataset_label_column: url
+background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
+background_load_from_disk: false
+
+subsample_background_train_dataset: 1000
+subsample_train_dataset: 1000
+subsample_eval_dataset: -1
+
+use_density_based_ue: false
+use_seq_ue: false
+use_tok_ue: false
+use_ens_ue: false
+generation_metrics: null
+
+additional_estimators:
+  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
+    class_name: MonteCarloSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
+    class_name: MonteCarloNormalizedSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.semantic_entropy
+    class_name: SemanticEntropy
+    kwargs: {}
+
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: SentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+ignore_exceptions: false
+
+batch_size: 1
+deberta_batch_size: 1
+
+seed:
+    - 1
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
new file mode 100644
index 000000000..6e81f65da
--- /dev/null
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -0,0 +1,160 @@
+hydra:
+  run:
+    dir: ${cache_path}/mmlu/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+
+task: qa
+
+dataset: [cais/mmlu, all]
+text_column: question
+label_column: answer
+description: "The following are multiple choice questions (with answers) about {subject}."
+prompt: "Q:{question}\nA. {choices[0]}\nB. {choices[1]}\nC. {choices[2]}\nD. {choices[3]}\nAnswer:{answer}"
+few_shot_split: dev
+train_split: validation
+eval_split: test
+max_new_tokens: 3
+load_from_disk: false
+n_shot: 5
+max_subject_size: 100
+generation_params:
+  generate_until:
+    - "\n"
+save_stats:
+  - greedy_tokens
+  - greedy_log_likelihoods
+  - greedy_tokens_alternatives
+  - entropy
+  - sample_tokens
+  - sample_tokens_alternatives
+  - sample_texts
+  - sample_log_probs
+  - sample_log_likelihoods
+  - sample_sentence_similarity
+  - sample_entropy
+entropy_top_k: 50
+
+train_dataset: null
+train_test_split: false
+test_split_size: 1
+
+background_train_dataset: allenai/c4
+background_train_dataset_text_column: text
+background_train_dataset_label_column: url
+background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
+background_load_from_disk: false
+
+subsample_background_train_dataset: 1000
+subsample_train_dataset: 1000
+subsample_eval_dataset: -1
+
+use_density_based_ue: false
+use_seq_ue: false
+use_tok_ue: false
+use_ens_ue: false
+generation_metrics: null
+
+additional_estimators:
+  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
+    class_name: MonteCarloSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
+    class_name: MonteCarloNormalizedSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.semantic_entropy
+    class_name: SemanticEntropy
+    kwargs: {}
+
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: SentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+ignore_exceptions: false
+
+batch_size: 1
+deberta_batch_size: 1
+
+seed:
+    - 1
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 74a06f08d..e4af4a50e 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -1,6 +1,6 @@
 hydra:
   run:
-    dir: ${cache_path}/${task}/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    dir: ${cache_path}/triviaqa/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 
 defaults:
   - model: bloomz-560m
@@ -27,10 +27,17 @@ generation_params:
   generate_until:
     - "\n"
 save_stats:
+  - greedy_tokens
+  - greedy_log_likelihoods
+  - greedy_tokens_alternatives
+  - entropy
   - sample_tokens
+  - sample_tokens_alternatives
   - sample_texts
   - sample_log_probs
+  - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_entropy
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
new file mode 100644
index 000000000..8efef7f56
--- /dev/null
+++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
@@ -0,0 +1,160 @@
+hydra:
+  run:
+    dir: ${cache_path}/wmt14_enfr/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+
+device: cpu
+
+task: nmt
+
+dataset: [wmt14, fr-en]
+text_column: en
+label_column: fr
+prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n"
+train_split: train
+eval_split: test
+max_new_tokens: 182
+load_from_disk: false
+generation_params:
+  generate_until:
+    - "\n"
+save_stats:
+  - greedy_tokens
+  - greedy_log_likelihoods
+  - greedy_tokens_alternatives
+  - entropy
+  - sample_tokens
+  - sample_tokens_alternatives
+  - sample_texts
+  - sample_log_probs
+  - sample_log_likelihoods
+  - sample_sentence_similarity
+  - sample_entropy
+entropy_top_k: 50
+
+source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
+
+train_dataset: null
+train_test_split: false
+test_split_size: 1
+
+background_train_dataset: allenai/c4
+background_train_dataset_text_column: text
+background_train_dataset_label_column: url
+background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
+background_load_from_disk: false
+
+subsample_background_train_dataset: 1000
+subsample_train_dataset: 1000
+subsample_eval_dataset: -1
+
+use_density_based_ue: false
+use_ens_ue: false
+use_seq_ue: false
+use_tok_ue: false
+generation_metrics: null
+
+additional_estimators:
+  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
+    class_name: MonteCarloSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
+    class_name: MonteCarloNormalizedSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.semantic_entropy
+    class_name: SemanticEntropy
+    kwargs: {}
+
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: SentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+ignore_exceptions: false
+
+batch_size: 1
+deberta_batch_size: 1
+
+seed:
+    - 1
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 7c2213716..0819ab00b 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -1,6 +1,6 @@
 hydra:
   run:
-    dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    dir: ${cache_path}/wmt14_fren/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 
 defaults:
   - model: bloomz-560m
@@ -25,10 +25,17 @@ generation_params:
   generate_until:
     - "\n"
 save_stats:
+  - greedy_tokens
+  - greedy_log_likelihoods
+  - greedy_tokens_alternatives
+  - entropy
   - sample_tokens
+  - sample_tokens_alternatives
   - sample_texts
   - sample_log_probs
+  - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_entropy
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index f5d70927a..86afb4328 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -1,6 +1,6 @@
 hydra:
   run:
-    dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    dir: ${cache_path}/wmt19_deen/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 
 defaults:
   - model: bloomz-560m
@@ -25,10 +25,17 @@ generation_params:
   generate_until:
     - "\n"
 save_stats:
+  - greedy_tokens
+  - greedy_log_likelihoods
+  - greedy_tokens_alternatives
+  - entropy
   - sample_tokens
+  - sample_tokens_alternatives
   - sample_texts
   - sample_log_probs
+  - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_entropy
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
new file mode 100644
index 000000000..7c23dd127
--- /dev/null
+++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
@@ -0,0 +1,159 @@
+hydra:
+  run:
+    dir: ${cache_path}/wmt19_ende/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+
+device: cpu
+
+task: nmt
+
+dataset: [wmt19, de-en]
+text_column: en
+label_column: de
+prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n"
+train_split: train
+eval_split: validation
+max_new_tokens: 200
+load_from_disk: false
+generation_params:
+  generate_until:
+    - "\n"
+save_stats:
+  - greedy_tokens
+  - greedy_log_likelihoods
+  - greedy_tokens_alternatives
+  - entropy
+  - sample_tokens
+  - sample_tokens_alternatives
+  - sample_texts
+  - sample_log_probs
+  - sample_log_likelihoods
+  - sample_sentence_similarity
+  - sample_entropy
+entropy_top_k: 50
+
+source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
+
+train_dataset: null
+train_test_split: false
+test_split_size: 1
+
+background_train_dataset: allenai/c4
+background_train_dataset_text_column: text
+background_train_dataset_label_column: url
+background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
+background_load_from_disk: false
+
+subsample_background_train_dataset: 1000
+subsample_train_dataset: 1000
+subsample_eval_dataset: -1
+
+use_density_based_ue: false
+use_ens_ue: false
+use_seq_ue: false
+use_tok_ue: false
+
+additional_estimators:
+  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
+    class_name: MonteCarloSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
+    class_name: MonteCarloNormalizedSequenceEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.semantic_entropy
+    class_name: SemanticEntropy
+    kwargs: {}
+
+  - module: lm_polygraph.estimators.max_probability
+    class_name: MaximumSequenceProbability
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: SentenceSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MaxprobGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_sar
+    class_name: TokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.sar
+    class_name: SAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: TokenSARGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.perplexity
+    class_name: Perplexity
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: PPLSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: PPLGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: MeanTokenEntropy
+    kwargs: {}
+  - module: lm_polygraph.estimators.sentence_sar
+    class_name: MTESAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: {}
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: false
+  - module: lm_polygraph.estimators.gsu
+    class_name: MTEGSU
+    kwargs: 
+      use_log: false
+      reverse: true
+
+ignore_exceptions: false
+
+batch_size: 1
+deberta_batch_size: 1
+
+seed:
+    - 1
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index b157ac671..ac69dc14c 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -1,6 +1,6 @@
 hydra:
   run:
-    dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    dir: ${cache_path}/xsum/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 
 defaults:
   - model: bloomz-560m
@@ -26,10 +26,17 @@ generation_params:
   generate_until:
     - "\n"
 save_stats:
+  - greedy_tokens
+  - greedy_log_likelihoods
+  - greedy_tokens_alternatives
+  - entropy
   - sample_tokens
+  - sample_tokens_alternatives
   - sample_texts
   - sample_log_probs
+  - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_entropy
 entropy_top_k: 50
 
 train_dataset: null

From 229502874518510a789c51b790d6b750e61954be Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 28 Nov 2024 12:43:00 +0400
Subject: [PATCH 36/97] Use only PRR

---
 scripts/polygraph_eval | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index ccf079f9e..5cb50e829 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -212,10 +212,10 @@ def main(args):
 
 def get_ue_metrics(args):
     ue_metrics = [
-        ReversedPairsProportion(),
+        #ReversedPairsProportion(),
         PredictionRejectionArea(),
         PredictionRejectionArea(max_rejection=0.5),
-        RiskCoverageCurveAUC(),
+        #RiskCoverageCurveAUC(),
     ]
     if getattr(args, "use_claim_ue", False):
         ue_metrics += [

From ef8258b97db73b6a3cfbad1a16899d349eed058c Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 4 Dec 2024 13:32:55 +0400
Subject: [PATCH 37/97] Simplify and unify GSU

---
 src/lm_polygraph/estimators/gsu.py | 104 +++++------------------------
 src/lm_polygraph/utils/manager.py  |  12 ++--
 2 files changed, 24 insertions(+), 92 deletions(-)

diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py
index b7ab5b6d9..8aae841d1 100644
--- a/src/lm_polygraph/estimators/gsu.py
+++ b/src/lm_polygraph/estimators/gsu.py
@@ -11,21 +11,12 @@ class MaxprobGSU(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
         self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
 
     def __str__(self):
-        base = "MaxprobGSU"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
+        return "MaxprobGSU"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -46,17 +37,12 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_log_probs, sample_sentence_similarity in zip(
             batch_sample_log_probs, batch_sample_sentence_similarity
         ):
-            sample_probs = np.exp(np.array(sample_log_probs))
+            sample_probs = -np.exp(np.array(sample_log_probs))
             R_s = (
                 sample_probs
                 * sample_sentence_similarity
             )
-            sent_relevance = R_s.sum(-1)
-
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                E_s = -sent_relevance if self.reverse else sent_relevance
+            E_s = R_s.sum(-1)
 
             GSU.append(E_s.mean())
 
@@ -66,22 +52,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 class PPLGSU(Estimator):
     def __init__(
         self,
-        verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
+        verbose: bool = False
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
         self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
 
     def __str__(self):
-        base = "PPLGSU"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
+        return "PPLGSU"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -102,18 +79,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_log_likelihoods, sample_sentence_similarity in zip(
             batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
-            ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+            ppl = -np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
 
             R_s = (
                 ppl
                 * sample_sentence_similarity
             )
-            sent_relevance = R_s.sum(-1)
-
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                E_s = -sent_relevance if self.reverse else sent_relevance
+            E_s = R_s.sum(-1)
 
             GSU.append(E_s.mean())
 
@@ -123,10 +95,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 class TokenSARGSU(Estimator):
     def __init__(
         self,
-        verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
-    ):
+        verbose: bool = False):
         super().__init__(
             [
                 "sample_sentence_similarity",
@@ -136,16 +105,9 @@ def __init__(
             "sequence",
         )
         self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
 
     def __str__(self):
-        base = "TokenSARGSU"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
+        return "TokenSARGSU"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -185,17 +147,12 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 tokenSAR.append(E_t.sum())
 
             tokenSAR = np.array(tokenSAR)
-            probs_token_sar = np.exp(-tokenSAR)
+            probs_token_sar = -np.exp(-tokenSAR)
             R_s = (
                 probs_token_sar
                 * sample_sentence_similarity
             )
-            sent_relevance = R_s.sum(-1)
-            E_s = -np.log(sent_relevance)
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                E_s = -sent_relevance if self.reverse else sent_relevance
+            E_s = R_s.sum(-1)
 
             GSU.append(E_s.mean())
 
@@ -205,22 +162,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 class MTEGSU(Estimator):
     def __init__(
         self,
-        verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
+        verbose: bool = False
     ):
         super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
         self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
 
     def __str__(self):
-        base = "MTEGSU"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
+        return "MTEGSU"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -247,13 +195,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             R_s = sample_entropy * sample_sentence_similarity
             
             # Compute sentence relevance by summing along the last axis
-            sent_relevance = R_s.sum(-1)
-
-            # Calculate E_s with options for log transformation and reversal
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                E_s = -sent_relevance if self.reverse else sent_relevance
+            E_s = R_s.sum(-1)
 
             GSU.append(E_s.mean())
 
@@ -263,25 +205,16 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 class CCPGSU(Estimator):
     def __init__(
         self,
-        verbose: bool = False,
-        use_log: bool = True,
-        reverse: bool = False
+        verbose: bool = False
     ):
         super().__init__(["sample_sentence_similarity",
                           "sample_tokens",
                           "sample_tokens_alternatives",
                           "sample_tokens_alternatives_nli"], "sequence")
         self.verbose = verbose
-        self.use_log = use_log
-        self.reverse = reverse
 
     def __str__(self):
-        base = "CCPGSU"
-        if not self.use_log:
-            base += "_no_log"
-            if self.reverse:
-                base += "_reverse"
-        return base
+        return "CCPGSU"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -331,11 +264,6 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             )
             sent_relevance = R_s.sum(-1)
 
-            if self.use_log:
-                E_s = -np.log(sent_relevance)
-            else:
-                E_s = -sent_relevance if self.reverse else sent_relevance
-
             GSU.append(E_s.mean())
 
         return np.array(GSU)
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 36e882308..b21d7d9aa 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -514,7 +514,15 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
 
             torch.cuda.empty_cache()
             gc.collect()
+            
+        self.eval_ue()
 
+        for processor in self.processors:
+            processor.on_eval(self.metrics, self.total_bad_estimators)
+
+        return self.metrics
+
+    def eval_ue(self):
         for (e_level, e_name), estimator_values in self.estimations.items():
             for (gen_level, gen_name), generation_metric in self.gen_metrics.items():
                 for ue_metric in self.ue_metrics:
@@ -541,10 +549,6 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
                             e_level, e_name, gen_name, str(ue_metric) + "_normalized"
                         ] = normalize_metric(ue_metric_val, oracle_score, random_score)
 
-        for processor in self.processors:
-            processor.on_eval(self.metrics, self.total_bad_estimators)
-
-        return self.metrics
 
     def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> dict:
         """

From a79620e415f2fea413317e5a1e64e8a787580b94 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 4 Dec 2024 13:39:48 +0400
Subject: [PATCH 38/97] Add missing stats to yamls

---
 examples/configs/polygraph_eval_coqa_sentsar.yaml       | 2 ++
 examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml  | 2 ++
 examples/configs/polygraph_eval_mmlu_sentsar.yaml       | 2 ++
 examples/configs/polygraph_eval_triviaqa_sentsar.yaml   | 2 ++
 examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml | 2 ++
 examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 2 ++
 examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 2 ++
 examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml | 2 ++
 examples/configs/polygraph_eval_xsum_sentsar.yaml       | 2 ++
 9 files changed, 18 insertions(+)

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
index 5c4c83673..4c2e28160 100644
--- a/examples/configs/polygraph_eval_coqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -28,6 +28,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - token_similarity
   - entropy
   - sample_tokens
   - sample_tokens_alternatives
@@ -35,6 +36,7 @@ save_stats:
   - sample_log_probs
   - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_token_similarity
   - sample_entropy
 entropy_top_k: 50
 
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
index dfd7d072c..3db79b51b 100644
--- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -29,6 +29,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - token_similarity
   - entropy
   - sample_tokens
   - sample_tokens_alternatives
@@ -36,6 +37,7 @@ save_stats:
   - sample_log_probs
   - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_token_similarity
   - sample_entropy
 entropy_top_k: 50
 
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
index 6e81f65da..713f536cb 100644
--- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -30,6 +30,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - token_similarity
   - entropy
   - sample_tokens
   - sample_tokens_alternatives
@@ -37,6 +38,7 @@ save_stats:
   - sample_log_probs
   - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_token_similarity
   - sample_entropy
 entropy_top_k: 50
 
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index e4af4a50e..94be9806b 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -30,6 +30,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - token_similarity
   - entropy
   - sample_tokens
   - sample_tokens_alternatives
@@ -37,6 +38,7 @@ save_stats:
   - sample_log_probs
   - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_token_similarity
   - sample_entropy
 entropy_top_k: 50
 
diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
index 8efef7f56..d86071a11 100644
--- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
@@ -28,6 +28,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - token_similarity
   - entropy
   - sample_tokens
   - sample_tokens_alternatives
@@ -35,6 +36,7 @@ save_stats:
   - sample_log_probs
   - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_token_similarity
   - sample_entropy
 entropy_top_k: 50
 
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 0819ab00b..47dbdc3f3 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -28,6 +28,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - token_similarity
   - entropy
   - sample_tokens
   - sample_tokens_alternatives
@@ -35,6 +36,7 @@ save_stats:
   - sample_log_probs
   - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_token_similarity
   - sample_entropy
 entropy_top_k: 50
 
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index 86afb4328..cc19da68f 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -28,6 +28,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - token_similarity
   - entropy
   - sample_tokens
   - sample_tokens_alternatives
@@ -35,6 +36,7 @@ save_stats:
   - sample_log_probs
   - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_token_similarity
   - sample_entropy
 entropy_top_k: 50
 
diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
index 7c23dd127..4b0099411 100644
--- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
@@ -28,6 +28,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - token_similarity
   - entropy
   - sample_tokens
   - sample_tokens_alternatives
@@ -35,6 +36,7 @@ save_stats:
   - sample_log_probs
   - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_token_similarity
   - sample_entropy
 entropy_top_k: 50
 
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index ac69dc14c..f308fbefb 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -29,6 +29,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - token_similarity
   - entropy
   - sample_tokens
   - sample_tokens_alternatives
@@ -36,6 +37,7 @@ save_stats:
   - sample_log_probs
   - sample_log_likelihoods
   - sample_sentence_similarity
+  - sample_token_similarity
   - sample_entropy
 entropy_top_k: 50
 

From 8ecc1d56891e81bbba3353c5666d1b9d4c6fd0f4 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 4 Dec 2024 15:10:08 +0400
Subject: [PATCH 39/97] Add tqdm to ce similarity

---
 .../stat_calculators/cross_encoder_similarity.py           | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py b/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py
index d6faa20bb..ada5c9136 100644
--- a/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py
+++ b/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py
@@ -2,6 +2,7 @@
 
 import itertools
 from typing import Dict, List
+from tqdm import tqdm
 
 from .stat_calculator import StatCalculator
 from sentence_transformers import CrossEncoder
@@ -67,7 +68,7 @@ def __call__(
             batch_counts.append(len(unique_texts))
 
         batch_token_scores = []
-        for input_texts, tokens in zip(batch_input_texts, batch_greedy_tokens):
+        for input_texts, tokens in tqdm(zip(batch_input_texts, batch_greedy_tokens)):
             if len(tokens) > 1:
                 is_special_tokens = np.isin(tokens, special_tokens)
                 cropped_tokens = list(itertools.combinations(tokens, len(tokens) - 1))[
@@ -96,7 +97,7 @@ def __call__(
             batch_token_scores.append(token_scores)
 
         sim_matrices = []
-        for i, pairs in enumerate(batch_pairs):
+        for i, pairs in tqdm(enumerate(batch_pairs)):
             sim_scores = self.crossencoder.predict(pairs, batch_size=deberta_batch_size)
             unique_mat_shape = (batch_counts[i], batch_counts[i])
 
@@ -109,7 +110,7 @@ def __call__(
         sim_matrices = np.stack(sim_matrices)
 
         batch_samples_token_scores = []
-        for sample_tokens, input_texts in zip(batch_sample_tokens, batch_input_texts):
+        for sample_tokens, input_texts in tqdm(zip(batch_sample_tokens, batch_input_texts)):
             samples_token_scores = []
             for tokens in sample_tokens:
                 if len(tokens) > 1:

From 6f32205044511de147fc49952c2330b7beff2f68 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 4 Dec 2024 16:06:12 +0400
Subject: [PATCH 40/97] Add model config for llama

---
 examples/configs/model/llama.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 examples/configs/model/llama.yaml

diff --git a/examples/configs/model/llama.yaml b/examples/configs/model/llama.yaml
new file mode 100644
index 000000000..0d1870443
--- /dev/null
+++ b/examples/configs/model/llama.yaml
@@ -0,0 +1,11 @@
+defaults:
+  - default
+
+path: meta-llama/Meta-Llama-3.1-8B
+type: CausalLM
+path_to_load_script: model/default_causal.py
+
+load_model_args:
+  device_map: balanced_low_0
+  dtype: bfloat16
+load_tokenizer_args: {}

From 457f94bcd39c3e532750a99e3e79c9fb1c417fb4 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 5 Dec 2024 13:06:39 +0400
Subject: [PATCH 41/97] Add dtype to load args

---
 examples/configs/model/default_causal.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/configs/model/default_causal.py b/examples/configs/model/default_causal.py
index 956a71dde..c77a317fc 100644
--- a/examples/configs/model/default_causal.py
+++ b/examples/configs/model/default_causal.py
@@ -1,9 +1,10 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
 
-
-def load_model(model_path: str, device_map: str):
+def load_model(model_path: str, device_map: str, dtype: str = "float32"):
+    dtype = getattr(torch, dtype)
     model = AutoModelForCausalLM.from_pretrained(
-        model_path, trust_remote_code=True, device_map=device_map
+        model_path, trust_remote_code=True, device_map=device_map, torch_dtype=dtype
     )
     model.eval()
 

From 1e1a3c86d28961c5f9746a3264052a8cb51d259a Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 5 Dec 2024 20:55:57 +0400
Subject: [PATCH 42/97] Remove redundant methods

---
 .../configs/polygraph_eval_coqa_sentsar.yaml  | 39 ------------------
 .../polygraph_eval_gsm8k_sentsar_cot.yaml     | 40 -------------------
 .../configs/polygraph_eval_mmlu_sentsar.yaml  | 40 -------------------
 .../polygraph_eval_triviaqa_sentsar.yaml      | 40 -------------------
 .../polygraph_eval_wmt14_enfr_sentsar.yaml    | 39 ------------------
 .../polygraph_eval_wmt14_fren_sentsar.yaml    | 40 -------------------
 .../polygraph_eval_wmt19_deen_sentsar.yaml    | 40 -------------------
 .../polygraph_eval_wmt19_ende_sentsar.yaml    | 40 -------------------
 .../configs/polygraph_eval_xsum_sentsar.yaml  | 40 -------------------
 9 files changed, 358 deletions(-)

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
index 4c2e28160..9e75ed4af 100644
--- a/examples/configs/polygraph_eval_coqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -80,16 +80,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
@@ -101,15 +91,6 @@ additional_estimators:
     class_name: TokenSARGSU
     kwargs: {}
   - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
@@ -120,16 +101,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
@@ -140,16 +111,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
index 3db79b51b..b4eea7dcd 100644
--- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -84,16 +84,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
@@ -104,16 +94,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
@@ -124,16 +104,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
@@ -144,16 +114,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
index 713f536cb..755da5c74 100644
--- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -82,16 +82,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
@@ -102,16 +92,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
@@ -122,16 +102,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
@@ -142,16 +112,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 94be9806b..1b40fb1b9 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -83,16 +83,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
@@ -103,16 +93,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
@@ -123,16 +103,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
@@ -143,16 +113,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
index d86071a11..b3505e7cc 100644
--- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
@@ -82,16 +82,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
@@ -103,15 +93,6 @@ additional_estimators:
     class_name: TokenSARGSU
     kwargs: {}
   - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
@@ -122,16 +103,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
@@ -142,16 +113,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 47dbdc3f3..b471f29f8 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -82,16 +82,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
@@ -102,16 +92,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
@@ -122,16 +102,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
@@ -142,16 +112,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index cc19da68f..c9658242b 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -81,16 +81,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
@@ -101,16 +91,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
@@ -121,16 +101,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
@@ -141,16 +111,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
index 4b0099411..afd8f28f3 100644
--- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
@@ -81,16 +81,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
@@ -101,16 +91,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
@@ -121,16 +101,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
@@ -141,16 +111,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
 ignore_exceptions: false
 
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index f308fbefb..af828f29c 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -80,16 +80,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MaxprobGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
@@ -100,16 +90,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
@@ -120,16 +100,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: PPLGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
@@ -140,16 +110,6 @@ additional_estimators:
   - module: lm_polygraph.estimators.gsu
     class_name: MTEGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: false
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: 
-      use_log: false
-      reverse: true
 
 ignore_exceptions: false
 

From 41a0849bb031791ee68cca87f83f2e15991b688f Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Tue, 10 Dec 2024 12:56:50 +0400
Subject: [PATCH 43/97] Add possibility of continuing estimation from saved
 manager, sampled versions of single-seq methods

---
 .../configs/polygraph_eval_xsum_sentsar.yaml  |  28 +++
 scripts/polygraph_eval                        | 117 ++++++++---
 src/lm_polygraph/estimators/__init__.py       |   1 +
 src/lm_polygraph/estimators/average_ue.py     | 129 ++++++++++++
 .../estimators/max_probability.py             |  28 +++
 src/lm_polygraph/estimators/perplexity.py     |  12 ++
 src/lm_polygraph/estimators/token_entropy.py  |  28 +++
 src/lm_polygraph/estimators/token_sar.py      |  53 +++++
 .../cross_encoder_similarity.py               |   1 +
 src/lm_polygraph/utils/manager.py             | 185 ++++++++----------
 src/lm_polygraph/utils/processor.py           |   2 +-
 11 files changed, 457 insertions(+), 127 deletions(-)
 create mode 100644 src/lm_polygraph/estimators/average_ue.py

diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index af828f29c..a04c9c672 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -13,6 +13,9 @@ device: cpu
 
 task: ats
 
+base_manager: null
+overwrite_base_estimations: false
+
 dataset: xsum
 text_column: document
 label_column: summary
@@ -74,6 +77,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
     kwargs: {}
+  - module: lm_polygraph.estimators.max_probability
+    class_name: SampledMaximumSequenceProbability
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
@@ -84,6 +90,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: SampledTokenSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
@@ -94,6 +103,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
     kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: SampledPerplexity
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSAR
     kwargs: {}
@@ -104,6 +116,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: SampledMeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: MTESAR
     kwargs: {}
@@ -111,6 +126,19 @@ additional_estimators:
     class_name: MTEGSU
     kwargs: {}
 
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMaxprob
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AvePPL
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveTokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMTE
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 5cb50e829..2944cf21e 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -183,27 +183,50 @@ def main(args):
         generation_metrics = get_generation_metrics(args)
 
         ue_metrics = get_ue_metrics(args)
-
-        man = UEManager(
-            dataset,
-            model,
-            estimators,
-            generation_metrics,
-            ue_metrics,
-            [
-                Logger(),
-            ],
-            deberta_batch_size=getattr(args, 'deberta_batch_size', 10),
-            train_data=train_dataset,
-            ignore_exceptions=args.ignore_exceptions,
-            background_train_data=background_train_dataset,
-            max_new_tokens=args.max_new_tokens,
-            ensemble_model=ensemble_model,
-            cache_path=args.cache_path,
-            language=getattr(args, 'language', 'en'),
-            save_stats=getattr(args, 'save_stats', []),
-            entropy_top_k=getattr(args, 'entropy_top_k', None),
-        )
+        
+        if getattr(args, "base_manager", None) is None:
+            man = UEManager(
+                dataset,
+                model,
+                estimators,
+                generation_metrics,
+                ue_metrics,
+                [
+                    Logger(),
+                ],
+                batch_size=args.batch_size,
+                deberta_batch_size=getattr(args, 'deberta_batch_size', 10),
+                train_data=train_dataset,
+                ignore_exceptions=args.ignore_exceptions,
+                background_train_data=background_train_dataset,
+                max_new_tokens=args.max_new_tokens,
+                ensemble_model=ensemble_model,
+                cache_path=args.cache_path,
+                language=getattr(args, 'language', 'en'),
+                save_stats=getattr(args, 'save_stats', []),
+                entropy_top_k=getattr(args, 'entropy_top_k', None),
+            )
+        else:
+            man = UEManager.load(
+                args.base_manager,
+                data = dataset,
+                model = model,
+                estimators = estimators,
+                generation_metrics = generation_metrics,
+                ue_metrics = ue_metrics,
+                processors = [Logger()],
+                batch_size=args.batch_size,
+                deberta_batch_size=getattr(args, 'deberta_batch_size', 10),
+                train_data=train_dataset,
+                ignore_exceptions=args.ignore_exceptions,
+                background_train_data=background_train_dataset,
+                max_new_tokens=args.max_new_tokens,
+                ensemble_model=ensemble_model,
+                cache_path=args.cache_path,
+                language=getattr(args, 'language', 'en'),
+                save_stats=getattr(args, 'save_stats', []),
+                entropy_top_k=getattr(args, 'entropy_top_k', None),
+            )
 
         man()
 
@@ -267,10 +290,18 @@ def get_density_based_ue_methods(args, model_type):
 
 
 def get_ue_methods(args, model):
+    if getattr(args, "base_manager", None) is not None:
+        base_manager = UEManager.load(args.base_manager)
+        existing_estimators = list(base_manager.estimations.keys())
+    else:
+        existing_estimators = []
+
+    overwrite = getattr(args, "overwrite_base_estimations", False)
+
     estimators = []
     if getattr(args.model, "type", "Whitebox") == "Blackbox":
         if getattr(args, "use_seq_ue", False):
-            estimators += [
+            bb_estimators = [
                 LexicalSimilarity(metric="rouge1"),
                 LexicalSimilarity(metric="rouge2"),
                 LexicalSimilarity(metric="rougeL"),
@@ -287,6 +318,10 @@ def get_ue_methods(args, model):
                 Eccentricity(similarity_score="Jaccard_score"),
             ]
 
+            for estimator in bb_estimators:
+                if overwrite or ('sequence', str(estimator)) not in existing_estimators:
+                    estimators.append(estimator)
+
         if getattr(args, "use_ens_ue", False):
             raise NotImplementedError('Ensemble UE methods not applicable for blackbox models')
 
@@ -297,7 +332,7 @@ def get_ue_methods(args, model):
             raise NotImplementedError('Claim UE methods not applicable for blackbox models')
     else:
         if getattr(args, "use_seq_ue", False):
-            estimators += [
+            wb_estimators = [
                 MaximumSequenceProbability(),
                 Perplexity(),
                 MeanTokenEntropy(),
@@ -330,6 +365,10 @@ def get_ue_methods(args, model):
                 FisherRao(),
             ]
 
+            for estimator in wb_estimators:
+                if overwrite or ('sequence', str(estimator)) not in existing_estimators:
+                    estimators.append(estimator)
+
         if getattr(args, "use_ens_ue", False):
             # Ensemble-based UE methods have been disabled due to dependency on old
             # transformers code, which prevents bumping transformers version in 
@@ -350,7 +389,7 @@ def get_ue_methods(args, model):
             #estimators += (token_measures + sequence_measures)
 
         if getattr(args, "use_tok_ue", False):
-            estimators += [
+            tok_estimators = [
                 MaximumTokenProbability(),
                 TokenEntropy(),
                 PointwiseMutualInformation(),
@@ -358,8 +397,12 @@ def get_ue_methods(args, model):
                 SemanticEntropyToken(model.model_path, args.cache_path),
             ]
 
+            for estimator in tok_estimators:
+                if overwrite or ('token', str(estimator)) not in existing_estimators:
+                    estimators.append(estimator)
+
         if getattr(args, "use_claim_ue", False):
-            estimators += [
+            claim_estimators = [
                 MaximumClaimProbability(),
                 PerplexityClaim(),
                 MaxTokenEntropyClaim(),
@@ -369,12 +412,19 @@ def get_ue_methods(args, model):
                 ClaimConditionedProbabilityClaim(nli_context="fact_pref"),
             ]
 
+            for estimator in claim_estimators:
+                if overwrite or ('claim', str(estimator)) not in existing_estimators:
+                    estimators.append(estimator)
+
     additional_estimators = getattr(args, "additional_estimators", {})
 
     for estimator_args in additional_estimators:
         module = importlib.import_module(estimator_args.module)
         estimator_class = getattr(module, estimator_args.class_name)
-        estimators.append(estimator_class(**estimator_args.kwargs))
+        estimator = estimator_class(**estimator_args.kwargs)
+        # Additional estimator filtering only works correctly for sequence-level estimators
+        if overwrite or ('sequence', str(estimator)) not in existing_estimators:
+            estimators.append(estimator_class(**estimator_args.kwargs))
 
     return estimators
 
@@ -383,6 +433,12 @@ def get_generation_metrics(args):
     log.info("="*100)
     log.info("Initializing generation metrics...")
 
+    if getattr(args, "base_manager", None) is not None:
+        base_manager = UEManager.load(args.base_manager)
+        existing_metrics = list(base_manager.gen_metrics.keys())
+    else:
+        existing_metrics = []
+
     generation_metrics = getattr(args, "generation_metrics", None)
     if not generation_metrics:
         result = [
@@ -415,6 +471,15 @@ def get_generation_metrics(args):
             metric_args = metric.get("args", [])
             result.append(metric_class(*metric_args))
 
+    # Filter out metrics that are already present in the base manager
+    filtered_result = []
+    for metric in result:
+        if (metric.level, str(metric)) in existing_metrics:
+            log.warning(f"Skipping metric {metric} as it is already present in the base manager.")
+        else:
+            filtered_result.append(metric)
+    result = filtered_result
+
     process_output_fn = getattr(args, "process_output_fn", None)
     process_target_fn = getattr(args, "process_target_fn", None)
     if process_target_fn or process_output_fn:
diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index ee51c6b77..5f75d7c74 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -79,3 +79,4 @@
 from .linguistic_1s import Linguistic1S
 from .label_prob import LabelProb
 from .p_true_empirical import PTrueEmpirical
+from .average_ue import AveMaxprob
diff --git a/src/lm_polygraph/estimators/average_ue.py b/src/lm_polygraph/estimators/average_ue.py
new file mode 100644
index 000000000..b03147a1f
--- /dev/null
+++ b/src/lm_polygraph/estimators/average_ue.py
@@ -0,0 +1,129 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+
+
+class AveMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "AveMaxprob"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sample_probs = -np.exp(np.array(sample_log_probs))
+
+            ave.append(sample_probs.mean())
+
+        return np.array(ave)
+
+class AvePPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "AvePPL"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for sample_log_likelihoods, sample_sentence_similarity in zip(
+            batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
+            ppl = -np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+
+            ave.append(ppl.mean())
+
+        return np.array(ave)
+
+class AveTokenSAR(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(
+            [
+                "sample_sentence_similarity",
+                "sample_log_likelihoods",
+                "sample_token_similarity",
+            ],
+            "sequence",
+        )
+        self.verbose = verbose
+
+    def __str__(self):
+        return "AveTokenSAR"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_token_similarity = stats["sample_token_similarity"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for batch_data in zip(
+            batch_sample_log_likelihoods,
+            batch_sample_token_similarity,
+            batch_sample_sentence_similarity,
+        ):
+            sample_log_likelihoods = batch_data[0]
+            sample_token_similarity = batch_data[1]
+            sample_sentence_similarity = batch_data[2]
+
+            tokenSAR = []
+            for log_likelihoods, token_similarity in zip(
+                sample_log_likelihoods, sample_token_similarity
+            ):
+                log_likelihoods = np.array(log_likelihoods)
+                R_t = 1 - token_similarity
+                R_t_norm = R_t / R_t.sum()
+                E_t = -log_likelihoods * R_t_norm
+                tokenSAR.append(E_t.sum())
+
+            tokenSAR = np.array(tokenSAR)
+            probs_token_sar = -np.exp(-tokenSAR)
+            ave.append(probs_token_sar.mean())
+
+        return np.array(ave)
+
+class AveMTE(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "AveMTE"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_entropy = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for sample_entropy, sample_sentence_similarity in zip(
+            batch_sample_entropy, batch_sample_sentence_similarity
+        ):
+            ave.append(np.mean(sample_entropy))
+
+        return np.array(ave)
diff --git a/src/lm_polygraph/estimators/max_probability.py b/src/lm_polygraph/estimators/max_probability.py
index 1d93b5e3c..b8fe2afde 100644
--- a/src/lm_polygraph/estimators/max_probability.py
+++ b/src/lm_polygraph/estimators/max_probability.py
@@ -33,6 +33,34 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         log_likelihoods = stats["greedy_log_likelihoods"]
         return np.array([-np.sum(log_likelihood) for log_likelihood in log_likelihoods])
 
+class SampledMaximumSequenceProbability(Estimator):
+    """
+    Estimates the sequence-level uncertainty of a language model by calculating the
+    log-probability of the generation with minus sign.
+    It is calculated as the sum of log-probabilities in each token.
+    Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel).
+    """
+
+    def __init__(self):
+        super().__init__(["sample_log_probs"], "sequence")
+
+    def __str__(self):
+        return "SampledMaximumSequenceProbability"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the minus log-probability of each sample in input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * log p(y_i | y_<i, x) in 'greedy_log_likelihoods'
+        Returns:
+            np.ndarray: minus log probabilities for each sample.
+                Higher values indicate more uncertain samples.
+        """
+        mp = [lp[0] for lp in stats["sample_log_probs"]]
+
+        return -np.array(mp)
 
 class MaximumTokenProbability(Estimator):
     """
diff --git a/src/lm_polygraph/estimators/perplexity.py b/src/lm_polygraph/estimators/perplexity.py
index 42ea2beef..d8c8e22b7 100644
--- a/src/lm_polygraph/estimators/perplexity.py
+++ b/src/lm_polygraph/estimators/perplexity.py
@@ -15,3 +15,15 @@ def __str__(self):
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         log_likelihoods = stats["greedy_log_likelihoods"]
         return np.array([-np.mean(ll) for ll in log_likelihoods])
+
+class SampledPerplexity(Estimator):
+    def __init__(self):
+        super().__init__(["sample_log_likelihoods"], "sequence")
+
+    def __str__(self):
+        return "SampledPerplexity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        log_likelihoods = stats["sample_log_likelihoods"]
+        ppl = [np.mean(sample_log_likelihoods[0]) for sample_log_likelihoods in log_likelihoods]
+        return -np.array(ppl)
diff --git a/src/lm_polygraph/estimators/token_entropy.py b/src/lm_polygraph/estimators/token_entropy.py
index fc87cc77c..9e1d080dd 100644
--- a/src/lm_polygraph/estimators/token_entropy.py
+++ b/src/lm_polygraph/estimators/token_entropy.py
@@ -33,6 +33,34 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array([np.mean(e) for e in entropy])
 
 
+class SampledMeanTokenEntropy(Estimator):
+    """
+    Estimates the sequence-level uncertainty of a language model by calculating the
+    mean entropy among all tokens in the generation.
+    Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel).
+    """
+
+    def __init__(self):
+        super().__init__(["sample_entropy"], "sequence")
+
+    def __str__(self):
+        return "SampledMeanTokenEntropy"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the mean token entropy for each sample in input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * Entropy(* | y_<i, x) in 'entropy'
+        Returns:
+            np.ndarray: minus log probabilities for each sample.
+                Higher values indicate more uncertain samples.
+        """
+        entropy = stats["sample_entropy"]
+        return np.array([e[0] for e in entropy])
+
+
 class TokenEntropy(Estimator):
     """
     Estimates the token-level uncertainty of a language model by calculating the
diff --git a/src/lm_polygraph/estimators/token_sar.py b/src/lm_polygraph/estimators/token_sar.py
index abb982e75..1a3e715c6 100644
--- a/src/lm_polygraph/estimators/token_sar.py
+++ b/src/lm_polygraph/estimators/token_sar.py
@@ -47,3 +47,56 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             tokenSAR.append(E_t.sum())
 
         return np.array(tokenSAR)
+
+
+class SampledTokenSAR(Estimator):
+    """
+    Estimates the sequence-level uncertainty of a language model following the method of
+    "Token SAR" as provided in the paper https://arxiv.org/abs/2307.01379.
+    Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel).
+
+    This method calculates the weighted sum of log_likelihoods with weights computed using token relevance.
+    """
+
+    def __init__(self, verbose: bool = False):
+        super().__init__(["sample_token_similarity", "sample_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "SampledTokenSAR"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the tokenSAR for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * log p(y_i | y_<i, x) in 'greedy_log_likelihoods'
+                * similarity of the generated text and generated text without one token for each token in 'token_similarity',
+        Returns:
+            np.ndarray: float tokenSAR for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_token_similarity = stats["sample_token_similarity"]
+
+        result = []
+        for batch_data in zip(
+            batch_sample_log_likelihoods,
+            batch_sample_token_similarity,
+        ):
+            sample_log_likelihoods = batch_data[0]
+            sample_token_similarity = batch_data[1]
+
+            tokenSAR = []
+            for log_likelihoods, token_similarity in zip(
+                sample_log_likelihoods, sample_token_similarity
+            ):
+                log_likelihoods = np.array(log_likelihoods)
+                R_t = 1 - token_similarity
+                R_t_norm = R_t / R_t.sum()
+                E_t = -log_likelihoods * R_t_norm
+                tokenSAR.append(E_t.sum())
+            result.append(tokenSAR[0])
+
+        return np.array(result)
diff --git a/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py b/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py
index ada5c9136..76a2241e2 100644
--- a/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py
+++ b/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py
@@ -94,6 +94,7 @@ def __call__(
                 token_scores[is_special_tokens] = 1
             else:
                 token_scores = np.array([0.5] * len(tokens))
+
             batch_token_scores.append(token_scores)
 
         sim_matrices = []
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index b21d7d9aa..545ed1421 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -245,6 +245,7 @@ def __init__(
         generation_metrics: List[GenerationMetric],
         ue_metrics: List[UEMetric],
         processors: List[Processor],
+        batch_size: int = 1,
         train_data: Dataset = None,
         background_train_data: Dataset = None,
         ignore_exceptions: bool = True,
@@ -281,22 +282,12 @@ def __init__(
             max_new_tokens (int): Maximum new tokens to use in generation. Default: 100.
         """
 
-        stat_calculators_dict, stat_dependencies_dict = register_stat_calculators(
-            deberta_batch_size=deberta_batch_size,
-            deberta_device=deberta_device,
-            language=language,
-            cache_path=cache_path,
-            model=model,
-            entropy_top_k=entropy_top_k,
-        )
-
-        self.stat_calculators_dict = stat_calculators_dict
-
         self.model: Model = model
         self.train_data: Dataset = train_data
         self.background_train_data: Dataset = background_train_data
         self.ensemble_model = ensemble_model
         self.data: Dataset = data
+        self.batch_size: int = batch_size
         self.estimators: List[Estimator] = estimators
         self.generation_metrics: List[GenerationMetric] = generation_metrics
         self.ue_metrics: List[UEMetric] = ue_metrics
@@ -304,15 +295,51 @@ def __init__(
         _check_unique_names(estimators)
         _check_unique_names(ue_metrics)
 
+        self.gen_metrics: Dict[Tuple[str, str], List[float]] = defaultdict(list)
+        self.estimations: Dict[Tuple[str, str], List[float]] = defaultdict(list)
+        self.metrics: Dict[Tuple[str, str, str, str], float] = {}
+        self.total_bad_estimators: Dict[Estimator, float] = {}
+        self.stats: Dict[str, List] = defaultdict(list)
+        self.save_stats = list(set(['greedy_texts', 'greedy_tokens']).union(set(save_stats)))
+
+        self.processors = processors
+        self.ignore_exceptions = ignore_exceptions
+        self.verbose = verbose
+        self.max_new_tokens = max_new_tokens
+        self.background_train_dataset_max_new_tokens = (
+            background_train_dataset_max_new_tokens
+        )
+        self.cache_path = cache_path
+        self.entropy_top_k = entropy_top_k
+        self.deberta_batch_size = deberta_batch_size
+        self.deberta_device = deberta_device
+        self.language = language
+
+
+    def prepare_calculators(self):
+        stat_calculators_dict, stat_dependencies_dict = register_stat_calculators(
+            deberta_batch_size=self.deberta_batch_size,
+            deberta_device=self.deberta_device,
+            language=self.language,
+            cache_path=self.cache_path,
+            model=self.model,
+            entropy_top_k=self.entropy_top_k,
+        )
+
+        self.stat_calculators_dict = stat_calculators_dict
+
         greedy = ["greedy_texts"]
         if not isinstance(self.model, BlackboxModel):
             greedy += ["greedy_tokens"]
 
         stats = (
             [s for e in self.estimators for s in e.stats_dependencies]
-            + [s for m in generation_metrics for s in m.stats_dependencies]
+            + [s for m in self.generation_metrics for s in m.stats_dependencies]
             + greedy
         )
+        
+        # Only calculate stats that are not already calculated
+        stats = list(set(stats) - set(self.stats))
 
         stats, have_stats = _order_calculators(
             stats,
@@ -324,31 +351,17 @@ def __init__(
         stats = [
             s
             for s in stats
-            if not (str(s).startswith("ensemble_"))
-            and not (
-                (
+            if not (
                     str(s).startswith("blackbox_")
                     and s[len("blackbox_") :] in have_stats
-                )  # remove blackbox_X from stats only if X is already in stats to remove duplicated run of stat calculator
-            )
+                   )  # remove blackbox_X from stats only if X is already in stats to remove duplicated run of stat calculator
         ]  # below in calculate() we copy X in blackbox_X
         self.stat_calculators: List[StatCalculator] = [
             stat_calculators_dict[c] for c in stats
         ]
-        if verbose:
+        if self.verbose:
             print("Stat calculators:", self.stat_calculators)
 
-        self.ensemble_estimators = []
-        single_estimators = []
-        for e in estimators:
-            for s in e.stats_dependencies:
-                if s.startswith("ensemble"):
-                    self.ensemble_estimators.append(e)
-                    break
-            if e not in self.ensemble_estimators:
-                single_estimators.append(e)
-        self.estimators = single_estimators
-
         train_stats = [
             s
             for e in self.estimators
@@ -360,6 +373,9 @@ def __init__(
             if "train_greedy_log_likelihoods" in train_stats
             else []
         )
+
+        train_stats = list(set(train_stats) - set(self.stats))
+
         train_stats, _ = _order_calculators(
             train_stats,
             stat_calculators_dict,
@@ -368,12 +384,16 @@ def __init__(
         self.train_stat_calculators: List[StatCalculator] = [
             stat_calculators_dict[c] for c in train_stats
         ]
+
         background_train_stats = [
             s
             for e in self.estimators
             for s in e.stats_dependencies
             if s.startswith("background_train")
         ]
+
+        background_train_stats = list(set(background_train_stats) - set(self.stats))
+
         background_train_stats, _ = _order_calculators(
             background_train_stats,
             stat_calculators_dict,
@@ -383,35 +403,26 @@ def __init__(
             stat_calculators_dict[c] for c in background_train_stats
         ]
 
-        ensemble_stats = [
-            s
-            for e in self.ensemble_estimators
-            for s in e.stats_dependencies
-            if s.startswith("ensemble")
-        ]
-        ensemble_stats, _ = _order_calculators(
-            ensemble_stats,
-            stat_calculators_dict,
-            stat_dependencies_dict,
-        )
-        self.ensemble_stat_calculators: List[StatCalculator] = [
-            stat_calculators_dict[c] for c in ensemble_stats
-        ]
+    def initiate_batch_stats(self, batch_i, inp_texts, target_texts):
+        batch_stats: Dict[str, np.ndarray] = {}
+        
+        for key, val in self.stats.items():
+            # Get corresponding batch from existing stats
+            val_batch = val[batch_i * self.batch_size : (batch_i + 1) * self.batch_size]
+            batch_stats[key] = val_batch
+
+        for key, val in [
+            ("input_texts", inp_texts),
+            ("target_texts", target_texts),
+        ]:  
+            if key not in batch_stats:
+                self.stats[key] += val
+                batch_stats[key] = val
 
-        self.gen_metrics: Dict[Tuple[str, str], List[float]] = defaultdict(list)
-        self.estimations: Dict[Tuple[str, str], List[float]] = defaultdict(list)
-        self.metrics: Dict[Tuple[str, str, str, str], float] = {}
-        self.total_bad_estimators: Dict[Estimator, float] = {}
-        self.stats: Dict[str, List] = defaultdict(list)
-        self.save_stats = list(set(['greedy_texts', 'greedy_tokens']).union(set(save_stats)))
+        batch_stats["model"] = self.model
+
+        return batch_stats
 
-        self.processors = processors
-        self.ignore_exceptions = ignore_exceptions
-        self.verbose = verbose
-        self.max_new_tokens = max_new_tokens
-        self.background_train_dataset_max_new_tokens = (
-            background_train_dataset_max_new_tokens
-        )
 
     def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
         """
@@ -431,22 +442,13 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
                 - generation metrics name,
                 - `ue_metrics` name which was used to calculate quality.
         """
-
+        self.prepare_calculators()
         train_stats = self._extract_train_embeddings()
         background_train_stats = self._extract_train_embeddings(background=True)
 
         iterable_data = tqdm(self.data) if self.verbose else self.data
         for batch_i, (inp_texts, target_texts) in enumerate(iterable_data):
-            batch_stats: Dict[str, np.ndarray] = {}
-            for key, val in [
-                ("input_texts", inp_texts),
-                ("target_texts", target_texts),
-            ]:
-                self.stats[key] += val
-                batch_stats[key] = val
-            batch_stats["model"] = self.model
-
-            batch_stats["model"] = self.model
+            batch_stats = self.initiate_batch_stats(batch_i, inp_texts, target_texts)
 
             train_stats_keys = list(train_stats.keys())
             for stat in train_stats_keys:
@@ -455,8 +457,10 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
             background_train_stats_keys = list(background_train_stats.keys())
             for stat in background_train_stats_keys:
                 batch_stats[stat] = background_train_stats.pop(stat)
-
+            
+            old_stats = set(batch_stats.keys())
             batch_stats = self.calculate(batch_stats, self.stat_calculators, inp_texts)
+            new_stats = set(batch_stats.keys()) - old_stats
 
             batch_estimations, bad_estimators = self.estimate(
                 batch_stats, self.estimators
@@ -479,42 +483,14 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
                 batch_gen_metrics[generation_metric.level, str(generation_metric)] += m
 
             for key in self.save_stats:
-                if key in batch_stats.keys():
+                if key in new_stats:
                     self.stats[key] += list(batch_stats[key])
             for processor in self.processors:
                 processor.on_batch(batch_stats, batch_gen_metrics, batch_estimations)
 
-        if self.ensemble_model is not None:
-            iterable_data = tqdm(self.data) if self.verbose else self.data
-            for batch_i, (inp_texts, target_texts) in enumerate(iterable_data):
-                batch_stats: Dict[str, np.ndarray] = {}
-                for key, val in [
-                    ("input_texts", inp_texts),
-                    ("target_texts", target_texts),
-                    ("model", self.model),
-                ]:
-                    batch_stats[key] = val
-
-                batch_stats["ensemble_generation_params"] = {}
-                batch_stats["ensemble_model"] = self.ensemble_model
-
-                batch_stats = self.calculate(
-                    batch_stats, self.ensemble_stat_calculators, inp_texts
-                )
-
-                batch_estimations, bad_estimators = self.estimate(
-                    batch_stats, self.ensemble_estimators
-                )
-
-                for bad_estimator in bad_estimators:
-                    key = (bad_estimator.level, str(bad_estimator))
-                    self.estimations.pop(key, None)
-                    self.ensemble_estimators.remove(bad_estimator)
-                    self.total_bad_estimators[bad_estimator] = batch_i
-
             torch.cuda.empty_cache()
             gc.collect()
-            
+
         self.eval_ue()
 
         for processor in self.processors:
@@ -703,7 +679,7 @@ def save(self, save_path: str):
         )
 
     @staticmethod
-    def load(load_path: str) -> "UEManager":
+    def load(load_path: str, **kwargs) -> "UEManager":
         """
         Loads UEManager from the specified path. To save the calculated manager results, see UEManager.save().
 
@@ -711,7 +687,16 @@ def load(load_path: str) -> "UEManager":
             load_path (str): Path to file with saved benchmark results to load.
         """
         res_dict = torch.load(load_path)
-        man = UEManager(None, None, [], [], [], [])
+        default_kwargs = {
+            "data": None,
+            "model": None,
+            "estimators": [],
+            "generation_metrics": [],
+            "ue_metrics": [],
+            "processors": [],
+        }
+        default_kwargs.update(kwargs)
+        man = UEManager(**default_kwargs)
         man.metrics = res_dict.get("metrics", None)
         man.gen_metrics = res_dict.get("gen_metrics", None)
         man.estimations = res_dict.get("estimations", None)
diff --git a/src/lm_polygraph/utils/processor.py b/src/lm_polygraph/utils/processor.py
index 424df271c..49908dea9 100644
--- a/src/lm_polygraph/utils/processor.py
+++ b/src/lm_polygraph/utils/processor.py
@@ -61,7 +61,7 @@ def on_batch(
         for key, val in batch_stats.items():
             str_repr = str(val)
             # to skip large outputs
-            if len(str_repr) < 10000 and str_repr.count("\n") < 10:
+            if len(str_repr) < 10000 and str_repr.count("\n") < 20:
                 print(f"{key}: {val}")
                 print()
         print("-" * 100)

From f570400f780d2ebcd8c7f0dec661da0dedea9fdd Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 11 Dec 2024 14:58:27 +0400
Subject: [PATCH 44/97] Add quality metrics based off first sample generated

---
 .../configs/polygraph_eval_xsum_sentsar.yaml  |  2 +-
 scripts/polygraph_eval                        | 18 ++++++++++---
 .../generation_metrics/accuracy.py            | 17 +++++++++---
 .../generation_metrics/alignscore.py          | 19 +++++++++++---
 src/lm_polygraph/generation_metrics/bleu.py   | 17 +++++++++---
 src/lm_polygraph/generation_metrics/comet.py  | 18 ++++++++++---
 src/lm_polygraph/generation_metrics/rouge.py  | 17 +++++++++---
 src/lm_polygraph/stat_calculators/__init__.py |  2 +-
 src/lm_polygraph/stat_calculators/sample.py   | 26 +++++++++++++++++++
 .../utils/register_stat_calculators.py        |  1 +
 10 files changed, 116 insertions(+), 21 deletions(-)

diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index a04c9c672..4c4ea0daf 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -13,7 +13,7 @@ device: cpu
 
 task: ats
 
-base_manager: null
+base_manager: /Users/romanvashurin/workspace/sar_enhancements/gsu/mistral7b_xsum.man
 overwrite_base_estimations: false
 
 dataset: xsum
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 2944cf21e..ba60b5577 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -446,8 +446,6 @@ def get_generation_metrics(args):
             RougeMetric("rouge2"),
             RougeMetric("rougeL"),
             BLEUMetric(),
-            BertScoreMetric('rh'),
-            SbertMetric(),
             AccuracyMetric(
                 target_ignore_regex = getattr(args, "target_ignore_regex", None),
                 output_ignore_regex = getattr(args, "output_ignore_regex", None),
@@ -456,13 +454,27 @@ def get_generation_metrics(args):
             AlignScore(),
             AlignScore(target_is_claims=False),
             AlignScore(ignore_target=True),
+            RougeMetric("rouge1", sample=True),
+            RougeMetric("rouge2", sample=True),
+            RougeMetric("rougeL", sample=True),
+            BLEUMetric(sample=True),
+            AccuracyMetric(
+                target_ignore_regex = getattr(args, "target_ignore_regex", None),
+                output_ignore_regex = getattr(args, "output_ignore_regex", None),
+                normalize = getattr(args, "normalize", False),
+                sample=True,
+            ),
+            AlignScore(sample=True),
+            AlignScore(target_is_claims=False, sample=True),
+            AlignScore(ignore_target=True, sample=True),
         ]
         if getattr(args.model, "type", "Whitebox") != "Blackbox":
             if getattr(args, "use_claim_ue", False):
                 result += [OpenAIFactCheck(cache_path=args.cache_path, language=getattr(args, "language", "en"))]
         if args.task == "nmt":
             ignore_regex = getattr(args, "source_ignore_regex", None)
-            result += [Comet(source_ignore_regex = ignore_regex)]
+            result += [Comet(source_ignore_regex = ignore_regex),
+                       Comet(source_ignore_regex = ignore_regex, sample=True)]
     else:
         result = []
         for metric in generation_metrics:
diff --git a/src/lm_polygraph/generation_metrics/accuracy.py b/src/lm_polygraph/generation_metrics/accuracy.py
index 5c2478efb..a71c1b989 100644
--- a/src/lm_polygraph/generation_metrics/accuracy.py
+++ b/src/lm_polygraph/generation_metrics/accuracy.py
@@ -16,9 +16,13 @@ class AccuracyMetric(GenerationMetric):
     """
 
     def __init__(
-        self, target_ignore_regex=None, output_ignore_regex=None, normalize=False
+        self, target_ignore_regex=None, output_ignore_regex=None, normalize=False, sample: bool = False
     ):
-        super().__init__(["greedy_texts"], "sequence")
+        if sample:
+            super().__init__(["first_sample_texts"], "sequence")
+        else:
+            super().__init__(["greedy_texts"], "sequence")
+        self.sample = sample
         self.target_ignore_regex = (
             re.compile(target_ignore_regex) if target_ignore_regex else None
         )
@@ -33,6 +37,8 @@ def __init__(
             )
 
     def __str__(self):
+        if self.sample:
+            return "SampleAccuracy"
         return "Accuracy"
 
     def _score_single(self, output: str, target: str) -> int:
@@ -66,11 +72,14 @@ def __call__(
         Returns:
             np.ndarray: list of accuracies: 1 if generated text is equal to ground-truth and 0 otherwise.
         """
-        greedy_texts = stats["greedy_texts"]
+        if self.sample:
+            gen_texts = stats["first_sample_texts"]
+        else:
+            gen_texts = stats["greedy_texts"]
 
         result = []
 
-        for hyp, ref in zip(greedy_texts, target_texts):
+        for hyp, ref in zip(gen_texts, target_texts):
             ref = self._filter_text(ref, self.target_ignore_regex)
             hyp = self._filter_text(hyp, self.output_ignore_regex)
 
diff --git a/src/lm_polygraph/generation_metrics/alignscore.py b/src/lm_polygraph/generation_metrics/alignscore.py
index 139b558e1..b47a080a3 100644
--- a/src/lm_polygraph/generation_metrics/alignscore.py
+++ b/src/lm_polygraph/generation_metrics/alignscore.py
@@ -19,8 +19,13 @@ def __init__(
         batch_size=16,
         target_is_claims=True,
         ignore_target=False,
+        sample: bool = False,
     ):
-        super().__init__(["greedy_texts", "input_texts"], "sequence")
+        if sample:
+            super().__init__(["first_sample_texts", "input_texts"], "sequence")
+        else:
+            super().__init__(["greedy_texts", "input_texts"], "sequence")
+        self.sample = sample
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.target_is_claims = target_is_claims
         self.batch_size = batch_size
@@ -41,6 +46,10 @@ def __str__(self):
             base += "OutputTarget"
         else:
             base += "TargetOutput"
+
+        if self.sample:
+            return f"Sample{base}"
+
         return base
 
     def __call__(
@@ -59,11 +68,15 @@ def __call__(
         Returns:
             np.ndarray: list of AlignScore Scores for each sample in input.
         """
-        greedy_texts = stats["greedy_texts"]
+        if self.sample:
+            gen_texts = stats["first_sample_texts"]
+        else:
+            gen_texts = stats["greedy_texts"]
+
         input_texts = stats["input_texts"]
 
         filtered_targets = [x if len(x.strip()) else "(empty)" for x in target_texts]
-        filtered_outputs = [x if len(x.strip()) else "(empty)" for x in greedy_texts]
+        filtered_outputs = [x if len(x.strip()) else "(empty)" for x in gen_texts]
         filtered_inputs = [x if len(x.strip()) else "(empty)" for x in input_texts]
 
         if self.ignore_target:
diff --git a/src/lm_polygraph/generation_metrics/bleu.py b/src/lm_polygraph/generation_metrics/bleu.py
index 641047495..dd9b19ae7 100644
--- a/src/lm_polygraph/generation_metrics/bleu.py
+++ b/src/lm_polygraph/generation_metrics/bleu.py
@@ -10,11 +10,17 @@ class BLEUMetric(GenerationMetric):
     Calculates BLEU metric between model-generated texts and ground truth texts.
     """
 
-    def __init__(self):
-        super().__init__(["greedy_texts"], "sequence")
+    def __init__(self, sample: bool = False):
+        if sample:
+            super().__init__(["first_sample_texts"], "sequence")
+        else:
+            super().__init__(["greedy_texts"], "sequence")
+        self.sample = sample
         self.scorer = BLEU(effective_order=True, lowercase=True)
 
     def __str__(self):
+        if self.sample:
+            return "SampleBLEU"
         return "BLEU"
 
     def _score_single(self, t1: str, t2: str):
@@ -37,9 +43,14 @@ def __call__(
         Returns:
             np.ndarray: list of BLEU Scores for each sample in input.
         """
+        if self.sample:
+            gen_texts = stats["first_sample_texts"]
+        else:
+            gen_texts = stats["greedy_texts"]
+
         return np.array(
             [
                 self._score_single(hyp, ref)
-                for hyp, ref in zip(stats["greedy_texts"], target_texts)
+                for hyp, ref in zip(gen_texts, target_texts)
             ]
         )
diff --git a/src/lm_polygraph/generation_metrics/comet.py b/src/lm_polygraph/generation_metrics/comet.py
index 0fcd9b3e2..4d02b37a6 100644
--- a/src/lm_polygraph/generation_metrics/comet.py
+++ b/src/lm_polygraph/generation_metrics/comet.py
@@ -12,14 +12,20 @@ class Comet(GenerationMetric):
     between model-generated texts and ground truth texts.
     """
 
-    def __init__(self, source_ignore_regex=None, lang="en"):
-        super().__init__(["greedy_texts", "input_texts"], "sequence")
+    def __init__(self, source_ignore_regex=None, lang="en", sample: bool = False):
+        if sample:
+            super().__init__(["first_sample_texts", "input_texts"], "sequence")
+        else:
+            super().__init__(["greedy_texts", "input_texts"], "sequence")
+        self.sample = sample
         self.scorer = load("comet")
         self.source_ignore_regex = (
             re.compile(source_ignore_regex) if source_ignore_regex else None
         )
 
     def __str__(self):
+        if self.sample:
+            return "SampleComet"
         return "Comet"
 
     def _filter_text(self, text: str, ignore_regex: re.Pattern) -> str:
@@ -54,9 +60,15 @@ def __call__(
             self._filter_text(src, self.source_ignore_regex)
             for src in stats["input_texts"]
         ]
+
+        if self.sample:
+            gen_texts = stats["first_sample_texts"]
+        else:
+            gen_texts = stats["greedy_texts"]
+
         scores = np.array(
             self.scorer.compute(
-                predictions=stats["greedy_texts"],
+                predictions=gen_texts,
                 references=target_texts,
                 sources=sources,
             )["scores"]
diff --git a/src/lm_polygraph/generation_metrics/rouge.py b/src/lm_polygraph/generation_metrics/rouge.py
index e4f96a18d..86ac231e3 100644
--- a/src/lm_polygraph/generation_metrics/rouge.py
+++ b/src/lm_polygraph/generation_metrics/rouge.py
@@ -15,7 +15,7 @@ class RougeMetric(GenerationMetric):
     Calculates Rouge metric between model-generated texts and ground truth texts.
     """
 
-    def __init__(self, rouge_name):
+    def __init__(self, rouge_name, sample: bool = False):
         """
         Parameters:
             rouge_name (str): rouge metric type. Possible values:
@@ -23,11 +23,17 @@ def __init__(self, rouge_name):
                 * rouge2
                 * rougeL
         """
-        super().__init__(["greedy_texts"], "sequence")
+        if sample:
+            super().__init__(["first_sample_texts"], "sequence")
+        else:
+            super().__init__(["greedy_texts"], "sequence")
+        self.sample = sample
         self.rouge_name = rouge_name
         self.scorer = rouge_scorer.RougeScorer([rouge_name], use_stemmer=True)
 
     def __str__(self):
+        if self.sample:
+            return f"SampleRouge_{self.rouge_name}"
         return f"Rouge_{self.rouge_name}"
 
     def _score_single(self, t1: str, t2: str):
@@ -52,9 +58,14 @@ def __call__(
         Returns:
             np.ndarray: list of Rouge Scores for each sample in input.
         """
+        if self.sample:
+            gen_texts = stats["first_sample_texts"]
+        else:
+            gen_texts = stats["greedy_texts"]
+
         return np.array(
             [
                 self._score_single(hyp, ref)
-                for hyp, ref in zip(stats["greedy_texts"], target_texts)
+                for hyp, ref in zip(gen_texts, target_texts)
             ]
         )
diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py
index 1a2ea5721..29844b507 100644
--- a/src/lm_polygraph/stat_calculators/__init__.py
+++ b/src/lm_polygraph/stat_calculators/__init__.py
@@ -10,7 +10,7 @@
 )
 from .entropy import EntropyCalculator
 from .entropy import SampleEntropyCalculator
-from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator
+from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator, FirstSampleCalculator
 from .sample_alternatives_nli import SampleAlternativesNLICalculator
 from .greedy_alternatives_nli import (
     GreedyAlternativesNLICalculator,
diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py
index 2c74f7a1d..d00f4f6f5 100644
--- a/src/lm_polygraph/stat_calculators/sample.py
+++ b/src/lm_polygraph/stat_calculators/sample.py
@@ -206,3 +206,29 @@ def __call__(
             "sample_tokens_distributions": token_distributions,
             "sample_tokens_alternatives": alternatives,
         }
+
+class FirstSampleCalculator(StatCalculator):
+    def __init__(self):
+        super().__init__(
+            [
+                "first_sample_texts",
+            ],
+            [
+                "sample_texts",
+            ]
+        )
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        sample_texts = dependencies["sample_texts"]
+        first_sample_texts = [st[0] for st in sample_texts]
+
+        return {
+            "first_sample_texts": first_sample_texts,
+        }
+
diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
index 2c7ccd8f8..7c82caf80 100644
--- a/src/lm_polygraph/utils/register_stat_calculators.py
+++ b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -67,6 +67,7 @@ def _register(calculator_class: StatCalculator):
         _register(SampleEntropyCalculator(top_k=entropy_top_k))
         _register(GreedyLMProbsCalculator())
         _register(SamplingGenerationCalculator(n_alternatives=n_ccp_alternatives))
+        _register(FirstSampleCalculator())
         _register(BartScoreCalculator())
         _register(ModelScoreCalculator())
         _register(EmbeddingsCalculator())

From 31b38e24caa676af55f941c7f5bb8be1902929af Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Wed, 11 Dec 2024 16:19:36 +0400
Subject: [PATCH 45/97] MaxSampledMaximumSequenceProbability

---
 src/lm_polygraph/estimators/__init__.py       |  1 +
 .../estimators/max_probability.py             | 31 +++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 5f75d7c74..b903b3cc3 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -8,6 +8,7 @@
 from .max_probability import (
     MaximumSequenceProbability,
     MaximumTokenProbability,
+    MaxSampledMaximumSequenceProbability,
 )
 from .claim_conditioned_probability import ClaimConditionedProbability
 from .token_entropy import MeanTokenEntropy, TokenEntropy
diff --git a/src/lm_polygraph/estimators/max_probability.py b/src/lm_polygraph/estimators/max_probability.py
index b8fe2afde..cbfc8ed32 100644
--- a/src/lm_polygraph/estimators/max_probability.py
+++ b/src/lm_polygraph/estimators/max_probability.py
@@ -61,6 +61,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         mp = [lp[0] for lp in stats["sample_log_probs"]]
 
         return -np.array(mp)
+    
+
+class MaxSampledMaximumSequenceProbability(Estimator):
+    """
+    Estimates the sequence-level uncertainty of a language model by calculating the
+    log-probability of the generation with minus sign.
+    It is calculated as the sum of log-probabilities in each token.
+    Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel).
+    """
+
+    def __init__(self):
+        super().__init__(["sample_log_probs"], "sequence")
+
+    def __str__(self):
+        return "MaxSampledMaximumSequenceProbability"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the minus log-probability of each sample in input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * log p(y_i | y_<i, x) in 'greedy_log_likelihoods'
+        Returns:
+            np.ndarray: minus log probabilities for each sample.
+                Higher values indicate more uncertain samples.
+        """
+        mp = [max(lp) for lp in stats["sample_log_probs"]]
+
+        return -np.array(mp)
+
 
 class MaximumTokenProbability(Estimator):
     """

From fca871de98f6264ca4c9308d5c8eac8e659565db Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 11 Dec 2024 16:35:20 +0400
Subject: [PATCH 46/97] Add average sample ue baseline, base manager params to
 all sentsar configs

---
 .../configs/polygraph_eval_coqa_sentsar.yaml  | 29 ++++++++++++++++++-
 .../polygraph_eval_gsm8k_sentsar_cot.yaml     | 28 ++++++++++++++++++
 .../configs/polygraph_eval_mmlu_sentsar.yaml  | 28 ++++++++++++++++++
 .../polygraph_eval_triviaqa_sentsar.yaml      | 28 ++++++++++++++++++
 .../polygraph_eval_wmt14_enfr_sentsar.yaml    | 29 ++++++++++++++++++-
 .../polygraph_eval_wmt14_fren_sentsar.yaml    | 28 ++++++++++++++++++
 .../polygraph_eval_wmt19_deen_sentsar.yaml    | 28 ++++++++++++++++++
 .../polygraph_eval_wmt19_ende_sentsar.yaml    | 28 ++++++++++++++++++
 .../configs/polygraph_eval_xsum_sentsar.yaml  |  2 +-
 9 files changed, 225 insertions(+), 3 deletions(-)

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
index 9e75ed4af..7af151dc6 100644
--- a/examples/configs/polygraph_eval_coqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -11,6 +11,9 @@ save_path: '${hydra:run.dir}'
 
 task: qa
 
+base_manager: null
+overwrite_base_estimations: false
+
 dataset: coqa
 text_column: questions
 label_column: answers
@@ -74,6 +77,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
     kwargs: {}
+  - module: lm_polygraph.estimators.max_probability
+    class_name: SampledMaximumSequenceProbability
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
@@ -84,17 +90,22 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: SampledTokenSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
   - module: lm_polygraph.estimators.gsu
     class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
     kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: SampledPerplexity
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSAR
     kwargs: {}
@@ -105,6 +116,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: SampledMeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: MTESAR
     kwargs: {}
@@ -112,6 +126,19 @@ additional_estimators:
     class_name: MTEGSU
     kwargs: {}
 
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMaxprob
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AvePPL
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveTokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMTE
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
index b4eea7dcd..d13ccfc0f 100644
--- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -11,6 +11,9 @@ save_path: '${hydra:run.dir}'
 
 task: qa
 
+base_manager: null
+overwrite_base_estimations: false
+
 dataset: [gsm8k, main]
 text_column: question
 label_column: answer
@@ -78,6 +81,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
     kwargs: {}
+  - module: lm_polygraph.estimators.max_probability
+    class_name: SampledMaximumSequenceProbability
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
@@ -88,6 +94,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: SampledTokenSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
@@ -98,6 +107,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
     kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: SampledPerplexity
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSAR
     kwargs: {}
@@ -108,6 +120,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: SampledMeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: MTESAR
     kwargs: {}
@@ -115,6 +130,19 @@ additional_estimators:
     class_name: MTEGSU
     kwargs: {}
 
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMaxprob
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AvePPL
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveTokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMTE
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
index 755da5c74..7162070b0 100644
--- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -11,6 +11,9 @@ save_path: '${hydra:run.dir}'
 
 task: qa
 
+base_manager: null
+overwrite_base_estimations: false
+
 dataset: [cais/mmlu, all]
 text_column: question
 label_column: answer
@@ -76,6 +79,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
     kwargs: {}
+  - module: lm_polygraph.estimators.max_probability
+    class_name: SampledMaximumSequenceProbability
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
@@ -86,6 +92,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: SampledTokenSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
@@ -96,6 +105,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
     kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: SampledPerplexity
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSAR
     kwargs: {}
@@ -106,6 +118,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: SampledMeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: MTESAR
     kwargs: {}
@@ -113,6 +128,19 @@ additional_estimators:
     class_name: MTEGSU
     kwargs: {}
 
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMaxprob
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AvePPL
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveTokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMTE
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 1b40fb1b9..f9fa19928 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -11,6 +11,9 @@ save_path: '${hydra:run.dir}'
 
 task: qa
 
+base_manager: null
+overwrite_base_estimations: false
+
 dataset: [trivia_qa, rc.nocontext]
 text_column: question
 label_column: answer
@@ -77,6 +80,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
     kwargs: {}
+  - module: lm_polygraph.estimators.max_probability
+    class_name: SampledMaximumSequenceProbability
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
@@ -87,6 +93,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: SampledTokenSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
@@ -97,6 +106,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
     kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: SampledPerplexity
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSAR
     kwargs: {}
@@ -107,6 +119,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: SampledMeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: MTESAR
     kwargs: {}
@@ -114,6 +129,19 @@ additional_estimators:
     class_name: MTEGSU
     kwargs: {}
 
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMaxprob
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AvePPL
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveTokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMTE
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
index b3505e7cc..38f283f9b 100644
--- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
@@ -13,6 +13,9 @@ device: cpu
 
 task: nmt
 
+base_manager: null
+overwrite_base_estimations: false
+
 dataset: [wmt14, fr-en]
 text_column: en
 label_column: fr
@@ -76,6 +79,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
     kwargs: {}
+  - module: lm_polygraph.estimators.max_probability
+    class_name: SampledMaximumSequenceProbability
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
@@ -86,17 +92,22 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: SampledTokenSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
   - module: lm_polygraph.estimators.gsu
     class_name: TokenSARGSU
     kwargs: {}
-  - module: lm_polygraph.estimators.gsu
 
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
     kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: SampledPerplexity
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSAR
     kwargs: {}
@@ -107,6 +118,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: SampledMeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: MTESAR
     kwargs: {}
@@ -114,6 +128,19 @@ additional_estimators:
     class_name: MTEGSU
     kwargs: {}
 
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMaxprob
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AvePPL
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveTokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMTE
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index b471f29f8..1dbed406e 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -13,6 +13,9 @@ device: cpu
 
 task: nmt
 
+base_manager: null
+overwrite_base_estimations: false
+
 dataset: [wmt14, fr-en]
 text_column: fr
 label_column: en
@@ -76,6 +79,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
     kwargs: {}
+  - module: lm_polygraph.estimators.max_probability
+    class_name: SampledMaximumSequenceProbability
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
@@ -86,6 +92,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: SampledTokenSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
@@ -96,6 +105,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
     kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: SampledPerplexity
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSAR
     kwargs: {}
@@ -106,6 +118,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: SampledMeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: MTESAR
     kwargs: {}
@@ -113,6 +128,19 @@ additional_estimators:
     class_name: MTEGSU
     kwargs: {}
 
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMaxprob
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AvePPL
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveTokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMTE
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index c9658242b..b0e766163 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -13,6 +13,9 @@ device: cpu
 
 task: nmt
 
+base_manager: null
+overwrite_base_estimations: false
+
 dataset: [wmt19, de-en]
 text_column: de
 label_column: en
@@ -75,6 +78,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
     kwargs: {}
+  - module: lm_polygraph.estimators.max_probability
+    class_name: SampledMaximumSequenceProbability
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
@@ -85,6 +91,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: SampledTokenSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
@@ -95,6 +104,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
     kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: SampledPerplexity
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSAR
     kwargs: {}
@@ -105,6 +117,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: SampledMeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: MTESAR
     kwargs: {}
@@ -112,6 +127,19 @@ additional_estimators:
     class_name: MTEGSU
     kwargs: {}
 
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMaxprob
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AvePPL
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveTokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMTE
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
index afd8f28f3..b52328b10 100644
--- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
@@ -13,6 +13,9 @@ device: cpu
 
 task: nmt
 
+base_manager: null
+overwrite_base_estimations: false
+
 dataset: [wmt19, de-en]
 text_column: en
 label_column: de
@@ -75,6 +78,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.max_probability
     class_name: MaximumSequenceProbability
     kwargs: {}
+  - module: lm_polygraph.estimators.max_probability
+    class_name: SampledMaximumSequenceProbability
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: SentenceSAR
     kwargs: {}
@@ -85,6 +91,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_sar
     class_name: TokenSAR
     kwargs: {}
+  - module: lm_polygraph.estimators.token_sar
+    class_name: SampledTokenSAR
+    kwargs: {}
   - module: lm_polygraph.estimators.sar
     class_name: SAR
     kwargs: {}
@@ -95,6 +104,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.perplexity
     class_name: Perplexity
     kwargs: {}
+  - module: lm_polygraph.estimators.perplexity
+    class_name: SampledPerplexity
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: PPLSAR
     kwargs: {}
@@ -105,6 +117,9 @@ additional_estimators:
   - module: lm_polygraph.estimators.token_entropy
     class_name: MeanTokenEntropy
     kwargs: {}
+  - module: lm_polygraph.estimators.token_entropy
+    class_name: SampledMeanTokenEntropy
+    kwargs: {}
   - module: lm_polygraph.estimators.sentence_sar
     class_name: MTESAR
     kwargs: {}
@@ -112,6 +127,19 @@ additional_estimators:
     class_name: MTEGSU
     kwargs: {}
 
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMaxprob
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AvePPL
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveTokenSAR
+    kwargs: {}
+  - module: lm_polygraph.estimators.average_ue
+    class_name: AveMTE
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index 4c4ea0daf..a04c9c672 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -13,7 +13,7 @@ device: cpu
 
 task: ats
 
-base_manager: /Users/romanvashurin/workspace/sar_enhancements/gsu/mistral7b_xsum.man
+base_manager: null
 overwrite_base_estimations: false
 
 dataset: xsum

From b9646f0e1fe49ae9496b97c37b0dc88a6b83ce0f Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Wed, 11 Dec 2024 17:24:42 +0400
Subject: [PATCH 47/97] add MaxSampledPerplexity

---
 src/lm_polygraph/estimators/__init__.py   |  5 ++++-
 src/lm_polygraph/estimators/perplexity.py | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index b903b3cc3..c51f08e0a 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -31,7 +31,10 @@
 from .num_sem_sets import NumSemSets
 from .semantic_entropy import SemanticEntropy
 from .semantic_entropy_token import SemanticEntropyToken
-from .perplexity import Perplexity
+from .perplexity import (
+    Perplexity, 
+    MaxSampledPerplexity,
+)
 from .mahalanobis_distance import MahalanobisDistanceSeq
 from .relative_mahalanobis_distance import RelativeMahalanobisDistanceSeq
 from .rde import RDESeq
diff --git a/src/lm_polygraph/estimators/perplexity.py b/src/lm_polygraph/estimators/perplexity.py
index d8c8e22b7..44fd9350f 100644
--- a/src/lm_polygraph/estimators/perplexity.py
+++ b/src/lm_polygraph/estimators/perplexity.py
@@ -27,3 +27,23 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         log_likelihoods = stats["sample_log_likelihoods"]
         ppl = [np.mean(sample_log_likelihoods[0]) for sample_log_likelihoods in log_likelihoods]
         return -np.array(ppl)
+
+class MaxSampledPerplexity(Estimator):
+    def init(self):
+        super().init(["sample_log_likelihoods"], "sequence")
+
+    def str(self):
+        return "MaxSampledPerplexity"
+
+    def call(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        log_likelihoods = stats["sample_log_likelihoods"]
+        
+        ppl_per_sample = [
+            [-np.mean(sequence) for sequence in sample_log_likelihoods]
+            for sample_log_likelihoods in log_likelihoods
+        ]
+
+        # Find the maximum perplexity for each set of samples
+        max_ppl = [max(ppl_sample) for ppl_sample in ppl_per_sample]
+
+        return -np.array(max_ppl)
\ No newline at end of file

From 6faa7763c6e56812468d9d4b8aea168bf2db1bd1 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 11 Dec 2024 18:17:55 +0400
Subject: [PATCH 48/97] Use common scorers for alignscores and comets

---
 scripts/polygraph_eval                        | 30 ++++++++++++++-----
 .../generation_metrics/alignscore.py          | 13 ++------
 src/lm_polygraph/generation_metrics/comet.py  |  5 ++--
 src/lm_polygraph/utils/manager.py             |  7 +++--
 4 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index ba60b5577..72c01c031 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -13,10 +13,12 @@ import logging
 
 log = logging.getLogger('lm_polygraph')
 
+from evaluate import load
 from lm_polygraph.utils.manager import UEManager
 from lm_polygraph.utils.dataset import Dataset
 from lm_polygraph.utils.model import WhiteboxModel, BlackboxModel, create_ensemble
 from lm_polygraph.utils.processor import Logger
+from lm_polygraph.generation_metrics.alignscore_utils import AlignScorer
 from lm_polygraph.generation_metrics import *
 from lm_polygraph.estimators import *
 from lm_polygraph.utils.openai_chat import OpenAIChat
@@ -441,6 +443,17 @@ def get_generation_metrics(args):
 
     generation_metrics = getattr(args, "generation_metrics", None)
     if not generation_metrics:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        batch_size = 16
+        ckpt_path="https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt"
+        align_scorer = AlignScorer(
+            model="roberta-large",
+            batch_size=batch_size,
+            device=device,
+            ckpt_path=ckpt_path,
+            evaluation_mode="nli_sp",
+        )
+
         result = [
             RougeMetric("rouge1"),
             RougeMetric("rouge2"),
@@ -451,9 +464,9 @@ def get_generation_metrics(args):
                 output_ignore_regex = getattr(args, "output_ignore_regex", None),
                 normalize = getattr(args, "normalize", False),
             ),
-            AlignScore(),
-            AlignScore(target_is_claims=False),
-            AlignScore(ignore_target=True),
+            AlignScore(align_scorer),
+            AlignScore(align_scorer, target_is_claims=False),
+            AlignScore(align_scorer, ignore_target=True),
             RougeMetric("rouge1", sample=True),
             RougeMetric("rouge2", sample=True),
             RougeMetric("rougeL", sample=True),
@@ -464,17 +477,18 @@ def get_generation_metrics(args):
                 normalize = getattr(args, "normalize", False),
                 sample=True,
             ),
-            AlignScore(sample=True),
-            AlignScore(target_is_claims=False, sample=True),
-            AlignScore(ignore_target=True, sample=True),
+            AlignScore(align_scorer, sample=True),
+            AlignScore(align_scorer, target_is_claims=False, sample=True),
+            AlignScore(align_scorer, ignore_target=True, sample=True),
         ]
         if getattr(args.model, "type", "Whitebox") != "Blackbox":
             if getattr(args, "use_claim_ue", False):
                 result += [OpenAIFactCheck(cache_path=args.cache_path, language=getattr(args, "language", "en"))]
         if args.task == "nmt":
             ignore_regex = getattr(args, "source_ignore_regex", None)
-            result += [Comet(source_ignore_regex = ignore_regex),
-                       Comet(source_ignore_regex = ignore_regex, sample=True)]
+            comet_scorer = load("comet")
+            result += [Comet(comet_scorer, source_ignore_regex = ignore_regex),
+                       Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True)]
     else:
         result = []
         for metric in generation_metrics:
diff --git a/src/lm_polygraph/generation_metrics/alignscore.py b/src/lm_polygraph/generation_metrics/alignscore.py
index b47a080a3..57c6d454b 100644
--- a/src/lm_polygraph/generation_metrics/alignscore.py
+++ b/src/lm_polygraph/generation_metrics/alignscore.py
@@ -14,9 +14,8 @@ class AlignScore(GenerationMetric):
 
     def __init__(
         self,
+        scorer,
         lang="en",
-        ckpt_path="https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt",
-        batch_size=16,
         target_is_claims=True,
         ignore_target=False,
         sample: bool = False,
@@ -26,17 +25,9 @@ def __init__(
         else:
             super().__init__(["greedy_texts", "input_texts"], "sequence")
         self.sample = sample
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.target_is_claims = target_is_claims
-        self.batch_size = batch_size
         self.ignore_target = ignore_target
-        self.scorer = AlignScorer(
-            model="roberta-large",
-            batch_size=batch_size,
-            device=device,
-            ckpt_path=ckpt_path,
-            evaluation_mode="nli_sp",
-        )
+        self.scorer = scorer
 
     def __str__(self):
         base = "AlignScore"
diff --git a/src/lm_polygraph/generation_metrics/comet.py b/src/lm_polygraph/generation_metrics/comet.py
index 4d02b37a6..35c0f9ab4 100644
--- a/src/lm_polygraph/generation_metrics/comet.py
+++ b/src/lm_polygraph/generation_metrics/comet.py
@@ -1,6 +1,5 @@
 import re
 import numpy as np
-from evaluate import load
 
 from typing import List, Dict
 from .generation_metric import GenerationMetric
@@ -12,16 +11,16 @@ class Comet(GenerationMetric):
     between model-generated texts and ground truth texts.
     """
 
-    def __init__(self, source_ignore_regex=None, lang="en", sample: bool = False):
+    def __init__(self, scorer, source_ignore_regex=None, lang="en", sample: bool = False):
         if sample:
             super().__init__(["first_sample_texts", "input_texts"], "sequence")
         else:
             super().__init__(["greedy_texts", "input_texts"], "sequence")
         self.sample = sample
-        self.scorer = load("comet")
         self.source_ignore_regex = (
             re.compile(source_ignore_regex) if source_ignore_regex else None
         )
+        self.scorer = scorer
 
     def __str__(self):
         if self.sample:
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 545ed1421..c2e7cd98f 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -408,8 +408,11 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts):
         
         for key, val in self.stats.items():
             # Get corresponding batch from existing stats
-            val_batch = val[batch_i * self.batch_size : (batch_i + 1) * self.batch_size]
-            batch_stats[key] = val_batch
+            batch_start = batch_i * self.batch_size
+            batch_end = (batch_i + 1) * self.batch_size
+            if len(val) >= batch_end:
+                val_batch = val[batch_start:batch_end]
+                batch_stats[key] = val_batch
 
         for key, val in [
             ("input_texts", inp_texts),

From 52166f5396cb73cdefd2f1150118d613fd9c6f33 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 11 Dec 2024 18:35:07 +0400
Subject: [PATCH 49/97] Do not recalculate dependencies

---
 src/lm_polygraph/utils/manager.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index c2e7cd98f..f56f68fff 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -27,11 +27,12 @@
 
 def _order_calculators(
     stats: List[str],
+    existing_stats: Set[str],
     stat_calculators: Dict[str, StatCalculator],
     stat_dependencies: Dict[str, List[str]],
 ) -> Tuple[List[str], Set[str]]:
     ordered: List[str] = []
-    have_stats: Set[str] = set()
+    have_stats: Set[str] = set(existing_stats)
     while len(stats) > 0:
         stat = stats[0]
         if stat in have_stats:
@@ -339,10 +340,12 @@ def prepare_calculators(self):
         )
         
         # Only calculate stats that are not already calculated
-        stats = list(set(stats) - set(self.stats))
+        existing_stats = set(self.stats.keys())
+        stats = list(set(stats) - existing_stats)
 
         stats, have_stats = _order_calculators(
             stats,
+            existing_stats,
             stat_calculators_dict,
             stat_dependencies_dict,
         )
@@ -374,10 +377,11 @@ def prepare_calculators(self):
             else []
         )
 
-        train_stats = list(set(train_stats) - set(self.stats))
+        train_stats = list(set(train_stats) - existing_stats)
 
         train_stats, _ = _order_calculators(
             train_stats,
+            existing_stats,
             stat_calculators_dict,
             stat_dependencies_dict,
         )
@@ -392,10 +396,11 @@ def prepare_calculators(self):
             if s.startswith("background_train")
         ]
 
-        background_train_stats = list(set(background_train_stats) - set(self.stats))
+        background_train_stats = list(set(background_train_stats) - existing_stats)
 
         background_train_stats, _ = _order_calculators(
             background_train_stats,
+            existing_stats,
             stat_calculators_dict,
             stat_dependencies_dict,
         )

From 58c7f6d4cb956267bf9b1fab7444141a8204829b Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 11 Dec 2024 21:22:25 +0400
Subject: [PATCH 50/97] Consider sampling-based evaluation in gen metric
 wrappers

---
 .../generation_metrics/aggregated_metric.py          | 11 +++++++++--
 .../generation_metrics/preprocess_output_target.py   | 12 +++++++++---
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/lm_polygraph/generation_metrics/aggregated_metric.py b/src/lm_polygraph/generation_metrics/aggregated_metric.py
index bd20e9d93..17a05cc6f 100644
--- a/src/lm_polygraph/generation_metrics/aggregated_metric.py
+++ b/src/lm_polygraph/generation_metrics/aggregated_metric.py
@@ -11,6 +11,7 @@ class AggregatedMetric(GenerationMetric):
 
     def __init__(self, base_metric: GenerationMetric, aggregation: str = "max"):
         self.base_metric = base_metric
+        self.sample = base_metric.sample
         self.level = base_metric.level
         self.stats_dependencies = base_metric.stats_dependencies
         self.aggregation = aggregation
@@ -34,8 +35,14 @@ def __call__(
             np.ndarray: list of aggregated metric values for each sample in input.
         """
         metric_values = []
-        for i, (targets, greedy_text) in enumerate(
-            zip(target_texts, stats["greedy_texts"])
+
+        if self.sample:
+            gen_texts = stats["first_sample_texts"]
+        else:
+            gen_texts = stats["greedy_texts"]
+
+        for i, (targets, gen_text) in enumerate(
+            zip(target_texts, gen_texts)
         ):
             # truncate stats to only process one sample at a time
             truncated_stats = {
diff --git a/src/lm_polygraph/generation_metrics/preprocess_output_target.py b/src/lm_polygraph/generation_metrics/preprocess_output_target.py
index 8d3d56671..0e77415aa 100644
--- a/src/lm_polygraph/generation_metrics/preprocess_output_target.py
+++ b/src/lm_polygraph/generation_metrics/preprocess_output_target.py
@@ -12,6 +12,7 @@ class PreprocessOutputTarget(GenerationMetric):
 
     def __init__(self, base_metric, process_output_fn, process_target_fn):
         self.base_metric = getattr(base_metric, "base_metric", base_metric)
+        self.sample = base_metric.sample
         self.level = base_metric.level
         self.stats_dependencies = base_metric.stats_dependencies
         self.process_output_fn = process_output_fn
@@ -44,8 +45,13 @@ def __call__(
         stats_copy = {k: v for k, v in stats.items() if k in self.stats_dependencies}
         stats_copy = deepcopy(stats_copy)
 
-        stats_copy["greedy_texts"] = [
-            self.process_output_fn(output) for output in stats_copy["greedy_texts"]
-        ]
+        if self.sample:
+            stats_copy["first_sample_texts"] = [
+                self.process_output_fn(output) for output in stats_copy["first_sample_texts"]
+            ]
+        else:
+            stats_copy["greedy_texts"] = [
+                self.process_output_fn(output) for output in stats_copy["greedy_texts"]
+            ]
 
         return self.base_metric(stats_copy, processed_target_texts)

From 518de01d130b80b0205dc79e1548453e707057c6 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 13 Dec 2024 12:01:03 +0400
Subject: [PATCH 51/97] Correctly handle the case when last batch is not whole

---
 src/lm_polygraph/utils/manager.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index f56f68fff..45cf7c6a2 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -410,11 +410,12 @@ def prepare_calculators(self):
 
     def initiate_batch_stats(self, batch_i, inp_texts, target_texts):
         batch_stats: Dict[str, np.ndarray] = {}
-        
+        cur_batch_size = len(inp_texts)
+
         for key, val in self.stats.items():
             # Get corresponding batch from existing stats
-            batch_start = batch_i * self.batch_size
-            batch_end = (batch_i + 1) * self.batch_size
+            batch_start = batch_i * cur_batch_size
+            batch_end = (batch_i + 1) * cur_batch_size
             if len(val) >= batch_end:
                 val_batch = val[batch_start:batch_end]
                 batch_stats[key] = val_batch
@@ -422,10 +423,14 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts):
         for key, val in [
             ("input_texts", inp_texts),
             ("target_texts", target_texts),
-        ]:  
+        ]:
             if key not in batch_stats:
                 self.stats[key] += val
                 batch_stats[key] = val
+            else:
+                # Check that new stats will be calculated
+                # against the same input texts and targets
+                assert np.all(np.array(batch_stats[key]) == np.array(val))
 
         batch_stats["model"] = self.model
 

From c5795918b761efe68b06f42a164ab263cc784f17 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 13 Dec 2024 13:38:21 +0400
Subject: [PATCH 52/97] Use common batch size for full batches

---
 src/lm_polygraph/utils/manager.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 45cf7c6a2..ea27fed32 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -414,8 +414,12 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts):
 
         for key, val in self.stats.items():
             # Get corresponding batch from existing stats
-            batch_start = batch_i * cur_batch_size
+            batch_start = batch_i * self.batch_size
+            # If last batch is not full, we need to adjust the end index
             batch_end = (batch_i + 1) * cur_batch_size
+            # This will only be true if the calculation is based off
+            # existing manager. Otherwise, all stats will contain only
+            # values calculated in previous batches
             if len(val) >= batch_end:
                 val_batch = val[batch_start:batch_end]
                 batch_stats[key] = val_batch

From b67933cf92a5a9e96001e28cf0ce8bb8becf2910 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 13 Dec 2024 14:25:22 +0400
Subject: [PATCH 53/97] Fix MTESAR

---
 src/lm_polygraph/estimators/sentence_sar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py
index c2ff21395..e8f5278b9 100644
--- a/src/lm_polygraph/estimators/sentence_sar.py
+++ b/src/lm_polygraph/estimators/sentence_sar.py
@@ -338,7 +338,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             # Compute sentence relevance
             sent_relevance = R_s.sum(-1) / self.t
             # Compute SentenceSAR (Uncertainty Estimation) using PPL
-            E_s = -np.log(sent_relevance + entropy)
+            E_s = np.log(sent_relevance + entropy)
             sentenceSAR.append(E_s.mean())
 
         return np.array(sentenceSAR)

From 19535ee5fc515291627118424c668dc80e18e7cc Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 13 Dec 2024 16:04:48 +0400
Subject: [PATCH 54/97] Import average baselines, correct batch initiation

---
 src/lm_polygraph/estimators/__init__.py | 2 +-
 src/lm_polygraph/utils/manager.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index c51f08e0a..9eb981925 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -83,4 +83,4 @@
 from .linguistic_1s import Linguistic1S
 from .label_prob import LabelProb
 from .p_true_empirical import PTrueEmpirical
-from .average_ue import AveMaxprob
+from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index ea27fed32..e4494c9ab 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -416,7 +416,7 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts):
             # Get corresponding batch from existing stats
             batch_start = batch_i * self.batch_size
             # If last batch is not full, we need to adjust the end index
-            batch_end = (batch_i + 1) * cur_batch_size
+            batch_end = batch_start + cur_batch_size
             # This will only be true if the calculation is based off
             # existing manager. Otherwise, all stats will contain only
             # values calculated in previous batches

From 1b67bbd8188d5b910e29a781e90a5b4f0e882579 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 13 Dec 2024 17:13:27 +0400
Subject: [PATCH 55/97] Only check input stats for consistency

---
 src/lm_polygraph/utils/manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index e4494c9ab..7d9a32f24 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -431,9 +431,9 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts):
             if key not in batch_stats:
                 self.stats[key] += val
                 batch_stats[key] = val
-            else:
+            elif key == "input_texts":
                 # Check that new stats will be calculated
-                # against the same input texts and targets
+                # against the same input texts
                 assert np.all(np.array(batch_stats[key]) == np.array(val))
 
         batch_stats["model"] = self.model

From f034cbd2fafb82e92fed7c983fcec68269cbbf80 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Tue, 17 Dec 2024 13:12:45 +0400
Subject: [PATCH 56/97] Lighten the prr calculation, use logs as base for GSU
 and other new methods

---
 scripts/polygraph_eval                        |   4 -
 src/lm_polygraph/estimators/__init__.py       |   3 +-
 src/lm_polygraph/estimators/average_ue.py     |   8 +-
 src/lm_polygraph/estimators/gsu.py            |  77 +---------
 .../estimators/semantic_average_ue.py         | 131 ++++++++++++++++++
 src/lm_polygraph/utils/manager.py             |  15 +-
 6 files changed, 150 insertions(+), 88 deletions(-)
 create mode 100644 src/lm_polygraph/estimators/semantic_average_ue.py

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 72c01c031..d6a373497 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -455,8 +455,6 @@ def get_generation_metrics(args):
         )
 
         result = [
-            RougeMetric("rouge1"),
-            RougeMetric("rouge2"),
             RougeMetric("rougeL"),
             BLEUMetric(),
             AccuracyMetric(
@@ -467,8 +465,6 @@ def get_generation_metrics(args):
             AlignScore(align_scorer),
             AlignScore(align_scorer, target_is_claims=False),
             AlignScore(align_scorer, ignore_target=True),
-            RougeMetric("rouge1", sample=True),
-            RougeMetric("rouge2", sample=True),
             RougeMetric("rougeL", sample=True),
             BLEUMetric(sample=True),
             AccuracyMetric(
diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 9eb981925..06956ecc6 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -75,7 +75,7 @@
     #DistilOneSentenceSAR,
 )
 from .sar import SAR
-from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU, CCPGSU
+from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU
 from .renyi_neg import RenyiNeg
 from .fisher_rao import FisherRao
 from .verbalized_1s import Verbalized1S
@@ -84,3 +84,4 @@
 from .label_prob import LabelProb
 from .p_true_empirical import PTrueEmpirical
 from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE
+from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE
diff --git a/src/lm_polygraph/estimators/average_ue.py b/src/lm_polygraph/estimators/average_ue.py
index b03147a1f..a7748e9e6 100644
--- a/src/lm_polygraph/estimators/average_ue.py
+++ b/src/lm_polygraph/estimators/average_ue.py
@@ -25,7 +25,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_log_probs, sample_sentence_similarity in zip(
             batch_sample_log_probs, batch_sample_sentence_similarity
         ):
-            sample_probs = -np.exp(np.array(sample_log_probs))
+            sample_probs = -np.array(sample_log_probs)
 
             ave.append(sample_probs.mean())
 
@@ -50,7 +50,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_log_likelihoods, sample_sentence_similarity in zip(
             batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
-            ppl = -np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+            ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
 
             ave.append(ppl.mean())
 
@@ -99,9 +99,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
 
-            tokenSAR = np.array(tokenSAR)
-            probs_token_sar = -np.exp(-tokenSAR)
-            ave.append(probs_token_sar.mean())
+            ave.append(np.mean(tokenSAR))
 
         return np.array(ave)
 
diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py
index 8aae841d1..4d58b8fee 100644
--- a/src/lm_polygraph/estimators/gsu.py
+++ b/src/lm_polygraph/estimators/gsu.py
@@ -37,7 +37,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_log_probs, sample_sentence_similarity in zip(
             batch_sample_log_probs, batch_sample_sentence_similarity
         ):
-            sample_probs = -np.exp(np.array(sample_log_probs))
+            sample_probs = -np.array(sample_log_probs)
             R_s = (
                 sample_probs
                 * sample_sentence_similarity
@@ -79,7 +79,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_log_likelihoods, sample_sentence_similarity in zip(
             batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
-            ppl = -np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+            ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
 
             R_s = (
                 ppl
@@ -146,10 +146,8 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
 
-            tokenSAR = np.array(tokenSAR)
-            probs_token_sar = -np.exp(-tokenSAR)
             R_s = (
-                probs_token_sar
+                tokenSAR
                 * sample_sentence_similarity
             )
             E_s = R_s.sum(-1)
@@ -193,77 +191,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         ):
             # Use MTE for sentence relevance calculation
             R_s = sample_entropy * sample_sentence_similarity
-            
+
             # Compute sentence relevance by summing along the last axis
             E_s = R_s.sum(-1)
 
             GSU.append(E_s.mean())
 
         return np.array(GSU)
-
-
-class CCPGSU(Estimator):
-    def __init__(
-        self,
-        verbose: bool = False
-    ):
-        super().__init__(["sample_sentence_similarity",
-                          "sample_tokens",
-                          "sample_tokens_alternatives",
-                          "sample_tokens_alternatives_nli"], "sequence")
-        self.verbose = verbose
-
-    def __str__(self):
-        return "CCPGSU"
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        """
-        Estimates the sentenceSAR for each sample in the input statistics.
-
-        Parameters:
-            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
-                * corresponding log probabilities in 'sample_log_probs',
-                * matrix with cross-encoder similarities in 'sample_sentence_similarity'
-        Returns:
-            np.ndarray: float sentenceSAR for each sample in input statistics.
-                Higher values indicate more uncertain samples.
-        """
-        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
-        batch_sample_tokens = stats["sample_tokens"]
-        batch_sample_tokens_alternatives = stats["sample_tokens_alternatives"]
-        batch_sample_tokens_alternatives_nli = stats["sample_tokens_alternatives_nli"]
-
-        GSU = []
-        for sample_sentence_similarity, \
-            samples_tokens, \
-            samples_tokens_alternatives, \
-            samples_tokens_alternatives_nli in zip(
-                batch_sample_sentence_similarity,
-                batch_sample_tokens,
-                batch_sample_tokens_alternatives,
-                batch_sample_tokens_alternatives_nli
-            ):
-            ccps = []
-            for sample_tokens, \
-                sample_tokens_alternatives, \
-                sample_tokens_alternatives_nli in zip(
-                    samples_tokens,
-                    samples_tokens_alternatives,
-                    samples_tokens_alternatives_nli
-                ):
-                ccp_stats = {
-                    "greedy_tokens": [sample_tokens],
-                    "greedy_tokens_alternatives": [sample_tokens_alternatives],
-                    "greedy_tokens_alternatives_nli": [sample_tokens_alternatives_nli]
-                }
-                ccps.append(ClaimConditionedProbability()(stats=ccp_stats)[0])
-
-            R_s = (
-                ccps
-                * sample_sentence_similarity
-            )
-            sent_relevance = R_s.sum(-1)
-
-            GSU.append(E_s.mean())
-
-        return np.array(GSU)
diff --git a/src/lm_polygraph/estimators/semantic_average_ue.py b/src/lm_polygraph/estimators/semantic_average_ue.py
new file mode 100644
index 000000000..962ad91e9
--- /dev/null
+++ b/src/lm_polygraph/estimators/semantic_average_ue.py
@@ -0,0 +1,131 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+
+
+class SemanticAveMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "SemanticAveMaxprob"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sample_probs = -np.array(sample_log_probs)
+            weights = sample_sentence_similarity[0, :]
+            ave.append(np.average(sample_probs, weights=weights))
+
+        return np.array(ave)
+
+class SemanticAvePPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "SemanticAvePPL"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for sample_log_likelihoods, sample_sentence_similarity in zip(
+            batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
+            ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+            weights = sample_sentence_similarity[0, :]
+
+            ave.append(np.average(ppl, weights=weights))
+
+        return np.array(ave)
+
+class SemanticAveTokenSAR(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(
+            [
+                "sample_sentence_similarity",
+                "sample_log_likelihoods",
+                "sample_token_similarity",
+            ],
+            "sequence",
+        )
+        self.verbose = verbose
+
+    def __str__(self):
+        return "SemanticAveTokenSAR"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_token_similarity = stats["sample_token_similarity"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for batch_data in zip(
+            batch_sample_log_likelihoods,
+            batch_sample_token_similarity,
+            batch_sample_sentence_similarity,
+        ):
+            sample_log_likelihoods = batch_data[0]
+            sample_token_similarity = batch_data[1]
+            sample_sentence_similarity = batch_data[2]
+
+            tokenSAR = []
+            for log_likelihoods, token_similarity in zip(
+                sample_log_likelihoods, sample_token_similarity
+            ):
+                log_likelihoods = np.array(log_likelihoods)
+                R_t = 1 - token_similarity
+                R_t_norm = R_t / R_t.sum()
+                E_t = -log_likelihoods * R_t_norm
+                tokenSAR.append(E_t.sum())
+
+            weights = sample_sentence_similarity[0, :]
+
+            ave.append(np.average(tokenSAR, weights=weights))
+
+        return np.array(ave)
+
+class SemanticAveMTE(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "SemanticAveMTE"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_entropy = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for sample_entropy, sample_sentence_similarity in zip(
+            batch_sample_entropy, batch_sample_sentence_similarity
+        ):
+            weights = sample_sentence_similarity[0, :]
+            ave.append(np.average(sample_entropy, weights=weights))
+
+        return np.array(ave)
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 7d9a32f24..db3c4a0ec 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -516,9 +516,13 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
         return self.metrics
 
     def eval_ue(self):
-        for (e_level, e_name), estimator_values in self.estimations.items():
-            for (gen_level, gen_name), generation_metric in self.gen_metrics.items():
-                for ue_metric in self.ue_metrics:
+        for (gen_level, gen_name), generation_metric in self.gen_metrics.items():
+            generation_metric = np.array(generation_metric)
+            for ue_metric in self.ue_metrics:
+                oracle_score = ue_metric(-generation_metric, generation_metric)
+                random_score = get_random_scores(ue_metric, generation_metric)
+
+                for (e_level, e_name), estimator_values in self.estimations.items():
                     if gen_level != e_level:
                         continue
                     if len(estimator_values) != len(generation_metric):
@@ -529,11 +533,12 @@ def eval_ue(self):
                     # TODO: Report how many nans!
                     # This is important to know for a user
                     ue, metric = _delete_nans(estimator_values, generation_metric)
+                    assert len(ue) == len(estimator_values)
+                    assert len(metric) == len(generation_metric)
+
                     if len(ue) == 0:
                         self.metrics[e_level, e_name, gen_name, str(ue_metric)] = np.nan
                     else:
-                        oracle_score = ue_metric(-metric, metric)
-                        random_score = get_random_scores(ue_metric, metric)
                         ue_metric_val = ue_metric(ue, metric)
                         self.metrics[e_level, e_name, gen_name, str(ue_metric)] = (
                             ue_metric_val

From 04b3d523fb3904f305e28e8a13c4e734764798bd Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Tue, 17 Dec 2024 13:24:55 +0400
Subject: [PATCH 57/97] Save first sample texts separately

---
 examples/configs/polygraph_eval_coqa_sentsar.yaml       | 1 +
 examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml  | 1 +
 examples/configs/polygraph_eval_mmlu_sentsar.yaml       | 1 +
 examples/configs/polygraph_eval_triviaqa_sentsar.yaml   | 1 +
 examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml | 1 +
 examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 1 +
 examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 1 +
 examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml | 1 +
 examples/configs/polygraph_eval_xsum_sentsar.yaml       | 1 +
 9 files changed, 9 insertions(+)

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
index 7af151dc6..f7dcdc754 100644
--- a/examples/configs/polygraph_eval_coqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -41,6 +41,7 @@ save_stats:
   - sample_sentence_similarity
   - sample_token_similarity
   - sample_entropy
+  - first_sample_texts
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
index d13ccfc0f..e5fbdd9c4 100644
--- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -42,6 +42,7 @@ save_stats:
   - sample_sentence_similarity
   - sample_token_similarity
   - sample_entropy
+  - first_sample_texts
 entropy_top_k: 50
 
 target_ignore_regex: "(?s).*#### "
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
index 7162070b0..20437d081 100644
--- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -43,6 +43,7 @@ save_stats:
   - sample_sentence_similarity
   - sample_token_similarity
   - sample_entropy
+  - first_sample_texts
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index f9fa19928..6cf4fd248 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -43,6 +43,7 @@ save_stats:
   - sample_sentence_similarity
   - sample_token_similarity
   - sample_entropy
+  - first_sample_texts
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
index 38f283f9b..89e058463 100644
--- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
@@ -41,6 +41,7 @@ save_stats:
   - sample_sentence_similarity
   - sample_token_similarity
   - sample_entropy
+  - first_sample_texts
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 1dbed406e..7b4e95e49 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -41,6 +41,7 @@ save_stats:
   - sample_sentence_similarity
   - sample_token_similarity
   - sample_entropy
+  - first_sample_texts
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index b0e766163..62fe84c26 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -41,6 +41,7 @@ save_stats:
   - sample_sentence_similarity
   - sample_token_similarity
   - sample_entropy
+  - first_sample_texts
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
index b52328b10..cece970ac 100644
--- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
@@ -41,6 +41,7 @@ save_stats:
   - sample_sentence_similarity
   - sample_token_similarity
   - sample_entropy
+  - first_sample_texts
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index a04c9c672..56e70c7cf 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -42,6 +42,7 @@ save_stats:
   - sample_sentence_similarity
   - sample_token_similarity
   - sample_entropy
+  - first_sample_texts
 entropy_top_k: 50
 
 train_dataset: null

From 3f98b5965a4b4bb264deeee8ea3eea658fc4597e Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 25 Dec 2024 16:01:55 +0400
Subject: [PATCH 58/97] Add degmat based on CE, log/exp differentiation for
 semantic methods and semantic median methods

---
 src/lm_polygraph/estimators/__init__.py       |   3 +-
 src/lm_polygraph/estimators/deg_mat.py        |  45 +++++
 src/lm_polygraph/estimators/gsu.py            |  42 ++++-
 .../estimators/semantic_average_ue.py         |  30 +++-
 .../estimators/semantic_median_ue.py          | 157 ++++++++++++++++++
 5 files changed, 268 insertions(+), 9 deletions(-)
 create mode 100644 src/lm_polygraph/estimators/semantic_median_ue.py

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 06956ecc6..74ef07953 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -25,7 +25,7 @@
 from .monte_carlo_sequence_entropy import MonteCarloSequenceEntropy
 from .monte_carlo_normalized_sequence_entropy import MonteCarloNormalizedSequenceEntropy
 from .lexical_similarity import LexicalSimilarity
-from .deg_mat import DegMat
+from .deg_mat import DegMat, CEDegMat
 from .eccentricity import Eccentricity
 from .eig_val_laplacian import EigValLaplacian
 from .num_sem_sets import NumSemSets
@@ -85,3 +85,4 @@
 from .p_true_empirical import PTrueEmpirical
 from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE
 from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE
+from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE
diff --git a/src/lm_polygraph/estimators/deg_mat.py b/src/lm_polygraph/estimators/deg_mat.py
index 634884c63..f373a09d3 100644
--- a/src/lm_polygraph/estimators/deg_mat.py
+++ b/src/lm_polygraph/estimators/deg_mat.py
@@ -88,3 +88,48 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 print(f"generated answers: {answers}")
             res.append(self.U_DegMat(i, stats))
         return np.array(res)
+
+
+class CEDegMat(Estimator):
+    """
+    Estimates the sequence-level uncertainty of a language model following the method of
+    "The Degree Matrix" as provided in the paper https://arxiv.org/abs/2305.19187.
+    Works with both whitebox and blackbox models (initialized using
+    lm_polygraph.utils.model.BlackboxModel/WhiteboxModel).
+
+    Elements on diagonal of matrix D are sums of similarities between the particular number
+    (position in matrix) and other answers. Thus, it is an average pairwise distance
+    (lower values indicated smaller distance between answers which means greater uncertainty).
+    """
+
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_texts"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "CEDegMat"
+
+    def U_DegMat(self, W, answers):
+        # The Degree Matrix
+        D = np.diag(W.sum(axis=1))
+        return np.trace(len(answers) - D) / (len(answers) ** 2)
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Estimates the uncertainties for each sample in the input statistics.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
+                * generated samples in 'sample_texts',
+                * matrix with semantic similarities in 'semantic_matrix_entail'/'semantic_matrix_contra'
+        Returns:
+            np.ndarray: float uncertainty for each sample in input statistics.
+                Higher values indicate more uncertain samples.
+        """
+        res = []
+        for W, answers in zip(stats["sample_sentence_similarity"], stats["sample_texts"]):
+            res.append(self.U_DegMat(W, answers))
+        return np.array(res)
diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py
index 4d58b8fee..2969dca95 100644
--- a/src/lm_polygraph/estimators/gsu.py
+++ b/src/lm_polygraph/estimators/gsu.py
@@ -11,12 +11,17 @@ class MaxprobGSU(Estimator):
     def __init__(
         self,
         verbose: bool = False,
+        exp: bool = False
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
         self.verbose = verbose
+        self.exp = exp
 
     def __str__(self):
-        return "MaxprobGSU"
+        if self.exp:
+            return "MaxprobGSUexp"
+        else:
+            return "MaxprobGSU"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -38,12 +43,16 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             batch_sample_log_probs, batch_sample_sentence_similarity
         ):
             sample_probs = -np.array(sample_log_probs)
+            if self.exp:
+                sample_probs = -np.exp(-sample_probs)
             R_s = (
                 sample_probs
                 * sample_sentence_similarity
             )
             E_s = R_s.sum(-1)
 
+            E_s = E_s / sample_sentence_similarity.sum(-1)
+
             GSU.append(E_s.mean())
 
         return np.array(GSU)
@@ -52,13 +61,18 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 class PPLGSU(Estimator):
     def __init__(
         self,
-        verbose: bool = False
+        verbose: bool = False,
+        exp: bool = False
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
         self.verbose = verbose
+        self.exp = exp
 
     def __str__(self):
-        return "PPLGSU"
+        if self.exp:
+            return "PPLGSUexp"
+        else:
+            return "PPLGSU"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -81,12 +95,17 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         ):
             ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
 
+            if self.exp:
+                ppl = -np.exp(-ppl)
+
             R_s = (
                 ppl
                 * sample_sentence_similarity
             )
             E_s = R_s.sum(-1)
 
+            E_s = E_s / sample_sentence_similarity.sum(-1)
+
             GSU.append(E_s.mean())
 
         return np.array(GSU)
@@ -95,7 +114,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 class TokenSARGSU(Estimator):
     def __init__(
         self,
-        verbose: bool = False):
+        verbose: bool = False,
+        exp: bool = False
+    ):
         super().__init__(
             [
                 "sample_sentence_similarity",
@@ -105,9 +126,13 @@ def __init__(
             "sequence",
         )
         self.verbose = verbose
+        self.exp = exp
 
     def __str__(self):
-        return "TokenSARGSU"
+        if self.exp:
+            return "TokenSARGSUexp"
+        else:
+            return "TokenSARGSU"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -146,12 +171,17 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
 
+            if self.exp:
+                tokenSAR = -np.exp(-np.array(tokenSAR))
+
             R_s = (
                 tokenSAR
                 * sample_sentence_similarity
             )
             E_s = R_s.sum(-1)
 
+            E_s = E_s / sample_sentence_similarity.sum(-1)
+
             GSU.append(E_s.mean())
 
         return np.array(GSU)
@@ -195,6 +225,8 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             # Compute sentence relevance by summing along the last axis
             E_s = R_s.sum(-1)
 
+            E_s = E_s / sample_sentence_similarity.sum(-1)
+
             GSU.append(E_s.mean())
 
         return np.array(GSU)
diff --git a/src/lm_polygraph/estimators/semantic_average_ue.py b/src/lm_polygraph/estimators/semantic_average_ue.py
index 962ad91e9..fb361d44d 100644
--- a/src/lm_polygraph/estimators/semantic_average_ue.py
+++ b/src/lm_polygraph/estimators/semantic_average_ue.py
@@ -10,12 +10,17 @@ class SemanticAveMaxprob(Estimator):
     def __init__(
         self,
         verbose: bool = False,
+        exp: bool = False
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
         self.verbose = verbose
+        self.exp = exp
 
     def __str__(self):
-        return "SemanticAveMaxprob"
+        if self.exp:
+            return "SemanticAveMaxprobexp"
+        else:
+            return "SemanticAveMaxprob"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_probs = stats["sample_log_probs"]
@@ -26,6 +31,8 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             batch_sample_log_probs, batch_sample_sentence_similarity
         ):
             sample_probs = -np.array(sample_log_probs)
+            if self.exp:
+                sample_probs = -np.exp(-sample_probs)
             weights = sample_sentence_similarity[0, :]
             ave.append(np.average(sample_probs, weights=weights))
 
@@ -35,12 +42,17 @@ class SemanticAvePPL(Estimator):
     def __init__(
         self,
         verbose: bool = False,
+        exp: bool = False
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
         self.verbose = verbose
+        self.exp = exp
 
     def __str__(self):
-        return "SemanticAvePPL"
+        if self.exp:
+            return "SemanticAvePPLexp"
+        else:
+            return "SemanticAvePPL"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
@@ -51,6 +63,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
             ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+
+            if self.exp:
+                ppl = -np.exp(-ppl)
+
             weights = sample_sentence_similarity[0, :]
 
             ave.append(np.average(ppl, weights=weights))
@@ -61,6 +77,7 @@ class SemanticAveTokenSAR(Estimator):
     def __init__(
         self,
         verbose: bool = False,
+        exp: bool = False
     ):
         super().__init__(
             [
@@ -71,9 +88,13 @@ def __init__(
             "sequence",
         )
         self.verbose = verbose
+        self.exp = exp
 
     def __str__(self):
-        return "SemanticAveTokenSAR"
+        if self.exp:
+            return "SemanticAveTokenSARexp"
+        else:
+            return "SemanticAveTokenSAR"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
@@ -99,6 +120,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
+            
+            if self.exp:
+                tokenSAR = -np.exp(-np.array(tokenSAR))
 
             weights = sample_sentence_similarity[0, :]
 
diff --git a/src/lm_polygraph/estimators/semantic_median_ue.py b/src/lm_polygraph/estimators/semantic_median_ue.py
new file mode 100644
index 000000000..11b9beaac
--- /dev/null
+++ b/src/lm_polygraph/estimators/semantic_median_ue.py
@@ -0,0 +1,157 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+
+from wquantiles import median
+
+
+class SemanticMedianMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "SemanticMedianMaxprobexp"
+        else:
+            return "SemanticMedianMaxprob"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sample_probs = -np.array(sample_log_probs)
+            if self.exp:
+                sample_probs = -np.exp(-sample_probs)
+            weights = sample_sentence_similarity[0, :]
+            ave.append(median(sample_probs, weights))
+
+        return np.array(ave)
+
+class SemanticMedianPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "SemanticMedianPPLexp"
+        else:
+            return "SemanticMedianPPL"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for sample_log_likelihoods, sample_sentence_similarity in zip(
+            batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
+            ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+
+            if self.exp:
+                ppl = -np.exp(-ppl)
+
+            weights = sample_sentence_similarity[0, :]
+
+            ave.append(median(ppl, weights))
+
+        return np.array(ave)
+
+class SemanticMedianTokenSAR(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False
+    ):
+        super().__init__(
+            [
+                "sample_sentence_similarity",
+                "sample_log_likelihoods",
+                "sample_token_similarity",
+            ],
+            "sequence",
+        )
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "SemanticMedianTokenSARexp"
+        else:
+            return "SemanticMedianTokenSAR"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_token_similarity = stats["sample_token_similarity"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for batch_data in zip(
+            batch_sample_log_likelihoods,
+            batch_sample_token_similarity,
+            batch_sample_sentence_similarity,
+        ):
+            sample_log_likelihoods = batch_data[0]
+            sample_token_similarity = batch_data[1]
+            sample_sentence_similarity = batch_data[2]
+
+            tokenSAR = []
+            for log_likelihoods, token_similarity in zip(
+                sample_log_likelihoods, sample_token_similarity
+            ):
+                log_likelihoods = np.array(log_likelihoods)
+                R_t = 1 - token_similarity
+                R_t_norm = R_t / R_t.sum()
+                E_t = -log_likelihoods * R_t_norm
+                tokenSAR.append(E_t.sum())
+            
+            if self.exp:
+                tokenSAR = -np.exp(-np.array(tokenSAR))
+
+            weights = sample_sentence_similarity[0, :]
+
+            ave.append(median(np.array(tokenSAR), weights))
+
+        return np.array(ave)
+
+class SemanticMedianMTE(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "SemanticMedianMTE"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_entropy = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        ave = []
+        for sample_entropy, sample_sentence_similarity in zip(
+            batch_sample_entropy, batch_sample_sentence_similarity
+        ):
+            weights = sample_sentence_similarity[0, :]
+            ave.append(median(np.array(sample_entropy), weights))
+
+        return np.array(ave)

From 93aecdfbdcb7e16da2425c6f7f297c21ee723c93 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 25 Dec 2024 19:19:06 +0400
Subject: [PATCH 59/97] Add sample-based gen metrics from best samples

---
 .../generation_metrics/accuracy.py            | 22 +++++++++--
 .../generation_metrics/alignscore.py          | 23 ++++++++++--
 src/lm_polygraph/generation_metrics/bleu.py   | 22 +++++++++--
 src/lm_polygraph/generation_metrics/comet.py  | 24 ++++++++++--
 src/lm_polygraph/generation_metrics/rouge.py  | 23 ++++++++++--
 src/lm_polygraph/stat_calculators/__init__.py |  2 +-
 src/lm_polygraph/stat_calculators/sample.py   | 37 +++++++++++++++++++
 7 files changed, 135 insertions(+), 18 deletions(-)

diff --git a/src/lm_polygraph/generation_metrics/accuracy.py b/src/lm_polygraph/generation_metrics/accuracy.py
index a71c1b989..e3e4d13ed 100644
--- a/src/lm_polygraph/generation_metrics/accuracy.py
+++ b/src/lm_polygraph/generation_metrics/accuracy.py
@@ -19,10 +19,16 @@ def __init__(
         self, target_ignore_regex=None, output_ignore_regex=None, normalize=False, sample: bool = False
     ):
         if sample:
-            super().__init__(["first_sample_texts"], "sequence")
+            super().__init__([
+                "first_sample_texts",
+                "best_sample_texts",
+                "best_normalized_sample_texts",
+                "input_texts"],
+            "sequence")
         else:
             super().__init__(["greedy_texts"], "sequence")
         self.sample = sample
+        self.sample_strategy = sample_strategy
         self.target_ignore_regex = (
             re.compile(target_ignore_regex) if target_ignore_regex else None
         )
@@ -38,7 +44,10 @@ def __init__(
 
     def __str__(self):
         if self.sample:
-            return "SampleAccuracy"
+            if self.sample_strategy == "First":
+                return "SampleAccuracy"
+            else:
+                return f"{self.sample_strategy}SampleAccuracy"
         return "Accuracy"
 
     def _score_single(self, output: str, target: str) -> int:
@@ -73,7 +82,14 @@ def __call__(
             np.ndarray: list of accuracies: 1 if generated text is equal to ground-truth and 0 otherwise.
         """
         if self.sample:
-            gen_texts = stats["first_sample_texts"]
+            if self.sample_strategy == "First":
+                gen_texts = stats["first_sample_texts"]
+            elif self.sample_strategy == "Best":
+                gen_texts = stats["best_sample_texts"]
+            elif self.sample_strategy == "BestNormalized":
+                gen_texts = stats["best_normalized_sample_texts"]
+            else:
+                raise ValueError(f"Invalid sample strategy: {self.sample_strategy}")
         else:
             gen_texts = stats["greedy_texts"]
 
diff --git a/src/lm_polygraph/generation_metrics/alignscore.py b/src/lm_polygraph/generation_metrics/alignscore.py
index 57c6d454b..a6bd49504 100644
--- a/src/lm_polygraph/generation_metrics/alignscore.py
+++ b/src/lm_polygraph/generation_metrics/alignscore.py
@@ -19,12 +19,19 @@ def __init__(
         target_is_claims=True,
         ignore_target=False,
         sample: bool = False,
+        sample_strategy: str = "First",
     ):
         if sample:
-            super().__init__(["first_sample_texts", "input_texts"], "sequence")
+            super().__init__([
+                "first_sample_texts",
+                "best_sample_texts",
+                "best_normalized_sample_texts",
+                "input_texts"],
+            "sequence")
         else:
             super().__init__(["greedy_texts", "input_texts"], "sequence")
         self.sample = sample
+        self.sample_strategy = sample_strategy
         self.target_is_claims = target_is_claims
         self.ignore_target = ignore_target
         self.scorer = scorer
@@ -39,7 +46,10 @@ def __str__(self):
             base += "TargetOutput"
 
         if self.sample:
-            return f"Sample{base}"
+            if self.sample_strategy == "First":
+                return f"Sample{base}"
+            else:
+                return f"{self.sample_strategy}Sample{base}"
 
         return base
 
@@ -60,7 +70,14 @@ def __call__(
             np.ndarray: list of AlignScore Scores for each sample in input.
         """
         if self.sample:
-            gen_texts = stats["first_sample_texts"]
+            if self.sample_strategy == "First":
+                gen_texts = stats["first_sample_texts"]
+            elif self.sample_strategy == "Best":
+                gen_texts = stats["best_sample_texts"]
+            elif self.sample_strategy == "BestNormalized":
+                gen_texts = stats["best_normalized_sample_texts"]
+            else:
+                raise ValueError(f"Invalid sample strategy: {self.sample_strategy}")
         else:
             gen_texts = stats["greedy_texts"]
 
diff --git a/src/lm_polygraph/generation_metrics/bleu.py b/src/lm_polygraph/generation_metrics/bleu.py
index dd9b19ae7..91b4b5098 100644
--- a/src/lm_polygraph/generation_metrics/bleu.py
+++ b/src/lm_polygraph/generation_metrics/bleu.py
@@ -12,15 +12,24 @@ class BLEUMetric(GenerationMetric):
 
     def __init__(self, sample: bool = False):
         if sample:
-            super().__init__(["first_sample_texts"], "sequence")
+            super().__init__([
+                "first_sample_texts",
+                "best_sample_texts",
+                "best_normalized_sample_texts",
+                "input_texts"],
+            "sequence")
         else:
             super().__init__(["greedy_texts"], "sequence")
         self.sample = sample
+        self.sample_strategy = sample_strategy
         self.scorer = BLEU(effective_order=True, lowercase=True)
 
     def __str__(self):
         if self.sample:
-            return "SampleBLEU"
+            if self.sample_strategy == "First":
+                return "SampleBLEU"
+            else:
+                return f"{self.sample_strategy}SampleBLEU"
         return "BLEU"
 
     def _score_single(self, t1: str, t2: str):
@@ -44,7 +53,14 @@ def __call__(
             np.ndarray: list of BLEU Scores for each sample in input.
         """
         if self.sample:
-            gen_texts = stats["first_sample_texts"]
+            if self.sample_strategy == "First":
+                gen_texts = stats["first_sample_texts"]
+            elif self.sample_strategy == "Best":
+                gen_texts = stats["best_sample_texts"]
+            elif self.sample_strategy == "BestNormalized":
+                gen_texts = stats["best_normalized_sample_texts"]
+            else:
+                raise ValueError(f"Invalid sample strategy: {self.sample_strategy}")
         else:
             gen_texts = stats["greedy_texts"]
 
diff --git a/src/lm_polygraph/generation_metrics/comet.py b/src/lm_polygraph/generation_metrics/comet.py
index 35c0f9ab4..f91c833a3 100644
--- a/src/lm_polygraph/generation_metrics/comet.py
+++ b/src/lm_polygraph/generation_metrics/comet.py
@@ -11,12 +11,18 @@ class Comet(GenerationMetric):
     between model-generated texts and ground truth texts.
     """
 
-    def __init__(self, scorer, source_ignore_regex=None, lang="en", sample: bool = False):
+    def __init__(self, scorer, source_ignore_regex=None, lang="en", sample: bool = False, sample_strategy: str = "First"):
         if sample:
-            super().__init__(["first_sample_texts", "input_texts"], "sequence")
+            super().__init__([
+                "first_sample_texts",
+                "best_sample_texts",
+                "best_normalized_sample_texts",
+                "input_texts"],
+            "sequence")
         else:
             super().__init__(["greedy_texts", "input_texts"], "sequence")
         self.sample = sample
+        self.sample_strategy = sample_strategy
         self.source_ignore_regex = (
             re.compile(source_ignore_regex) if source_ignore_regex else None
         )
@@ -24,7 +30,10 @@ def __init__(self, scorer, source_ignore_regex=None, lang="en", sample: bool = F
 
     def __str__(self):
         if self.sample:
-            return "SampleComet"
+            if self.sample_strategy == "First":
+                return f"SampleComet"
+            else:
+                return f"{self.sample_strategy}SampleComet"
         return "Comet"
 
     def _filter_text(self, text: str, ignore_regex: re.Pattern) -> str:
@@ -61,7 +70,14 @@ def __call__(
         ]
 
         if self.sample:
-            gen_texts = stats["first_sample_texts"]
+            if self.sample_strategy == "First":
+                gen_texts = stats["first_sample_texts"]
+            elif self.sample_strategy == "Best":
+                gen_texts = stats["best_sample_texts"]
+            elif self.sample_strategy == "BestNormalized":
+                gen_texts = stats["best_normalized_sample_texts"]
+            else:
+                raise ValueError(f"Invalid sample strategy: {self.sample_strategy}")
         else:
             gen_texts = stats["greedy_texts"]
 
diff --git a/src/lm_polygraph/generation_metrics/rouge.py b/src/lm_polygraph/generation_metrics/rouge.py
index 86ac231e3..cea5201ec 100644
--- a/src/lm_polygraph/generation_metrics/rouge.py
+++ b/src/lm_polygraph/generation_metrics/rouge.py
@@ -15,7 +15,7 @@ class RougeMetric(GenerationMetric):
     Calculates Rouge metric between model-generated texts and ground truth texts.
     """
 
-    def __init__(self, rouge_name, sample: bool = False):
+    def __init__(self, rouge_name, sample: bool = False, sample_strategy: str = "First"):
         """
         Parameters:
             rouge_name (str): rouge metric type. Possible values:
@@ -24,16 +24,24 @@ def __init__(self, rouge_name, sample: bool = False):
                 * rougeL
         """
         if sample:
-            super().__init__(["first_sample_texts"], "sequence")
+            super().__init__([
+                "first_sample_texts",
+                "best_sample_texts",
+                "best_normalized_sample_texts"],
+            "sequence")
         else:
             super().__init__(["greedy_texts"], "sequence")
         self.sample = sample
+        self.sample_strategy = sample_strategy
         self.rouge_name = rouge_name
         self.scorer = rouge_scorer.RougeScorer([rouge_name], use_stemmer=True)
 
     def __str__(self):
         if self.sample:
-            return f"SampleRouge_{self.rouge_name}"
+            if self.sample_strategy == "First":
+                return f"SampleRouge_{self.rouge_name}"
+            else:
+                return f"{self.sample_strategy}SampleRouge_{self.rouge_name}"
         return f"Rouge_{self.rouge_name}"
 
     def _score_single(self, t1: str, t2: str):
@@ -59,7 +67,14 @@ def __call__(
             np.ndarray: list of Rouge Scores for each sample in input.
         """
         if self.sample:
-            gen_texts = stats["first_sample_texts"]
+            if self.sample_strategy == "First":
+                gen_texts = stats["first_sample_texts"]
+            elif self.sample_strategy == "Best":
+                gen_texts = stats["best_sample_texts"]
+            elif self.sample_strategy == "BestNormalized":
+                gen_texts = stats["best_normalized_sample_texts"]
+            else:
+                raise ValueError(f"Invalid sample strategy: {self.sample_strategy}")
         else:
             gen_texts = stats["greedy_texts"]
 
diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py
index 29844b507..1ba3b4057 100644
--- a/src/lm_polygraph/stat_calculators/__init__.py
+++ b/src/lm_polygraph/stat_calculators/__init__.py
@@ -10,7 +10,7 @@
 )
 from .entropy import EntropyCalculator
 from .entropy import SampleEntropyCalculator
-from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator, FirstSampleCalculator
+from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator, FirstSampleCalculator, BestSampleCalculator
 from .sample_alternatives_nli import SampleAlternativesNLICalculator
 from .greedy_alternatives_nli import (
     GreedyAlternativesNLICalculator,
diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py
index d00f4f6f5..b6d6b1553 100644
--- a/src/lm_polygraph/stat_calculators/sample.py
+++ b/src/lm_polygraph/stat_calculators/sample.py
@@ -232,3 +232,40 @@ def __call__(
             "first_sample_texts": first_sample_texts,
         }
 
+class BestSampleCalculator(StatCalculator):
+    def __init__(self):
+        super().__init__(
+            [
+                "best_sample_texts",
+                "best_normalized_sample_texts",
+            ],
+            [
+                "sample_texts",
+                "sample_log_probs",
+                "sample_log_likelihoods",
+            ]
+        )
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        best_sample_texts = []
+        best_normalized_sample_texts = []
+
+        for batch_i, (sample_texts, sample_log_probs, sample_log_likelihoods) in enumerate(zip(dependencies["sample_texts"], dependencies["sample_log_probs"], dependencies["sample_log_likelihoods"])):
+            best_i = np.argmax(sample_log_probs)
+            best_sample_texts.append(sample_texts[best_i])
+
+            ppls = [np.mean(ll) for ll in sample_log_likelihoods]
+            best_ppl_i = np.argmax(ppls)
+            best_normalized_sample_texts.append(sample_texts[best_ppl_i])
+
+        return {
+            "best_sample_texts": best_sample_texts,
+            "best_normalized_sample_texts": best_normalized_sample_texts,
+        }
+

From 65772794ce2ad9732bf2d8cae5837061c05a70fa Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 25 Dec 2024 19:19:09 +0400
Subject: [PATCH 60/97] Add sample-based gen metrics from best samples

---
 scripts/polygraph_eval | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index d6a373497..c21a66afb 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -465,6 +465,7 @@ def get_generation_metrics(args):
             AlignScore(align_scorer),
             AlignScore(align_scorer, target_is_claims=False),
             AlignScore(align_scorer, ignore_target=True),
+            # Sample-based metrics
             RougeMetric("rougeL", sample=True),
             BLEUMetric(sample=True),
             AccuracyMetric(
@@ -476,7 +477,34 @@ def get_generation_metrics(args):
             AlignScore(align_scorer, sample=True),
             AlignScore(align_scorer, target_is_claims=False, sample=True),
             AlignScore(align_scorer, ignore_target=True, sample=True),
+            # Best sample-based metrics
+            RougeMetric("rougeL", sample=True, sample_strategy="Best"),
+            BLEUMetric(sample=True, sample_strategy="Best"),
+            AccuracyMetric(
+                target_ignore_regex = getattr(args, "target_ignore_regex", None),
+                output_ignore_regex = getattr(args, "output_ignore_regex", None),
+                normalize = getattr(args, "normalize", False),
+                sample=True,
+                sample_strategy="Best",
+            ),
+            AlignScore(align_scorer, sample=True, sample_strategy="Best"),
+            AlignScore(align_scorer, target_is_claims=False, sample=True, sample_strategy="Best"),
+            AlignScore(align_scorer, ignore_target=True, sample=True, sample_strategy="Best"),
+            # Best normalized sample-based metrics
+            RougeMetric("rougeL", sample=True, sample_strategy="BestNormalized"),
+            BLEUMetric(sample=True, sample_strategy="BestNormalized"),
+            AccuracyMetric(
+                target_ignore_regex = getattr(args, "target_ignore_regex", None),
+                output_ignore_regex = getattr(args, "output_ignore_regex", None),
+                normalize = getattr(args, "normalize", False),
+                sample=True,
+                sample_strategy="BestNormalized",
+            ),
+            AlignScore(align_scorer, sample=True, sample_strategy="BestNormalized"),
+            AlignScore(align_scorer, target_is_claims=False, sample=True, sample_strategy="BestNormalized"),
+            AlignScore(align_scorer, ignore_target=True, sample=True, sample_strategy="BestNormalized"),
         ]
+
         if getattr(args.model, "type", "Whitebox") != "Blackbox":
             if getattr(args, "use_claim_ue", False):
                 result += [OpenAIFactCheck(cache_path=args.cache_path, language=getattr(args, "language", "en"))]

From bc991b068ecd57ecd3a4e9506f0c7c175e052deb Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 25 Dec 2024 19:29:41 +0400
Subject: [PATCH 61/97] Save new stats in manager

---
 examples/configs/polygraph_eval_coqa_sentsar.yaml       | 2 ++
 examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml  | 2 ++
 examples/configs/polygraph_eval_mmlu_sentsar.yaml       | 2 ++
 examples/configs/polygraph_eval_triviaqa_sentsar.yaml   | 2 ++
 examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml | 2 ++
 examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 2 ++
 examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 2 ++
 examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml | 2 ++
 examples/configs/polygraph_eval_xsum_sentsar.yaml       | 2 ++
 9 files changed, 18 insertions(+)

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
index f7dcdc754..828e2327a 100644
--- a/examples/configs/polygraph_eval_coqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -42,6 +42,8 @@ save_stats:
   - sample_token_similarity
   - sample_entropy
   - first_sample_texts
+  - best_sample_texts
+  - best_normalized_sample_texts
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
index e5fbdd9c4..ea91a213f 100644
--- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -43,6 +43,8 @@ save_stats:
   - sample_token_similarity
   - sample_entropy
   - first_sample_texts
+  - best_sample_texts
+  - best_normalized_sample_texts
 entropy_top_k: 50
 
 target_ignore_regex: "(?s).*#### "
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
index 20437d081..743904e1b 100644
--- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -44,6 +44,8 @@ save_stats:
   - sample_token_similarity
   - sample_entropy
   - first_sample_texts
+  - best_sample_texts
+  - best_normalized_sample_texts
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 6cf4fd248..4fd78ca81 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -44,6 +44,8 @@ save_stats:
   - sample_token_similarity
   - sample_entropy
   - first_sample_texts
+  - best_sample_texts
+  - best_normalized_sample_texts
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
index 89e058463..2404e8822 100644
--- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
@@ -42,6 +42,8 @@ save_stats:
   - sample_token_similarity
   - sample_entropy
   - first_sample_texts
+  - best_sample_texts
+  - best_normalized_sample_texts
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 7b4e95e49..6040bd6e7 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -42,6 +42,8 @@ save_stats:
   - sample_token_similarity
   - sample_entropy
   - first_sample_texts
+  - best_sample_texts
+  - best_normalized_sample_texts
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index 62fe84c26..58e5cee10 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -42,6 +42,8 @@ save_stats:
   - sample_token_similarity
   - sample_entropy
   - first_sample_texts
+  - best_sample_texts
+  - best_normalized_sample_texts
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
index cece970ac..33bae1849 100644
--- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
@@ -42,6 +42,8 @@ save_stats:
   - sample_token_similarity
   - sample_entropy
   - first_sample_texts
+  - best_sample_texts
+  - best_normalized_sample_texts
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index 56e70c7cf..1a4d971c6 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -43,6 +43,8 @@ save_stats:
   - sample_token_similarity
   - sample_entropy
   - first_sample_texts
+  - best_sample_texts
+  - best_normalized_sample_texts
 entropy_top_k: 50
 
 train_dataset: null

From 744b108e32f6e9e0516a472fd39885d7e8b78ee2 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 25 Dec 2024 19:42:40 +0400
Subject: [PATCH 62/97] Small fixes

---
 src/lm_polygraph/generation_metrics/accuracy.py     | 2 +-
 src/lm_polygraph/generation_metrics/bleu.py         | 2 +-
 src/lm_polygraph/utils/register_stat_calculators.py | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/lm_polygraph/generation_metrics/accuracy.py b/src/lm_polygraph/generation_metrics/accuracy.py
index e3e4d13ed..7f0f062da 100644
--- a/src/lm_polygraph/generation_metrics/accuracy.py
+++ b/src/lm_polygraph/generation_metrics/accuracy.py
@@ -16,7 +16,7 @@ class AccuracyMetric(GenerationMetric):
     """
 
     def __init__(
-        self, target_ignore_regex=None, output_ignore_regex=None, normalize=False, sample: bool = False
+        self, target_ignore_regex=None, output_ignore_regex=None, normalize=False, sample: bool = False, sample_strategy: str = "First"
     ):
         if sample:
             super().__init__([
diff --git a/src/lm_polygraph/generation_metrics/bleu.py b/src/lm_polygraph/generation_metrics/bleu.py
index 91b4b5098..34fee322a 100644
--- a/src/lm_polygraph/generation_metrics/bleu.py
+++ b/src/lm_polygraph/generation_metrics/bleu.py
@@ -10,7 +10,7 @@ class BLEUMetric(GenerationMetric):
     Calculates BLEU metric between model-generated texts and ground truth texts.
     """
 
-    def __init__(self, sample: bool = False):
+    def __init__(self, sample: bool = False, sample_strategy: str = "First"):
         if sample:
             super().__init__([
                 "first_sample_texts",
diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
index 7c82caf80..c2bae472e 100644
--- a/src/lm_polygraph/utils/register_stat_calculators.py
+++ b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -68,6 +68,7 @@ def _register(calculator_class: StatCalculator):
         _register(GreedyLMProbsCalculator())
         _register(SamplingGenerationCalculator(n_alternatives=n_ccp_alternatives))
         _register(FirstSampleCalculator())
+        _register(BestSampleCalculator())
         _register(BartScoreCalculator())
         _register(ModelScoreCalculator())
         _register(EmbeddingsCalculator())

From 65f9513d442b656fb51ab783dd6491c42f15f759 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 26 Dec 2024 11:28:36 +0400
Subject: [PATCH 63/97] Add Comet against best

---
 scripts/polygraph_eval | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index c21a66afb..c64b9f992 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -512,7 +512,9 @@ def get_generation_metrics(args):
             ignore_regex = getattr(args, "source_ignore_regex", None)
             comet_scorer = load("comet")
             result += [Comet(comet_scorer, source_ignore_regex = ignore_regex),
-                       Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True)]
+                       Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True)
+                       Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="Best"),
+                       Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="BestNormalized")]
     else:
         result = []
         for metric in generation_metrics:

From 2d8fa7ebaa049be62bee7ce9a0e3ab08966a5bf1 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 26 Dec 2024 11:29:53 +0400
Subject: [PATCH 64/97] Fix

---
 scripts/polygraph_eval | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index c64b9f992..0eda6666c 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -512,7 +512,7 @@ def get_generation_metrics(args):
             ignore_regex = getattr(args, "source_ignore_regex", None)
             comet_scorer = load("comet")
             result += [Comet(comet_scorer, source_ignore_regex = ignore_regex),
-                       Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True)
+                       Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True),
                        Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="Best"),
                        Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="BestNormalized")]
     else:

From c5509ea479ae22e3756237e4e1c45a0d751fd52b Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Sun, 29 Dec 2024 09:08:41 +0400
Subject: [PATCH 65/97] Avesimilarity

---
 src/lm_polygraph/estimators/__init__.py       |   1 +
 .../semantic_average_ue_average_similarity.py | 224 ++++++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 74ef07953..d810dd6d9 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -85,4 +85,5 @@
 from .p_true_empirical import PTrueEmpirical
 from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE
 from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE
+from .semantic_average_ue_average_similarity import SemanticAveMaxprobAveSimilarity, SemanticAvePPLAveSimilarity, SemanticAveTokenSARAveSimilarity,SemanticAveMTEAveSimilarity
 from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE
diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
new file mode 100644
index 000000000..b214554bd
--- /dev/null
+++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
@@ -0,0 +1,224 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+
+
+class SemanticAveMaxprobAveSimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "SemanticAveMaxprobAveSimilarityexp"
+        else:
+            return "SemanticAveMaxprobAveSimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            # Compute probabilities (negative log-probs)
+            sample_probs = -np.array(sample_log_probs)
+            if self.exp:
+                sample_probs = -np.exp(-sample_probs)
+
+            # Compute row-wise average similarity, excluding self-similarity
+            # Diagonal contains self-similarities
+            row_averages = []
+            for i in range(sample_sentence_similarity.shape[0]):
+                row = sample_sentence_similarity[i]
+                average_similarity = (np.sum(row) - row[i]) / (len(row) - 1)
+                row_averages.append(average_similarity)
+
+            # Enrich each metric by scaling it by 1/row_average
+            enriched_sample_metrics = []
+            for i, (prob, avg_similarity) in enumerate(zip(sample_probs, row_averages)):
+                if avg_similarity == 0:
+                    avg_similarity = 1e-10  # Avoid division by zero
+                enriched_metric = prob * (1 / avg_similarity)
+                enriched_sample_metrics.append(enriched_metric)
+
+            enriched_metrics.append(np.array(enriched_sample_metrics))
+        # Return only metric for the first sample for prr calculation
+        first_elements = [metrics[0] for metrics in enriched_metrics]
+        return np.array(first_elements)
+
+class SemanticAvePPLAveSimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "SemanticAvePPLAveSimilarityexp"
+        else:
+            return "SemanticAvePPLAveSimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for sample_log_likelihoods, sample_sentence_similarity in zip(
+            batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
+            # get PPL for each sample
+            ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+            if self.exp:
+                ppl = -np.exp(-ppl)
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            row_averages = []
+            for i in range(sample_sentence_similarity.shape[0]):
+                row = sample_sentence_similarity[i]
+                average_similarity = (np.sum(row) - row[i]) / (len(row) - 1)  # Exclude g_ii
+                row_averages.append(average_similarity)
+
+            # Enrich each PPL independently by scaling with 1/row_average
+            enriched_sample_ppl = []
+            for i, (ppl_value, avg_similarity) in enumerate(zip(ppl, row_averages)):
+                if avg_similarity == 0:
+                    avg_similarity = 1e-10  # Avoid division by zero
+                enriched_value = ppl_value * (1 / avg_similarity)
+                enriched_sample_ppl.append(enriched_value)
+
+            enriched_ppl.append(np.array(enriched_sample_ppl))  # Collect enriched PPL values
+        # Return only metric for the first sample for prr calculation
+        first_elements = [metrics[0] for metrics in enriched_ppl]
+        return np.array(first_elements)
+
+class SemanticAveTokenSARAveSimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False
+    ):
+        super().__init__(
+            [
+                "sample_sentence_similarity",
+                "sample_log_likelihoods",
+                "sample_token_similarity",
+            ],
+            "sequence",
+        )
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "SemanticAveTokenSARAveSimilarityexp"
+        else:
+            return "SemanticAveTokenSARAveSimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_token_similarity = stats["sample_token_similarity"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        enriched_tokenSAR = []
+
+        for batch_data in zip(
+            batch_sample_log_likelihoods,
+            batch_sample_token_similarity,
+            batch_sample_sentence_similarity,
+        ):
+            sample_log_likelihoods = batch_data[0]
+            sample_token_similarity = batch_data[1]
+            sample_sentence_similarity = batch_data[2]
+
+            tokenSAR = []
+            for log_likelihoods, token_similarity in zip(
+                sample_log_likelihoods, sample_token_similarity
+            ):
+                log_likelihoods = np.array(log_likelihoods)
+                R_t = 1 - token_similarity
+                R_t_norm = R_t / R_t.sum()
+                E_t = -log_likelihoods * R_t_norm
+                tokenSAR.append(E_t.sum())
+
+            if self.exp:
+                tokenSAR = -np.exp(-np.array(tokenSAR))
+
+            # Compute row-wise average similarity excluding self-similarity
+            row_averages = []
+            for i in range(sample_sentence_similarity.shape[0]):
+                row = sample_sentence_similarity[i]
+                average_similarity = (np.sum(row) - row[i]) / (len(row) - 1)  # Exclude g_ii
+                row_averages.append(average_similarity)
+
+            # Enrich each tokenSAR value
+            enriched_sample_tokenSAR = []
+            for i, (sar_value, avg_similarity) in enumerate(zip(tokenSAR, row_averages)):
+                if avg_similarity == 0:
+                    avg_similarity = 1e-10  # Avoid division by zero
+                enriched_value = sar_value * (1 / avg_similarity)
+                enriched_sample_tokenSAR.append(enriched_value)
+
+            enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR))
+        # Return only metric for the first sample for prr calculation
+
+        first_elements = [metrics[0] for metrics in enriched_tokenSAR]
+        return np.array(first_elements)
+
+
+class SemanticAveMTEAveSimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "SemanticAveMTEAveSimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_entropy = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        enriched_entropy = []
+
+        for sample_entropy, sample_sentence_similarity in zip(
+            batch_sample_entropy, batch_sample_sentence_similarity
+        ):
+            # Compute row-wise average similarity, excluding self-similarity
+            row_averages = []
+            for i in range(sample_sentence_similarity.shape[0]):
+                row = sample_sentence_similarity[i]
+                average_similarity = (np.sum(row) - row[i]) / (len(row) - 1)  # Exclude g_ii
+                row_averages.append(average_similarity)
+
+            # Enrich each sample's entropy value
+            enriched_sample_entropy = []
+            for i, (entropy, avg_similarity) in enumerate(zip(sample_entropy, row_averages)):
+                if avg_similarity == 0:
+                    avg_similarity = 1e-10  # Avoid division by zero
+                enriched_value = entropy * (1 / avg_similarity)
+                enriched_sample_entropy.append(enriched_value)
+
+            enriched_entropy.append(np.array(enriched_sample_entropy))
+        # Return only metric for the first sample for prr calculation
+        first_elements = [metrics[0] for metrics in enriched_entropy]
+        return np.array(first_elements)
+
+

From fb601db6a080d2136c2c5c0d241072f340e24011 Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Thu, 2 Jan 2025 13:56:57 +0400
Subject: [PATCH 66/97] UE metric enriched with average dissimilarity

---
 src/lm_polygraph/estimators/__init__.py       |  11 +-
 .../semantic_average_ue_average_similarity.py | 265 ++++++++++++++++++
 2 files changed, 275 insertions(+), 1 deletion(-)

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index d810dd6d9..7b1cfe0c4 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -85,5 +85,14 @@
 from .p_true_empirical import PTrueEmpirical
 from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE
 from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE
-from .semantic_average_ue_average_similarity import SemanticAveMaxprobAveSimilarity, SemanticAvePPLAveSimilarity, SemanticAveTokenSARAveSimilarity,SemanticAveMTEAveSimilarity
+from .semantic_average_ue_average_similarity import (
+    SemanticAveMaxprobAveSimilarity, 
+    SemanticAvePPLAveSimilarity, 
+    SemanticAveTokenSARAveSimilarity,
+    SemanticAveMTEAveSimilarity,
+    SemanticEnrichedPPLAveDissimilarity,
+    SemanticEnrichedTokenSARAveDissimilarity ,
+    SemanticEnrichedMaxprobAveDissimilarity,
+    SemanticEnrichedMTEAveDissimilarity,
+    AveDissimilarity)
 from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE
diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
index b214554bd..f72b6bb13 100644
--- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
@@ -57,6 +57,62 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         first_elements = [metrics[0] for metrics in enriched_metrics]
         return np.array(first_elements)
 
+class SemanticEnrichedMaxprobAveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "SemanticEnrichedMaxprobAveDissimilarityexp"
+        else:
+            return "SemanticEnrichedMaxprobAveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+
+        for sample_log_probs, sample_sentence_similarity in zip(
+            batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            # Step 1: Compute probabilities (negative log-probs)
+            sample_probs = -np.array(sample_log_probs)
+            if self.exp:
+                sample_probs = -np.exp(-sample_probs)
+
+            # Step 2: Compute row-wise sum of dissimilarities (1 - g)
+            row_dissimilarities = []
+            for i in range(sample_sentence_similarity.shape[0]):
+                row = sample_sentence_similarity[i]
+                sum_dissimilarities = np.sum(1 - row) - (1 - row[i])  # Exclude self-similarity
+                row_dissimilarities.append(sum_dissimilarities)
+
+            # Step 3: Normalize by (M - 1)
+            normalized_dissimilarities = [
+                dissim / (len(sample_sentence_similarity) - 1)
+                for dissim in row_dissimilarities
+            ]
+
+            # Step 4: Enrich each metric
+            enriched_sample_metrics = []
+            for prob, dissim in zip(sample_probs, normalized_dissimilarities):
+                enriched_metric = prob * dissim
+                enriched_sample_metrics.append(enriched_metric)
+
+            enriched_metrics.append(np.array(enriched_sample_metrics))
+
+        # Return only metric for the first sample for PRR calculation
+        first_elements = [metrics[0] for metrics in enriched_metrics]
+        return np.array(first_elements)
+
+
 class SemanticAvePPLAveSimilarity(Estimator):
     def __init__(
         self,
@@ -107,6 +163,59 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         first_elements = [metrics[0] for metrics in enriched_ppl]
         return np.array(first_elements)
 
+class SemanticEnrichedPPLAveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "SemanticEnrichedPPLAveDissimilarityexp"
+        else:
+            return "SemanticEnrichedPPLAveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for sample_log_likelihoods, sample_sentence_similarity in zip(
+            batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
+            # Step 1: Compute PPL for each sample
+            ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
+            if self.exp:
+                ppl = -np.exp(-ppl)
+
+            # Step 2: Compute row-wise average dissimilarity (1 - g)
+            row_averages = []
+            for i in range(sample_sentence_similarity.shape[0]):
+                row = sample_sentence_similarity[i]
+                # Compute average dissimilarity, excluding self-similarity
+                average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1)
+                row_averages.append(average_dissimilarity)
+
+            # Step 3: Enrich each PPL independently by scaling with the average dissimilarity
+            enriched_sample_ppl = []
+            for i, (ppl_value, avg_dissimilarity) in enumerate(zip(ppl, row_averages)):
+                if avg_dissimilarity == 0:
+                    avg_dissimilarity = 1e-10  # Avoid division by zero
+                enriched_value = ppl_value * avg_dissimilarity
+                enriched_sample_ppl.append(enriched_value)
+
+            enriched_ppl.append(np.array(enriched_sample_ppl))  # Collect enriched PPL values
+
+        # Return only metric for the first sample for PRR calculation
+        first_elements = [metrics[0] for metrics in enriched_ppl]
+        return np.array(first_elements)
+
+
 class SemanticAveTokenSARAveSimilarity(Estimator):
     def __init__(
         self,
@@ -181,6 +290,80 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(first_elements)
 
 
+class SemanticEnrichedTokenSARAveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False
+    ):
+        super().__init__(
+            [
+                "sample_sentence_similarity",
+                "sample_log_likelihoods",
+                "sample_token_similarity",
+            ],
+            "sequence",
+        )
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "SemanticEnrichedTokenSARAveDissimilarityexp"
+        else:
+            return "SemanticEnrichedTokenSARAveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_token_similarity = stats["sample_token_similarity"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        enriched_tokenSAR = []
+
+        for batch_data in zip(
+            batch_sample_log_likelihoods,
+            batch_sample_token_similarity,
+            batch_sample_sentence_similarity,
+        ):
+            sample_log_likelihoods = batch_data[0]
+            sample_token_similarity = batch_data[1]
+            sample_sentence_similarity = batch_data[2]
+
+            tokenSAR = []
+            for log_likelihoods, token_similarity in zip(
+                sample_log_likelihoods, sample_token_similarity
+            ):
+                log_likelihoods = np.array(log_likelihoods)
+                R_t = 1 - token_similarity
+                R_t_norm = R_t / R_t.sum()
+                E_t = -log_likelihoods * R_t_norm
+                tokenSAR.append(E_t.sum())
+
+            if self.exp:
+                tokenSAR = -np.exp(-np.array(tokenSAR))
+
+            # Compute row-wise average dissimilarity (1 - g), excluding self-similarity
+            row_averages = []
+            for i in range(sample_sentence_similarity.shape[0]):
+                row = sample_sentence_similarity[i]
+                average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1)
+                row_averages.append(average_dissimilarity)
+
+            # Enrich each tokenSAR value
+            enriched_sample_tokenSAR = []
+            for i, (sar_value, avg_dissimilarity) in enumerate(zip(tokenSAR, row_averages)):
+                if avg_dissimilarity == 0:
+                    avg_dissimilarity = 1e-10  # Avoid division by zero
+                enriched_value = sar_value * avg_dissimilarity
+                enriched_sample_tokenSAR.append(enriched_value)
+
+            enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR))
+        # Return only metric for the first sample for PRR calculation
+
+        first_elements = [metrics[0] for metrics in enriched_tokenSAR]
+        return np.array(first_elements)
+
+
 class SemanticAveMTEAveSimilarity(Estimator):
     def __init__(
         self,
@@ -222,3 +405,85 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(first_elements)
 
 
+
+class SemanticEnrichedMTEAveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "SemanticEnrichedMTEAveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_entropy = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        enriched_entropy = []
+
+        for sample_entropy, sample_sentence_similarity in zip(
+            batch_sample_entropy, batch_sample_sentence_similarity
+        ):
+            # Compute row-wise average dissimilarity (1 - g), excluding self-similarity
+            row_averages = []
+            for i in range(sample_sentence_similarity.shape[0]):
+                row = sample_sentence_similarity[i]
+                average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1)
+                row_averages.append(average_dissimilarity)
+
+            # Enrich each sample's entropy value
+            enriched_sample_entropy = []
+            for i, (entropy, avg_dissimilarity) in enumerate(zip(sample_entropy, row_averages)):
+                if avg_dissimilarity == 0:
+                    avg_dissimilarity = 1e-10  # Avoid division by zero
+                enriched_value = entropy * avg_dissimilarity
+                enriched_sample_entropy.append(enriched_value)
+
+            enriched_entropy.append(np.array(enriched_sample_entropy))
+        # Return only metric for the first sample for PRR calculation
+        first_elements = [metrics[0] for metrics in enriched_entropy]
+        return np.array(first_elements)
+
+
+
+class AveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "AveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_entropy = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+
+        enriched_entropy = []
+
+        for sample_entropy, sample_sentence_similarity in zip(
+            batch_sample_entropy, batch_sample_sentence_similarity
+        ):
+            # Compute row-wise average dissimilarity (1 - g), excluding self-similarity
+            row_averages = []
+            for i in range(sample_sentence_similarity.shape[0]):
+                row = sample_sentence_similarity[i]
+                average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1)
+                row_averages.append(average_dissimilarity)
+
+            # Enrich each sample's entropy value
+            enriched_sample_entropy = []
+            for i, (entropy, avg_dissimilarity) in enumerate(zip(sample_entropy, row_averages)):
+                if avg_dissimilarity == 0:
+                    avg_dissimilarity = 1e-10  # Avoid division by zero
+                enriched_value = avg_dissimilarity
+                enriched_sample_entropy.append(enriched_value)
+
+            enriched_entropy.append(np.array(enriched_sample_entropy))
+        # Return only metric for the first sample for PRR calculation
+        first_elements = [metrics[0] for metrics in enriched_entropy]
+        return np.array(first_elements)

From ab5f055d76c8d67ff1e610e0de94287ec77938bc Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 9 Jan 2025 14:10:32 +0400
Subject: [PATCH 67/97] Set sample selection strategy for sample-focused
 methods, add greedy-focused semantically-enriched methods

---
 src/lm_polygraph/estimators/__init__.py       |  19 +-
 src/lm_polygraph/estimators/common.py         |  19 +
 ..._semantic_average_ue_average_similarity.py | 349 ++++++++++++++++++
 .../estimators/max_probability.py             |  41 +-
 src/lm_polygraph/estimators/perplexity.py     |  31 +-
 .../estimators/semantic_average_ue.py         |  59 ++-
 .../semantic_average_ue_average_similarity.py | 162 +++++---
 .../estimators/semantic_median_ue.py          |  60 +--
 src/lm_polygraph/estimators/token_entropy.py  |  10 +-
 src/lm_polygraph/estimators/token_sar.py      |  11 +-
 src/lm_polygraph/stat_calculators/__init__.py |   1 +
 .../stat_calculators/greedy_similarity.py     |  80 ++++
 src/lm_polygraph/stat_calculators/sample.py   |   7 +-
 .../utils/register_stat_calculators.py        |   1 +
 14 files changed, 695 insertions(+), 155 deletions(-)
 create mode 100644 src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
 create mode 100644 src/lm_polygraph/stat_calculators/greedy_similarity.py

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 7b1cfe0c4..9d3009913 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -7,11 +7,11 @@
 from .claim.pointwise_mutual_information import PointwiseMutualInformationClaim
 from .max_probability import (
     MaximumSequenceProbability,
+    SampledMaximumSequenceProbability,
     MaximumTokenProbability,
-    MaxSampledMaximumSequenceProbability,
 )
 from .claim_conditioned_probability import ClaimConditionedProbability
-from .token_entropy import MeanTokenEntropy, TokenEntropy
+from .token_entropy import MeanTokenEntropy, TokenEntropy, SampledMeanTokenEntropy
 from .pointwise_mutual_information import (
     MeanPointwiseMutualInformation,
     PointwiseMutualInformation,
@@ -32,8 +32,7 @@
 from .semantic_entropy import SemanticEntropy
 from .semantic_entropy_token import SemanticEntropyToken
 from .perplexity import (
-    Perplexity, 
-    MaxSampledPerplexity,
+    Perplexity, SampledPerplexity
 )
 from .mahalanobis_distance import MahalanobisDistanceSeq
 from .relative_mahalanobis_distance import RelativeMahalanobisDistanceSeq
@@ -65,7 +64,7 @@
     PESrmi,
     PESrmiabs,
 )
-from .token_sar import TokenSAR
+from .token_sar import TokenSAR, SampledTokenSAR
 from .sentence_sar import (
     SentenceSAR,
 #    OtherSentenceSAR,
@@ -95,4 +94,14 @@
     SemanticEnrichedMaxprobAveDissimilarity,
     SemanticEnrichedMTEAveDissimilarity,
     AveDissimilarity)
+from .greedy_semantic_average_ue_average_similarity import (
+    GreedySemanticAveMaxprobAveSimilarity, 
+    GreedySemanticAvePPLAveSimilarity, 
+    GreedySemanticAveTokenSARAveSimilarity,
+    GreedySemanticAveMTEAveSimilarity,
+    GreedySemanticEnrichedPPLAveDissimilarity,
+    GreedySemanticEnrichedTokenSARAveDissimilarity ,
+    GreedySemanticEnrichedMaxprobAveDissimilarity,
+    GreedySemanticEnrichedMTEAveDissimilarity,
+)
 from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE
diff --git a/src/lm_polygraph/estimators/common.py b/src/lm_polygraph/estimators/common.py
index 0a10c414c..72e2142e7 100644
--- a/src/lm_polygraph/estimators/common.py
+++ b/src/lm_polygraph/estimators/common.py
@@ -29,3 +29,22 @@ def _compute_Jaccard_score(lst):
 
 def compute_sim_score(answers, affinity, similarity_score):
     return _compute_Jaccard_score(answers)
+
+def sample_strategy_to_prefix(sample_strategy):
+    if sample_strategy == "first":
+        return ""
+    elif sample_strategy in ["best", "best_normalized"]:
+        return "".join(list(map(lambda x: x.capitalize(), sample_strategy.split("_"))))
+    else:
+        raise ValueError(f"Unknown sample strategy: {sample_strategy}")
+
+def best_sample_ids(sample_strategy, stats):
+    batch_size = len(stats["sample_log_probs"])
+    if sample_strategy == "first":
+        return [0] * batch_size
+    elif sample_strategy == "best":
+        return stats["best_sample_text_ids"]
+    elif sample_strategy == "best_normalized":
+        return stats["best_normalized_sample_text_ids"]
+    else:
+        raise ValueError(f"Unknown sample strategy: {sample_strategy}")
diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
new file mode 100644
index 000000000..5b7a8ab99
--- /dev/null
+++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
@@ -0,0 +1,349 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids
+
+
+class GreedySemanticAveMaxprobAveSimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "GreedySemanticAveMaxprobAveSimilarityexp"
+        else:
+            return "GreedySemanticAveMaxprobAveSimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+        batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]])
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+        for greedy_ll, greedy_sentence_similarity in zip(
+            batch_lls, batch_greedy_sentence_similarity
+        ):
+            # Compute probabilities (negative log-probs)
+            prob = -greedy_ll
+            if self.exp:
+                prob = -np.exp(-prob)
+
+            # Compute row-wise average similarity, excluding self-similarity
+            # Diagonal contains self-similarities
+            ave_similarity = np.mean(greedy_sentence_similarity)
+
+            # Enrich each metric by scaling it by 1/row_average
+            if ave_similarity == 0:
+                ave_similarity = 1e-10  # Avoid division by zero
+
+            enriched_metric = prob * (1 / avg_similarity)
+            enriched_metrics.append(enriched_metric)
+
+        return np.array(enriched_metrics)
+
+
+class GreedySemanticAveMaxprobAveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "GreedySemanticAveMaxprobAveDissimilarityexp"
+        else:
+            return "GreedySemanticAveMaxprobAveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+        batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]])
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+        for greedy_ll, greedy_sentence_similarity in zip(
+            batch_lls, batch_greedy_sentence_similarity
+        ):
+            # Compute probabilities (negative log-probs)
+            prob = -greedy_ll
+            if self.exp:
+                prob = -np.exp(-prob)
+
+            # Compute row-wise average similarity, excluding self-similarity
+            # Diagonal contains self-similarities
+            ave_dissimilarity = np.mean(1 - greedy_sentence_similarity)
+
+            enriched_metric = prob * avg_dissimilarity
+            enriched_metrics.append(enriched_metric)
+
+        return np.array(enriched_metrics)
+
+
+class GreedySemanticAvePPLAveSimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "GreedySemanticAvePPLAveSimilarityexp"
+        else:
+            return "GreedySemanticAvePPLAveSimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for greedy_log_likelihoods, greedy_sentence_similarity in zip(
+            batch_greedy_log_likelihoods, batch_greedy_sentence_similarity
+        ):
+            # get PPL for each sample
+            ppl = -np.mean(greedy_log_likelihoods)
+            if self.exp:
+                ppl = -np.exp(-ppl)
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_similarity = np.mean(greedy_sentence_similarity)
+
+            # Enrich each PPL independently by scaling with 1/row_average
+            if avg_similarity == 0:
+                avg_similarity = 1e-10  # Avoid division by zero
+
+            enriched_value = ppl * (1 / avg_similarity)
+            enriched_ppl.append(enriched_value)
+
+        return np.array(enriched_ppl)
+
+
+class GreedySemanticAvePPLAveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "GreedySemanticAvePPLAveDissimilarityexp"
+        else:
+            return "GreedySemanticAvePPLAveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for greedy_log_likelihoods, greedy_sentence_similarity in zip(
+            batch_greedy_log_likelihoods, batch_greedy_sentence_similarity
+        ):
+            # get PPL for each sample
+            ppl = -np.mean(greedy_log_likelihoods)
+            if self.exp:
+                ppl = -np.exp(-ppl)
+
+            # Compute row-wise average similarity, excluding self-similarity
+            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
+
+            enriched_value = ppl *  avg_dissimilarity
+            enriched_ppl.append(enriched_value)
+
+        return np.array(enriched_ppl)
+
+
+class GreedySemanticAveTokenSARAveSimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False,
+    ):
+        super().__init__(
+            [
+                "greedy_sentence_similarity",
+                "greedy_log_likelihoods",
+            ],
+            "sequence",
+        )
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "GreedySemanticAveTokenSARAveSimilarityexp"
+        else:
+            return "GreedySemanticAveTokenSARAveSimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
+        batch_greedy_token_similarity = stats["token_similarity"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_tokenSAR = []
+
+        for batch_data in zip(
+            batch_greedy_log_likelihoods,
+            batch_greedy_token_similarity,
+            batch_greedy_sentence_similarity,
+        ):
+            log_likelihoods = batch_data[0]
+            token_similarity = batch_data[1]
+            greedy_sentence_similarity = batch_data[2]
+
+            log_likelihoods = np.array(log_likelihoods)
+            R_t = 1 - token_similarity
+            R_t_norm = R_t / R_t.sum()
+            E_t = -log_likelihoods * R_t_norm
+            tokenSAR.append(E_t.sum())
+
+            if self.exp:
+                tokenSAR = -np.exp(-np.array(tokenSAR))
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_similarity = np.mean(greedy_sentence_similarity)
+
+            # Enrich each PPL independently by scaling with 1/row_average
+            if avg_similarity == 0:
+                avg_similarity = 1e-10  # Avoid division by zero
+
+            enriched_value = tokenSAR * (1 / avg_similarity)
+            enriched_tokenSAR.append(enriched_value)
+
+        return np.array(enriched_tokenSAR)
+
+
+class GreedySemanticAveTokenSARAveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        exp: bool = False,
+    ):
+        super().__init__(
+            [
+                "greedy_sentence_similarity",
+                "greedy_log_likelihoods",
+            ],
+            "sequence",
+        )
+        self.verbose = verbose
+        self.exp = exp
+
+    def __str__(self):
+        if self.exp:
+            return "GreedySemanticAveTokenSARAveDissimilarityexp"
+        else:
+            return "GreedySemanticAveTokenSARAveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
+        batch_greedy_token_similarity = stats["token_similarity"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_tokenSAR = []
+
+        for batch_data in zip(
+            batch_greedy_log_likelihoods,
+            batch_greedy_token_similarity,
+            batch_greedy_sentence_similarity,
+        ):
+            log_likelihoods = batch_data[0]
+            token_similarity = batch_data[1]
+            greedy_sentence_similarity = batch_data[2]
+
+            log_likelihoods = np.array(log_likelihoods)
+            R_t = 1 - token_similarity
+            R_t_norm = R_t / R_t.sum()
+            E_t = -log_likelihoods * R_t_norm
+            tokenSAR.append(E_t.sum())
+
+            if self.exp:
+                tokenSAR = -np.exp(-np.array(tokenSAR))
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
+
+            enriched_value = tokenSAR * avg_dissimilarity
+            enriched_tokenSAR.append(enriched_value)
+
+        return np.array(enriched_tokenSAR)
+
+
+class GreedySemanticAveMTEAveSimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedySemanticAveMTEAveSimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_entropy = stats["entropy"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_entropy = []
+
+        for greedy_entropy, greedy_sentence_similarity in zip(
+            batch_greedy_entropy, batch_greedy_sentence_similarity
+        ):
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_similarity = np.mean(greedy_sentence_similarity)
+
+            # Enrich each PPL independently by scaling with 1/row_average
+            if avg_similarity == 0:
+                avg_similarity = 1e-10  # Avoid division by zero
+
+            enriched_value = greedy_entropy * (1 / avg_similarity)
+            enriched_entropy.append(enriched_value)
+
+        return np.array(enriched_entropy)
+
+
+class GreedySemanticEnrichedMTEAveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedySemanticEnrichedMTEAveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_entropy = stats["entropy"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_entropy = []
+
+        for greedy_entropy, greedy_sentence_similarity in zip(
+            batch_greedy_entropy, batch_greedy_sentence_similarity
+        ):
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
+
+            enriched_value = greedy_entropy * avg_dissimilarity
+            enriched_entropy.append(enriched_value)
+
+        return np.array(enriched_entropy)
diff --git a/src/lm_polygraph/estimators/max_probability.py b/src/lm_polygraph/estimators/max_probability.py
index cbfc8ed32..406021cd6 100644
--- a/src/lm_polygraph/estimators/max_probability.py
+++ b/src/lm_polygraph/estimators/max_probability.py
@@ -3,6 +3,7 @@
 from typing import Dict
 
 from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids
 
 
 class MaximumSequenceProbability(Estimator):
@@ -41,41 +42,13 @@ class SampledMaximumSequenceProbability(Estimator):
     Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel).
     """
 
-    def __init__(self):
+    def __init__(self, sample_strategy: str = "first"):
         super().__init__(["sample_log_probs"], "sequence")
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return "SampledMaximumSequenceProbability"
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        """
-        Estimates the minus log-probability of each sample in input statistics.
-
-        Parameters:
-            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
-                * log p(y_i | y_<i, x) in 'greedy_log_likelihoods'
-        Returns:
-            np.ndarray: minus log probabilities for each sample.
-                Higher values indicate more uncertain samples.
-        """
-        mp = [lp[0] for lp in stats["sample_log_probs"]]
+        return sample_strategy_to_prefix(self.sample_strategy) + "SampledMaximumSequenceProbability"
 
-        return -np.array(mp)
-    
-
-class MaxSampledMaximumSequenceProbability(Estimator):
-    """
-    Estimates the sequence-level uncertainty of a language model by calculating the
-    log-probability of the generation with minus sign.
-    It is calculated as the sum of log-probabilities in each token.
-    Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel).
-    """
-
-    def __init__(self):
-        super().__init__(["sample_log_probs"], "sequence")
-
-    def __str__(self):
-        return "MaxSampledMaximumSequenceProbability"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -88,7 +61,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             np.ndarray: minus log probabilities for each sample.
                 Higher values indicate more uncertain samples.
         """
-        mp = [max(lp) for lp in stats["sample_log_probs"]]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        mp = []
+        for best_id, sample_log_probs in zip(sample_ids, stats["sample_log_probs"]):
+            mp.append(sample_log_probs[best_id])
 
         return -np.array(mp)
 
diff --git a/src/lm_polygraph/estimators/perplexity.py b/src/lm_polygraph/estimators/perplexity.py
index 44fd9350f..4581b7c39 100644
--- a/src/lm_polygraph/estimators/perplexity.py
+++ b/src/lm_polygraph/estimators/perplexity.py
@@ -3,6 +3,7 @@
 from typing import Dict
 
 from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids
 
 
 class Perplexity(Estimator):
@@ -17,33 +18,19 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array([-np.mean(ll) for ll in log_likelihoods])
 
 class SampledPerplexity(Estimator):
-    def __init__(self):
+    def __init__(self, sample_strategy: str = "first"):
         super().__init__(["sample_log_likelihoods"], "sequence")
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return "SampledPerplexity"
+        return sample_strategy_to_prefix(self.sample_strategy) + "SampledPerplexity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         log_likelihoods = stats["sample_log_likelihoods"]
-        ppl = [np.mean(sample_log_likelihoods[0]) for sample_log_likelihoods in log_likelihoods]
-        return -np.array(ppl)
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
-class MaxSampledPerplexity(Estimator):
-    def init(self):
-        super().init(["sample_log_likelihoods"], "sequence")
+        ppl = []
+        for best_id, sample_log_likelihoods in zip(sample_ids, log_likelihoods):
+            ppl.append(np.mean(sample_log_likelihoods[best_id]))
 
-    def str(self):
-        return "MaxSampledPerplexity"
-
-    def call(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        log_likelihoods = stats["sample_log_likelihoods"]
-        
-        ppl_per_sample = [
-            [-np.mean(sequence) for sequence in sample_log_likelihoods]
-            for sample_log_likelihoods in log_likelihoods
-        ]
-
-        # Find the maximum perplexity for each set of samples
-        max_ppl = [max(ppl_sample) for ppl_sample in ppl_per_sample]
-
-        return -np.array(max_ppl)
\ No newline at end of file
+        return -np.array(ppl)
diff --git a/src/lm_polygraph/estimators/semantic_average_ue.py b/src/lm_polygraph/estimators/semantic_average_ue.py
index fb361d44d..d58489d99 100644
--- a/src/lm_polygraph/estimators/semantic_average_ue.py
+++ b/src/lm_polygraph/estimators/semantic_average_ue.py
@@ -4,36 +4,42 @@
 from copy import deepcopy
 
 from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids
 
 
 class SemanticAveMaxprob(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticAveMaxprobexp"
+            base = "SemanticAveMaxprobexp"
         else:
-            return "SemanticAveMaxprob"
+            base = "SemanticAveMaxprob"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_probs = stats["sample_log_probs"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         ave = []
-        for sample_log_probs, sample_sentence_similarity in zip(
-            batch_sample_log_probs, batch_sample_sentence_similarity
+        for best_id, sample_log_probs, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity
         ):
             sample_probs = -np.array(sample_log_probs)
             if self.exp:
                 sample_probs = -np.exp(-sample_probs)
-            weights = sample_sentence_similarity[0, :]
+
+            weights = sample_sentence_similarity[best_id, :]
             ave.append(np.average(sample_probs, weights=weights))
 
         return np.array(ave)
@@ -42,32 +48,36 @@ class SemanticAvePPL(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticAvePPLexp"
+            base = "SemanticAvePPLexp"
         else:
-            return "SemanticAvePPL"
+            base = "SemanticAvePPL"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         ave = []
-        for sample_log_likelihoods, sample_sentence_similarity in zip(
-            batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        for best_id, sample_log_likelihoods, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
             ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
 
             if self.exp:
                 ppl = -np.exp(-ppl)
 
-            weights = sample_sentence_similarity[0, :]
+            weights = sample_sentence_similarity[best_id, :]
 
             ave.append(np.average(ppl, weights=weights))
 
@@ -77,7 +87,8 @@ class SemanticAveTokenSAR(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(
             [
@@ -89,27 +100,32 @@ def __init__(
         )
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticAveTokenSARexp"
+            base = "SemanticAveTokenSARexp"
         else:
-            return "SemanticAveTokenSAR"
+            base = "SemanticAveTokenSAR"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
         batch_sample_token_similarity = stats["sample_token_similarity"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         ave = []
         for batch_data in zip(
             batch_sample_log_likelihoods,
             batch_sample_token_similarity,
             batch_sample_sentence_similarity,
+            sample_ids,
         ):
             sample_log_likelihoods = batch_data[0]
             sample_token_similarity = batch_data[1]
             sample_sentence_similarity = batch_data[2]
+            best_id = batch_data[3]
 
             tokenSAR = []
             for log_likelihoods, token_similarity in zip(
@@ -124,7 +140,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             if self.exp:
                 tokenSAR = -np.exp(-np.array(tokenSAR))
 
-            weights = sample_sentence_similarity[0, :]
+            weights = sample_sentence_similarity[best_id, :]
 
             ave.append(np.average(tokenSAR, weights=weights))
 
@@ -134,22 +150,25 @@ class SemanticAveMTE(Estimator):
     def __init__(
         self,
         verbose: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
         self.verbose = verbose
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return "SemanticAveMTE"
+        return sample_strategy_to_prefix(self.sample_strategy) + "SemanticAveMTE"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_entropy = stats["sample_entropy"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         ave = []
-        for sample_entropy, sample_sentence_similarity in zip(
-            batch_sample_entropy, batch_sample_sentence_similarity
+        for best_id, sample_entropy, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_entropy, batch_sample_sentence_similarity
         ):
-            weights = sample_sentence_similarity[0, :]
+            weights = sample_sentence_similarity[best_id, :]
             ave.append(np.average(sample_entropy, weights=weights))
 
         return np.array(ave)
diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
index f72b6bb13..2a0f0b617 100644
--- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
@@ -4,27 +4,32 @@
 from copy import deepcopy
 
 from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids
 
 
 class SemanticAveMaxprobAveSimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticAveMaxprobAveSimilarityexp"
+            base = "SemanticAveMaxprobAveSimilarityexp"
         else:
-            return "SemanticAveMaxprobAveSimilarity"
+            base = "SemanticAveMaxprobAveSimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_probs = stats["sample_log_probs"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         enriched_metrics = []  # To store enriched metrics for each sample
 
@@ -53,29 +58,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 enriched_sample_metrics.append(enriched_metric)
 
             enriched_metrics.append(np.array(enriched_sample_metrics))
-        # Return only metric for the first sample for prr calculation
-        first_elements = [metrics[0] for metrics in enriched_metrics]
-        return np.array(first_elements)
+
+        # Return only metric for the best sample for prr calculation
+        best_elements = []
+        for best_id, metrics in zip(sample_ids, enriched_metrics):
+            best_elements.append(metrics[best_id])
+
+        return np.array(best_elements)
 
 class SemanticEnrichedMaxprobAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticEnrichedMaxprobAveDissimilarityexp"
+            base = "SemanticEnrichedMaxprobAveDissimilarityexp"
         else:
-            return "SemanticEnrichedMaxprobAveDissimilarity"
+            base = "SemanticEnrichedMaxprobAveDissimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_probs = stats["sample_log_probs"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         enriched_metrics = []  # To store enriched metrics for each sample
 
@@ -108,30 +121,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             enriched_metrics.append(np.array(enriched_sample_metrics))
 
-        # Return only metric for the first sample for PRR calculation
-        first_elements = [metrics[0] for metrics in enriched_metrics]
-        return np.array(first_elements)
+        # Return only metric for the best sample for PRR calculation
+        best_elements = []
+        for best_id, metrics in zip(sample_ids, enriched_metrics):
+            best_elements.append(metrics[best_id])
+
+        return np.array(best_elements)
 
 
 class SemanticAvePPLAveSimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticAvePPLAveSimilarityexp"
+            base = "SemanticAvePPLAveSimilarityexp"
         else:
-            return "SemanticAvePPLAveSimilarity"
+            base = "SemanticAvePPLAveSimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         enriched_ppl = []  # To store enriched PPL for each sample
 
@@ -159,29 +179,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 enriched_sample_ppl.append(enriched_value)
 
             enriched_ppl.append(np.array(enriched_sample_ppl))  # Collect enriched PPL values
-        # Return only metric for the first sample for prr calculation
-        first_elements = [metrics[0] for metrics in enriched_ppl]
-        return np.array(first_elements)
+
+        # Return only metric for the best sample for prr calculation
+        best_elements = []
+        for best_id, metrics in zip(sample_ids, enriched_ppl):
+            best_elements.append(metrics[best_id])
+
+        return np.array(best_elements)
 
 class SemanticEnrichedPPLAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,  
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticEnrichedPPLAveDissimilarityexp"
+            base = "SemanticEnrichedPPLAveDissimilarityexp"
         else:
-            return "SemanticEnrichedPPLAveDissimilarity"
+            base = "SemanticEnrichedPPLAveDissimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         enriched_ppl = []  # To store enriched PPL for each sample
 
@@ -211,16 +239,20 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             enriched_ppl.append(np.array(enriched_sample_ppl))  # Collect enriched PPL values
 
-        # Return only metric for the first sample for PRR calculation
-        first_elements = [metrics[0] for metrics in enriched_ppl]
-        return np.array(first_elements)
+        # Return only metric for the best sample for PRR calculation
+        best_elements = []
+        for best_id, metrics in zip(sample_ids, enriched_ppl):
+            best_elements.append(metrics[best_id])
+
+        return np.array(best_elements)
 
 
 class SemanticAveTokenSARAveSimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(
             [
@@ -232,17 +264,20 @@ def __init__(
         )
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticAveTokenSARAveSimilarityexp"
+            base = "SemanticAveTokenSARAveSimilarityexp"
         else:
-            return "SemanticAveTokenSARAveSimilarity"
+            base = "SemanticAveTokenSARAveSimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
         batch_sample_token_similarity = stats["sample_token_similarity"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         enriched_tokenSAR = []
 
@@ -284,17 +319,21 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 enriched_sample_tokenSAR.append(enriched_value)
 
             enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR))
-        # Return only metric for the first sample for prr calculation
 
-        first_elements = [metrics[0] for metrics in enriched_tokenSAR]
-        return np.array(first_elements)
+        # Return only metric for the best sample for prr calculation
+        best_elements = []
+        for best_id, metrics in zip(sample_ids, enriched_tokenSAR):
+            best_elements.append(metrics[best_id])
+
+        return np.array(best_elements)
 
 
 class SemanticEnrichedTokenSARAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(
             [
@@ -306,17 +345,20 @@ def __init__(
         )
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticEnrichedTokenSARAveDissimilarityexp"
+            base = "SemanticEnrichedTokenSARAveDissimilarityexp"
         else:
-            return "SemanticEnrichedTokenSARAveDissimilarity"
+            base = "SemanticEnrichedTokenSARAveDissimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
         batch_sample_token_similarity = stats["sample_token_similarity"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         enriched_tokenSAR = []
 
@@ -358,26 +400,32 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 enriched_sample_tokenSAR.append(enriched_value)
 
             enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR))
-        # Return only metric for the first sample for PRR calculation
 
-        first_elements = [metrics[0] for metrics in enriched_tokenSAR]
-        return np.array(first_elements)
+        # Return only metric for the best sample for PRR calculation
+        best_elements = []
+        for best_id, metrics in zip(sample_ids, enriched_tokenSAR):
+            best_elements.append(metrics[best_id])
+
+        return np.array(best_elements)
 
 
 class SemanticAveMTEAveSimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
         self.verbose = verbose
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return "SemanticAveMTEAveSimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + "SemanticAveMTEAveSimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_entropy = stats["sample_entropy"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         enriched_entropy = []
 
@@ -400,9 +448,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 enriched_sample_entropy.append(enriched_value)
 
             enriched_entropy.append(np.array(enriched_sample_entropy))
-        # Return only metric for the first sample for prr calculation
-        first_elements = [metrics[0] for metrics in enriched_entropy]
-        return np.array(first_elements)
+
+        # Return only metric for the best sample for prr calculation
+        best_elements = []
+        for best_id, metrics in zip(sample_ids, enriched_entropy):
+            best_elements.append(metrics[best_id])
+
+        return np.array(best_elements)
 
 
 
@@ -410,16 +462,19 @@ class SemanticEnrichedMTEAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
         self.verbose = verbose
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return "SemanticEnrichedMTEAveDissimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + "SemanticEnrichedMTEAveDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_entropy = stats["sample_entropy"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         enriched_entropy = []
 
@@ -442,9 +497,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 enriched_sample_entropy.append(enriched_value)
 
             enriched_entropy.append(np.array(enriched_sample_entropy))
-        # Return only metric for the first sample for PRR calculation
-        first_elements = [metrics[0] for metrics in enriched_entropy]
-        return np.array(first_elements)
+
+        # Return only metric for the best sample for PRR calculation
+        best_elements = []
+        for best_id, metrics in zip(sample_ids, enriched_entropy):
+            best_elements.append(metrics[best_id])
+
+        return np.array(best_elements)
 
 
 
@@ -452,16 +511,19 @@ class AveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
         self.verbose = verbose
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return "AveDissimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + "AveDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_entropy = stats["sample_entropy"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         enriched_entropy = []
 
@@ -484,6 +546,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 enriched_sample_entropy.append(enriched_value)
 
             enriched_entropy.append(np.array(enriched_sample_entropy))
-        # Return only metric for the first sample for PRR calculation
-        first_elements = [metrics[0] for metrics in enriched_entropy]
-        return np.array(first_elements)
+
+        # Return only metric for the best sample for PRR calculation
+        best_elements = []
+        for best_id, metrics in zip(sample_ids, enriched_entropy):
+            best_elements.append(metrics[best_id])
+
+        return np.array(best_elements)
diff --git a/src/lm_polygraph/estimators/semantic_median_ue.py b/src/lm_polygraph/estimators/semantic_median_ue.py
index 11b9beaac..0ec7d3274 100644
--- a/src/lm_polygraph/estimators/semantic_median_ue.py
+++ b/src/lm_polygraph/estimators/semantic_median_ue.py
@@ -4,6 +4,7 @@
 from copy import deepcopy
 
 from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids
 
 from wquantiles import median
 
@@ -12,30 +13,34 @@ class SemanticMedianMaxprob(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticMedianMaxprobexp"
+            base = "SemanticMedianMaxprobexp"
         else:
-            return "SemanticMedianMaxprob"
+            base = "SemanticMedianMaxprob"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_probs = stats["sample_log_probs"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         ave = []
-        for sample_log_probs, sample_sentence_similarity in zip(
-            batch_sample_log_probs, batch_sample_sentence_similarity
+        for best_id, sample_log_probs, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity
         ):
             sample_probs = -np.array(sample_log_probs)
             if self.exp:
                 sample_probs = -np.exp(-sample_probs)
-            weights = sample_sentence_similarity[0, :]
+            weights = sample_sentence_similarity[best_id, :]
             ave.append(median(sample_probs, weights))
 
         return np.array(ave)
@@ -44,32 +49,36 @@ class SemanticMedianPPL(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticMedianPPLexp"
+            base = "SemanticMedianPPLexp"
         else:
-            return "SemanticMedianPPL"
+            base = "SemanticMedianPPL"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         ave = []
-        for sample_log_likelihoods, sample_sentence_similarity in zip(
-            batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        for best_id, sample_log_likelihoods, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
             ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
 
             if self.exp:
                 ppl = -np.exp(-ppl)
 
-            weights = sample_sentence_similarity[0, :]
+            weights = sample_sentence_similarity[best_id, :]
 
             ave.append(median(ppl, weights))
 
@@ -79,7 +88,8 @@ class SemanticMedianTokenSAR(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False
+        exp: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(
             [
@@ -91,27 +101,32 @@ def __init__(
         )
         self.verbose = verbose
         self.exp = exp
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
         if self.exp:
-            return "SemanticMedianTokenSARexp"
+            base = "SemanticMedianTokenSARexp"
         else:
-            return "SemanticMedianTokenSAR"
+            base = "SemanticMedianTokenSAR"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
         batch_sample_token_similarity = stats["sample_token_similarity"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         ave = []
         for batch_data in zip(
             batch_sample_log_likelihoods,
             batch_sample_token_similarity,
             batch_sample_sentence_similarity,
+            sample_ids,
         ):
             sample_log_likelihoods = batch_data[0]
             sample_token_similarity = batch_data[1]
             sample_sentence_similarity = batch_data[2]
+            best_id = batch_data[3]
 
             tokenSAR = []
             for log_likelihoods, token_similarity in zip(
@@ -122,11 +137,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
-            
+
             if self.exp:
                 tokenSAR = -np.exp(-np.array(tokenSAR))
 
-            weights = sample_sentence_similarity[0, :]
+            weights = sample_sentence_similarity[best_id, :]
 
             ave.append(median(np.array(tokenSAR), weights))
 
@@ -136,22 +151,25 @@ class SemanticMedianMTE(Estimator):
     def __init__(
         self,
         verbose: bool = False,
+        sample_strategy: str = "first"
     ):
         super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
         self.verbose = verbose
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return "SemanticMedianMTE"
+        return sample_strategy_to_prefix(self.sample_strategy) + "SemanticMedianMTE"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_entropy = stats["sample_entropy"]
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         ave = []
-        for sample_entropy, sample_sentence_similarity in zip(
-            batch_sample_entropy, batch_sample_sentence_similarity
+        for best_id, sample_entropy, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_entropy, batch_sample_sentence_similarity
         ):
-            weights = sample_sentence_similarity[0, :]
+            weights = sample_sentence_similarity[best_id, :]
             ave.append(median(np.array(sample_entropy), weights))
 
         return np.array(ave)
diff --git a/src/lm_polygraph/estimators/token_entropy.py b/src/lm_polygraph/estimators/token_entropy.py
index 9e1d080dd..059934c12 100644
--- a/src/lm_polygraph/estimators/token_entropy.py
+++ b/src/lm_polygraph/estimators/token_entropy.py
@@ -3,6 +3,7 @@
 from typing import Dict
 
 from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids
 
 
 class MeanTokenEntropy(Estimator):
@@ -40,11 +41,12 @@ class SampledMeanTokenEntropy(Estimator):
     Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel).
     """
 
-    def __init__(self):
+    def __init__(self, sample_strategy: str = "first"):
         super().__init__(["sample_entropy"], "sequence")
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return "SampledMeanTokenEntropy"
+        return sample_strategy_to_prefix(self.sample_strategy) + "SampledMeanTokenEntropy"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -58,7 +60,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 Higher values indicate more uncertain samples.
         """
         entropy = stats["sample_entropy"]
-        return np.array([e[0] for e in entropy])
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        return np.array([e[best_id] for e, best_id in zip(entropy, sample_ids)])
 
 
 class TokenEntropy(Estimator):
diff --git a/src/lm_polygraph/estimators/token_sar.py b/src/lm_polygraph/estimators/token_sar.py
index 1a3e715c6..c2695970d 100644
--- a/src/lm_polygraph/estimators/token_sar.py
+++ b/src/lm_polygraph/estimators/token_sar.py
@@ -3,6 +3,7 @@
 from typing import Dict
 
 from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids
 
 
 class TokenSAR(Estimator):
@@ -58,12 +59,13 @@ class SampledTokenSAR(Estimator):
     This method calculates the weighted sum of log_likelihoods with weights computed using token relevance.
     """
 
-    def __init__(self, verbose: bool = False):
+    def __init__(self, verbose: bool = False, sample_strategy: str = "first"):
         super().__init__(["sample_token_similarity", "sample_log_likelihoods"], "sequence")
         self.verbose = verbose
+        self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return "SampledTokenSAR"
+        return sample_strategy_to_prefix(self.sample_strategy) + "SampledTokenSAR"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
@@ -79,14 +81,17 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         """
         batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
         batch_sample_token_similarity = stats["sample_token_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
 
         result = []
         for batch_data in zip(
             batch_sample_log_likelihoods,
             batch_sample_token_similarity,
+            sample_ids,
         ):
             sample_log_likelihoods = batch_data[0]
             sample_token_similarity = batch_data[1]
+            best_id = batch_data[2]
 
             tokenSAR = []
             for log_likelihoods, token_similarity in zip(
@@ -97,6 +102,6 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
-            result.append(tokenSAR[0])
+            result.append(tokenSAR[best_id])
 
         return np.array(result)
diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py
index 1ba3b4057..f8c3fa554 100644
--- a/src/lm_polygraph/stat_calculators/__init__.py
+++ b/src/lm_polygraph/stat_calculators/__init__.py
@@ -24,3 +24,4 @@
 from .cross_encoder_similarity import CrossEncoderSimilarityMatrixCalculator
 from .extract_claims import ClaimsExtractor
 from .semantic_classes import SemanticClassesCalculator
+from .greedy_similarity import GreedySimilarityCalculator
diff --git a/src/lm_polygraph/stat_calculators/greedy_similarity.py b/src/lm_polygraph/stat_calculators/greedy_similarity.py
new file mode 100644
index 000000000..04eae17bf
--- /dev/null
+++ b/src/lm_polygraph/stat_calculators/greedy_similarity.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+import itertools
+from typing import Dict, List
+from tqdm import tqdm
+
+from .stat_calculator import StatCalculator
+from sentence_transformers import CrossEncoder
+from lm_polygraph.utils.model import WhiteboxModel
+
+
+class GreedySimilarityCalculator(StatCalculator):
+    """
+    Calculates the cross-encoder similarity between greedy sequence and sampled sequences.
+    """
+
+    def __init__(self, nli_model):
+        super().__init__(
+            [
+                "greedy_sentence_similarity",
+            ],
+            ["input_texts", "sample_tokens", "sample_texts", "greedy_tokens", "greedy_texts"],
+        )
+
+        self.crossencoder_setup = False
+        self.nli_model = nli_model
+
+    def _setup(self, device="cuda"):
+        self.crossencoder = CrossEncoder(
+            "cross-encoder/stsb-roberta-large", device=device
+        )
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        device = model.device()
+        tokenizer = model.tokenizer
+
+        if not self.crossencoder_setup:
+            self._setup(device=device)
+            self.crossencoder_setup = True
+
+        batch_sample_tokens = dependencies["sample_tokens"]
+        batch_texts = dependencies["sample_texts"]
+        deberta_batch_size = (
+            self.nli_model.batch_size
+        )
+        batch_input_texts = dependencies["input_texts"]
+        batch_greedy_tokens = dependencies["greedy_tokens"]
+        batch_greedy_texts = dependencies["greedy_texts"]
+
+        special_tokens = list(model.tokenizer.added_tokens_decoder.keys())
+
+        batch_pairs = []
+        batch_invs = []
+        batch_counts = []
+        for texts, greedy_text in zip(batch_texts, batch_greedy_texts):
+            # Sampling from LLM often produces significant number of identical
+            # outputs. We only need to score pairs of unqiue outputs
+            unique_texts, inv = np.unique(texts, return_inverse=True)
+            batch_pairs.append(list(itertools.product([greedy_text], unique_texts)))
+            batch_invs.append(inv)
+
+        sim_arrays = []
+        for i, pairs in tqdm(enumerate(batch_pairs)):
+            sim_scores = self.crossencoder.predict(pairs, batch_size=deberta_batch_size)
+
+            inv = batch_invs[i]
+
+            sim_arrays.append(sim_scores[inv])
+
+        sim_arrays = np.stack(sim_arrays)
+
+        return {
+            "greedy_sentence_similarity": sim_arrays,
+        }
diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py
index b6d6b1553..4217000c6 100644
--- a/src/lm_polygraph/stat_calculators/sample.py
+++ b/src/lm_polygraph/stat_calculators/sample.py
@@ -254,18 +254,23 @@ def __call__(
         max_new_tokens: int = 100,
     ) -> Dict[str, np.ndarray]:
         best_sample_texts = []
+        best_sample_text_ids = []
         best_normalized_sample_texts = []
+        best_normalized_sample_text_ids = []
 
         for batch_i, (sample_texts, sample_log_probs, sample_log_likelihoods) in enumerate(zip(dependencies["sample_texts"], dependencies["sample_log_probs"], dependencies["sample_log_likelihoods"])):
             best_i = np.argmax(sample_log_probs)
             best_sample_texts.append(sample_texts[best_i])
+            best_sample_text_ids.append(best_i)
 
             ppls = [np.mean(ll) for ll in sample_log_likelihoods]
             best_ppl_i = np.argmax(ppls)
             best_normalized_sample_texts.append(sample_texts[best_ppl_i])
+            best_normalized_sample_text_ids.append(best_ppl_i)
 
         return {
             "best_sample_texts": best_sample_texts,
+            "best_sample_text_ids": best_sample_text_ids,
             "best_normalized_sample_texts": best_normalized_sample_texts,
+            "best_normalized_sample_text_ids": best_normalized_sample_text_ids,
         }
-
diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
index c2bae472e..926c6b164 100644
--- a/src/lm_polygraph/utils/register_stat_calculators.py
+++ b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -74,6 +74,7 @@ def _register(calculator_class: StatCalculator):
         _register(EmbeddingsCalculator())
         _register(EnsembleTokenLevelDataCalculator())
         _register(CrossEncoderSimilarityMatrixCalculator(nli_model=nli_model))
+        _register(GreedySimilarityCalculator(nli_model=nli_model))
         _register(GreedyAlternativesNLICalculator(nli_model=nli_model))
         _register(SampleAlternativesNLICalculator(nli_model=nli_model))
         _register(GreedyAlternativesFactPrefNLICalculator(nli_model=nli_model))

From f973da89bfce1e6bdc918b5377a839d6f7a4144c Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 9 Jan 2025 22:12:28 +0400
Subject: [PATCH 68/97] Fix class names

---
 src/lm_polygraph/estimators/__init__.py                     | 3 ++-
 .../greedy_semantic_average_ue_average_similarity.py        | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 9d3009913..357816c71 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -93,7 +93,8 @@
     SemanticEnrichedTokenSARAveDissimilarity ,
     SemanticEnrichedMaxprobAveDissimilarity,
     SemanticEnrichedMTEAveDissimilarity,
-    AveDissimilarity)
+    AveDissimilarity
+)
 from .greedy_semantic_average_ue_average_similarity import (
     GreedySemanticAveMaxprobAveSimilarity, 
     GreedySemanticAvePPLAveSimilarity, 
diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
index 5b7a8ab99..7b23c52de 100644
--- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
@@ -50,7 +50,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(enriched_metrics)
 
 
-class GreedySemanticAveMaxprobAveDissimilarity(Estimator):
+class GreedySemanticEnrichedMaxprobAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -132,7 +132,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(enriched_ppl)
 
 
-class GreedySemanticAvePPLAveDissimilarity(Estimator):
+class GreedySemanticEnrichedPPLAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -231,7 +231,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(enriched_tokenSAR)
 
 
-class GreedySemanticAveTokenSARAveDissimilarity(Estimator):
+class GreedySemanticEnrichedTokenSARAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,

From 5e7df32486d9a174747dabbe8702280b2d64bee4 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 10 Jan 2025 15:15:38 +0400
Subject: [PATCH 69/97] Fix naming

---
 ..._semantic_average_ue_average_similarity.py | 34 ++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
index 7b23c52de..9cf501ffe 100644
--- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
@@ -38,11 +38,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             # Compute row-wise average similarity, excluding self-similarity
             # Diagonal contains self-similarities
-            ave_similarity = np.mean(greedy_sentence_similarity)
+            avg_similarity = np.mean(greedy_sentence_similarity)
 
             # Enrich each metric by scaling it by 1/row_average
-            if ave_similarity == 0:
-                ave_similarity = 1e-10  # Avoid division by zero
+            if avg_similarity == 0:
+                avg_similarity = 1e-10  # Avoid division by zero
 
             enriched_metric = prob * (1 / avg_similarity)
             enriched_metrics.append(enriched_metric)
@@ -62,9 +62,9 @@ def __init__(
 
     def __str__(self):
         if self.exp:
-            return "GreedySemanticAveMaxprobAveDissimilarityexp"
+            return "GreedySemanticEnrichedMaxprobAveDissimilarityexp"
         else:
-            return "GreedySemanticAveMaxprobAveDissimilarity"
+            return "GreedySemanticEnrichedMaxprobAveDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
@@ -81,7 +81,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             # Compute row-wise average similarity, excluding self-similarity
             # Diagonal contains self-similarities
-            ave_dissimilarity = np.mean(1 - greedy_sentence_similarity)
+            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
 
             enriched_metric = prob * avg_dissimilarity
             enriched_metrics.append(enriched_metric)
@@ -144,9 +144,9 @@ def __init__(
 
     def __str__(self):
         if self.exp:
-            return "GreedySemanticAvePPLAveDissimilarityexp"
+            return "GreedySemanticEnrichedPPLAveDissimilarityexp"
         else:
-            return "GreedySemanticAvePPLAveDissimilarity"
+            return "GreedySemanticEnrichedPPLAveDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
@@ -213,7 +213,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             R_t = 1 - token_similarity
             R_t_norm = R_t / R_t.sum()
             E_t = -log_likelihoods * R_t_norm
-            tokenSAR.append(E_t.sum())
+            tokenSAR = E_t.sum()
 
             if self.exp:
                 tokenSAR = -np.exp(-np.array(tokenSAR))
@@ -249,9 +249,9 @@ def __init__(
 
     def __str__(self):
         if self.exp:
-            return "GreedySemanticAveTokenSARAveDissimilarityexp"
+            return "GreedySemanticEnrichedTokenSARAveDissimilarityexp"
         else:
-            return "GreedySemanticAveTokenSARAveDissimilarity"
+            return "GreedySemanticEnrichedTokenSARAveDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
@@ -273,7 +273,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             R_t = 1 - token_similarity
             R_t_norm = R_t / R_t.sum()
             E_t = -log_likelihoods * R_t_norm
-            tokenSAR.append(E_t.sum())
+            tokenSAR = E_t.sum()
 
             if self.exp:
                 tokenSAR = -np.exp(-np.array(tokenSAR))
@@ -313,8 +313,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             # Enrich each PPL independently by scaling with 1/row_average
             if avg_similarity == 0:
                 avg_similarity = 1e-10  # Avoid division by zero
-
-            enriched_value = greedy_entropy * (1 / avg_similarity)
+            
+            entropy = np.mean(greedy_entropy)
+            enriched_value = entropy * (1 / avg_similarity)
             enriched_entropy.append(enriched_value)
 
         return np.array(enriched_entropy)
@@ -342,8 +343,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         ):
             #  Compute row-wise average similarity, excluding self-similarity
             avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
-
-            enriched_value = greedy_entropy * avg_dissimilarity
+            
+            entropy = np.mean(greedy_entropy)
+            enriched_value = entropy * avg_dissimilarity
             enriched_entropy.append(enriched_value)
 
         return np.array(enriched_entropy)

From 7cf569e6dc597e7aa793fa2d55b85c67ade628dd Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 10 Jan 2025 15:32:25 +0400
Subject: [PATCH 70/97] Add missing stats to experimental configs

---
 .../configs/polygraph_eval_coqa_sentsar.yaml  |   3 +
 .../polygraph_eval_gsm8k_sentsar_cot.yaml     |   3 +
 .../configs/polygraph_eval_mmlu_sentsar.yaml  |   3 +
 .../polygraph_eval_triviaqa_sentsar.yaml      |   3 +
 .../polygraph_eval_wmt14_enfr_sentsar.yaml    | 153 ------------------
 .../polygraph_eval_wmt14_fren_sentsar.yaml    |   3 +
 .../polygraph_eval_wmt19_deen_sentsar.yaml    |   3 +
 .../polygraph_eval_wmt19_ende_sentsar.yaml    | 152 -----------------
 8 files changed, 18 insertions(+), 305 deletions(-)
 delete mode 100644 examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
 delete mode 100644 examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
index 828e2327a..09d8e58d4 100644
--- a/examples/configs/polygraph_eval_coqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -31,6 +31,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - greedy_sentence_similarity
   - token_similarity
   - entropy
   - sample_tokens
@@ -43,7 +44,9 @@ save_stats:
   - sample_entropy
   - first_sample_texts
   - best_sample_texts
+  - best_sample_text_ids
   - best_normalized_sample_texts
+  - best_normalized_sample_texts_ids
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
index ea91a213f..3164160b2 100644
--- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -32,6 +32,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - greedy_sentence_similarity
   - token_similarity
   - entropy
   - sample_tokens
@@ -44,7 +45,9 @@ save_stats:
   - sample_entropy
   - first_sample_texts
   - best_sample_texts
+  - best_sample_text_ids
   - best_normalized_sample_texts
+  - best_normalized_sample_texts_ids
 entropy_top_k: 50
 
 target_ignore_regex: "(?s).*#### "
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
index 743904e1b..639fe90f6 100644
--- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -33,6 +33,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - greedy_sentence_similarity
   - token_similarity
   - entropy
   - sample_tokens
@@ -45,7 +46,9 @@ save_stats:
   - sample_entropy
   - first_sample_texts
   - best_sample_texts
+  - best_sample_text_ids
   - best_normalized_sample_texts
+  - best_normalized_sample_texts_ids
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 4fd78ca81..12f322bf5 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -33,6 +33,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - greedy_sentence_similarity
   - token_similarity
   - entropy
   - sample_tokens
@@ -45,7 +46,9 @@ save_stats:
   - sample_entropy
   - first_sample_texts
   - best_sample_texts
+  - best_sample_text_ids
   - best_normalized_sample_texts
+  - best_normalized_sample_texts_ids
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
deleted file mode 100644
index 2404e8822..000000000
--- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml
+++ /dev/null
@@ -1,153 +0,0 @@
-hydra:
-  run:
-    dir: ${cache_path}/wmt14_enfr/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
-
-defaults:
-  - model: bloomz-560m
-  - _self_
-
-cache_path: ./workdir/output
-save_path: '${hydra:run.dir}'
-
-device: cpu
-
-task: nmt
-
-base_manager: null
-overwrite_base_estimations: false
-
-dataset: [wmt14, fr-en]
-text_column: en
-label_column: fr
-prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n"
-train_split: train
-eval_split: test
-max_new_tokens: 182
-load_from_disk: false
-generation_params:
-  generate_until:
-    - "\n"
-save_stats:
-  - greedy_tokens
-  - greedy_log_likelihoods
-  - greedy_tokens_alternatives
-  - token_similarity
-  - entropy
-  - sample_tokens
-  - sample_tokens_alternatives
-  - sample_texts
-  - sample_log_probs
-  - sample_log_likelihoods
-  - sample_sentence_similarity
-  - sample_token_similarity
-  - sample_entropy
-  - first_sample_texts
-  - best_sample_texts
-  - best_normalized_sample_texts
-entropy_top_k: 50
-
-source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
-
-train_dataset: null
-train_test_split: false
-test_split_size: 1
-
-background_train_dataset: allenai/c4
-background_train_dataset_text_column: text
-background_train_dataset_label_column: url
-background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
-background_load_from_disk: false
-
-subsample_background_train_dataset: 1000
-subsample_train_dataset: 1000
-subsample_eval_dataset: -1
-
-use_density_based_ue: false
-use_ens_ue: false
-use_seq_ue: false
-use_tok_ue: false
-generation_metrics: null
-
-additional_estimators:
-  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
-    class_name: MonteCarloSequenceEntropy
-    kwargs: {}
-  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
-    class_name: MonteCarloNormalizedSequenceEntropy
-    kwargs: {}
-  - module: lm_polygraph.estimators.semantic_entropy
-    class_name: SemanticEntropy
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.max_probability
-    class_name: MaximumSequenceProbability
-    kwargs: {}
-  - module: lm_polygraph.estimators.max_probability
-    class_name: SampledMaximumSequenceProbability
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: SentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.token_sar
-    class_name: TokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_sar
-    class_name: SampledTokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.perplexity
-    class_name: Perplexity
-    kwargs: {}
-  - module: lm_polygraph.estimators.perplexity
-    class_name: SampledPerplexity
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.token_entropy
-    class_name: MeanTokenEntropy
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_entropy
-    class_name: SampledMeanTokenEntropy
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.average_ue
-    class_name: AveMaxprob
-    kwargs: {}
-  - module: lm_polygraph.estimators.average_ue
-    class_name: AvePPL
-    kwargs: {}
-  - module: lm_polygraph.estimators.average_ue
-    class_name: AveTokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.average_ue
-    class_name: AveMTE
-    kwargs: {}
-
-ignore_exceptions: false
-
-batch_size: 1
-deberta_batch_size: 1
-
-seed:
-    - 1
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 6040bd6e7..d5b5932b4 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -31,6 +31,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - greedy_sentence_similarity
   - token_similarity
   - entropy
   - sample_tokens
@@ -43,7 +44,9 @@ save_stats:
   - sample_entropy
   - first_sample_texts
   - best_sample_texts
+  - best_sample_text_ids
   - best_normalized_sample_texts
+  - best_normalized_sample_texts_ids
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index 58e5cee10..0dbe3de3e 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -31,6 +31,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - greedy_sentence_similarity
   - token_similarity
   - entropy
   - sample_tokens
@@ -43,7 +44,9 @@ save_stats:
   - sample_entropy
   - first_sample_texts
   - best_sample_texts
+  - best_sample_text_ids
   - best_normalized_sample_texts
+  - best_normalized_sample_texts_ids
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
deleted file mode 100644
index 33bae1849..000000000
--- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml
+++ /dev/null
@@ -1,152 +0,0 @@
-hydra:
-  run:
-    dir: ${cache_path}/wmt19_ende/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
-
-defaults:
-  - model: bloomz-560m
-  - _self_
-
-cache_path: ./workdir/output
-save_path: '${hydra:run.dir}'
-
-device: cpu
-
-task: nmt
-
-base_manager: null
-overwrite_base_estimations: false
-
-dataset: [wmt19, de-en]
-text_column: en
-label_column: de
-prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n"
-train_split: train
-eval_split: validation
-max_new_tokens: 200
-load_from_disk: false
-generation_params:
-  generate_until:
-    - "\n"
-save_stats:
-  - greedy_tokens
-  - greedy_log_likelihoods
-  - greedy_tokens_alternatives
-  - token_similarity
-  - entropy
-  - sample_tokens
-  - sample_tokens_alternatives
-  - sample_texts
-  - sample_log_probs
-  - sample_log_likelihoods
-  - sample_sentence_similarity
-  - sample_token_similarity
-  - sample_entropy
-  - first_sample_texts
-  - best_sample_texts
-  - best_normalized_sample_texts
-entropy_top_k: 50
-
-source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
-
-train_dataset: null
-train_test_split: false
-test_split_size: 1
-
-background_train_dataset: allenai/c4
-background_train_dataset_text_column: text
-background_train_dataset_label_column: url
-background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz
-background_load_from_disk: false
-
-subsample_background_train_dataset: 1000
-subsample_train_dataset: 1000
-subsample_eval_dataset: -1
-
-use_density_based_ue: false
-use_ens_ue: false
-use_seq_ue: false
-use_tok_ue: false
-
-additional_estimators:
-  - module: lm_polygraph.estimators.monte_carlo_sequence_entropy
-    class_name: MonteCarloSequenceEntropy
-    kwargs: {}
-  - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy
-    class_name: MonteCarloNormalizedSequenceEntropy
-    kwargs: {}
-  - module: lm_polygraph.estimators.semantic_entropy
-    class_name: SemanticEntropy
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.max_probability
-    class_name: MaximumSequenceProbability
-    kwargs: {}
-  - module: lm_polygraph.estimators.max_probability
-    class_name: SampledMaximumSequenceProbability
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: SentenceSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MaxprobGSU
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.token_sar
-    class_name: TokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_sar
-    class_name: SampledTokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.sar
-    class_name: SAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: TokenSARGSU
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.perplexity
-    class_name: Perplexity
-    kwargs: {}
-  - module: lm_polygraph.estimators.perplexity
-    class_name: SampledPerplexity
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: PPLSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: PPLGSU
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.token_entropy
-    class_name: MeanTokenEntropy
-    kwargs: {}
-  - module: lm_polygraph.estimators.token_entropy
-    class_name: SampledMeanTokenEntropy
-    kwargs: {}
-  - module: lm_polygraph.estimators.sentence_sar
-    class_name: MTESAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.gsu
-    class_name: MTEGSU
-    kwargs: {}
-
-  - module: lm_polygraph.estimators.average_ue
-    class_name: AveMaxprob
-    kwargs: {}
-  - module: lm_polygraph.estimators.average_ue
-    class_name: AvePPL
-    kwargs: {}
-  - module: lm_polygraph.estimators.average_ue
-    class_name: AveTokenSAR
-    kwargs: {}
-  - module: lm_polygraph.estimators.average_ue
-    class_name: AveMTE
-    kwargs: {}
-
-ignore_exceptions: false
-
-batch_size: 1
-deberta_batch_size: 1
-
-seed:
-    - 1

From faf3e14511ff918de5e5b423b9d8a21d5d6b53d5 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 10 Jan 2025 16:34:29 +0400
Subject: [PATCH 71/97] Make experiments work

---
 .../configs/polygraph_eval_coqa_sentsar.yaml  |  4 ++
 .../polygraph_eval_gsm8k_sentsar_cot.yaml     |  4 ++
 .../configs/polygraph_eval_mmlu_sentsar.yaml  |  4 ++
 .../polygraph_eval_triviaqa_sentsar.yaml      |  4 ++
 .../polygraph_eval_wmt14_fren_sentsar.yaml    |  4 ++
 .../polygraph_eval_wmt19_deen_sentsar.yaml    |  4 ++
 .../configs/polygraph_eval_xsum_sentsar.yaml  |  7 ++++
 scripts/polygraph_eval                        |  2 +-
 src/lm_polygraph/estimators/common.py         |  3 ++
 .../semantic_average_ue_average_similarity.py | 41 ++++++++++++++-----
 src/lm_polygraph/stat_calculators/sample.py   |  2 +
 11 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
index 09d8e58d4..9963feca3 100644
--- a/examples/configs/polygraph_eval_coqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -145,6 +145,10 @@ additional_estimators:
     class_name: AveMTE
     kwargs: {}
 
+  - module: lm_polygraph.estimators.semantic_average_ue_average_similarity
+    class_name: SemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
index 3164160b2..f282769b4 100644
--- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -149,6 +149,10 @@ additional_estimators:
     class_name: AveMTE
     kwargs: {}
 
+  - module: lm_polygraph.estimators.semantic_average_ue_average_similarity
+    class_name: SemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
index 639fe90f6..5c88ba80e 100644
--- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -147,6 +147,10 @@ additional_estimators:
     class_name: AveMTE
     kwargs: {}
 
+  - module: lm_polygraph.estimators.semantic_average_ue_average_similarity
+    class_name: SemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 12f322bf5..b639b9cfa 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -148,6 +148,10 @@ additional_estimators:
     class_name: AveMTE
     kwargs: {}
 
+  - module: lm_polygraph.estimators.semantic_average_ue_average_similarity
+    class_name: SemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index d5b5932b4..f2def77bf 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -147,6 +147,10 @@ additional_estimators:
     class_name: AveMTE
     kwargs: {}
 
+  - module: lm_polygraph.estimators.semantic_average_ue_average_similarity
+    class_name: SemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index 0dbe3de3e..fbe97acfe 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -146,6 +146,10 @@ additional_estimators:
     class_name: AveMTE
     kwargs: {}
 
+  - module: lm_polygraph.estimators.semantic_average_ue_average_similarity
+    class_name: SemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index 1a4d971c6..cce8d376d 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -32,6 +32,7 @@ save_stats:
   - greedy_tokens
   - greedy_log_likelihoods
   - greedy_tokens_alternatives
+  - greedy_sentence_similarity
   - token_similarity
   - entropy
   - sample_tokens
@@ -44,7 +45,9 @@ save_stats:
   - sample_entropy
   - first_sample_texts
   - best_sample_texts
+  - best_sample_text_ids
   - best_normalized_sample_texts
+  - best_normalized_sample_texts_ids
 entropy_top_k: 50
 
 train_dataset: null
@@ -142,6 +145,10 @@ additional_estimators:
     class_name: AveMTE
     kwargs: {}
 
+  - module: lm_polygraph.estimators.semantic_average_ue_average_similarity
+    class_name: SemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 0eda6666c..2440332b0 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -426,7 +426,7 @@ def get_ue_methods(args, model):
         estimator = estimator_class(**estimator_args.kwargs)
         # Additional estimator filtering only works correctly for sequence-level estimators
         if overwrite or ('sequence', str(estimator)) not in existing_estimators:
-            estimators.append(estimator_class(**estimator_args.kwargs))
+            estimators.append(estimator)
 
     return estimators
 
diff --git a/src/lm_polygraph/estimators/common.py b/src/lm_polygraph/estimators/common.py
index 72e2142e7..7942e2c41 100644
--- a/src/lm_polygraph/estimators/common.py
+++ b/src/lm_polygraph/estimators/common.py
@@ -1,5 +1,6 @@
 import numpy as np
 
+SAMPLE_SELECTION_STAT_KEYS = ["best_sample_text_ids", "best_normalized_sample_text_ids"]
 
 def _get_pairs(lst):
     pairs = []
@@ -30,6 +31,7 @@ def _compute_Jaccard_score(lst):
 def compute_sim_score(answers, affinity, similarity_score):
     return _compute_Jaccard_score(answers)
 
+
 def sample_strategy_to_prefix(sample_strategy):
     if sample_strategy == "first":
         return ""
@@ -38,6 +40,7 @@ def sample_strategy_to_prefix(sample_strategy):
     else:
         raise ValueError(f"Unknown sample strategy: {sample_strategy}")
 
+
 def best_sample_ids(sample_strategy, stats):
     batch_size = len(stats["sample_log_probs"])
     if sample_strategy == "first":
diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
index 2a0f0b617..f33849e98 100644
--- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
@@ -4,7 +4,7 @@
 from copy import deepcopy
 
 from .estimator import Estimator
-from .common import sample_strategy_to_prefix, best_sample_ids
+from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS
 
 
 class SemanticAveMaxprobAveSimilarity(Estimator):
@@ -14,7 +14,10 @@ def __init__(
         exp: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
         self.verbose = verbose
         self.exp = exp
         self.sample_strategy = sample_strategy
@@ -73,7 +76,10 @@ def __init__(
         exp: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence")
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
         self.verbose = verbose
         self.exp = exp
         self.sample_strategy = sample_strategy
@@ -136,7 +142,10 @@ def __init__(
         exp: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        super().__init(
+            ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
         self.verbose = verbose
         self.exp = exp
         self.sample_strategy = sample_strategy
@@ -194,7 +203,10 @@ def __init__(
         exp: bool = False,  
         sample_strategy: str = "first"
     ):
-        super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence")
+        super().__init(
+            ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
         self.verbose = verbose
         self.exp = exp
         self.sample_strategy = sample_strategy
@@ -259,7 +271,7 @@ def __init__(
                 "sample_sentence_similarity",
                 "sample_log_likelihoods",
                 "sample_token_similarity",
-            ],
+            ] + SAMPLE_SELECTION_STAT_KEYS,
             "sequence",
         )
         self.verbose = verbose
@@ -340,7 +352,7 @@ def __init__(
                 "sample_sentence_similarity",
                 "sample_log_likelihoods",
                 "sample_token_similarity",
-            ],
+            ] + SAMPLE_SELECTION_STAT_KEYS,
             "sequence",
         )
         self.verbose = verbose
@@ -415,7 +427,10 @@ def __init__(
         verbose: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        super().__init(
+            ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
         self.verbose = verbose
         self.sample_strategy = sample_strategy
 
@@ -464,7 +479,10 @@ def __init__(
         verbose: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        super().__init(
+            ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
         self.verbose = verbose
         self.sample_strategy = sample_strategy
 
@@ -513,7 +531,10 @@ def __init__(
         verbose: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence")
+        super().__init(
+            ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
         self.verbose = verbose
         self.sample_strategy = sample_strategy
 
diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py
index 4217000c6..f05d9901e 100644
--- a/src/lm_polygraph/stat_calculators/sample.py
+++ b/src/lm_polygraph/stat_calculators/sample.py
@@ -237,7 +237,9 @@ def __init__(self):
         super().__init__(
             [
                 "best_sample_texts",
+                "best_sample_text_ids",
                 "best_normalized_sample_texts",
+                "best_normalized_sample_text_ids",
             ],
             [
                 "sample_texts",

From 3074b6a2331317d6d110be420598f0924322e563 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 10 Jan 2025 16:48:16 +0400
Subject: [PATCH 72/97] Add falcon model

---
 examples/configs/model/falcon3.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 examples/configs/model/falcon3.yaml

diff --git a/examples/configs/model/falcon3.yaml b/examples/configs/model/falcon3.yaml
new file mode 100644
index 000000000..3a8243339
--- /dev/null
+++ b/examples/configs/model/falcon3.yaml
@@ -0,0 +1,11 @@
+defaults:
+  - default
+
+path: tiiuae/Falcon3-7B-Base
+type: CausalLM
+path_to_load_script: model/default_causal.py
+
+load_model_args:
+  device_map: balanced_low_0
+  dtype: bfloat16
+load_tokenizer_args: {}

From edf9ee1701679a21ee36981c0a03ed5ce9423408 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 10 Jan 2025 19:06:23 +0400
Subject: [PATCH 73/97] Prevent tokenizer outputting token type ids

---
 src/lm_polygraph/utils/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py
index f22901919..db381107f 100644
--- a/src/lm_polygraph/utils/model.py
+++ b/src/lm_polygraph/utils/model.py
@@ -564,7 +564,7 @@ def tokenize(
                 formatted_texts.append(formatted_chat)
             texts = formatted_texts
 
-        return self.tokenizer(texts, padding=True, return_tensors="pt")
+        return self.tokenizer(texts, padding=True, return_tensors="pt", return_token_type_ids=False)
 
 
 def create_ensemble(

From e07c62ac30cbe219b548f5f203e1db06d770b046 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Mon, 13 Jan 2025 12:42:29 +0400
Subject: [PATCH 74/97] Save manager state between evaluation steps

---
 scripts/polygraph_eval            |  4 +---
 src/lm_polygraph/utils/manager.py | 20 ++++++++++++++++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 2440332b0..a748e8d10 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -228,13 +228,11 @@ def main(args):
                 language=getattr(args, 'language', 'en'),
                 save_stats=getattr(args, 'save_stats', []),
                 entropy_top_k=getattr(args, 'entropy_top_k', None),
+                save_path=save_path + f"/ue_manager_seed{seed}",
             )
 
         man()
 
-        man.save(save_path + f"/ue_manager_seed{seed}")
-
-
 def get_ue_metrics(args):
     ue_metrics = [
         #ReversedPairsProportion(),
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index db3c4a0ec..905ff3818 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -260,6 +260,8 @@ def __init__(
         cache_path=os.path.expanduser("~") + "/.cache",
         save_stats: List[str] = [],
         entropy_top_k: Optional[int] = None,
+        state: str = 'init',
+        save_path: Optional[str] = None,
     ):
         """
         Parameters:
@@ -315,6 +317,8 @@ def __init__(
         self.deberta_batch_size = deberta_batch_size
         self.deberta_device = deberta_device
         self.language = language
+        self.state = state
+        self.save_path = save_path
 
 
     def prepare_calculators(self):
@@ -508,11 +512,17 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
             torch.cuda.empty_cache()
             gc.collect()
 
+        self.state = 'post_inference'
+        self.save()
+
         self.eval_ue()
+        self.state = 'post_eval'
 
         for processor in self.processors:
             processor.on_eval(self.metrics, self.total_bad_estimators)
 
+        self.save()
+
         return self.metrics
 
     def eval_ue(self):
@@ -680,7 +690,7 @@ def _extract_train_embeddings(
 
         return result_train_stat
 
-    def save(self, save_path: str):
+    def save(self):
         """
         Saves the run results in the provided path. Will raise exception, if no results are calculated yet.
         To load the saved manager, see UEManager.load().
@@ -690,14 +700,18 @@ def save(self, save_path: str):
         """
         if len(self.metrics) == 0:
             raise Exception("Nothing to save. Consider calling manager() first.")
+        if self.save_path is None:
+            raise Exception("No save path provided.")
+
         torch.save(
             {
                 "metrics": self.metrics,
                 "gen_metrics": self.gen_metrics,
                 "estimations": self.estimations,
                 "stats": self.stats,
+                "state": self.state,
             },
-            save_path,
+            self.save_path,
         )
 
     @staticmethod
@@ -723,4 +737,6 @@ def load(load_path: str, **kwargs) -> "UEManager":
         man.gen_metrics = res_dict.get("gen_metrics", None)
         man.estimations = res_dict.get("estimations", None)
         man.stats = res_dict.get("stats", None)
+        man.state = res_dict.get("state", 'init')
+
         return man

From c3c63ad8528b2d3c60082a3d50d55d5c572ff796 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Mon, 13 Jan 2025 14:26:26 +0400
Subject: [PATCH 75/97] Fix saving

---
 scripts/polygraph_eval            | 2 +-
 src/lm_polygraph/utils/manager.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index a748e8d10..8421863a5 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -207,6 +207,7 @@ def main(args):
                 language=getattr(args, 'language', 'en'),
                 save_stats=getattr(args, 'save_stats', []),
                 entropy_top_k=getattr(args, 'entropy_top_k', None),
+                save_path=save_path + f"/ue_manager_seed{seed}",
             )
         else:
             man = UEManager.load(
@@ -228,7 +229,6 @@ def main(args):
                 language=getattr(args, 'language', 'en'),
                 save_stats=getattr(args, 'save_stats', []),
                 entropy_top_k=getattr(args, 'entropy_top_k', None),
-                save_path=save_path + f"/ue_manager_seed{seed}",
             )
 
         man()
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 905ff3818..e44a260df 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -698,8 +698,6 @@ def save(self):
         Parameters:
             save_path (str): Path to file to save benchmark results to.
         """
-        if len(self.metrics) == 0:
-            raise Exception("Nothing to save. Consider calling manager() first.")
         if self.save_path is None:
             raise Exception("No save path provided.")
 

From 6f9fee1c3aa83cfcddfa00d66c2695386314d817 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Tue, 14 Jan 2025 13:53:26 +0400
Subject: [PATCH 76/97] Fix stat name issue

---
 examples/configs/polygraph_eval_coqa_sentsar.yaml       | 2 +-
 examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml  | 2 +-
 examples/configs/polygraph_eval_mmlu_sentsar.yaml       | 2 +-
 examples/configs/polygraph_eval_triviaqa_sentsar.yaml   | 2 +-
 examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 2 +-
 examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 2 +-
 examples/configs/polygraph_eval_xsum_sentsar.yaml       | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
index 9963feca3..2d2a80727 100644
--- a/examples/configs/polygraph_eval_coqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -46,7 +46,7 @@ save_stats:
   - best_sample_texts
   - best_sample_text_ids
   - best_normalized_sample_texts
-  - best_normalized_sample_texts_ids
+  - best_normalized_sample_text_ids
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
index f282769b4..7357f6153 100644
--- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -47,7 +47,7 @@ save_stats:
   - best_sample_texts
   - best_sample_text_ids
   - best_normalized_sample_texts
-  - best_normalized_sample_texts_ids
+  - best_normalized_sample_text_ids
 entropy_top_k: 50
 
 target_ignore_regex: "(?s).*#### "
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
index 5c88ba80e..211dbd4a0 100644
--- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -48,7 +48,7 @@ save_stats:
   - best_sample_texts
   - best_sample_text_ids
   - best_normalized_sample_texts
-  - best_normalized_sample_texts_ids
+  - best_normalized_sample_text_ids
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index b639b9cfa..532a29a5e 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -48,7 +48,7 @@ save_stats:
   - best_sample_texts
   - best_sample_text_ids
   - best_normalized_sample_texts
-  - best_normalized_sample_texts_ids
+  - best_normalized_sample_text_ids
 entropy_top_k: 50
 
 train_dataset: null
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index f2def77bf..8ab1e65bb 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -46,7 +46,7 @@ save_stats:
   - best_sample_texts
   - best_sample_text_ids
   - best_normalized_sample_texts
-  - best_normalized_sample_texts_ids
+  - best_normalized_sample_text_ids
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index fbe97acfe..8fca6ce1c 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -46,7 +46,7 @@ save_stats:
   - best_sample_texts
   - best_sample_text_ids
   - best_normalized_sample_texts
-  - best_normalized_sample_texts_ids
+  - best_normalized_sample_text_ids
 entropy_top_k: 50
 
 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n"
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index cce8d376d..f3efb2356 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -47,7 +47,7 @@ save_stats:
   - best_sample_texts
   - best_sample_text_ids
   - best_normalized_sample_texts
-  - best_normalized_sample_texts_ids
+  - best_normalized_sample_text_ids
 entropy_top_k: 50
 
 train_dataset: null

From 3e7b953e7e20d94f8eb9e8c6ffc28bccb0197c86 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 15 Jan 2025 15:32:40 +0400
Subject: [PATCH 77/97] Fix tokensars, add greedy-based method to save stat

---
 .../configs/polygraph_eval_coqa_sentsar.yaml  |  4 ++++
 .../polygraph_eval_gsm8k_sentsar_cot.yaml     |  4 ++++
 .../configs/polygraph_eval_mmlu_sentsar.yaml  |  4 ++++
 .../polygraph_eval_triviaqa_sentsar.yaml      |  4 ++++
 .../polygraph_eval_wmt14_fren_sentsar.yaml    |  4 ++++
 .../polygraph_eval_wmt19_deen_sentsar.yaml    |  4 ++++
 .../configs/polygraph_eval_xsum_sentsar.yaml  |  4 ++++
 src/lm_polygraph/estimators/average_ue.py     | 11 +++++-----
 src/lm_polygraph/estimators/gsu.py            |  5 ++++-
 src/lm_polygraph/estimators/sar.py            |  5 ++++-
 .../estimators/semantic_average_ue.py         |  5 ++++-
 .../semantic_average_ue_average_similarity.py | 20 ++++++++++++-------
 .../estimators/semantic_median_ue.py          |  5 ++++-
 src/lm_polygraph/estimators/token_sar.py      |  5 ++++-
 src/lm_polygraph/utils/manager.py             |  1 +
 15 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml
index 2d2a80727..9c710a207 100644
--- a/examples/configs/polygraph_eval_coqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml
@@ -149,6 +149,10 @@ additional_estimators:
     class_name: SemanticAveMaxprobAveSimilarity
     kwargs: {}
 
+  - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity
+    class_name: GreedySemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
index 7357f6153..dead8a64e 100644
--- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
+++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml
@@ -153,6 +153,10 @@ additional_estimators:
     class_name: SemanticAveMaxprobAveSimilarity
     kwargs: {}
 
+  - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity
+    class_name: GreedySemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
index 211dbd4a0..4be9fc43c 100644
--- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml
+++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml
@@ -151,6 +151,10 @@ additional_estimators:
     class_name: SemanticAveMaxprobAveSimilarity
     kwargs: {}
 
+  - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity
+    class_name: GreedySemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
index 532a29a5e..81e594904 100644
--- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
+++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml
@@ -152,6 +152,10 @@ additional_estimators:
     class_name: SemanticAveMaxprobAveSimilarity
     kwargs: {}
 
+  - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity
+    class_name: GreedySemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
index 8ab1e65bb..67449b720 100644
--- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml
@@ -151,6 +151,10 @@ additional_estimators:
     class_name: SemanticAveMaxprobAveSimilarity
     kwargs: {}
 
+  - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity
+    class_name: GreedySemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
index 8fca6ce1c..cf9b58fe0 100644
--- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
+++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml
@@ -150,6 +150,10 @@ additional_estimators:
     class_name: SemanticAveMaxprobAveSimilarity
     kwargs: {}
 
+  - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity
+    class_name: GreedySemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml
index f3efb2356..c9dd41a86 100644
--- a/examples/configs/polygraph_eval_xsum_sentsar.yaml
+++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml
@@ -149,6 +149,10 @@ additional_estimators:
     class_name: SemanticAveMaxprobAveSimilarity
     kwargs: {}
 
+  - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity
+    class_name: GreedySemanticAveMaxprobAveSimilarity
+    kwargs: {}
+
 ignore_exceptions: false
 
 batch_size: 1
diff --git a/src/lm_polygraph/estimators/average_ue.py b/src/lm_polygraph/estimators/average_ue.py
index a7748e9e6..4ecf9e541 100644
--- a/src/lm_polygraph/estimators/average_ue.py
+++ b/src/lm_polygraph/estimators/average_ue.py
@@ -5,7 +5,6 @@
 
 from .estimator import Estimator
 
-
 class AveMaxprob(Estimator):
     def __init__(
         self,
@@ -80,11 +79,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
 
         ave = []
-        for batch_data in zip(
+        for i, batch_data in enumerate(zip(
             batch_sample_log_likelihoods,
             batch_sample_token_similarity,
             batch_sample_sentence_similarity,
-        ):
+        )):
             sample_log_likelihoods = batch_data[0]
             sample_token_similarity = batch_data[1]
             sample_sentence_similarity = batch_data[2]
@@ -95,10 +94,12 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             ):
                 log_likelihoods = np.array(log_likelihoods)
                 R_t = 1 - token_similarity
-                R_t_norm = R_t / R_t.sum()
+                if R_t.sum() == 0:
+                    R_t_norm = np.zeros_like(R_t)
+                else:
+                    R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
-
             ave.append(np.mean(tokenSAR))
 
         return np.array(ave)
diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py
index 2969dca95..bdb8e5de4 100644
--- a/src/lm_polygraph/estimators/gsu.py
+++ b/src/lm_polygraph/estimators/gsu.py
@@ -167,7 +167,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             ):
                 log_likelihoods = np.array(log_likelihoods)
                 R_t = 1 - token_similarity
-                R_t_norm = R_t / R_t.sum()
+                if R_t.sum() == 0:
+                    R_t_norm = np.zeros_like(R_t)
+                else:
+                    R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
 
diff --git a/src/lm_polygraph/estimators/sar.py b/src/lm_polygraph/estimators/sar.py
index 57e9c2902..2d7559db1 100644
--- a/src/lm_polygraph/estimators/sar.py
+++ b/src/lm_polygraph/estimators/sar.py
@@ -63,7 +63,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             ):
                 log_likelihoods = np.array(log_likelihoods)
                 R_t = 1 - token_similarity
-                R_t_norm = R_t / R_t.sum()
+                if R_t.sum() == 0:
+                    R_t_norm = np.zeros_like(R_t)
+                else:
+                    R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
 
diff --git a/src/lm_polygraph/estimators/semantic_average_ue.py b/src/lm_polygraph/estimators/semantic_average_ue.py
index d58489d99..161ccec37 100644
--- a/src/lm_polygraph/estimators/semantic_average_ue.py
+++ b/src/lm_polygraph/estimators/semantic_average_ue.py
@@ -133,7 +133,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             ):
                 log_likelihoods = np.array(log_likelihoods)
                 R_t = 1 - token_similarity
-                R_t_norm = R_t / R_t.sum()
+                if R_t.sum() == 0:
+                    R_t_norm = np.zeros_like(R_t)
+                else:
+                    R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
             
diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
index f33849e98..40fc0a004 100644
--- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
@@ -142,7 +142,7 @@ def __init__(
         exp: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init(
+        super().__init__(
             ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS,
             "sequence"
         )
@@ -203,7 +203,7 @@ def __init__(
         exp: bool = False,  
         sample_strategy: str = "first"
     ):
-        super().__init(
+        super().__init__(
             ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS,
             "sequence"
         )
@@ -308,7 +308,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             ):
                 log_likelihoods = np.array(log_likelihoods)
                 R_t = 1 - token_similarity
-                R_t_norm = R_t / R_t.sum()
+                if R_t.sum() == 0:
+                    R_t_norm = np.zeros_like(R_t)
+                else:
+                    R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
 
@@ -389,7 +392,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             ):
                 log_likelihoods = np.array(log_likelihoods)
                 R_t = 1 - token_similarity
-                R_t_norm = R_t / R_t.sum()
+                if R_t.sum() == 0:
+                    R_t_norm = np.zeros_like(R_t)
+                else:
+                    R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
 
@@ -427,7 +433,7 @@ def __init__(
         verbose: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init(
+        super().__init__(
             ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS,
             "sequence"
         )
@@ -479,7 +485,7 @@ def __init__(
         verbose: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init(
+        super().__init__(
             ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS,
             "sequence"
         )
@@ -531,7 +537,7 @@ def __init__(
         verbose: bool = False,
         sample_strategy: str = "first"
     ):
-        super().__init(
+        super().__init__(
             ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS,
             "sequence"
         )
diff --git a/src/lm_polygraph/estimators/semantic_median_ue.py b/src/lm_polygraph/estimators/semantic_median_ue.py
index 0ec7d3274..5c6687608 100644
--- a/src/lm_polygraph/estimators/semantic_median_ue.py
+++ b/src/lm_polygraph/estimators/semantic_median_ue.py
@@ -134,7 +134,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             ):
                 log_likelihoods = np.array(log_likelihoods)
                 R_t = 1 - token_similarity
-                R_t_norm = R_t / R_t.sum()
+                if R_t.sum() == 0:
+                    R_t_norm = np.zeros_like(R_t)
+                else:
+                    R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
 
diff --git a/src/lm_polygraph/estimators/token_sar.py b/src/lm_polygraph/estimators/token_sar.py
index c2695970d..d31c7651b 100644
--- a/src/lm_polygraph/estimators/token_sar.py
+++ b/src/lm_polygraph/estimators/token_sar.py
@@ -99,7 +99,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             ):
                 log_likelihoods = np.array(log_likelihoods)
                 R_t = 1 - token_similarity
-                R_t_norm = R_t / R_t.sum()
+                if R_t.sum() == 0:
+                    R_t_norm = np.zeros_like(R_t)
+                else:
+                    R_t_norm = R_t / R_t.sum()
                 E_t = -log_likelihoods * R_t_norm
                 tokenSAR.append(E_t.sum())
             result.append(tokenSAR[best_id])
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index e44a260df..987177e34 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -540,6 +540,7 @@ def eval_ue(self):
                             f"Got different number of metrics for {e_name} and {gen_name}: "
                             f"{len(estimator_values)} and {len(generation_metric)}"
                         )
+
                     # TODO: Report how many nans!
                     # This is important to know for a user
                     ue, metric = _delete_nans(estimator_values, generation_metric)

From 0bba562922497216ac1d814444c727de9b80aa81 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Mon, 20 Jan 2025 14:55:15 +0400
Subject: [PATCH 78/97] Add rouge sim matrix calculator, some fixes

---
 src/lm_polygraph/estimators/__init__.py       |  2 +
 ..._semantic_average_ue_average_similarity.py | 10 ++-
 .../greedy_sum_semantic_entropies.py          | 70 +++++++++++++++
 .../estimators/sum_semantic_entropies.py      | 79 +++++++++++++++++
 src/lm_polygraph/estimators/token_sar.py      |  5 +-
 src/lm_polygraph/stat_calculators/__init__.py |  3 +
 .../stat_calculators/greedy_rouge_matrix.py   | 58 ++++++++++++
 .../greedy_semantic_matrix.py                 | 88 +++++++++++++++++++
 .../stat_calculators/greedy_similarity.py     |  5 +-
 .../stat_calculators/rouge_matrix.py          | 67 ++++++++++++++
 .../utils/register_stat_calculators.py        |  2 +
 11 files changed, 382 insertions(+), 7 deletions(-)
 create mode 100644 src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py
 create mode 100644 src/lm_polygraph/estimators/sum_semantic_entropies.py
 create mode 100644 src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py
 create mode 100644 src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
 create mode 100644 src/lm_polygraph/stat_calculators/rouge_matrix.py

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 357816c71..648d5a1ab 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -106,3 +106,5 @@
     GreedySemanticEnrichedMTEAveDissimilarity,
 )
 from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE
+from .sum_semantic_entropies import SumSemanticMaxprob, SumSemanticPPL
+from .greedy_sum_semantic_entropies import GreedySumSemanticMaxprob, GreedySumSemanticPPL
diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
index 9cf501ffe..d2b182741 100644
--- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
@@ -211,7 +211,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             log_likelihoods = np.array(log_likelihoods)
             R_t = 1 - token_similarity
-            R_t_norm = R_t / R_t.sum()
+            if R_t.sum() == 0:
+                R_t_norm = np.zeros_like(R_t)
+            else:
+                R_t_norm = R_t / R_t.sum()
             E_t = -log_likelihoods * R_t_norm
             tokenSAR = E_t.sum()
 
@@ -271,7 +274,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             log_likelihoods = np.array(log_likelihoods)
             R_t = 1 - token_similarity
-            R_t_norm = R_t / R_t.sum()
+            if R_t.sum() == 0:
+                R_t_norm = np.zeros_like(R_t)
+            else:
+                R_t_norm = R_t / R_t.sum()
             E_t = -log_likelihoods * R_t_norm
             tokenSAR = E_t.sum()
 
diff --git a/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py b/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py
new file mode 100644
index 000000000..1280dadc0
--- /dev/null
+++ b/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py
@@ -0,0 +1,70 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids
+
+
+class GreedySumSemanticMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedySumSemanticMaxprob"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+        batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]])
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+        for greedy_ll, greedy_sentence_similarity in zip(
+            batch_lls, batch_greedy_sentence_similarity
+        ):
+            # Compute probabilities (negative log-probs)
+            prob = -greedy_ll
+
+            # Compute row-wise average similarity, excluding self-similarity
+            # Diagonal contains self-similarities
+            avg_similarity = np.mean(greedy_sentence_similarity)
+
+            enriched_metrics.append(prob - np.log(avg_similarity))
+
+        return np.array(enriched_metrics)
+
+
+class GreedySumSemanticPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedySumSemanticPPL"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for greedy_log_likelihoods, greedy_sentence_similarity in zip(
+            batch_greedy_log_likelihoods, batch_greedy_sentence_similarity
+        ):
+            # get PPL for each sample
+            ppl = -np.mean(greedy_log_likelihoods)
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_similarity = np.mean(greedy_sentence_similarity)
+
+            enriched_ppl.append(ppl - np.log(avg_similarity))
+
+
+        return np.array(enriched_ppl)
diff --git a/src/lm_polygraph/estimators/sum_semantic_entropies.py b/src/lm_polygraph/estimators/sum_semantic_entropies.py
new file mode 100644
index 000000000..10b19a03b
--- /dev/null
+++ b/src/lm_polygraph/estimators/sum_semantic_entropies.py
@@ -0,0 +1,79 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS
+
+
+class SumSemanticMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first"
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+
+    def __str__(self):
+        base = "SumSemanticMaxprob"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+
+        for best_id, sample_log_probs, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sim = sample_sentence_similarity[best_id, :]
+            sim[best_id] = 1
+            avg_similarity = np.mean(sim)
+            res = -np.sum(sample_log_probs[best_id]) - np.log(avg_similarity)
+            enriched_metrics.append(res)
+
+        return np.array(enriched_metrics)
+
+
+class SumSemanticPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first"
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+
+    def __str__(self):
+        base = "SumSemanticPPL"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for best_id, sample_log_likelihoods, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
+            sim = sample_sentence_similarity[best_id, :]
+            sim[best_id] = 1
+            avg_similarity = np.mean(sim)
+            res = -np.mean(sample_log_likelihoods[best_id]) - np.log(avg_similarity)
+            enriched_ppl.append(res)
+
+        return np.array(enriched_ppl)
diff --git a/src/lm_polygraph/estimators/token_sar.py b/src/lm_polygraph/estimators/token_sar.py
index d31c7651b..0997c110b 100644
--- a/src/lm_polygraph/estimators/token_sar.py
+++ b/src/lm_polygraph/estimators/token_sar.py
@@ -43,7 +43,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         ):
             log_likelihoods = np.array(log_likelihoods)
             R_t = 1 - token_similarity
-            R_t_norm = R_t / R_t.sum()
+            if R_t.sum() == 0:
+                R_t_norm = np.zeros_like(R_t)
+            else:
+                R_t_norm = R_t / R_t.sum()
             E_t = -log_likelihoods * R_t_norm
             tokenSAR.append(E_t.sum())
 
diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py
index f8c3fa554..b01f9c541 100644
--- a/src/lm_polygraph/stat_calculators/__init__.py
+++ b/src/lm_polygraph/stat_calculators/__init__.py
@@ -25,3 +25,6 @@
 from .extract_claims import ClaimsExtractor
 from .semantic_classes import SemanticClassesCalculator
 from .greedy_similarity import GreedySimilarityCalculator
+from .greedy_semantic_matrix import GreedySemanticMatrixCalculator
+from .rouge_matrix import RougeLSemanticMatrixCalculator
+from .greedy_rouge_matrix import GreedyRougeLSemanticMatrixCalculator
diff --git a/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py b/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py
new file mode 100644
index 000000000..ebcc14373
--- /dev/null
+++ b/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py
@@ -0,0 +1,58 @@
+import numpy as np
+
+import itertools
+from typing import Dict, List
+
+from .stat_calculator import StatCalculator
+from lm_polygraph.utils.model import WhiteboxModel
+import torch.nn as nn
+import torch
+from rouge_score import rouge_scorer
+
+class GreedyRougeLSemanticMatrixCalculator(StatCalculator):
+    def __init__(self):
+        super().__init__(
+            [
+                "greedy_semantic_matrix",
+            ],
+            ["greedy_texts", "sample_texts"],
+        )
+        self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        batch_texts = dependencies["sample_texts"]
+        batch_greedy_texts = dependencies["greedy_texts"]
+
+        batch_pairs = []
+        batch_invs = []
+        for texts, greedy_text in zip(batch_texts, batch_greedy_texts):
+            # Sampling from LLM often produces significant number of identical
+            # outputs. We only need to score pairs of unqiue outputs
+            unique_texts, inv = np.unique(texts, return_inverse=True)
+            batch_pairs.append(list(itertools.product([greedy_text], unique_texts)))
+            batch_invs.append(inv)
+
+
+        E = []
+
+        for i, pairs in enumerate(batch_pairs):
+            sim_mat = []
+            for first_texts, second_texts in pairs:
+                sim_mat.append(self.scorer.score(first_texts, second_texts)['rougeL'].fmeasure)
+
+            sim_mat = np.array(sim_mat)
+
+            inv = batch_invs[i]
+            E.append(sim_mat[inv])
+
+        E = np.stack(E)
+
+        return {
+            "greedy_rouge_semantic_matrix": E,
+        }
diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
new file mode 100644
index 000000000..6185f2d9f
--- /dev/null
+++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
@@ -0,0 +1,88 @@
+import numpy as np
+
+import itertools
+from typing import Dict, List
+
+from .stat_calculator import StatCalculator
+from lm_polygraph.utils.model import WhiteboxModel
+import torch.nn as nn
+import torch
+
+softmax = nn.Softmax(dim=1)
+
+
+class GreedySemanticMatrixCalculator(StatCalculator):
+    """
+    Calculates the NLI semantic matrix for generation samples using DeBERTa model.
+    """
+
+    def __init__(self, nli_model):
+        super().__init__(
+            [
+                "greedy_semantic_matrix_entail",
+                "greedy_semantic_matrix_contra",
+            ],
+            ["greedy_texts", "sample_texts"],
+        )
+        self.is_deberta_setup = False
+        self.nli_model = nli_model
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        deberta = self.nli_model
+        deberta_batch_size = deberta.batch_size
+
+        batch_texts = dependencies["sample_texts"]
+        batch_greedy_texts = dependencies["greedy_texts"]
+
+        batch_pairs = []
+        batch_invs = []
+        for texts, greedy_text in zip(batch_texts, batch_greedy_texts):
+            # Sampling from LLM often produces significant number of identical
+            # outputs. We only need to score pairs of unqiue outputs
+            unique_texts, inv = np.unique(texts, return_inverse=True)
+            batch_pairs.append(list(itertools.product([greedy_text], unique_texts)))
+            batch_invs.append(inv)
+
+        device = deberta.device
+        ent_id = deberta.deberta.config.label2id["ENTAILMENT"]
+        contra_id = deberta.deberta.config.label2id["CONTRADICTION"]
+
+        softmax = nn.Softmax(dim=1)
+        tokenizer = deberta.deberta_tokenizer
+
+        E = []
+        C = []
+
+        for i, pairs in enumerate(batch_pairs):
+            dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size)
+            probs = []
+            for first_texts, second_texts in dl:
+                batch = list(zip(first_texts, second_texts))
+                encoded = tokenizer.batch_encode_plus(
+                    batch, padding=True, return_tensors="pt"
+                ).to(device)
+                logits = deberta.deberta(**encoded).logits.detach().to(device)
+                probs.append(softmax(logits).cpu().detach())
+            probs = torch.cat(probs, dim=0)
+
+            inv = batch_invs[i]
+
+            entail_probs = probs[:, ent_id]
+            contra_probs = probs[:, contra_id]
+
+            E.append(entail_probs[inv].numpy())
+            C.append(contra_probs[inv].numpy())
+
+        E = np.stack(E)
+        C = np.stack(C)
+
+        return {
+            "greedy_semantic_matrix_entail": E,
+            "greedy_semantic_matrix_contra": C,
+        }
diff --git a/src/lm_polygraph/stat_calculators/greedy_similarity.py b/src/lm_polygraph/stat_calculators/greedy_similarity.py
index 04eae17bf..c3da31778 100644
--- a/src/lm_polygraph/stat_calculators/greedy_similarity.py
+++ b/src/lm_polygraph/stat_calculators/greedy_similarity.py
@@ -19,7 +19,7 @@ def __init__(self, nli_model):
             [
                 "greedy_sentence_similarity",
             ],
-            ["input_texts", "sample_tokens", "sample_texts", "greedy_tokens", "greedy_texts"],
+            ["input_texts", "sample_texts", "greedy_texts"],
         )
 
         self.crossencoder_setup = False
@@ -44,16 +44,13 @@ def __call__(
             self._setup(device=device)
             self.crossencoder_setup = True
 
-        batch_sample_tokens = dependencies["sample_tokens"]
         batch_texts = dependencies["sample_texts"]
         deberta_batch_size = (
             self.nli_model.batch_size
         )
         batch_input_texts = dependencies["input_texts"]
-        batch_greedy_tokens = dependencies["greedy_tokens"]
         batch_greedy_texts = dependencies["greedy_texts"]
 
-        special_tokens = list(model.tokenizer.added_tokens_decoder.keys())
 
         batch_pairs = []
         batch_invs = []
diff --git a/src/lm_polygraph/stat_calculators/rouge_matrix.py b/src/lm_polygraph/stat_calculators/rouge_matrix.py
new file mode 100644
index 000000000..f99c819ae
--- /dev/null
+++ b/src/lm_polygraph/stat_calculators/rouge_matrix.py
@@ -0,0 +1,67 @@
+import numpy as np
+
+import itertools
+from typing import Dict, List
+
+from .stat_calculator import StatCalculator
+from lm_polygraph.utils.model import WhiteboxModel
+import torch.nn as nn
+import torch
+from rouge_score import rouge_scorer
+
+
+class RougeLSemanticMatrixCalculator(StatCalculator):
+    """
+    Calculates the NLI semantic matrix for generation samples using DeBERTa model.
+    """
+
+    def __init__(self):
+        super().__init__(
+            [
+                "rouge_semantic_matrix",
+            ],
+            ["sample_texts"],
+        )
+        self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+
+        batch_texts = dependencies["sample_texts"]
+
+        batch_pairs = []
+        batch_invs = []
+        batch_counts = []
+        for texts in batch_texts:
+            # Sampling from LLM often produces significant number of identical
+            # outputs. We only need to score pairs of unqiue outputs
+            unique_texts, inv = np.unique(texts, return_inverse=True)
+            batch_pairs.append(list(itertools.product(unique_texts, unique_texts)))
+            batch_invs.append(inv)
+            batch_counts.append(len(unique_texts))
+
+        E = []
+
+        for i, pairs in enumerate(batch_pairs):
+            sim_mat = []
+            for first_texts, second_texts in pairs:
+                sim_mat.append(self.scorer.score(first_texts, second_texts)['rougeL'].fmeasure)
+
+            sim_mat = np.array(sim_mat)
+            unique_mat_shape = (batch_counts[i], batch_counts[i])
+            sim_mat = sim_mat.reshape(unique_mat_shape)
+
+            inv = batch_invs[i]
+
+            E.append(sim_mat[inv, :][:, inv])
+
+        E = np.stack(E)
+
+        return {
+            "rouge_semantic_matrix": E,
+        }
diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
index 926c6b164..fa96605e1 100644
--- a/src/lm_polygraph/utils/register_stat_calculators.py
+++ b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -75,6 +75,8 @@ def _register(calculator_class: StatCalculator):
         _register(EnsembleTokenLevelDataCalculator())
         _register(CrossEncoderSimilarityMatrixCalculator(nli_model=nli_model))
         _register(GreedySimilarityCalculator(nli_model=nli_model))
+        _register(RougeLSemanticMatrixCalculator())
+        _register(GreedyRougeLSemanticMatrixCalculator())
         _register(GreedyAlternativesNLICalculator(nli_model=nli_model))
         _register(SampleAlternativesNLICalculator(nli_model=nli_model))
         _register(GreedyAlternativesFactPrefNLICalculator(nli_model=nli_model))

From 5701a477a4d9c96149d57b74fea8e2490be990c0 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 22 Jan 2025 01:24:01 +0400
Subject: [PATCH 79/97] Add align matrix

---
 src/lm_polygraph/stat_calculators/__init__.py |  2 +
 .../stat_calculators/align_matrix.py          | 83 +++++++++++++++++++
 .../stat_calculators/greedy_align_matrix.py   | 73 ++++++++++++++++
 .../stat_calculators/greedy_rouge_matrix.py   |  2 +-
 .../greedy_semantic_matrix.py                 | 43 ++++++----
 .../stat_calculators/greedy_similarity.py     | 16 +++-
 6 files changed, 202 insertions(+), 17 deletions(-)
 create mode 100644 src/lm_polygraph/stat_calculators/align_matrix.py
 create mode 100644 src/lm_polygraph/stat_calculators/greedy_align_matrix.py

diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py
index b01f9c541..7f7e37aef 100644
--- a/src/lm_polygraph/stat_calculators/__init__.py
+++ b/src/lm_polygraph/stat_calculators/__init__.py
@@ -28,3 +28,5 @@
 from .greedy_semantic_matrix import GreedySemanticMatrixCalculator
 from .rouge_matrix import RougeLSemanticMatrixCalculator
 from .greedy_rouge_matrix import GreedyRougeLSemanticMatrixCalculator
+from .align_matrix import AlignMatrixCalculator
+from .greedy_align_matrix import GreedyAlignMatrixCalculator
diff --git a/src/lm_polygraph/stat_calculators/align_matrix.py b/src/lm_polygraph/stat_calculators/align_matrix.py
new file mode 100644
index 000000000..9be56ddd3
--- /dev/null
+++ b/src/lm_polygraph/stat_calculators/align_matrix.py
@@ -0,0 +1,83 @@
+import numpy as np
+
+import itertools
+from typing import Dict, List
+
+from .stat_calculator import StatCalculator
+from lm_polygraph.utils.model import WhiteboxModel
+import torch.nn as nn
+import torch
+
+
+class AlignMatrixCalculator(StatCalculator):
+    """
+    Calculates the NLI semantic matrix for generation samples using DeBERTa model.
+    """
+
+    def __init__(self, scorer):
+        super().__init__(
+            [
+                "align_semantic_matrix",
+            ],
+            ["sample_texts"],
+        )
+        self.scorer = scorer
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        """
+        Calculates the NLI semantic matrix for generation samples using DeBERTa model.
+
+        Parameters:
+            dependencies (Dict[str, np.ndarray]): input statistics, containing:
+                - 'sample_texts' (List[List[str]]): several sampling generations
+                    for each input text in the batch.
+            texts (List[str]): Input texts batch used for model generation.
+            model (Model): Model used for generation.
+            max_new_tokens (int): Maximum number of new tokens at model generation. Default: 100.
+        Returns:
+            Dict[str, np.ndarray]: dictionary with the following items:
+                - 'semantic_matrix_entail' (List[np.array]): for each input text: quadratic matrix of size
+                    n_samples x n_samples, with probabilities of 'ENTAILMENT' output of DeBERTa.
+                - 'semantic_matrix_contra' (List[np.array]): for each input text: quadratic matrix of size
+                    n_samples x n_samples, with probabilities of 'CONTRADICTION' output of DeBERTa.
+                - 'semantic_matrix_classes' (List[np.array]): for each input text: quadratic matrix of size
+                    n_samples x n_samples, with the NLI label id corresponding to the DeBERTa prediction.
+        """
+        batch_texts = dependencies["sample_texts"]
+
+        batch_pairs = []
+        batch_invs = []
+        batch_counts = []
+        for texts in batch_texts:
+            # Sampling from LLM often produces significant number of identical
+            # outputs. We only need to score pairs of unqiue outputs
+            unique_texts, inv = np.unique(texts, return_inverse=True)
+            batch_pairs.append(list(itertools.product(unique_texts, unique_texts)))
+            batch_invs.append(inv)
+            batch_counts.append(len(unique_texts))
+
+        E = []
+
+        for i, pairs in enumerate(batch_pairs):
+            first_texts, second_texts = zip(*pairs)
+            sim_mat = np.array(self.scorer.score(claims=first_texts, contexts=second_texts))
+
+            unique_mat_shape = (batch_counts[i], batch_counts[i])
+
+            sim_mat = sim_mat.reshape(unique_mat_shape)
+
+            inv = batch_invs[i]
+
+            E.append(sim_mat[inv, :][:, inv])
+
+        E = np.stack(E)
+
+        return {
+            "align_semantic_matrix": E,
+        }
diff --git a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
new file mode 100644
index 000000000..c7a6be6ee
--- /dev/null
+++ b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
@@ -0,0 +1,73 @@
+import numpy as np
+
+import itertools
+from typing import Dict, List
+
+from .stat_calculator import StatCalculator
+from lm_polygraph.utils.model import WhiteboxModel
+import torch.nn as nn
+import torch
+
+
+class GreedyAlignMatrixCalculator(StatCalculator):
+    """
+    Calculates the NLI semantic matrix for generation samples using DeBERTa model.
+    """
+
+    def __init__(self, scorer):
+        super().__init__(
+            [
+                "greedy_align_semantic_matrix_forward",
+                "greedy_align_semantic_matrix_backward",
+                "greedy_align_semantic_matrix",
+            ],
+            ["greedy_texts", "sample_texts"],
+        )
+        self.scorer = scorer
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        batch_texts = dependencies["sample_texts"]
+        batch_greedy_texts = dependencies["greedy_texts"]
+
+        batch_pairs = []
+        batch_invs = []
+        for texts, greedy_text in zip(batch_texts, batch_greedy_texts):
+            # Sampling from LLM often produces significant number of identical
+            # outputs. We only need to score pairs of unqiue outputs
+            unique_texts, inv = np.unique(texts, return_inverse=True)
+            batch_pairs.append(list(itertools.product([greedy_text], unique_texts)))
+            batch_invs.append(inv)
+
+        E_f = []
+        E_b = []
+        E = []
+
+        for i, pairs in enumerate(batch_pairs):
+            sim_mat_f = []
+            sim_mat_b = []
+            first_texts, second_texts = zip(*pairs)
+            sim_mat_f = np.array(self.scorer.score(claims=first_texts, contexts=second_texts))
+            sim_mat_b = np.array(self.scorer.score(claims=second_texts, contexts=first_texts))
+
+            inv = batch_invs[i]
+
+            E_f.append(sim_mat_f[inv])
+            E_b.append(sim_mat_b[inv])
+            E.append((sim_mat_f[inv] + sim_mat_b[inv]) / 2)
+
+
+        E_f = np.stack(E_f)
+        E_b = np.stack(E_b)
+        E = np.stack(E)
+
+        return {
+            "greedy_align_semantic_matrix_forward": E_f,
+            "greedy_align_semantic_matrix_backward": E_b,
+            "greedy_align_semantic_matrix": E,
+        }
diff --git a/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py b/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py
index ebcc14373..c863b3a43 100644
--- a/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py
+++ b/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py
@@ -13,7 +13,7 @@ class GreedyRougeLSemanticMatrixCalculator(StatCalculator):
     def __init__(self):
         super().__init__(
             [
-                "greedy_semantic_matrix",
+                "greedy_rouge_semantic_matrix",
             ],
             ["greedy_texts", "sample_texts"],
         )
diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
index 6185f2d9f..d4ae280ef 100644
--- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
@@ -19,8 +19,9 @@ class GreedySemanticMatrixCalculator(StatCalculator):
     def __init__(self, nli_model):
         super().__init__(
             [
-                "greedy_semantic_matrix_entail",
-                "greedy_semantic_matrix_contra",
+                "greedy_semantic_matrix_forward",
+                "greedy_semantic_matrix_backward",
+                "greedy_semantic_matrix",
             ],
             ["greedy_texts", "sample_texts"],
         )
@@ -51,38 +52,52 @@ def __call__(
 
         device = deberta.device
         ent_id = deberta.deberta.config.label2id["ENTAILMENT"]
-        contra_id = deberta.deberta.config.label2id["CONTRADICTION"]
 
         softmax = nn.Softmax(dim=1)
         tokenizer = deberta.deberta_tokenizer
 
+        E_f = []
+        E_b = []
         E = []
-        C = []
 
         for i, pairs in enumerate(batch_pairs):
             dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size)
-            probs = []
+            probs_f = []
+            probs_b = []
+
             for first_texts, second_texts in dl:
                 batch = list(zip(first_texts, second_texts))
                 encoded = tokenizer.batch_encode_plus(
                     batch, padding=True, return_tensors="pt"
                 ).to(device)
                 logits = deberta.deberta(**encoded).logits.detach().to(device)
-                probs.append(softmax(logits).cpu().detach())
-            probs = torch.cat(probs, dim=0)
+                probs_f.append(softmax(logits).cpu().detach())
+
+                batch = list(zip(second_texts, first_texts))
+                encoded = tokenizer.batch_encode_plus(
+                    batch, padding=True, return_tensors="pt"
+                ).to(device)
+                logits = deberta.deberta(**encoded).logits.detach().to(device)
+                probs_b.append(softmax(logits).cpu().detach())
+
+            probs_f = torch.cat(probs_f, dim=0)
+            probs_b = torch.cat(probs_b, dim=0)
 
             inv = batch_invs[i]
 
-            entail_probs = probs[:, ent_id]
-            contra_probs = probs[:, contra_id]
+            entail_probs_f = probs_f[:, ent_id]
+            entail_probs_b = probs_b[:, ent_id]
 
-            E.append(entail_probs[inv].numpy())
-            C.append(contra_probs[inv].numpy())
+            E_f.append(entail_probs_f[inv].numpy())
+            E_b.append(entail_probs_b[inv].numpy())
+            E.append((entail_probs_f[inv].numpy() + entail_probs_b[inv].numpy()) / 2)
 
+        E_f = np.stack(E_f)
+        E_b = np.stack(E_b)
         E = np.stack(E)
-        C = np.stack(C)
 
         return {
-            "greedy_semantic_matrix_entail": E,
-            "greedy_semantic_matrix_contra": C,
+            "greedy_semantic_matrix_forward": E_f,
+            "greedy_semantic_matrix_backward": E_b,
+            "greedy_semantic_matrix": E,
         }
diff --git a/src/lm_polygraph/stat_calculators/greedy_similarity.py b/src/lm_polygraph/stat_calculators/greedy_similarity.py
index c3da31778..cf2435985 100644
--- a/src/lm_polygraph/stat_calculators/greedy_similarity.py
+++ b/src/lm_polygraph/stat_calculators/greedy_similarity.py
@@ -17,6 +17,8 @@ class GreedySimilarityCalculator(StatCalculator):
     def __init__(self, nli_model):
         super().__init__(
             [
+                "greedy_sentence_similarity_forward",
+                "greedy_sentence_similarity_backward",
                 "greedy_sentence_similarity",
             ],
             ["input_texts", "sample_texts", "greedy_texts"],
@@ -62,16 +64,26 @@ def __call__(
             batch_pairs.append(list(itertools.product([greedy_text], unique_texts)))
             batch_invs.append(inv)
 
+        sim_arrays_f = []
+        sim_arrays_b = []
         sim_arrays = []
         for i, pairs in tqdm(enumerate(batch_pairs)):
-            sim_scores = self.crossencoder.predict(pairs, batch_size=deberta_batch_size)
+            pairs_b = [(b, a) for a, b in pairs]
+            sim_scores_f = self.crossencoder.predict(pairs, batch_size=deberta_batch_size)
+            sim_scores_b = self.crossencoder.predict(pairs_b, batch_size=deberta_batch_size)
 
             inv = batch_invs[i]
 
-            sim_arrays.append(sim_scores[inv])
+            sim_arrays_f.append(sim_scores_f[inv])
+            sim_arrays_b.append(sim_scores_b[inv])
+            sim_arrays.append((sim_scores_f[inv] + sim_scores_b[inv]) / 2)
 
+        sim_arrays_f = np.stack(sim_arrays_f)
+        sim_arrays_b = np.stack(sim_arrays_b)
         sim_arrays = np.stack(sim_arrays)
 
         return {
+            "greedy_sentence_similarity_forward": sim_arrays_f,
+            "greedy_sentence_similarity_backward": sim_arrays_b,
             "greedy_sentence_similarity": sim_arrays,
         }

From 7ab3691e2dd2951c4ea1a8282870683ea5fde3f4 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 22 Jan 2025 01:34:09 +0400
Subject: [PATCH 80/97] add tqdm

---
 src/lm_polygraph/stat_calculators/align_matrix.py           | 3 ++-
 src/lm_polygraph/stat_calculators/greedy_align_matrix.py    | 3 ++-
 src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py | 3 ++-
 src/lm_polygraph/stat_calculators/semantic_matrix.py        | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/lm_polygraph/stat_calculators/align_matrix.py b/src/lm_polygraph/stat_calculators/align_matrix.py
index 9be56ddd3..4df7fa1f8 100644
--- a/src/lm_polygraph/stat_calculators/align_matrix.py
+++ b/src/lm_polygraph/stat_calculators/align_matrix.py
@@ -7,6 +7,7 @@
 from lm_polygraph.utils.model import WhiteboxModel
 import torch.nn as nn
 import torch
+from tqdm import tqdm
 
 
 class AlignMatrixCalculator(StatCalculator):
@@ -64,7 +65,7 @@ def __call__(
 
         E = []
 
-        for i, pairs in enumerate(batch_pairs):
+        for i, pairs in tqdm(enumerate(batch_pairs)):
             first_texts, second_texts = zip(*pairs)
             sim_mat = np.array(self.scorer.score(claims=first_texts, contexts=second_texts))
 
diff --git a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
index c7a6be6ee..eb767f3ac 100644
--- a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
+++ b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
@@ -7,6 +7,7 @@
 from lm_polygraph.utils.model import WhiteboxModel
 import torch.nn as nn
 import torch
+from tqdm import tqdm
 
 
 class GreedyAlignMatrixCalculator(StatCalculator):
@@ -48,7 +49,7 @@ def __call__(
         E_b = []
         E = []
 
-        for i, pairs in enumerate(batch_pairs):
+        for i, pairs in tqdm(enumerate(batch_pairs)):
             sim_mat_f = []
             sim_mat_b = []
             first_texts, second_texts = zip(*pairs)
diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
index d4ae280ef..a5e5cc9df 100644
--- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
@@ -7,6 +7,7 @@
 from lm_polygraph.utils.model import WhiteboxModel
 import torch.nn as nn
 import torch
+from tqdm import tqdm
 
 softmax = nn.Softmax(dim=1)
 
@@ -65,7 +66,7 @@ def __call__(
             probs_f = []
             probs_b = []
 
-            for first_texts, second_texts in dl:
+            for first_texts, second_texts in tqdm(dl):
                 batch = list(zip(first_texts, second_texts))
                 encoded = tokenizer.batch_encode_plus(
                     batch, padding=True, return_tensors="pt"
diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py
index 8a6862f9d..c8ede60a6 100644
--- a/src/lm_polygraph/stat_calculators/semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py
@@ -7,6 +7,7 @@
 from lm_polygraph.utils.model import WhiteboxModel
 import torch.nn as nn
 import torch
+from tqdm import tqdm
 
 softmax = nn.Softmax(dim=1)
 
@@ -82,7 +83,7 @@ def __call__(
         C = []
         P = []
 
-        for i, pairs in enumerate(batch_pairs):
+        for i, pairs in tqdm(enumerate(batch_pairs)):
             dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size)
             probs = []
             for first_texts, second_texts in dl:

From 90eacd1d80be916d279f5e9837a4bb8ebda533ef Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 22 Jan 2025 01:57:51 +0400
Subject: [PATCH 81/97] Fix issue with empty samples

---
 src/lm_polygraph/stat_calculators/align_matrix.py        | 1 +
 src/lm_polygraph/stat_calculators/greedy_align_matrix.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/lm_polygraph/stat_calculators/align_matrix.py b/src/lm_polygraph/stat_calculators/align_matrix.py
index 4df7fa1f8..a65bfad76 100644
--- a/src/lm_polygraph/stat_calculators/align_matrix.py
+++ b/src/lm_polygraph/stat_calculators/align_matrix.py
@@ -58,6 +58,7 @@ def __call__(
         for texts in batch_texts:
             # Sampling from LLM often produces significant number of identical
             # outputs. We only need to score pairs of unqiue outputs
+            texts = [text if text.strip() != "" else "<empty>" for text in texts]
             unique_texts, inv = np.unique(texts, return_inverse=True)
             batch_pairs.append(list(itertools.product(unique_texts, unique_texts)))
             batch_invs.append(inv)
diff --git a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
index eb767f3ac..a467e92e6 100644
--- a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
+++ b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
@@ -41,6 +41,7 @@ def __call__(
         for texts, greedy_text in zip(batch_texts, batch_greedy_texts):
             # Sampling from LLM often produces significant number of identical
             # outputs. We only need to score pairs of unqiue outputs
+            texts = [text if text.strip() != "" else "<empty>" for text in texts]
             unique_texts, inv = np.unique(texts, return_inverse=True)
             batch_pairs.append(list(itertools.product([greedy_text], unique_texts)))
             batch_invs.append(inv)

From 823ba26b9c46af4deacdb321d59363417149483b Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Wed, 29 Jan 2025 19:22:27 +0400
Subject: [PATCH 82/97] Add ablation-related methods

---
 src/lm_polygraph/estimators/__init__.py       |  10 +-
 .../estimators/adj_sum_semantic_entropies.py  | 213 ++++++++++++++++++
 ..._semantic_average_ue_average_similarity.py |  27 +++
 .../greedy_sum_semantic_entropies.py          |  70 ------
 src/lm_polygraph/estimators/prob_cocoa.py     | 144 ++++++++++++
 .../semantic_average_ue_average_similarity.py |   1 -
 .../estimators/sum_semantic_entropies.py      | 142 +++++++++++-
 .../supervised_sum_semantic_entropies.py      | 179 +++++++++++++++
 .../stat_calculators/semantic_matrix.py       |  12 +-
 .../utils/register_stat_calculators.py        |  19 +-
 10 files changed, 725 insertions(+), 92 deletions(-)
 create mode 100644 src/lm_polygraph/estimators/adj_sum_semantic_entropies.py
 delete mode 100644 src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py
 create mode 100644 src/lm_polygraph/estimators/prob_cocoa.py
 create mode 100644 src/lm_polygraph/estimators/supervised_sum_semantic_entropies.py

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 648d5a1ab..ea76573dc 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -104,7 +104,13 @@
     GreedySemanticEnrichedTokenSARAveDissimilarity ,
     GreedySemanticEnrichedMaxprobAveDissimilarity,
     GreedySemanticEnrichedMTEAveDissimilarity,
+    GreedyAveDissimilarity
 )
 from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE
-from .sum_semantic_entropies import SumSemanticMaxprob, SumSemanticPPL
-from .greedy_sum_semantic_entropies import GreedySumSemanticMaxprob, GreedySumSemanticPPL
+
+from .sum_semantic_entropies import SumSemanticMaxprob, SumSemanticPPL, SumSemanticMTE, GreedySumSemanticMaxprob, GreedySumSemanticPPL, GreedySumSemanticMTE
+from .adj_sum_semantic_entropies import AdjustedSumSemanticMaxprob, AdjustedSumSemanticPPL, AdjustedSumSemanticMTE, GreedyAdjustedSumSemanticMaxprob, GreedyAdjustedSumSemanticPPL, GreedyAdjustedSumSemanticMTE
+
+from .prob_cocoa import ProbCocoaMaxprob, ProbCocoaPPL, GreedyProbCocoaMaxprob, GreedyProbCocoaPPL
+
+from .supervised_sum_semantic_entropies import SupSumSemanticMaxprob, SupSumSemanticPPL, SupSumSemanticMTE, GreedySupSumSemanticMaxprob, GreedySupSumSemanticPPL, GreedySupSumSemanticMTE
diff --git a/src/lm_polygraph/estimators/adj_sum_semantic_entropies.py b/src/lm_polygraph/estimators/adj_sum_semantic_entropies.py
new file mode 100644
index 000000000..1d780ca1c
--- /dev/null
+++ b/src/lm_polygraph/estimators/adj_sum_semantic_entropies.py
@@ -0,0 +1,213 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS
+
+
+class AdjustedSumSemanticMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first"
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+
+    def __str__(self):
+        base = "AdjustedSumSemanticMaxprob"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+
+        for best_id, sample_log_probs, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sim = 1 - sample_sentence_similarity[best_id, :]
+            sim[best_id] = 1
+            avg_similarity = np.mean(sim)
+            mp = -np.sum(sample_log_probs[best_id])
+            res = mp + avg_similarity * mp
+            enriched_metrics.append(res)
+
+        return np.array(enriched_metrics)
+
+
+class AdjustedSumSemanticPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first"
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+
+    def __str__(self):
+        base = "AdjustedSumSemanticPPL"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for best_id, sample_log_likelihoods, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
+            sim = 1 - sample_sentence_similarity[best_id, :]
+            sim[best_id] = 1
+            avg_similarity = np.mean(sim)
+            ppl = -np.mean(sample_log_likelihoods[best_id])
+            res = ppl + avg_similarity * ppl
+            enriched_ppl.append(res)
+
+        return np.array(enriched_ppl)
+
+
+class AdjustedSumSemanticMTE(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first"
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+
+    def __str__(self):
+        base = "AdjustedSumSemanticMTE"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_entropies = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        enriched_mte = []  
+
+        for best_id, sample_entropies, sample_sentence_similarity in zip(
+            sample_ids, batch_entropies, batch_sample_sentence_similarity
+        ):
+            sim = 1 - sample_sentence_similarity[best_id, :]
+            sim[best_id] = 1
+            avg_similarity = np.mean(sim)
+            mte = sample_entropies[best_id]
+            res = mte + avg_similarity * mte
+            enriched_mte.append(res)
+
+        return np.array(enriched_mte)
+
+
+class GreedyAdjustedSumSemanticMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedyAdjustedSumSemanticMaxprob"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+        batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]])
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+        for greedy_ll, greedy_sentence_similarity in zip(
+            batch_lls, batch_greedy_sentence_similarity
+        ):
+            # Compute probabilities (negative log-probs)
+            prob = -greedy_ll
+
+            # Compute row-wise average similarity, excluding self-similarity
+            # Diagonal contains self-similarities
+            avg_similarity = 1 - np.mean(greedy_sentence_similarity)
+
+            enriched_metrics.append(prob + avg_similarity * prob)
+
+        return np.array(enriched_metrics)
+
+
+class GreedyAdjustedSumSemanticPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedyAdjustedSumSemanticPPL"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for greedy_log_likelihoods, greedy_sentence_similarity in zip(
+            batch_greedy_log_likelihoods, batch_greedy_sentence_similarity
+        ):
+            # get PPL for each sample
+            ppl = -np.mean(greedy_log_likelihoods)
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_similarity = 1 - np.mean(greedy_sentence_similarity)
+
+            enriched_ppl.append(ppl + avg_similarity * ppl)
+
+
+        return np.array(enriched_ppl)
+
+
+class GreedyAdjustedSumSemanticMTE(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedyAdjustedSumSemanticMTE"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_entropies = stats["greedy_log_likelihoods"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_mte = []  # To store enriched PPL for each sample
+
+        for greedy_entropies, greedy_sentence_similarity in zip(
+            batch_greedy_entropies, batch_greedy_sentence_similarity
+        ):
+            # get PPL for each sample
+            mte = np.mean(greedy_entropies)
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_similarity = 1 - np.mean(greedy_sentence_similarity)
+
+            enriched_mte.append(mte + avg_similarity * mte)
+
+
+        return np.array(enriched_mte)
diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
index d2b182741..69611957b 100644
--- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
@@ -355,3 +355,30 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             enriched_entropy.append(enriched_value)
 
         return np.array(enriched_entropy)
+
+
+class GreedyAveDissimilarity(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedyAveDissimilarity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_entropy = stats["entropy"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        res = []
+
+        for greedy_entropy, greedy_sentence_similarity in zip(
+            batch_greedy_entropy, batch_greedy_sentence_similarity
+        ):
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
+            res.append(avg_dissimilarity)
+
+        return np.array(res)
diff --git a/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py b/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py
deleted file mode 100644
index 1280dadc0..000000000
--- a/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import numpy as np
-
-from typing import Dict
-from copy import deepcopy
-
-from .estimator import Estimator
-from .common import sample_strategy_to_prefix, best_sample_ids
-
-
-class GreedySumSemanticMaxprob(Estimator):
-    def __init__(
-        self,
-        verbose: bool = False,
-    ):
-        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
-        self.verbose = verbose
-
-    def __str__(self):
-        return "GreedySumSemanticMaxprob"
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
-        batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]])
-
-        enriched_metrics = []  # To store enriched metrics for each sample
-        for greedy_ll, greedy_sentence_similarity in zip(
-            batch_lls, batch_greedy_sentence_similarity
-        ):
-            # Compute probabilities (negative log-probs)
-            prob = -greedy_ll
-
-            # Compute row-wise average similarity, excluding self-similarity
-            # Diagonal contains self-similarities
-            avg_similarity = np.mean(greedy_sentence_similarity)
-
-            enriched_metrics.append(prob - np.log(avg_similarity))
-
-        return np.array(enriched_metrics)
-
-
-class GreedySumSemanticPPL(Estimator):
-    def __init__(
-        self,
-        verbose: bool = False,
-    ):
-        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
-        self.verbose = verbose
-
-    def __str__(self):
-        return "GreedySumSemanticPPL"
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
-        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
-
-        enriched_ppl = []  # To store enriched PPL for each sample
-
-        for greedy_log_likelihoods, greedy_sentence_similarity in zip(
-            batch_greedy_log_likelihoods, batch_greedy_sentence_similarity
-        ):
-            # get PPL for each sample
-            ppl = -np.mean(greedy_log_likelihoods)
-
-            #  Compute row-wise average similarity, excluding self-similarity
-            avg_similarity = np.mean(greedy_sentence_similarity)
-
-            enriched_ppl.append(ppl - np.log(avg_similarity))
-
-
-        return np.array(enriched_ppl)
diff --git a/src/lm_polygraph/estimators/prob_cocoa.py b/src/lm_polygraph/estimators/prob_cocoa.py
new file mode 100644
index 000000000..cf3483fac
--- /dev/null
+++ b/src/lm_polygraph/estimators/prob_cocoa.py
@@ -0,0 +1,144 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS
+
+
+class ProbCocoaMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first"
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+
+    def __str__(self):
+        base = "ProbCocoaMaxprob"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+
+        for best_id, sample_log_probs, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity
+        ):
+            sim = 1 - sample_sentence_similarity[best_id, :]
+            sim[best_id] = 1
+            avg_similarity = np.mean(sim)
+            mp = 1 - np.exp(np.sum(sample_log_probs[best_id]))
+            res = mp * avg_similarity
+            enriched_metrics.append(res)
+
+        return np.array(enriched_metrics)
+
+
+class ProbCocoaPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first"
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+
+    def __str__(self):
+        base = "ProbCocoaPPL"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for best_id, sample_log_likelihoods, sample_sentence_similarity in zip(
+            sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity
+        ):
+            sim = 1 - sample_sentence_similarity[best_id, :]
+            sim[best_id] = 1
+            avg_similarity = np.mean(sim)
+            ppl = 1 - np.exp(np.mean(sample_log_likelihoods[best_id]))
+            res = ppl * avg_similarity
+            enriched_ppl.append(res)
+
+        return np.array(enriched_ppl)
+
+
+class GreedyProbCocoaMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedyProbCocoaMaxprob"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+        batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]])
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+        for greedy_ll, greedy_sentence_similarity in zip(
+            batch_lls, batch_greedy_sentence_similarity
+        ):
+            # Compute probabilities (negative log-probs)
+            prob = 1 - np.exp(greedy_ll)
+
+            # Compute row-wise average similarity, excluding self-similarity
+            # Diagonal contains self-similarities
+            avg_similarity = 1 - np.mean(greedy_sentence_similarity)
+
+            enriched_metrics.append(prob * avg_similarity)
+
+        return np.array(enriched_metrics)
+
+
+class GreedyProbCocoaPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedyProbCocoaPPL"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for greedy_log_likelihoods, greedy_sentence_similarity in zip(
+            batch_greedy_log_likelihoods, batch_greedy_sentence_similarity
+        ):
+            # get PPL for each sample
+            ppl = 1 - np.exp(np.mean(greedy_log_likelihoods))
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_similarity = 1 - np.mean(greedy_sentence_similarity)
+
+            enriched_ppl.append(ppl * avg_similarity)
+
+
+        return np.array(enriched_ppl)
diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
index 40fc0a004..f10f02925 100644
--- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
@@ -530,7 +530,6 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(best_elements)
 
 
-
 class AveDissimilarity(Estimator):
     def __init__(
         self,
diff --git a/src/lm_polygraph/estimators/sum_semantic_entropies.py b/src/lm_polygraph/estimators/sum_semantic_entropies.py
index 10b19a03b..47f9aad25 100644
--- a/src/lm_polygraph/estimators/sum_semantic_entropies.py
+++ b/src/lm_polygraph/estimators/sum_semantic_entropies.py
@@ -34,10 +34,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for best_id, sample_log_probs, sample_sentence_similarity in zip(
             sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity
         ):
-            sim = sample_sentence_similarity[best_id, :]
+            sim = 1 - sample_sentence_similarity[best_id, :]
             sim[best_id] = 1
             avg_similarity = np.mean(sim)
-            res = -np.sum(sample_log_probs[best_id]) - np.log(avg_similarity)
+            mp = -np.sum(sample_log_probs[best_id])
+            res = mp + avg_similarity
             enriched_metrics.append(res)
 
         return np.array(enriched_metrics)
@@ -70,10 +71,143 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for best_id, sample_log_likelihoods, sample_sentence_similarity in zip(
             sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
-            sim = sample_sentence_similarity[best_id, :]
+            sim = 1 - sample_sentence_similarity[best_id, :]
             sim[best_id] = 1
             avg_similarity = np.mean(sim)
-            res = -np.mean(sample_log_likelihoods[best_id]) - np.log(avg_similarity)
+            ppl = -np.mean(sample_log_likelihoods[best_id])
+            res = ppl + avg_similarity
             enriched_ppl.append(res)
 
         return np.array(enriched_ppl)
+
+
+class SumSemanticMTE(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first"
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+
+    def __str__(self):
+        base = "SumSemanticMTE"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_entropies = stats["sample_entropy"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        enriched_mte = []  
+
+        for best_id, sample_entropies, sample_sentence_similarity in zip(
+            sample_ids, batch_entropies, batch_sample_sentence_similarity
+        ):
+            sim = 1 - sample_sentence_similarity[best_id, :]
+            sim[best_id] = 1
+            avg_similarity = np.mean(sim)
+            mte = sample_entropies[best_id]
+            res = mte + avg_similarity
+            enriched_mte.append(res)
+
+        return np.array(enriched_mte)
+
+
+class GreedySumSemanticMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedySumSemanticMaxprob"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+        batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]])
+
+        enriched_metrics = []  # To store enriched metrics for each sample
+        for greedy_ll, greedy_sentence_similarity in zip(
+            batch_lls, batch_greedy_sentence_similarity
+        ):
+            # Compute probabilities (negative log-probs)
+            prob = -greedy_ll
+
+            # Compute row-wise average similarity, excluding self-similarity
+            # Diagonal contains self-similarities
+            avg_similarity = 1 - np.mean(greedy_sentence_similarity)
+
+            enriched_metrics.append(prob + avg_similarity)
+
+        return np.array(enriched_metrics)
+
+
+class GreedySumSemanticPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedySumSemanticPPL"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_ppl = []  # To store enriched PPL for each sample
+
+        for greedy_log_likelihoods, greedy_sentence_similarity in zip(
+            batch_greedy_log_likelihoods, batch_greedy_sentence_similarity
+        ):
+            # get PPL for each sample
+            ppl = -np.mean(greedy_log_likelihoods)
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_similarity = 1 - np.mean(greedy_sentence_similarity)
+
+            enriched_ppl.append(ppl + avg_similarity)
+
+
+        return np.array(enriched_ppl)
+
+
+class GreedySumSemanticMTE(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+    ):
+        super().__init__(["greedy_sentence_similarity", "entropy"], "sequence")
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedySumSemanticMTE"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_greedy_entropies = stats["greedy_log_likelihoods"]
+        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+
+        enriched_mte = []  # To store enriched PPL for each sample
+
+        for greedy_entropies, greedy_sentence_similarity in zip(
+            batch_greedy_entropies, batch_greedy_sentence_similarity
+        ):
+            # get PPL for each sample
+            mte = np.mean(greedy_entropies)
+
+            #  Compute row-wise average similarity, excluding self-similarity
+            avg_similarity = 1 - np.mean(greedy_sentence_similarity)
+
+            enriched_mte.append(mte + avg_similarity)
+
+
+        return np.array(enriched_mte)
diff --git a/src/lm_polygraph/estimators/supervised_sum_semantic_entropies.py b/src/lm_polygraph/estimators/supervised_sum_semantic_entropies.py
new file mode 100644
index 000000000..06fcd5cd5
--- /dev/null
+++ b/src/lm_polygraph/estimators/supervised_sum_semantic_entropies.py
@@ -0,0 +1,179 @@
+import numpy as np
+
+from typing import Dict
+from copy import deepcopy
+
+from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS
+from sklearn.preprocessing import MinMaxScaler
+
+
+def get_avg_dissim(sample_sentence_similarity, sample_ids):
+    batch_avg_similarity = []
+    for best_id, sentence_similarity in zip(sample_ids, sample_sentence_similarity):
+        batch_avg_similarity.append(np.mean(1 - sentence_similarity[best_id, :]))
+    return batch_avg_similarity
+
+def normalize_and_enrich(batch_metrics, batch_avg_dissimilarity, alpha):
+    batch_metrics = MinMaxScaler().fit_transform(np.array(batch_metrics).reshape(-1, 1)).flatten()
+    batch_avg_dissimilarity = MinMaxScaler().fit_transform(np.array(batch_avg_dissimilarity).reshape(-1, 1)).flatten()
+    enriched_metrics = [metric + avg_dissimilarity * alpha for metric, avg_dissimilarity in zip(batch_metrics, batch_avg_dissimilarity)]
+    return enriched_metrics
+
+
+class SupSumSemanticMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first",
+        alpha: int = 1
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+        self.alpha = alpha
+
+    def __str__(self):
+        base = f"SupSumSemanticMaxprob_{self.alpha}"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        batch_mps = [-np.sum(log_probs[best_id]) for best_id, log_probs in zip(sample_ids, stats["sample_log_probs"])]
+        batch_avg_dissim = get_avg_dissim(stats["sample_sentence_similarity"], sample_ids)
+
+        enriched_metrics = normalize_and_enrich(batch_mps, batch_avg_dissim, self.alpha)
+
+        return np.array(enriched_metrics)
+
+
+class SupSumSemanticPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first",
+        alpha: int = 1 
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+        self.alpha = alpha
+
+    def __str__(self):
+        base = f"SupSumSemanticPPL_{self.alpha}"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        batch_ppls = [-np.mean(sample_log_likelihoods[best_id]) for best_id, sample_log_likelihoods in zip(sample_ids, stats["sample_log_likelihoods"])]
+        batch_avg_dissim = get_avg_dissim(stats["sample_sentence_similarity"], sample_ids)
+        
+        enriched_metrics = normalize_and_enrich(batch_ppls, batch_avg_dissim, self.alpha)
+
+        return np.array(enriched_metrics)
+
+
+class SupSumSemanticMTE(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        sample_strategy: str = "first",
+        alpha: int = 1
+    ):
+        super().__init__(
+            ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence"
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+        self.alpha = alpha
+
+    def __str__(self):
+        base = f"SupSumSemanticMTE_{self.alpha}"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        sample_ids = best_sample_ids(self.sample_strategy, stats)
+
+        batch_mtes = [entropies[best_id] for best_id, entropies in zip(sample_ids, stats["sample_entropy"])]
+        batch_avg_dissim = get_avg_dissim(stats["sample_sentence_similarity"], sample_ids)
+
+        enriched_metrics = normalize_and_enrich(batch_mtes, batch_avg_dissim, self.alpha)
+
+        return np.array(enriched_metrics)
+
+
+class GreedySupSumSemanticMaxprob(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        alpha: int = 1
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.alpha = alpha
+
+    def __str__(self):
+        return f"GreedySupSumSemanticMaxprob_{self.alpha}"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]])
+        batch_avg_dissim = [np.mean(1 - sentence_similarity) for sentence_similarity in stats["greedy_sentence_similarity"]]
+        
+        enriched_metrics = normalize_and_enrich(batch_lls, batch_avg_dissim, self.alpha)
+
+        return np.array(enriched_metrics)
+
+
+class GreedySupSumSemanticPPL(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        alpha: int = 1
+    ):
+        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        self.verbose = verbose
+        self.alpha = alpha
+
+    def __str__(self):
+        return f"GreedySupSumSemanticPPL_{self.alpha}"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_ppls = [-np.mean(greedy_log_likelihoods) for greedy_log_likelihoods in stats["greedy_log_likelihoods"]]
+        batch_avg_dissim = [np.mean(1 - sentence_similarity) for sentence_similarity in stats["greedy_sentence_similarity"]]
+
+        enriched_metrics = normalize_and_enrich(batch_ppls, batch_avg_dissim, self.alpha)
+
+        return np.array(enriched_metrics)
+
+
+class GreedySupSumSemanticMTE(Estimator):
+    def __init__(
+        self,
+        verbose: bool = False,
+        alpha: int = 1
+    ):
+        super().__init__(["greedy_sentence_similarity", "entropy"], "sequence")
+        self.verbose = verbose
+        self.alpha = alpha
+
+    def __str__(self):
+        return f"GreedySupSumSemanticMTE_{self.alpha}"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        #batch_greedy_entropies = stats["greedy_log_likelihoods"]
+        #batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+        batch_mtes = [np.mean(greedy_entropies) for greedy_entropies in stats["greedy_log_likelihoods"]]
+        batch_avg_dissim = [np.mean(1 - sentence_similarity) for sentence_similarity in stats["greedy_sentence_similarity"]]
+
+        enriched_metrics = normalize_and_enrich(batch_mtes, batch_avg_dissim, self.alpha)
+
+        return np.array(enriched_metrics)
diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py
index c8ede60a6..57499408a 100644
--- a/src/lm_polygraph/stat_calculators/semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py
@@ -101,17 +101,17 @@ def __call__(
 
             unique_mat_shape = (batch_counts[i], batch_counts[i])
 
-            unique_E = entail_probs.view(unique_mat_shape).numpy()
-            unique_C = contra_probs.view(unique_mat_shape).numpy()
-            unique_P = class_preds.view(unique_mat_shape).numpy()
+            unique_E = entail_probs.view(unique_mat_shape)
+            unique_C = contra_probs.view(unique_mat_shape)
+            unique_P = class_preds.view(unique_mat_shape)
 
             inv = batch_invs[i]
 
             # Recover full matrices from unques by gathering along both axes
             # using inverse index
-            E.append(unique_E[inv, :][:, inv])
-            C.append(unique_C[inv, :][:, inv])
-            P.append(unique_P[inv, :][:, inv])
+            E.append(unique_E.cpu().numpy()[inv, :][:, inv])
+            C.append(unique_C.cpu().numpy()[inv, :][:, inv])
+            P.append(unique_P.cpu().numpy()[inv, :][:, inv])
 
         E = np.stack(E)
         C = np.stack(C)
diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
index fa96605e1..b453c89fb 100644
--- a/src/lm_polygraph/utils/register_stat_calculators.py
+++ b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -30,15 +30,16 @@ def register_stat_calculators(
     log.info("=" * 100)
     log.info("Loading NLI model...")
 
-    if language == "en":
-        nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device)
-    elif language in ["zh", "ar", "ru"]:
-        nli_model = MultilingualDeberta(
-            batch_size=deberta_batch_size,
-            device=deberta_device,
-        )
-    else:
-        raise Exception(f"Unsupported language: {language}")
+    #if language == "en":
+    #    nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device)
+    #elif language in ["zh", "ar", "ru"]:
+    #    nli_model = MultilingualDeberta(
+    #        batch_size=deberta_batch_size,
+    #        device=deberta_device,
+    #    )
+    #else:
+    #    raise Exception(f"Unsupported language: {language}")
+    nli_model = None
 
     log.info("=" * 100)
     log.info("Initializing stat calculators...")

From 59062663dbb8d1d2ab0c1f0af1aa9236007f1b93 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Fri, 31 Jan 2025 12:19:59 +0400
Subject: [PATCH 83/97] Final fixes before submit

---
 src/lm_polygraph/estimators/__init__.py       |  20 +-
 ..._semantic_average_ue_average_similarity.py | 216 +++-----------
 .../semantic_average_ue_average_similarity.py | 281 ++++--------------
 .../utils/register_stat_calculators.py        |  18 +-
 4 files changed, 120 insertions(+), 415 deletions(-)

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index ea76573dc..254788487 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -85,25 +85,21 @@
 from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE
 from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE
 from .semantic_average_ue_average_similarity import (
-    SemanticAveMaxprobAveSimilarity, 
-    SemanticAvePPLAveSimilarity, 
-    SemanticAveTokenSARAveSimilarity,
-    SemanticAveMTEAveSimilarity,
-    SemanticEnrichedPPLAveDissimilarity,
-    SemanticEnrichedTokenSARAveDissimilarity ,
     SemanticEnrichedMaxprobAveDissimilarity,
+    SemanticEnrichedPPLAveDissimilarity,
     SemanticEnrichedMTEAveDissimilarity,
+    SemanticEnrichedMaxprobTotalDissimilarity,
+    SemanticEnrichedPPLTotalDissimilarity,
+    SemanticEnrichedMTETotalDissimilarity,
     AveDissimilarity
 )
 from .greedy_semantic_average_ue_average_similarity import (
-    GreedySemanticAveMaxprobAveSimilarity, 
-    GreedySemanticAvePPLAveSimilarity, 
-    GreedySemanticAveTokenSARAveSimilarity,
-    GreedySemanticAveMTEAveSimilarity,
-    GreedySemanticEnrichedPPLAveDissimilarity,
-    GreedySemanticEnrichedTokenSARAveDissimilarity ,
     GreedySemanticEnrichedMaxprobAveDissimilarity,
+    GreedySemanticEnrichedPPLAveDissimilarity,
     GreedySemanticEnrichedMTEAveDissimilarity,
+    GreedySemanticEnrichedMaxprobTotalDissimilarity,
+    GreedySemanticEnrichedPPLTotalDissimilarity,
+    GreedySemanticEnrichedMTETotalDissimilarity,
     GreedyAveDissimilarity
 )
 from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE
diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
index 69611957b..18dbd8ddb 100644
--- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py
@@ -7,7 +7,7 @@
 from .common import sample_strategy_to_prefix, best_sample_ids
 
 
-class GreedySemanticAveMaxprobAveSimilarity(Estimator):
+class GreedySemanticEnrichedMaxprobAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -19,9 +19,9 @@ def __init__(
 
     def __str__(self):
         if self.exp:
-            return "GreedySemanticAveMaxprobAveSimilarityexp"
+            return "GreedySemanticEnrichedMaxprobAveDissimilarityexp"
         else:
-            return "GreedySemanticAveMaxprobAveSimilarity"
+            return "GreedySemanticEnrichedMaxprobAveDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
@@ -38,41 +38,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             # Compute row-wise average similarity, excluding self-similarity
             # Diagonal contains self-similarities
-            avg_similarity = np.mean(greedy_sentence_similarity)
-
-            # Enrich each metric by scaling it by 1/row_average
-            if avg_similarity == 0:
-                avg_similarity = 1e-10  # Avoid division by zero
+            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
 
-            enriched_metric = prob * (1 / avg_similarity)
+            enriched_metric = prob * avg_dissimilarity
             enriched_metrics.append(enriched_metric)
 
         return np.array(enriched_metrics)
 
 
-class GreedySemanticEnrichedMaxprobAveDissimilarity(Estimator):
+class GreedySemanticEnrichedMaxprobTotalDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
         exp: bool = False,
     ):
-        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        super().__init__(["sample_sentence_similarity", "greedy_log_likelihoods"], "sequence")
         self.verbose = verbose
         self.exp = exp
 
     def __str__(self):
         if self.exp:
-            return "GreedySemanticEnrichedMaxprobAveDissimilarityexp"
+            return "GreedySemanticEnrichedMaxprobTotalDissimilarityexp"
         else:
-            return "GreedySemanticEnrichedMaxprobAveDissimilarity"
+            return "GreedySemanticEnrichedMaxprobTotalDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
         batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]])
 
         enriched_metrics = []  # To store enriched metrics for each sample
-        for greedy_ll, greedy_sentence_similarity in zip(
-            batch_lls, batch_greedy_sentence_similarity
+        for greedy_ll, sample_sentence_similarity in zip(
+            batch_lls, batch_sample_sentence_similarity
         ):
             # Compute probabilities (negative log-probs)
             prob = -greedy_ll
@@ -81,7 +77,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             # Compute row-wise average similarity, excluding self-similarity
             # Diagonal contains self-similarities
-            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
+            avg_dissimilarity = np.mean(1 - np.array(sample_sentence_similarity))
 
             enriched_metric = prob * avg_dissimilarity
             enriched_metrics.append(enriched_metric)
@@ -89,7 +85,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(enriched_metrics)
 
 
-class GreedySemanticAvePPLAveSimilarity(Estimator):
+class GreedySemanticEnrichedPPLAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -101,9 +97,9 @@ def __init__(
 
     def __str__(self):
         if self.exp:
-            return "GreedySemanticAvePPLAveSimilarityexp"
+            return "GreedySemanticEnrichedPPLAveDissimilarityexp"
         else:
-            return "GreedySemanticAvePPLAveSimilarity"
+            return "GreedySemanticEnrichedPPLAveDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
@@ -119,43 +115,39 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             if self.exp:
                 ppl = -np.exp(-ppl)
 
-            #  Compute row-wise average similarity, excluding self-similarity
-            avg_similarity = np.mean(greedy_sentence_similarity)
-
-            # Enrich each PPL independently by scaling with 1/row_average
-            if avg_similarity == 0:
-                avg_similarity = 1e-10  # Avoid division by zero
+            # Compute row-wise average similarity, excluding self-similarity
+            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
 
-            enriched_value = ppl * (1 / avg_similarity)
+            enriched_value = ppl *  avg_dissimilarity
             enriched_ppl.append(enriched_value)
 
         return np.array(enriched_ppl)
 
 
-class GreedySemanticEnrichedPPLAveDissimilarity(Estimator):
+class GreedySemanticEnrichedPPLTotalDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
         exp: bool = False,
     ):
-        super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence")
+        super().__init__(["sample_sentence_similarity", "greedy_log_likelihoods"], "sequence")
         self.verbose = verbose
         self.exp = exp
 
     def __str__(self):
         if self.exp:
-            return "GreedySemanticEnrichedPPLAveDissimilarityexp"
+            return "GreedySemanticEnrichedPPLTotalDissimilarityexp"
         else:
-            return "GreedySemanticEnrichedPPLAveDissimilarity"
+            return "GreedySemanticEnrichedPPLTotalDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
         batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
-        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
 
         enriched_ppl = []  # To store enriched PPL for each sample
 
-        for greedy_log_likelihoods, greedy_sentence_similarity in zip(
-            batch_greedy_log_likelihoods, batch_greedy_sentence_similarity
+        for greedy_log_likelihoods, sample_sentence_similarity in zip(
+            batch_greedy_log_likelihoods, batch_sample_sentence_similarity
         ):
             # get PPL for each sample
             ppl = -np.mean(greedy_log_likelihoods)
@@ -163,7 +155,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 ppl = -np.exp(-ppl)
 
             # Compute row-wise average similarity, excluding self-similarity
-            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
+            avg_dissimilarity = np.mean(1 - np.array(sample_sentence_similarity))
 
             enriched_value = ppl *  avg_dissimilarity
             enriched_ppl.append(enriched_value)
@@ -171,129 +163,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(enriched_ppl)
 
 
-class GreedySemanticAveTokenSARAveSimilarity(Estimator):
-    def __init__(
-        self,
-        verbose: bool = False,
-        exp: bool = False,
-    ):
-        super().__init__(
-            [
-                "greedy_sentence_similarity",
-                "greedy_log_likelihoods",
-            ],
-            "sequence",
-        )
-        self.verbose = verbose
-        self.exp = exp
-
-    def __str__(self):
-        if self.exp:
-            return "GreedySemanticAveTokenSARAveSimilarityexp"
-        else:
-            return "GreedySemanticAveTokenSARAveSimilarity"
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
-        batch_greedy_token_similarity = stats["token_similarity"]
-        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
-
-        enriched_tokenSAR = []
-
-        for batch_data in zip(
-            batch_greedy_log_likelihoods,
-            batch_greedy_token_similarity,
-            batch_greedy_sentence_similarity,
-        ):
-            log_likelihoods = batch_data[0]
-            token_similarity = batch_data[1]
-            greedy_sentence_similarity = batch_data[2]
-
-            log_likelihoods = np.array(log_likelihoods)
-            R_t = 1 - token_similarity
-            if R_t.sum() == 0:
-                R_t_norm = np.zeros_like(R_t)
-            else:
-                R_t_norm = R_t / R_t.sum()
-            E_t = -log_likelihoods * R_t_norm
-            tokenSAR = E_t.sum()
-
-            if self.exp:
-                tokenSAR = -np.exp(-np.array(tokenSAR))
-
-            #  Compute row-wise average similarity, excluding self-similarity
-            avg_similarity = np.mean(greedy_sentence_similarity)
-
-            # Enrich each PPL independently by scaling with 1/row_average
-            if avg_similarity == 0:
-                avg_similarity = 1e-10  # Avoid division by zero
-
-            enriched_value = tokenSAR * (1 / avg_similarity)
-            enriched_tokenSAR.append(enriched_value)
-
-        return np.array(enriched_tokenSAR)
-
-
-class GreedySemanticEnrichedTokenSARAveDissimilarity(Estimator):
-    def __init__(
-        self,
-        verbose: bool = False,
-        exp: bool = False,
-    ):
-        super().__init__(
-            [
-                "greedy_sentence_similarity",
-                "greedy_log_likelihoods",
-            ],
-            "sequence",
-        )
-        self.verbose = verbose
-        self.exp = exp
-
-    def __str__(self):
-        if self.exp:
-            return "GreedySemanticEnrichedTokenSARAveDissimilarityexp"
-        else:
-            return "GreedySemanticEnrichedTokenSARAveDissimilarity"
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
-        batch_greedy_token_similarity = stats["token_similarity"]
-        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
-
-        enriched_tokenSAR = []
-
-        for batch_data in zip(
-            batch_greedy_log_likelihoods,
-            batch_greedy_token_similarity,
-            batch_greedy_sentence_similarity,
-        ):
-            log_likelihoods = batch_data[0]
-            token_similarity = batch_data[1]
-            greedy_sentence_similarity = batch_data[2]
-
-            log_likelihoods = np.array(log_likelihoods)
-            R_t = 1 - token_similarity
-            if R_t.sum() == 0:
-                R_t_norm = np.zeros_like(R_t)
-            else:
-                R_t_norm = R_t / R_t.sum()
-            E_t = -log_likelihoods * R_t_norm
-            tokenSAR = E_t.sum()
-
-            if self.exp:
-                tokenSAR = -np.exp(-np.array(tokenSAR))
-
-            #  Compute row-wise average similarity, excluding self-similarity
-            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
-
-            enriched_value = tokenSAR * avg_dissimilarity
-            enriched_tokenSAR.append(enriched_value)
-
-        return np.array(enriched_tokenSAR)
-
-
-class GreedySemanticAveMTEAveSimilarity(Estimator):
+class GreedySemanticEnrichedMTEAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -302,7 +172,7 @@ def __init__(
         self.verbose = verbose
 
     def __str__(self):
-        return "GreedySemanticAveMTEAveSimilarity"
+        return "GreedySemanticEnrichedMTEAveDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_greedy_entropy = stats["entropy"]
@@ -314,43 +184,39 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
             batch_greedy_entropy, batch_greedy_sentence_similarity
         ):
             #  Compute row-wise average similarity, excluding self-similarity
-            avg_similarity = np.mean(greedy_sentence_similarity)
-
-            # Enrich each PPL independently by scaling with 1/row_average
-            if avg_similarity == 0:
-                avg_similarity = 1e-10  # Avoid division by zero
+            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
             
             entropy = np.mean(greedy_entropy)
-            enriched_value = entropy * (1 / avg_similarity)
+            enriched_value = entropy * avg_dissimilarity
             enriched_entropy.append(enriched_value)
 
         return np.array(enriched_entropy)
 
 
-class GreedySemanticEnrichedMTEAveDissimilarity(Estimator):
+class GreedySemanticEnrichedMTETotalDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
     ):
-        super().__init__(["greedy_sentence_similarity", "entropy"], "sequence")
+        super().__init__(["sample_sentence_similarity", "entropy"], "sequence")
         self.verbose = verbose
 
     def __str__(self):
-        return "GreedySemanticEnrichedMTEAveDissimilarity"
+        return "GreedySemanticEnrichedMTETotalDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        batch_greedy_entropy = stats["entropy"]
-        batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"]
+        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
+        batch_entropy = stats["entropy"]
 
         enriched_entropy = []
 
-        for greedy_entropy, greedy_sentence_similarity in zip(
-            batch_greedy_entropy, batch_greedy_sentence_similarity
+        for entropy, sample_sentence_similarity in zip(
+            batch_entropy, batch_sample_sentence_similarity
         ):
             #  Compute row-wise average similarity, excluding self-similarity
-            avg_dissimilarity = np.mean(1 - greedy_sentence_similarity)
-            
-            entropy = np.mean(greedy_entropy)
+            avg_dissimilarity = np.mean(1 - np.array(sample_sentence_similarity))
+
+            entropy = np.mean(entropy)
             enriched_value = entropy * avg_dissimilarity
             enriched_entropy.append(enriched_value)
 
diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
index f10f02925..03956cacb 100644
--- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
+++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py
@@ -7,7 +7,7 @@
 from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS
 
 
-class SemanticAveMaxprobAveSimilarity(Estimator):
+class SemanticEnrichedMaxprobAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -24,9 +24,9 @@ def __init__(
 
     def __str__(self):
         if self.exp:
-            base = "SemanticAveMaxprobAveSimilarityexp"
+            base = "SemanticEnrichedMaxprobAveDissimilarityexp"
         else:
-            base = "SemanticAveMaxprobAveSimilarity"
+            base = "SemanticEnrichedMaxprobAveDissimilarity"
         return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
@@ -39,37 +39,41 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_log_probs, sample_sentence_similarity in zip(
             batch_sample_log_probs, batch_sample_sentence_similarity
         ):
-            # Compute probabilities (negative log-probs)
+            # Step 1: Compute probabilities (negative log-probs)
             sample_probs = -np.array(sample_log_probs)
             if self.exp:
                 sample_probs = -np.exp(-sample_probs)
 
-            # Compute row-wise average similarity, excluding self-similarity
-            # Diagonal contains self-similarities
-            row_averages = []
+            # Step 2: Compute row-wise sum of dissimilarities (1 - g)
+            row_dissimilarities = []
             for i in range(sample_sentence_similarity.shape[0]):
                 row = sample_sentence_similarity[i]
-                average_similarity = (np.sum(row) - row[i]) / (len(row) - 1)
-                row_averages.append(average_similarity)
+                sum_dissimilarities = np.sum(1 - row) - (1 - row[i])  # Exclude self-similarity
+                row_dissimilarities.append(sum_dissimilarities)
 
-            # Enrich each metric by scaling it by 1/row_average
+            # Step 3: Normalize by (M - 1)
+            normalized_dissimilarities = [
+                dissim / (len(sample_sentence_similarity) - 1)
+                for dissim in row_dissimilarities
+            ]
+
+            # Step 4: Enrich each metric
             enriched_sample_metrics = []
-            for i, (prob, avg_similarity) in enumerate(zip(sample_probs, row_averages)):
-                if avg_similarity == 0:
-                    avg_similarity = 1e-10  # Avoid division by zero
-                enriched_metric = prob * (1 / avg_similarity)
+            for prob, dissim in zip(sample_probs, normalized_dissimilarities):
+                enriched_metric = prob * dissim
                 enriched_sample_metrics.append(enriched_metric)
 
             enriched_metrics.append(np.array(enriched_sample_metrics))
 
-        # Return only metric for the best sample for prr calculation
+        # Return only metric for the best sample for PRR calculation
         best_elements = []
         for best_id, metrics in zip(sample_ids, enriched_metrics):
             best_elements.append(metrics[best_id])
 
         return np.array(best_elements)
 
-class SemanticEnrichedMaxprobAveDissimilarity(Estimator):
+
+class SemanticEnrichedMaxprobTotalDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -86,9 +90,9 @@ def __init__(
 
     def __str__(self):
         if self.exp:
-            base = "SemanticEnrichedMaxprobAveDissimilarityexp"
+            base = "SemanticEnrichedMaxprobTotalDissimilarityexp"
         else:
-            base = "SemanticEnrichedMaxprobAveDissimilarity"
+            base = "SemanticEnrichedMaxprobTotalDissimilarity"
         return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
@@ -119,9 +123,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 for dissim in row_dissimilarities
             ]
 
+            dissim = np.mean(normalized_dissimilarities)
+
             # Step 4: Enrich each metric
             enriched_sample_metrics = []
-            for prob, dissim in zip(sample_probs, normalized_dissimilarities):
+            for prob in sample_probs:
                 enriched_metric = prob * dissim
                 enriched_sample_metrics.append(enriched_metric)
 
@@ -135,11 +141,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(best_elements)
 
 
-class SemanticAvePPLAveSimilarity(Estimator):
+class SemanticEnrichedPPLAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
-        exp: bool = False,
+        exp: bool = False,  
         sample_strategy: str = "first"
     ):
         super().__init__(
@@ -152,9 +158,9 @@ def __init__(
 
     def __str__(self):
         if self.exp:
-            base = "SemanticAvePPLAveSimilarityexp"
+            base = "SemanticEnrichedPPLAveDissimilarityexp"
         else:
-            base = "SemanticAvePPLAveSimilarity"
+            base = "SemanticEnrichedPPLAveDissimilarity"
         return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
@@ -167,36 +173,38 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_log_likelihoods, sample_sentence_similarity in zip(
             batch_sample_log_likelihoods, batch_sample_sentence_similarity
         ):
-            # get PPL for each sample
+            # Step 1: Compute PPL for each sample
             ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods])
             if self.exp:
                 ppl = -np.exp(-ppl)
 
-            #  Compute row-wise average similarity, excluding self-similarity
+            # Step 2: Compute row-wise average dissimilarity (1 - g)
             row_averages = []
             for i in range(sample_sentence_similarity.shape[0]):
                 row = sample_sentence_similarity[i]
-                average_similarity = (np.sum(row) - row[i]) / (len(row) - 1)  # Exclude g_ii
-                row_averages.append(average_similarity)
+                # Compute average dissimilarity, excluding self-similarity
+                average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1)
+                row_averages.append(average_dissimilarity)
 
-            # Enrich each PPL independently by scaling with 1/row_average
+            # Step 3: Enrich each PPL independently by scaling with the average dissimilarity
             enriched_sample_ppl = []
-            for i, (ppl_value, avg_similarity) in enumerate(zip(ppl, row_averages)):
-                if avg_similarity == 0:
-                    avg_similarity = 1e-10  # Avoid division by zero
-                enriched_value = ppl_value * (1 / avg_similarity)
+            for i, (ppl_value, avg_dissimilarity) in enumerate(zip(ppl, row_averages)):
+                if avg_dissimilarity == 0:
+                    avg_dissimilarity = 1e-10  # Avoid division by zero
+                enriched_value = ppl_value * avg_dissimilarity
                 enriched_sample_ppl.append(enriched_value)
 
             enriched_ppl.append(np.array(enriched_sample_ppl))  # Collect enriched PPL values
 
-        # Return only metric for the best sample for prr calculation
+        # Return only metric for the best sample for PRR calculation
         best_elements = []
         for best_id, metrics in zip(sample_ids, enriched_ppl):
             best_elements.append(metrics[best_id])
 
         return np.array(best_elements)
 
-class SemanticEnrichedPPLAveDissimilarity(Estimator):
+
+class SemanticEnrichedPPLTotalDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -213,9 +221,9 @@ def __init__(
 
     def __str__(self):
         if self.exp:
-            base = "SemanticEnrichedPPLAveDissimilarityexp"
+            base = "SemanticEnrichedPPLTotalDissimilarityexp"
         else:
-            base = "SemanticEnrichedPPLAveDissimilarity"
+            base = "SemanticEnrichedPPLTotalDissimilarity"
         return sample_strategy_to_prefix(self.sample_strategy) + base
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
@@ -241,9 +249,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1)
                 row_averages.append(average_dissimilarity)
 
+            avg_dissimilarity = np.mean(row_averages)
+
             # Step 3: Enrich each PPL independently by scaling with the average dissimilarity
             enriched_sample_ppl = []
-            for i, (ppl_value, avg_dissimilarity) in enumerate(zip(ppl, row_averages)):
+            for ppl_value in ppl:
                 if avg_dissimilarity == 0:
                     avg_dissimilarity = 1e-10  # Avoid division by zero
                 enriched_value = ppl_value * avg_dissimilarity
@@ -259,175 +269,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(best_elements)
 
 
-class SemanticAveTokenSARAveSimilarity(Estimator):
-    def __init__(
-        self,
-        verbose: bool = False,
-        exp: bool = False,
-        sample_strategy: str = "first"
-    ):
-        super().__init__(
-            [
-                "sample_sentence_similarity",
-                "sample_log_likelihoods",
-                "sample_token_similarity",
-            ] + SAMPLE_SELECTION_STAT_KEYS,
-            "sequence",
-        )
-        self.verbose = verbose
-        self.exp = exp
-        self.sample_strategy = sample_strategy
-
-    def __str__(self):
-        if self.exp:
-            base = "SemanticAveTokenSARAveSimilarityexp"
-        else:
-            base = "SemanticAveTokenSARAveSimilarity"
-        return sample_strategy_to_prefix(self.sample_strategy) + base
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
-        batch_sample_token_similarity = stats["sample_token_similarity"]
-        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
-        sample_ids = best_sample_ids(self.sample_strategy, stats)
-
-        enriched_tokenSAR = []
-
-        for batch_data in zip(
-            batch_sample_log_likelihoods,
-            batch_sample_token_similarity,
-            batch_sample_sentence_similarity,
-        ):
-            sample_log_likelihoods = batch_data[0]
-            sample_token_similarity = batch_data[1]
-            sample_sentence_similarity = batch_data[2]
-
-            tokenSAR = []
-            for log_likelihoods, token_similarity in zip(
-                sample_log_likelihoods, sample_token_similarity
-            ):
-                log_likelihoods = np.array(log_likelihoods)
-                R_t = 1 - token_similarity
-                if R_t.sum() == 0:
-                    R_t_norm = np.zeros_like(R_t)
-                else:
-                    R_t_norm = R_t / R_t.sum()
-                E_t = -log_likelihoods * R_t_norm
-                tokenSAR.append(E_t.sum())
-
-            if self.exp:
-                tokenSAR = -np.exp(-np.array(tokenSAR))
-
-            # Compute row-wise average similarity excluding self-similarity
-            row_averages = []
-            for i in range(sample_sentence_similarity.shape[0]):
-                row = sample_sentence_similarity[i]
-                average_similarity = (np.sum(row) - row[i]) / (len(row) - 1)  # Exclude g_ii
-                row_averages.append(average_similarity)
-
-            # Enrich each tokenSAR value
-            enriched_sample_tokenSAR = []
-            for i, (sar_value, avg_similarity) in enumerate(zip(tokenSAR, row_averages)):
-                if avg_similarity == 0:
-                    avg_similarity = 1e-10  # Avoid division by zero
-                enriched_value = sar_value * (1 / avg_similarity)
-                enriched_sample_tokenSAR.append(enriched_value)
-
-            enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR))
-
-        # Return only metric for the best sample for prr calculation
-        best_elements = []
-        for best_id, metrics in zip(sample_ids, enriched_tokenSAR):
-            best_elements.append(metrics[best_id])
-
-        return np.array(best_elements)
-
-
-class SemanticEnrichedTokenSARAveDissimilarity(Estimator):
-    def __init__(
-        self,
-        verbose: bool = False,
-        exp: bool = False,
-        sample_strategy: str = "first"
-    ):
-        super().__init__(
-            [
-                "sample_sentence_similarity",
-                "sample_log_likelihoods",
-                "sample_token_similarity",
-            ] + SAMPLE_SELECTION_STAT_KEYS,
-            "sequence",
-        )
-        self.verbose = verbose
-        self.exp = exp
-        self.sample_strategy = sample_strategy
-
-    def __str__(self):
-        if self.exp:
-            base = "SemanticEnrichedTokenSARAveDissimilarityexp"
-        else:
-            base = "SemanticEnrichedTokenSARAveDissimilarity"
-        return sample_strategy_to_prefix(self.sample_strategy) + base
-
-    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
-        batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
-        batch_sample_token_similarity = stats["sample_token_similarity"]
-        batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
-        sample_ids = best_sample_ids(self.sample_strategy, stats)
-
-        enriched_tokenSAR = []
-
-        for batch_data in zip(
-            batch_sample_log_likelihoods,
-            batch_sample_token_similarity,
-            batch_sample_sentence_similarity,
-        ):
-            sample_log_likelihoods = batch_data[0]
-            sample_token_similarity = batch_data[1]
-            sample_sentence_similarity = batch_data[2]
-
-            tokenSAR = []
-            for log_likelihoods, token_similarity in zip(
-                sample_log_likelihoods, sample_token_similarity
-            ):
-                log_likelihoods = np.array(log_likelihoods)
-                R_t = 1 - token_similarity
-                if R_t.sum() == 0:
-                    R_t_norm = np.zeros_like(R_t)
-                else:
-                    R_t_norm = R_t / R_t.sum()
-                E_t = -log_likelihoods * R_t_norm
-                tokenSAR.append(E_t.sum())
-
-            if self.exp:
-                tokenSAR = -np.exp(-np.array(tokenSAR))
-
-            # Compute row-wise average dissimilarity (1 - g), excluding self-similarity
-            row_averages = []
-            for i in range(sample_sentence_similarity.shape[0]):
-                row = sample_sentence_similarity[i]
-                average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1)
-                row_averages.append(average_dissimilarity)
-
-            # Enrich each tokenSAR value
-            enriched_sample_tokenSAR = []
-            for i, (sar_value, avg_dissimilarity) in enumerate(zip(tokenSAR, row_averages)):
-                if avg_dissimilarity == 0:
-                    avg_dissimilarity = 1e-10  # Avoid division by zero
-                enriched_value = sar_value * avg_dissimilarity
-                enriched_sample_tokenSAR.append(enriched_value)
-
-            enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR))
-
-        # Return only metric for the best sample for PRR calculation
-        best_elements = []
-        for best_id, metrics in zip(sample_ids, enriched_tokenSAR):
-            best_elements.append(metrics[best_id])
-
-        return np.array(best_elements)
-
-
-class SemanticAveMTEAveSimilarity(Estimator):
+class SemanticEnrichedMTEAveDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -441,7 +283,7 @@ def __init__(
         self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return sample_strategy_to_prefix(self.sample_strategy) + "SemanticAveMTEAveSimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + "SemanticEnrichedMTEAveDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_entropy = stats["sample_entropy"]
@@ -453,24 +295,24 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         for sample_entropy, sample_sentence_similarity in zip(
             batch_sample_entropy, batch_sample_sentence_similarity
         ):
-            # Compute row-wise average similarity, excluding self-similarity
+            # Compute row-wise average dissimilarity (1 - g), excluding self-similarity
             row_averages = []
             for i in range(sample_sentence_similarity.shape[0]):
                 row = sample_sentence_similarity[i]
-                average_similarity = (np.sum(row) - row[i]) / (len(row) - 1)  # Exclude g_ii
-                row_averages.append(average_similarity)
+                average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1)
+                row_averages.append(average_dissimilarity)
 
             # Enrich each sample's entropy value
             enriched_sample_entropy = []
-            for i, (entropy, avg_similarity) in enumerate(zip(sample_entropy, row_averages)):
-                if avg_similarity == 0:
-                    avg_similarity = 1e-10  # Avoid division by zero
-                enriched_value = entropy * (1 / avg_similarity)
+            for i, (entropy, avg_dissimilarity) in enumerate(zip(sample_entropy, row_averages)):
+                if avg_dissimilarity == 0:
+                    avg_dissimilarity = 1e-10  # Avoid division by zero
+                enriched_value = entropy * avg_dissimilarity
                 enriched_sample_entropy.append(enriched_value)
 
             enriched_entropy.append(np.array(enriched_sample_entropy))
 
-        # Return only metric for the best sample for prr calculation
+        # Return only metric for the best sample for PRR calculation
         best_elements = []
         for best_id, metrics in zip(sample_ids, enriched_entropy):
             best_elements.append(metrics[best_id])
@@ -478,8 +320,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         return np.array(best_elements)
 
 
-
-class SemanticEnrichedMTEAveDissimilarity(Estimator):
+class SemanticEnrichedMTETotalDissimilarity(Estimator):
     def __init__(
         self,
         verbose: bool = False,
@@ -493,7 +334,7 @@ def __init__(
         self.sample_strategy = sample_strategy
 
     def __str__(self):
-        return sample_strategy_to_prefix(self.sample_strategy) + "SemanticEnrichedMTEAveDissimilarity"
+        return sample_strategy_to_prefix(self.sample_strategy) + "SemanticEnrichedMTETotalDissimilarity"
 
     def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_entropy = stats["sample_entropy"]
@@ -512,9 +353,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
                 average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1)
                 row_averages.append(average_dissimilarity)
 
+            avg_dissimilarity = np.mean(row_averages)
+
             # Enrich each sample's entropy value
             enriched_sample_entropy = []
-            for i, (entropy, avg_dissimilarity) in enumerate(zip(sample_entropy, row_averages)):
+            for entropy in sample_entropy:
                 if avg_dissimilarity == 0:
                     avg_dissimilarity = 1e-10  # Avoid division by zero
                 enriched_value = entropy * avg_dissimilarity
diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py
index b453c89fb..cf01905be 100644
--- a/src/lm_polygraph/utils/register_stat_calculators.py
+++ b/src/lm_polygraph/utils/register_stat_calculators.py
@@ -30,15 +30,15 @@ def register_stat_calculators(
     log.info("=" * 100)
     log.info("Loading NLI model...")
 
-    #if language == "en":
-    #    nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device)
-    #elif language in ["zh", "ar", "ru"]:
-    #    nli_model = MultilingualDeberta(
-    #        batch_size=deberta_batch_size,
-    #        device=deberta_device,
-    #    )
-    #else:
-    #    raise Exception(f"Unsupported language: {language}")
+    if language == "en":
+        nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device)
+    elif language in ["zh", "ar", "ru"]:
+        nli_model = MultilingualDeberta(
+            batch_size=deberta_batch_size,
+            device=deberta_device,
+        )
+    else:
+        raise Exception(f"Unsupported language: {language}")
     nli_model = None
 
     log.info("=" * 100)

From 0ab5eaf3546e5451fb649180c5238660e50dad2a Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Tue, 18 Feb 2025 15:55:37 +0400
Subject: [PATCH 84/97] Uncommited changes from cluster

---
 src/lm_polygraph/stat_calculators/greedy_align_matrix.py | 1 +
 src/lm_polygraph/stat_calculators/semantic_matrix.py     | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
index a467e92e6..497118726 100644
--- a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
+++ b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py
@@ -42,6 +42,7 @@ def __call__(
             # Sampling from LLM often produces significant number of identical
             # outputs. We only need to score pairs of unqiue outputs
             texts = [text if text.strip() != "" else "<empty>" for text in texts]
+            greedy_text = greedy_text if greedy_text.strip() != "" else "<empty>"
             unique_texts, inv = np.unique(texts, return_inverse=True)
             batch_pairs.append(list(itertools.product([greedy_text], unique_texts)))
             batch_invs.append(inv)
diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py
index 57499408a..8fe738056 100644
--- a/src/lm_polygraph/stat_calculators/semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py
@@ -83,7 +83,7 @@ def __call__(
         C = []
         P = []
 
-        for i, pairs in tqdm(enumerate(batch_pairs)):
+        for i, pairs in enumerate(tqdm(batch_pairs)):
             dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size)
             probs = []
             for first_texts, second_texts in dl:
@@ -92,7 +92,7 @@ def __call__(
                     batch, padding=True, return_tensors="pt"
                 ).to(device)
                 logits = deberta.deberta(**encoded).logits.detach().to(device)
-                probs.append(softmax(logits).cpu().detach())
+                probs.append(softmax(logits).detach())
             probs = torch.cat(probs, dim=0)
 
             entail_probs = probs[:, ent_id]

From d03d080087609a3a0f1a139a3f605fe2f65e2bf9 Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Thu, 27 Mar 2025 17:24:58 +0400
Subject: [PATCH 85/97] Fixed x metric for samples

---
 scripts/polygraph_eval                        |  17 ++
 .../generation_metrics/__init__.py            |   1 +
 .../generation_metrics/x_metric.py            | 145 ++++++++++++++
 .../generation_metrics/x_metric_utils.py      | 182 ++++++++++++++++++
 4 files changed, 345 insertions(+)
 create mode 100644 src/lm_polygraph/generation_metrics/x_metric.py
 create mode 100644 src/lm_polygraph/generation_metrics/x_metric_utils.py

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 8421863a5..8435cb2cd 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -28,6 +28,8 @@ from lm_polygraph.estimators.ensemble_token_measures import *
 from lm_polygraph.ue_metrics import *
 from lm_polygraph.utils.common import load_external_module
 from lm_polygraph.utils.generation_parameters import GenerationParameters
+from lm_polygraph.generation_metrics.x_metric_utils import MT5ForRegression
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 hydra_config = Path(os.environ["HYDRA_CONFIG"])
 
@@ -513,6 +515,21 @@ def get_generation_metrics(args):
                        Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True),
                        Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="Best"),
                        Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="BestNormalized")]
+            model_name_or_path="google/metricx-24-hybrid-large-v2p6"
+            tokenizer_name="google/mt5-large"
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            model_xmetric = MT5ForRegression.from_pretrained(model_name_or_path)
+            model_xmetric.to(device)
+            model_xmetric.eval()
+
+            tokenizer_xmetric = AutoTokenizer.from_pretrained(
+                tokenizer_name if tokenizer_name else model_name_or_path
+            )
+
+            result += [XMetric(model=model_xmetric, tokenizer=tokenizer_xmetric, source_ignore_regex = ignore_regex),
+                       XMetric(model=model_xmetric, tokenizer=tokenizer_xmetric, source_ignore_regex = ignore_regex, sample=True),
+                       XMetric(model=model_xmetric, tokenizer=tokenizer_xmetric, source_ignore_regex = ignore_regex, sample=True, sample_strategy="Best"),
+                       XMetric(model=model_xmetric, tokenizer=tokenizer_xmetric, source_ignore_regex = ignore_regex, sample=True, sample_strategy="BestNormalized")]
     else:
         result = []
         for metric in generation_metrics:
diff --git a/src/lm_polygraph/generation_metrics/__init__.py b/src/lm_polygraph/generation_metrics/__init__.py
index d9d66c958..f5e702401 100644
--- a/src/lm_polygraph/generation_metrics/__init__.py
+++ b/src/lm_polygraph/generation_metrics/__init__.py
@@ -10,3 +10,4 @@
 from .sbert import SbertMetric
 from .aggregated_metric import AggregatedMetric
 from .preprocess_output_target import PreprocessOutputTarget
+from .x_metric import XMetric
diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py
new file mode 100644
index 000000000..bc7c7e483
--- /dev/null
+++ b/src/lm_polygraph/generation_metrics/x_metric.py
@@ -0,0 +1,145 @@
+import re
+import numpy as np
+
+from typing import List, Dict
+from .generation_metric import GenerationMetric
+from transformers import AutoTokenizer
+from .x_metric_utils import MT5ForRegression
+import torch 
+import datasets 
+from transformers import TrainingArguments, Trainer
+
+class XMetric(GenerationMetric):
+    """
+    Calculates X-MERTIC (https://aclanthology.org/2023.wmt-1.63/)
+    between model-generated texts and ground truth texts.
+    """
+
+    def __init__(self, model ,tokenizer,
+                 source_ignore_regex=None, translation_ignore_regex=None, sample: bool = False, sample_strategy: str = "First"):
+        if sample:
+            super().__init__([
+                "first_sample_texts",
+                "best_sample_texts",
+                "best_normalized_sample_texts",
+                "input_texts"],
+            "sequence")
+        else:
+            super().__init__(["greedy_texts", "input_texts"], "sequence")
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = model
+
+        self.tokenizer = tokenizer
+        self.source_ignore_regex = (
+            re.compile(source_ignore_regex) if source_ignore_regex else None
+        )
+        self.translation_ignore_regex = (
+            re.compile(translation_ignore_regex) if translation_ignore_regex else None
+        )
+        self.training_args = TrainingArguments(
+            output_dir=".",
+            per_device_eval_batch_size=1,
+            dataloader_pin_memory=False,
+        )
+
+        self.trainer = Trainer(
+            model=self.model,
+            args=self.training_args,
+        )
+        self.sample = sample
+        self.sample_strategy=sample_strategy
+
+
+    def __str__(self):
+        if self.sample:
+            if self.sample_strategy == "First":
+                return f"Samplexmetric"
+            else:
+                return f"{self.sample_strategy}Samplexmetric"
+        return "xmetric"
+
+    def _filter_source(self, text: str, ignore_regex: re.Pattern) -> str:
+        if ignore_regex is not None:
+            try:
+                return ignore_regex.findall(text)[-1]
+            except IndexError:
+                raise ValueError(
+                    f"Source text '{text}' does not match the ignore regex '{ignore_regex}'"
+                )
+        return text
+
+    def _filter_translation(self, text: str, ignore_regex: re.Pattern) -> str:
+        return ignore_regex.sub("", text).strip() if ignore_regex else text.strip()
+
+    
+    def _prepare_inputs(self, translations: List[str], references: List[str]):
+        """Prepares the input data for X-MERTIC scoring."""
+        inputs = [
+            f"candidate: {hyp} reference: {ref}" 
+            for hyp, ref in zip(translations, references)
+        ]
+        tokenized = self.tokenizer(
+            inputs, 
+            max_length=512, 
+            truncation=True, 
+            padding=False
+        )
+        
+        # Convert to Hugging Face Dataset
+        dataset = datasets.Dataset.from_dict({
+            "input_ids": tokenized["input_ids"],
+            "attention_mask": tokenized["attention_mask"],
+            "input":inputs
+        }).with_format("torch")  
+        
+        def remove_eos(example):
+            example["input_ids"] = example["input_ids"][:-1]
+            example["attention_mask"] = example["attention_mask"][:-1]
+            return example
+
+        dataset = dataset.map(remove_eos)
+        return dataset
+
+    def __call__(
+        self,
+        stats: Dict[str, np.ndarray],
+        target_texts: List[str],
+    ) -> np.ndarray:
+        """
+        Calculates X-MERTIC between stats['greedy_texts'] and target_texts.
+
+        Parameters:
+            stats (Dict[str, np.ndarray]): input statistics, including:
+                * model-generated texts in 'greedy_texts'
+            target_texts (List[str]): ground-truth texts
+            input_texts (List[str]): input texts before translation
+
+        Returns:
+            np.ndarray: list of X-MERTIC scores for each sample.
+        """
+        references = [
+            src
+            for src in stats["target_texts"]
+        ]
+        if self.sample:
+            if self.sample_strategy == "First":
+                gen_texts = stats["first_sample_texts"]
+            elif self.sample_strategy == "Best":
+                gen_texts = stats["best_sample_texts"]
+            elif self.sample_strategy == "BestNormalized":
+                gen_texts = stats["best_normalized_sample_texts"]
+            else:
+                raise ValueError(f"Invalid sample strategy: {self.sample_strategy}")
+        else:
+            gen_texts = stats["greedy_texts"]
+
+        translations = [
+            self._filter_translation(tr, self.source_ignore_regex)
+            for tr in gen_texts
+        ]
+
+        inputs = self._prepare_inputs(translations, references)
+        scores, _, _ = self.trainer.predict(test_dataset=inputs)
+        for i, score in enumerate(scores):
+            scores[i] = (25 - score) / 25
+        return scores
\ No newline at end of file
diff --git a/src/lm_polygraph/generation_metrics/x_metric_utils.py b/src/lm_polygraph/generation_metrics/x_metric_utils.py
new file mode 100644
index 000000000..ce1314cd1
--- /dev/null
+++ b/src/lm_polygraph/generation_metrics/x_metric_utils.py
@@ -0,0 +1,182 @@
+import copy
+import dataclasses
+from typing import Optional, Tuple, Union
+import warnings
+
+import torch
+from torch import nn
+import transformers
+import transformers.modeling_outputs
+
+BaseModelOutput = transformers.modeling_outputs.BaseModelOutput
+ModelOutput = transformers.modeling_outputs.ModelOutput
+
+MT5Config = transformers.models.mt5.modeling_mt5.MT5Config
+MT5PreTrainedModel = transformers.models.mt5.modeling_mt5.MT5PreTrainedModel
+MT5Stack = transformers.models.mt5.modeling_mt5.MT5Stack
+
+__HEAD_MASK_WARNING_MSG = (
+    transformers.models.mt5.modeling_mt5.__HEAD_MASK_WARNING_MSG  # pylint: disable=protected-access
+)
+
+
+@dataclasses.dataclass
+class MT5ForRegressionOutput(ModelOutput):
+  loss: Optional[torch.FloatTensor] = None
+  predictions: torch.FloatTensor = None
+
+
+class MT5ForRegression(MT5PreTrainedModel):
+  """MT5 model for regression."""
+
+  def __init__(self, config: MT5Config):
+    super().__init__(config)
+    self.model_dim = config.d_model
+
+    self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+    encoder_config = copy.deepcopy(config)
+    encoder_config.is_decoder = False
+    encoder_config.use_cache = False
+    encoder_config.is_encoder_decoder = False
+    self.encoder = MT5Stack(encoder_config, self.shared)
+
+    decoder_config = copy.deepcopy(config)
+    decoder_config.is_decoder = True
+    decoder_config.is_encoder_decoder = False
+    decoder_config.num_layers = config.num_decoder_layers
+    self.decoder = MT5Stack(decoder_config, self.shared)
+
+    self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+    # Initialize weights and apply final processing
+    self.post_init()
+
+    # Model parallel
+    self.model_parallel = False
+    self.device_map = None
+
+  def forward(
+      self,
+      input_ids: Optional[torch.LongTensor] = None,
+      attention_mask: Optional[torch.FloatTensor] = None,
+      decoder_attention_mask: Optional[torch.BoolTensor] = None,
+      head_mask: Optional[torch.FloatTensor] = None,
+      decoder_head_mask: Optional[torch.FloatTensor] = None,
+      cross_attn_head_mask: Optional[torch.Tensor] = None,
+      encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+      past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+      inputs_embeds: Optional[torch.FloatTensor] = None,
+      decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+      labels: Optional[torch.FloatTensor] = None,
+      use_cache: Optional[bool] = None,
+      output_attentions: Optional[bool] = None,
+      output_hidden_states: Optional[bool] = None,
+      return_dict: Optional[bool] = None,
+  ) -> Union[Tuple[torch.FloatTensor], MT5ForRegressionOutput]:
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+
+    # FutureWarning: head_mask was separated into two input args - head_mask,
+    # decoder_head_mask
+    if head_mask is not None and decoder_head_mask is None:
+      if self.config.num_layers == self.config.num_decoder_layers:
+        warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+        decoder_head_mask = head_mask
+
+    # Encode if needed (training, first prediction pass)
+    if encoder_outputs is None:
+      # Convert encoder inputs in embeddings if needed
+      encoder_outputs = self.encoder(
+          input_ids=input_ids,
+          attention_mask=attention_mask,
+          inputs_embeds=inputs_embeds,
+          head_mask=head_mask,
+          output_attentions=output_attentions,
+          output_hidden_states=output_hidden_states,
+          return_dict=return_dict,
+      )
+    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+      encoder_outputs = BaseModelOutput(
+          last_hidden_state=encoder_outputs[0],
+          hidden_states=encoder_outputs[1]
+          if len(encoder_outputs) > 1
+          else None,
+          attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+      )
+
+    hidden_states = encoder_outputs[0]
+
+    if self.model_parallel:
+      torch.cuda.set_device(self.decoder.first_device)
+
+    # Create 1 step of dummy input for the decoder.
+    batch_size = input_ids.size(0)
+    decoder_input_ids = torch.LongTensor([0]).repeat(batch_size).reshape(-1, 1)
+    if torch.cuda.is_available():
+      decoder_input_ids = decoder_input_ids.to(torch.device("cuda"))
+
+    # Set device for model parallelism
+    if self.model_parallel:
+      torch.cuda.set_device(self.decoder.first_device)
+      hidden_states = hidden_states.to(self.decoder.first_device)
+      if decoder_input_ids is not None:
+        decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+      if attention_mask is not None:
+        attention_mask = attention_mask.to(self.decoder.first_device)
+      if decoder_attention_mask is not None:
+        decoder_attention_mask = decoder_attention_mask.to(
+            self.decoder.first_device
+        )
+
+    # Decode
+    decoder_outputs = self.decoder(
+        input_ids=decoder_input_ids,
+        attention_mask=decoder_attention_mask,
+        inputs_embeds=decoder_inputs_embeds,
+        past_key_values=past_key_values,
+        encoder_hidden_states=hidden_states,
+        encoder_attention_mask=attention_mask,
+        head_mask=decoder_head_mask,
+        cross_attn_head_mask=cross_attn_head_mask,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+
+    sequence_output = decoder_outputs[0]
+
+    # Set device for model parallelism
+    if self.model_parallel:
+      torch.cuda.set_device(self.encoder.first_device)
+      self.lm_head = self.lm_head.to(self.encoder.first_device)
+      sequence_output = sequence_output.to(self.lm_head.weight.device)
+
+    if self.config.tie_word_embeddings:
+      # Rescale output before projecting on vocab
+      # See
+      # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+      sequence_output = sequence_output * (self.model_dim**-0.5)
+
+    lm_logits = self.lm_head(sequence_output)
+
+    # 250089 = <extra_id_10>
+    predictions = lm_logits[:, 0, 250089]
+
+    # Clip to 0 to 25
+    predictions = torch.clamp(predictions, 0, 25)
+
+    loss = None
+    if labels is not None:
+      loss_fct = nn.MSELoss()
+      # move labels to correct device to enable PP
+      labels = labels.to(predictions.device)
+      loss = loss_fct(predictions.view(-1), labels.view(-1))
+
+    return MT5ForRegressionOutput(
+        loss=loss,
+        predictions=predictions,
+    )
\ No newline at end of file

From fc90a1ea13bb9d7ffe130c39254d9daae604dca8 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 27 Mar 2025 17:47:14 +0400
Subject: [PATCH 86/97] Add semantic density

---
 src/lm_polygraph/estimators/__init__.py       |   2 +
 .../estimators/semantic_density.py            | 137 ++++++++++++++++
 src/lm_polygraph/stat_calculators/__init__.py |   4 +-
 .../greedy_semantic_matrix.py                 | 155 ++++++++++++++++++
 .../stat_calculators/semantic_matrix.py       | 130 +++++++++++++++
 5 files changed, 426 insertions(+), 2 deletions(-)
 create mode 100644 src/lm_polygraph/estimators/semantic_density.py

diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 254788487..26c3d0739 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -110,3 +110,5 @@
 from .prob_cocoa import ProbCocoaMaxprob, ProbCocoaPPL, GreedyProbCocoaMaxprob, GreedyProbCocoaPPL
 
 from .supervised_sum_semantic_entropies import SupSumSemanticMaxprob, SupSumSemanticPPL, SupSumSemanticMTE, GreedySupSumSemanticMaxprob, GreedySupSumSemanticPPL, GreedySupSumSemanticMTE
+
+from .semantic_density import SemanticDensity, GreedySemanticDensity
diff --git a/src/lm_polygraph/estimators/semantic_density.py b/src/lm_polygraph/estimators/semantic_density.py
new file mode 100644
index 000000000..1c09250b2
--- /dev/null
+++ b/src/lm_polygraph/estimators/semantic_density.py
@@ -0,0 +1,137 @@
+import numpy as np
+
+from typing import Dict
+
+from .estimator import Estimator
+from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS
+
+
+class SemanticDensity(Estimator):
+    def __init__(self, verbose: bool = False, sample_strategy: str = "first"):
+        super().__init__(
+            [
+                "sample_log_probs",
+                "sample_tokens",
+                "sample_texts",
+                "concat_semantic_matrix_contra",
+                "concat_semantic_matrix_neutral",
+            ] + SAMPLE_SELECTION_STAT_KEYS,
+            "sequence",
+        )
+        self.verbose = verbose
+        self.sample_strategy = sample_strategy
+
+    def __str__(self):
+        base = "SemanticDensity"
+        return sample_strategy_to_prefix(self.sample_strategy) + base
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_ids = best_sample_ids(self.sample_strategy, stats)
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_tokens = stats["sample_tokens"]
+        batch_sample_texts = stats["sample_texts"]
+        batch_semantic_matrix_contra = stats["concat_semantic_matrix_contra"]
+        batch_semantic_matrix_neutral = stats["concat_semantic_matrix_neutral"]
+
+        semantic_density = []
+        for batch_data in zip(
+            batch_sample_ids,
+            batch_sample_log_probs,
+            batch_sample_tokens,
+            batch_sample_texts,
+            batch_semantic_matrix_contra,
+            batch_semantic_matrix_neutral,
+        ):
+            sample_id = batch_data[0]
+            sample_probs = np.exp(batch_data[1])
+            sample_tokens = batch_data[2]
+            sample_texts = batch_data[3]
+            semantic_matrix_contra = batch_data[4]
+            semantic_matrix_neutral = batch_data[5]
+
+            _, unique_sample_indices = np.unique(sample_texts, return_index=True)
+
+            numerator, denominator = [], []
+
+            for _id in unique_sample_indices:
+                normed_prob = sample_probs[_id] ** (1 / len(sample_tokens[_id]))
+                distance = semantic_matrix_contra[sample_id, _id] + (semantic_matrix_neutral[sample_id, _id] / 2)
+
+                if distance <= 1:
+                    kernel_value = 1 - distance
+                else:
+                    kernel_value = 0
+
+                numerator.append(normed_prob * kernel_value)
+                denominator.append(normed_prob)
+
+            semantic_density.append(np.sum(numerator) / np.sum(denominator))
+
+        return np.array(semantic_density)
+
+
+class GreedySemanticDensity(Estimator):
+    def __init__(self, verbose: bool = False):
+        super().__init__(
+            [
+                "greedy_log_probs",
+                "sample_log_probs",
+                "sample_tokens",
+                "sample_texts",
+                "concat_greedy_semantic_matrix_contra_forward",
+                "concat_greedy_semantic_matrix_neutral_forward",
+            ],
+            "sequence",
+        )
+        self.verbose = verbose
+
+    def __str__(self):
+        return "GreedySemanticDensity"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        batch_sample_log_probs = stats["sample_log_probs"]
+        batch_sample_tokens = stats["sample_tokens"]
+        batch_sample_texts = stats["sample_texts"]
+        batch_semantic_matrix_contra = stats["concat_greedy_semantic_matrix_contra_forward"]
+        batch_semantic_matrix_neutral = stats["concat_greedy_semantic_matrix_neutral_forward"]
+        batch_greedy_log_likelihoods = stats["concat_greedy_log_likelihoods"]
+
+        semantic_density = []
+        for batch_data in zip(
+            batch_greedy_log_likelihoods,
+            batch_sample_log_probs,
+            batch_sample_tokens,
+            batch_sample_texts,
+            batch_semantic_matrix_contra,
+            batch_semantic_matrix_neutral,
+        ):
+            greedy_log_probs = batch_data[0]
+            sample_probs = np.exp(batch_data[1])
+            sample_tokens = batch_data[2]
+            sample_texts = batch_data[3]
+            semantic_matrix_contra = batch_data[4]
+            semantic_matrix_neutral = batch_data[5]
+
+            _, unique_sample_indices = np.unique(sample_texts, return_index=True)
+
+            numerator, denominator = [], []
+
+            for _id in unique_sample_indices:
+                normed_prob = sample_probs[_id] ** (1 / len(sample_tokens[_id]))
+                distance = semantic_matrix_contra[_id] + (semantic_matrix_neutral[_id] / 2)
+
+                if distance <= 1:
+                    kernel_value = 1 - distance
+                else:
+                    kernel_value = 0
+
+                numerator.append(normed_prob * kernel_value)
+                denominator.append(normed_prob)
+
+            greedy_normed_prob = np.exp(np.sum(greedy_log_probs)) ** (1 / len(greedy_log_probs))
+            numerator.append(greedy_normed_prob)
+            denominator.append(greedy_normed_prob)
+
+            semantic_density.append(np.sum(numerator) / np.sum(denominator))
+
+        return np.array(semantic_density)
diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py
index 7f7e37aef..f88615976 100644
--- a/src/lm_polygraph/stat_calculators/__init__.py
+++ b/src/lm_polygraph/stat_calculators/__init__.py
@@ -20,12 +20,12 @@
 from .model_score import ModelScoreCalculator
 from .embeddings import EmbeddingsCalculator
 from .ensemble_token_data import EnsembleTokenLevelDataCalculator
-from .semantic_matrix import SemanticMatrixCalculator
+from .semantic_matrix import SemanticMatrixCalculator, ConcatSemanticMatrixCalculator
 from .cross_encoder_similarity import CrossEncoderSimilarityMatrixCalculator
 from .extract_claims import ClaimsExtractor
 from .semantic_classes import SemanticClassesCalculator
 from .greedy_similarity import GreedySimilarityCalculator
-from .greedy_semantic_matrix import GreedySemanticMatrixCalculator
+from .greedy_semantic_matrix import GreedySemanticMatrixCalculator, ConcatGreedySemanticMatrixCalculator
 from .rouge_matrix import RougeLSemanticMatrixCalculator
 from .greedy_rouge_matrix import GreedyRougeLSemanticMatrixCalculator
 from .align_matrix import AlignMatrixCalculator
diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
index a5e5cc9df..07c14e805 100644
--- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py
@@ -53,6 +53,8 @@ def __call__(
 
         device = deberta.device
         ent_id = deberta.deberta.config.label2id["ENTAILMENT"]
+        contra_id = deberta.deberta.config.label2id["CONTRADICTION"]
+        neutral_id = deberta.deberta.config.label2id["NEUTRAL"]
 
         softmax = nn.Softmax(dim=1)
         tokenizer = deberta.deberta_tokenizer
@@ -60,6 +62,12 @@ def __call__(
         E_f = []
         E_b = []
         E = []
+        N_f = []
+        N_b = []
+        N = []
+        C_f = []
+        C_b = []
+        C = []
 
         for i, pairs in enumerate(batch_pairs):
             dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size)
@@ -88,17 +96,164 @@ def __call__(
 
             entail_probs_f = probs_f[:, ent_id]
             entail_probs_b = probs_b[:, ent_id]
+            contra_probs_f = probs_f[:, contra_id]
+            contra_probs_b = probs_b[:, contra_id]
+            neutral_probs_f = probs_f[:, neutral_id]
+            neutral_probs_b = probs_b[:, neutral_id]
 
             E_f.append(entail_probs_f[inv].numpy())
             E_b.append(entail_probs_b[inv].numpy())
             E.append((entail_probs_f[inv].numpy() + entail_probs_b[inv].numpy()) / 2)
+            N_f.append(neutral_probs_f[inv].numpy())
+            N_b.append(neutral_probs_b[inv].numpy())
+            N.append((neutral_probs_f[inv].numpy() + neutral_probs_b[inv].numpy()) / 2)
+            C_f.append(contra_probs_f[inv].numpy())
+            C_b.append(contra_probs_b[inv].numpy())
+            C.append((contra_probs_f[inv].numpy() + contra_probs_b[inv].numpy()) / 2)
 
         E_f = np.stack(E_f)
         E_b = np.stack(E_b)
         E = np.stack(E)
+        N_f = np.stack(N_f)
+        N_b = np.stack(N_b)
+        N = np.stack(N)
+        C_f = np.stack(C_f)
+        C_b = np.stack(C_b)
+        C = np.stack(C)
 
         return {
             "greedy_semantic_matrix_forward": E_f,
             "greedy_semantic_matrix_backward": E_b,
             "greedy_semantic_matrix": E,
+            "greedy_semantic_matrix_neutral_forward": N_f,
+            "greedy_semantic_matrix_neutral_backward": N_b,
+            "greedy_semantic_matrix_neutral": N,
+            "greedy_semantic_matrix_contra_forward": C_f,
+            "greedy_semantic_matrix_contra_backward": C_b,
+            "greedy_semantic_matrix_contra": C,
+        }
+
+
+class ConcatGreedySemanticMatrixCalculator(StatCalculator):
+    """
+    Calculates the NLI semantic matrix for generation samples using DeBERTa model.
+    """
+
+    def __init__(self, nli_model):
+        super().__init__(
+            [
+                "concat_greedy_semantic_matrix_forward",
+                "concat_greedy_semantic_matrix_backward",
+                "concat_greedy_semantic_matrix",
+            ],
+            ["greedy_texts", "no_fewshot_input_texts", "sample_texts"],
+        )
+        self.is_deberta_setup = False
+        self.nli_model = nli_model
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        deberta = self.nli_model
+        deberta_batch_size = deberta.batch_size
+
+        batch_texts = dependencies["sample_texts"]
+        batch_greedy_texts = dependencies["greedy_texts"]
+        input_texts = dependencies["no_fewshot_input_texts"]
+
+
+        batch_pairs = []
+        batch_invs = []
+        for texts, greedy_text, input_text in zip(batch_texts, batch_greedy_texts, input_texts):
+            texts = [input_text + text for text in texts]
+            # Sampling from LLM often produces significant number of identical
+            # outputs. We only need to score pairs of unqiue outputs
+            unique_texts, inv = np.unique(texts, return_inverse=True)
+            batch_pairs.append(list(itertools.product([input_text + greedy_text], unique_texts)))
+            batch_invs.append(inv)
+
+        device = deberta.device
+        ent_id = deberta.deberta.config.label2id["ENTAILMENT"]
+        contra_id = deberta.deberta.config.label2id["CONTRADICTION"]
+        neutral_id = deberta.deberta.config.label2id["NEUTRAL"]
+
+        softmax = nn.Softmax(dim=1)
+        tokenizer = deberta.deberta_tokenizer
+
+        E_f = []
+        E_b = []
+        E = []
+        N_f = []
+        N_b = []
+        N = []
+        C_f = []
+        C_b = []
+        C = []
+
+        for i, pairs in enumerate(batch_pairs):
+            dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size)
+            probs_f = []
+            probs_b = []
+
+            for first_texts, second_texts in tqdm(dl):
+                batch = list(zip(first_texts, second_texts))
+                encoded = tokenizer.batch_encode_plus(
+                    batch, padding=True, return_tensors="pt"
+                ).to(device)
+                logits = deberta.deberta(**encoded).logits.detach().to(device)
+                probs_f.append(softmax(logits).cpu().detach())
+
+                batch = list(zip(second_texts, first_texts))
+                encoded = tokenizer.batch_encode_plus(
+                    batch, padding=True, return_tensors="pt"
+                ).to(device)
+                logits = deberta.deberta(**encoded).logits.detach().to(device)
+                probs_b.append(softmax(logits).cpu().detach())
+
+            probs_f = torch.cat(probs_f, dim=0)
+            probs_b = torch.cat(probs_b, dim=0)
+
+            inv = batch_invs[i]
+
+            entail_probs_f = probs_f[:, ent_id]
+            entail_probs_b = probs_b[:, ent_id]
+            contra_probs_f = probs_f[:, contra_id]
+            contra_probs_b = probs_b[:, contra_id]
+            neutral_probs_f = probs_f[:, neutral_id]
+            neutral_probs_b = probs_b[:, neutral_id]
+
+            E_f.append(entail_probs_f[inv].numpy())
+            E_b.append(entail_probs_b[inv].numpy())
+            E.append((entail_probs_f[inv].numpy() + entail_probs_b[inv].numpy()) / 2)
+            N_f.append(neutral_probs_f[inv].numpy())
+            N_b.append(neutral_probs_b[inv].numpy())
+            N.append((neutral_probs_f[inv].numpy() + neutral_probs_b[inv].numpy()) / 2)
+            C_f.append(contra_probs_f[inv].numpy())
+            C_b.append(contra_probs_b[inv].numpy())
+            C.append((contra_probs_f[inv].numpy() + contra_probs_b[inv].numpy()) / 2)
+
+        E_f = np.stack(E_f)
+        E_b = np.stack(E_b)
+        E = np.stack(E)
+        N_f = np.stack(N_f)
+        N_b = np.stack(N_b)
+        N = np.stack(N)
+        C_f = np.stack(C_f)
+        C_b = np.stack(C_b)
+        C = np.stack(C)
+
+        return {
+            "concat_greedy_semantic_matrix_forward": E_f,
+            "concat_greedy_semantic_matrix_backward": E_b,
+            "concat_greedy_semantic_matrix": E,
+            "concat_greedy_semantic_matrix_neutral_forward": N_f,
+            "concat_greedy_semantic_matrix_neutral_backward": N_b,
+            "concat_greedy_semantic_matrix_neutral": N,
+            "concat_greedy_semantic_matrix_contra_forward": C_f,
+            "concat_greedy_semantic_matrix_contra_backward": C_b,
+            "concat_greedy_semantic_matrix_contra": C,
         }
diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py
index 8fe738056..cb9dd4f9d 100644
--- a/src/lm_polygraph/stat_calculators/semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py
@@ -75,12 +75,14 @@ def __call__(
         device = deberta.device
         ent_id = deberta.deberta.config.label2id["ENTAILMENT"]
         contra_id = deberta.deberta.config.label2id["CONTRADICTION"]
+        neutral_id = deberta.deberta.config.label2id["NEUTRAL"]
 
         softmax = nn.Softmax(dim=1)
         tokenizer = deberta.deberta_tokenizer
 
         E = []
         C = []
+        N = []
         P = []
 
         for i, pairs in enumerate(tqdm(batch_pairs)):
@@ -97,12 +99,14 @@ def __call__(
 
             entail_probs = probs[:, ent_id]
             contra_probs = probs[:, contra_id]
+            neutral_probs = probs[:, neutral_id]
             class_preds = probs.argmax(-1)
 
             unique_mat_shape = (batch_counts[i], batch_counts[i])
 
             unique_E = entail_probs.view(unique_mat_shape)
             unique_C = contra_probs.view(unique_mat_shape)
+            unique_N = neutral_probs.view(unique_mat_shape)
             unique_P = class_preds.view(unique_mat_shape)
 
             inv = batch_invs[i]
@@ -111,15 +115,141 @@ def __call__(
             # using inverse index
             E.append(unique_E.cpu().numpy()[inv, :][:, inv])
             C.append(unique_C.cpu().numpy()[inv, :][:, inv])
+            N.append(unique_N.cpu().numpy()[inv, :][:, inv])
             P.append(unique_P.cpu().numpy()[inv, :][:, inv])
 
         E = np.stack(E)
         C = np.stack(C)
+        N = np.stack(N)
         P = np.stack(P)
 
         return {
             "semantic_matrix_entail": E,
             "semantic_matrix_contra": C,
+            "semantic_matrix_neutral": N,
             "semantic_matrix_classes": P,
             "entailment_id": deberta.deberta.config.label2id["ENTAILMENT"],
         }
+
+
+class ConcatSemanticMatrixCalculator(StatCalculator):
+    """
+    Calculates the NLI semantic matrix for generation samples using DeBERTa model.
+    """
+
+    def __init__(self, nli_model):
+        super().__init__(
+            [
+                "concat_semantic_matrix_entail",
+                "concat_semantic_matrix_contra",
+                "concat_semantic_matrix_classes",
+                "entailment_id",
+            ],
+            ["no_fewshot_input_texts", "sample_texts"],
+        )
+        self.is_deberta_setup = False
+        self.nli_model = nli_model
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        """
+        Calculates the NLI semantic matrix for generation samples using DeBERTa model.
+
+        Parameters:
+            dependencies (Dict[str, np.ndarray]): input statistics, containing:
+                - 'sample_texts' (List[List[str]]): several sampling generations
+                    for each input text in the batch.
+            texts (List[str]): Input texts batch used for model generation.
+            model (Model): Model used for generation.
+            max_new_tokens (int): Maximum number of new tokens at model generation. Default: 100.
+        Returns:
+            Dict[str, np.ndarray]: dictionary with the following items:
+                - 'semantic_matrix_entail' (List[np.array]): for each input text: quadratic matrix of size
+                    n_samples x n_samples, with probabilities of 'ENTAILMENT' output of DeBERTa.
+                - 'semantic_matrix_contra' (List[np.array]): for each input text: quadratic matrix of size
+                    n_samples x n_samples, with probabilities of 'CONTRADICTION' output of DeBERTa.
+                - 'semantic_matrix_classes' (List[np.array]): for each input text: quadratic matrix of size
+                    n_samples x n_samples, with the NLI label id corresponding to the DeBERTa prediction.
+        """
+
+        deberta = self.nli_model
+        deberta_batch_size = deberta.batch_size
+        batch_texts = dependencies["sample_texts"]
+        input_texts = dependencies["no_fewshot_input_texts"]
+
+        batch_pairs = []
+        batch_invs = []
+        batch_counts = []
+        for input_text, texts in zip(input_texts, batch_texts):
+            texts = [input_text + text for text in texts]
+            breakpoint()
+            # Sampling from LLM often produces significant number of identical
+            # outputs. We only need to score pairs of unqiue outputs
+            unique_texts, inv = np.unique(texts, return_inverse=True)
+            batch_pairs.append(list(itertools.product(unique_texts, unique_texts)))
+            batch_invs.append(inv)
+            batch_counts.append(len(unique_texts))
+
+        device = deberta.device
+        ent_id = deberta.deberta.config.label2id["ENTAILMENT"]
+        contra_id = deberta.deberta.config.label2id["CONTRADICTION"]
+        neutral_id = deberta.deberta.config.label2id["NEUTRAL"]
+
+        softmax = nn.Softmax(dim=1)
+        tokenizer = deberta.deberta_tokenizer
+
+        E = []
+        C = []
+        N = []
+        P = []
+
+        for i, pairs in enumerate(tqdm(batch_pairs)):
+            dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size)
+            probs = []
+            for first_texts, second_texts in dl:
+                batch = list(zip(first_texts, second_texts))
+                encoded = tokenizer.batch_encode_plus(
+                    batch, padding=True, return_tensors="pt"
+                ).to(device)
+                logits = deberta.deberta(**encoded).logits.detach().to(device)
+                probs.append(softmax(logits).detach())
+            probs = torch.cat(probs, dim=0)
+
+            entail_probs = probs[:, ent_id]
+            contra_probs = probs[:, contra_id]
+            neutral_probs = probs[:, neutral_id]
+            class_preds = probs.argmax(-1)
+
+            unique_mat_shape = (batch_counts[i], batch_counts[i])
+
+            unique_E = entail_probs.view(unique_mat_shape)
+            unique_C = contra_probs.view(unique_mat_shape)
+            unique_N = neutral_probs.view(unique_mat_shape)
+            unique_P = class_preds.view(unique_mat_shape)
+
+            inv = batch_invs[i]
+
+            # Recover full matrices from unques by gathering along both axes
+            # using inverse index
+            E.append(unique_E.cpu().numpy()[inv, :][:, inv])
+            C.append(unique_C.cpu().numpy()[inv, :][:, inv])
+            N.append(unique_N.cpu().numpy()[inv, :][:, inv])
+            P.append(unique_P.cpu().numpy()[inv, :][:, inv])
+
+        E = np.stack(E)
+        C = np.stack(C)
+        N = np.stack(N)
+        P = np.stack(P)
+
+        return {
+            "concat_semantic_matrix_entail": E,
+            "concat_semantic_matrix_contra": C,
+            "concat_semantic_matrix_neutral": N,
+            "concat_semantic_matrix_classes": P,
+            "entailment_id": deberta.deberta.config.label2id["ENTAILMENT"],
+        }

From 3b1bdcb6305df8ca9f98fdcea07d217e0c6965e8 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 27 Mar 2025 17:51:21 +0400
Subject: [PATCH 87/97] Remove breakpoint

---
 src/lm_polygraph/stat_calculators/semantic_matrix.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py
index cb9dd4f9d..036046ce3 100644
--- a/src/lm_polygraph/stat_calculators/semantic_matrix.py
+++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py
@@ -187,7 +187,6 @@ def __call__(
         batch_counts = []
         for input_text, texts in zip(input_texts, batch_texts):
             texts = [input_text + text for text in texts]
-            breakpoint()
             # Sampling from LLM often produces significant number of identical
             # outputs. We only need to score pairs of unqiue outputs
             unique_texts, inv = np.unique(texts, return_inverse=True)

From 25103254a6a20dd0e921f5fbc15c689a4d374f78 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Thu, 27 Mar 2025 17:56:06 +0400
Subject: [PATCH 88/97] Fix some typos

---
 src/lm_polygraph/generation_metrics/x_metric.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py
index bc7c7e483..a5bd84c92 100644
--- a/src/lm_polygraph/generation_metrics/x_metric.py
+++ b/src/lm_polygraph/generation_metrics/x_metric.py
@@ -11,7 +11,7 @@
 
 class XMetric(GenerationMetric):
     """
-    Calculates X-MERTIC (https://aclanthology.org/2023.wmt-1.63/)
+    Calculates X-METRIC (https://aclanthology.org/2023.wmt-1.63/)
     between model-generated texts and ground truth texts.
     """
 
@@ -73,7 +73,7 @@ def _filter_translation(self, text: str, ignore_regex: re.Pattern) -> str:
 
     
     def _prepare_inputs(self, translations: List[str], references: List[str]):
-        """Prepares the input data for X-MERTIC scoring."""
+        """Prepares the input data for X-METRIC scoring."""
         inputs = [
             f"candidate: {hyp} reference: {ref}" 
             for hyp, ref in zip(translations, references)
@@ -106,7 +106,7 @@ def __call__(
         target_texts: List[str],
     ) -> np.ndarray:
         """
-        Calculates X-MERTIC between stats['greedy_texts'] and target_texts.
+        Calculates X-METRIC between stats['greedy_texts'] and target_texts.
 
         Parameters:
             stats (Dict[str, np.ndarray]): input statistics, including:
@@ -142,4 +142,4 @@ def __call__(
         scores, _, _ = self.trainer.predict(test_dataset=inputs)
         for i, score in enumerate(scores):
             scores[i] = (25 - score) / 25
-        return scores
\ No newline at end of file
+        return scores

From a7bc19c50a6368a3de921e60f7a1f5af8cba4868 Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Sat, 29 Mar 2025 08:34:35 +0400
Subject: [PATCH 89/97] Gpt as a judge + Fixes to X Metric 24

---
 .../generation_metrics/__init__.py            |  1 +
 .../generation_metrics/gpt_judge_accuracy.py  | 90 +++++++++++++++++++
 .../generation_metrics/x_metric.py            | 31 +++++--
 3 files changed, 116 insertions(+), 6 deletions(-)
 create mode 100644 src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py

diff --git a/src/lm_polygraph/generation_metrics/__init__.py b/src/lm_polygraph/generation_metrics/__init__.py
index f5e702401..83f7b58f6 100644
--- a/src/lm_polygraph/generation_metrics/__init__.py
+++ b/src/lm_polygraph/generation_metrics/__init__.py
@@ -11,3 +11,4 @@
 from .aggregated_metric import AggregatedMetric
 from .preprocess_output_target import PreprocessOutputTarget
 from .x_metric import XMetric
+from .gpt_judge_accuracy import GptAccuracyMetric
\ No newline at end of file
diff --git a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py
new file mode 100644
index 000000000..d0cb36ed8
--- /dev/null
+++ b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py
@@ -0,0 +1,90 @@
+import openai
+from .generation_metric import GenerationMetric
+import numpy as np
+import logging
+from typing import Dict, List
+import re 
+log = logging.getLogger("lm_polygraph")
+import os 
+
+class GptAccuracyMetric(GenerationMetric):
+    """
+    Uses GPT to compare generated text with target and return 1 if semantically equivalent, else 0.
+    """
+
+    def __init__(self, model="gpt-4o-mini", sample=False, sample_strategy="First", api_key=None):
+        if sample:
+            super().__init__([
+                "first_sample_texts",
+                "best_sample_texts",
+                "best_normalized_sample_texts",
+                "input_texts"],
+                "sequence")
+        else:
+            super().__init__(["greedy_texts", "input_texts"], "sequence")
+        self.sample = sample
+        self.sample_strategy = sample_strategy
+        self.model = model
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+        openai.api_key = self.api_key
+
+    def __str__(self):
+        if self.sample == True:
+            return f"GptAccuracy_{self.model}_{self.sample_strategy}"
+        return f"GptAccuracy_{self.model}"
+    
+    def _filter_input(self, input):
+        matches = re.findall(r"Question:\s*(.*?)\nAnswer:", input, re.DOTALL)
+        if matches:
+            return matches[-1].strip()
+        return input
+    def _gpt_compare(self, output: str, target: str, question: str) -> int:
+        prompt = (
+            f"You are a text evaluator. The model was asked the following question: {question.strip()}.\n"
+            "The 'Generated' text is a model's response. The 'Target' is the correct answer.\n"
+            "If the generated answer correctly answers the question based on the target, return 1.\n"
+            "If it is wrong, return 0.\n"
+            "Respond ONLY with a single digit: 1 or 0.\n\n"
+            f"Generated: {output.strip()}\n"
+            f"Target: {target.strip()}"
+        )
+
+        try:
+            response = openai.ChatCompletion.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": "You are a strict evaluator of text similarity."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0,
+                max_tokens=1,
+                n=1
+            )
+
+            raw_reply = response['choices'][0]['message']['content'].strip()
+            return int(raw_reply) if raw_reply in ['0', '1'] else 0
+
+        except Exception as e:
+            log.error(f"GPT comparison failed: {e}")
+            return 0  # Safe default
+
+    def __call__(self, stats: Dict[str, np.ndarray], target_texts: List[str]) -> np.ndarray:
+        if self.sample:
+            if self.sample_strategy == "First":
+                gen_texts = stats["first_sample_texts"]
+            elif self.sample_strategy == "Best":
+                gen_texts = stats["best_sample_texts"]
+            elif self.sample_strategy == "BestNormalized":
+                gen_texts = stats["best_normalized_sample_texts"]
+            else:
+                raise ValueError(f"Invalid sample strategy: {self.sample_strategy}")
+        else:
+            gen_texts = stats["greedy_texts"]
+
+        results = []
+        input_texts = [self._filter_input(text) for text in stats["input_texts"]]
+        for output, target, input in zip(gen_texts, target_texts, input_texts):
+            score = self._gpt_compare(output, target,input)
+            results.append(score)
+
+        return np.array(results)
diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py
index bc7c7e483..4298fd126 100644
--- a/src/lm_polygraph/generation_metrics/x_metric.py
+++ b/src/lm_polygraph/generation_metrics/x_metric.py
@@ -7,7 +7,7 @@
 from .x_metric_utils import MT5ForRegression
 import torch 
 import datasets 
-from transformers import TrainingArguments, Trainer
+from transformers import TrainingArguments, DataCollatorWithPadding, Trainer
 
 class XMetric(GenerationMetric):
     """
@@ -36,15 +36,19 @@ def __init__(self, model ,tokenizer,
         self.translation_ignore_regex = (
             re.compile(translation_ignore_regex) if translation_ignore_regex else None
         )
+
         self.training_args = TrainingArguments(
             output_dir=".",
             per_device_eval_batch_size=1,
             dataloader_pin_memory=False,
         )
 
+        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
+
         self.trainer = Trainer(
             model=self.model,
             args=self.training_args,
+            data_collator=data_collator
         )
         self.sample = sample
         self.sample_strategy=sample_strategy
@@ -71,12 +75,22 @@ def _filter_source(self, text: str, ignore_regex: re.Pattern) -> str:
     def _filter_translation(self, text: str, ignore_regex: re.Pattern) -> str:
         return ignore_regex.sub("", text).strip() if ignore_regex else text.strip()
 
-    
-    def _prepare_inputs(self, translations: List[str], references: List[str]):
+    def _filter_text(self, text: str, ignore_regex: re.Pattern) -> str:
+        if ignore_regex is not None:
+            processed_text = ignore_regex.search(text)
+            if processed_text:
+                return processed_text.group(1)
+            else:
+                raise ValueError(
+                    f"Source text {text} does not match the ignore regex {ignore_regex}"
+                )
+        return text
+
+    def _prepare_inputs(self, translations: List[str], references: List[str], sources: List[str],):
         """Prepares the input data for X-MERTIC scoring."""
         inputs = [
-            f"candidate: {hyp} reference: {ref}" 
-            for hyp, ref in zip(translations, references)
+            f"source: {source} candidate: {hyp} reference: {ref}" 
+            for hyp, ref, source in zip(translations, references, sources)
         ]
         tokenized = self.tokenizer(
             inputs, 
@@ -138,7 +152,12 @@ def __call__(
             for tr in gen_texts
         ]
 
-        inputs = self._prepare_inputs(translations, references)
+        sources = [
+            self._filter_text(src, self.source_ignore_regex)
+            for src in stats["input_texts"]
+        ]
+
+        inputs = self._prepare_inputs(translations, references, sources)
         scores, _, _ = self.trainer.predict(test_dataset=inputs)
         for i, score in enumerate(scores):
             scores[i] = (25 - score) / 25

From 75ad725e7ca2d607c43a2d125a7ee88066aca21a Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Sat, 29 Mar 2025 08:36:58 +0400
Subject: [PATCH 90/97] Polygraph eval code + remove redundant funcion form x
 metric

---
 scripts/polygraph_eval                          |  5 ++++-
 src/lm_polygraph/generation_metrics/x_metric.py | 10 ----------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 8435cb2cd..bcd4e976a 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -453,7 +453,7 @@ def get_generation_metrics(args):
             ckpt_path=ckpt_path,
             evaluation_mode="nli_sp",
         )
-
+        api_key =getattr(args, "openai_api_key", '') 
         result = [
             RougeMetric("rougeL"),
             BLEUMetric(),
@@ -503,6 +503,9 @@ def get_generation_metrics(args):
             AlignScore(align_scorer, sample=True, sample_strategy="BestNormalized"),
             AlignScore(align_scorer, target_is_claims=False, sample=True, sample_strategy="BestNormalized"),
             AlignScore(align_scorer, ignore_target=True, sample=True, sample_strategy="BestNormalized"),
+            GptAccuracyMetric( api_key=api_key),
+            GptAccuracyMetric( api_key=api_key,sample=True, sample_strategy="Best"),
+            GptAccuracyMetric( api_key=api_key, sample=True, sample_strategy="First"),
         ]
 
         if getattr(args.model, "type", "Whitebox") != "Blackbox":
diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py
index 4298fd126..4835e2428 100644
--- a/src/lm_polygraph/generation_metrics/x_metric.py
+++ b/src/lm_polygraph/generation_metrics/x_metric.py
@@ -62,16 +62,6 @@ def __str__(self):
                 return f"{self.sample_strategy}Samplexmetric"
         return "xmetric"
 
-    def _filter_source(self, text: str, ignore_regex: re.Pattern) -> str:
-        if ignore_regex is not None:
-            try:
-                return ignore_regex.findall(text)[-1]
-            except IndexError:
-                raise ValueError(
-                    f"Source text '{text}' does not match the ignore regex '{ignore_regex}'"
-                )
-        return text
-
     def _filter_translation(self, text: str, ignore_regex: re.Pattern) -> str:
         return ignore_regex.sub("", text).strip() if ignore_regex else text.strip()
 

From aa577fcd9e6c11b787248efd87fb8a1f2cf24579 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Sat, 29 Mar 2025 19:46:41 +0400
Subject: [PATCH 91/97] Add multiref support without aggregation, some other
 tweaks

---
 scripts/polygraph_eval                        |  2 +-
 .../generation_metrics/gpt_judge_accuracy.py  | 46 +++++++++++++------
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index bcd4e976a..7b6bde8f4 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -572,7 +572,7 @@ def get_generation_metrics(args):
 
     if getattr(args, "multiref", False):
         # Wrap each metric in AggregatedMetric
-        result = [AggregatedMetric(base_metric=metric) for metric in result]
+        result = [AggregatedMetric(base_metric=metric)  if type(metric) != GptAccuracyMetric else metric for metric in result]
 
     log.info("Done with initializing generation metrics.")
 
diff --git a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py
index d0cb36ed8..7e3a170df 100644
--- a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py
+++ b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py
@@ -6,6 +6,7 @@
 import re 
 log = logging.getLogger("lm_polygraph")
 import os 
+from tqdm import tqdm
 
 class GptAccuracyMetric(GenerationMetric):
     """
@@ -15,13 +16,15 @@ class GptAccuracyMetric(GenerationMetric):
     def __init__(self, model="gpt-4o-mini", sample=False, sample_strategy="First", api_key=None):
         if sample:
             super().__init__([
+                "no_fewshot_input_texts",
                 "first_sample_texts",
                 "best_sample_texts",
                 "best_normalized_sample_texts",
                 "input_texts"],
                 "sequence")
         else:
-            super().__init__(["greedy_texts", "input_texts"], "sequence")
+            super().__init__(["no_fewshot_input_texts", "greedy_texts", "input_texts"], "sequence")
+
         self.sample = sample
         self.sample_strategy = sample_strategy
         self.model = model
@@ -32,28 +35,41 @@ def __str__(self):
         if self.sample == True:
             return f"GptAccuracy_{self.model}_{self.sample_strategy}"
         return f"GptAccuracy_{self.model}"
-    
+
     def _filter_input(self, input):
         matches = re.findall(r"Question:\s*(.*?)\nAnswer:", input, re.DOTALL)
         if matches:
             return matches[-1].strip()
         return input
+
     def _gpt_compare(self, output: str, target: str, question: str) -> int:
-        prompt = (
-            f"You are a text evaluator. The model was asked the following question: {question.strip()}.\n"
-            "The 'Generated' text is a model's response. The 'Target' is the correct answer.\n"
-            "If the generated answer correctly answers the question based on the target, return 1.\n"
-            "If it is wrong, return 0.\n"
-            "Respond ONLY with a single digit: 1 or 0.\n\n"
-            f"Generated: {output.strip()}\n"
-            f"Target: {target.strip()}"
-        )
+        if type(target) == list:
+            str_target = ", ".join(target)
+            prompt = (
+                f"You are a text evaluator. The model was asked the following question:\n{question}\n"
+                "The 'Generated' text is a model's response. The 'Target' is the list of possible correct answers.\n"
+                "If the generated answer correctly answers the question (matches one of the target responses), return 1.\n"
+                "If it is wrong, return 0.\n"
+                "Respond ONLY with a single digit: 1 or 0.\n\n"
+                f"Generated: {output.strip()}\n"
+                f"Target list: {str_target.strip()}"
+            )
+        else:
+            prompt = (
+                f"You are a text evaluator. The model was asked the following question:\n{question}\n"
+                "The 'Generated' text is a model's response. The 'Target' is the correct answer.\n"
+                "If the generated answer correctly answers the question based on the target, return 1.\n"
+                "If it is wrong, return 0.\n"
+                "Respond ONLY with a single digit: 1 or 0.\n\n"
+                f"Generated: {output.strip()}\n"
+                f"Target: {target.strip()}"
+            )
 
         try:
             response = openai.ChatCompletion.create(
                 model=self.model,
                 messages=[
-                    {"role": "system", "content": "You are a strict evaluator of text similarity."},
+                    {"role": "system", "content": "You are a strict evaluator of correctness of the model's response."},
                     {"role": "user", "content": prompt}
                 ],
                 temperature=0,
@@ -62,6 +78,7 @@ def _gpt_compare(self, output: str, target: str, question: str) -> int:
             )
 
             raw_reply = response['choices'][0]['message']['content'].strip()
+
             return int(raw_reply) if raw_reply in ['0', '1'] else 0
 
         except Exception as e:
@@ -82,8 +99,9 @@ def __call__(self, stats: Dict[str, np.ndarray], target_texts: List[str]) -> np.
             gen_texts = stats["greedy_texts"]
 
         results = []
-        input_texts = [self._filter_input(text) for text in stats["input_texts"]]
-        for output, target, input in zip(gen_texts, target_texts, input_texts):
+        input_texts = stats["no_fewshot_input_texts"]
+
+        for output, target, input in tqdm(zip(gen_texts, target_texts, input_texts)):
             score = self._gpt_compare(output, target,input)
             results.append(score)
 

From 66fc7371b065d8b22f169ff312bb7c0908d88084 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Sat, 29 Mar 2025 21:03:32 +0400
Subject: [PATCH 92/97] Show metricx progress

---
 src/lm_polygraph/generation_metrics/x_metric.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py
index 37b69de35..3e1444354 100644
--- a/src/lm_polygraph/generation_metrics/x_metric.py
+++ b/src/lm_polygraph/generation_metrics/x_metric.py
@@ -48,7 +48,8 @@ def __init__(self, model ,tokenizer,
         self.trainer = Trainer(
             model=self.model,
             args=self.training_args,
-            data_collator=data_collator
+            data_collator=data_collator,
+            disable_tqdm=False
         )
         self.sample = sample
         self.sample_strategy=sample_strategy

From dae4f26e8b90f30ddb107d018d55895f6f7daa01 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Sat, 29 Mar 2025 21:05:04 +0400
Subject: [PATCH 93/97] Fix tqdm

---
 src/lm_polygraph/generation_metrics/x_metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py
index 3e1444354..e8f7fe6af 100644
--- a/src/lm_polygraph/generation_metrics/x_metric.py
+++ b/src/lm_polygraph/generation_metrics/x_metric.py
@@ -40,6 +40,7 @@ def __init__(self, model ,tokenizer,
         self.training_args = TrainingArguments(
             output_dir=".",
             per_device_eval_batch_size=1,
+            disable_tqdm=False,
             dataloader_pin_memory=False,
         )
 
@@ -49,7 +50,6 @@ def __init__(self, model ,tokenizer,
             model=self.model,
             args=self.training_args,
             data_collator=data_collator,
-            disable_tqdm=False
         )
         self.sample = sample
         self.sample_strategy=sample_strategy

From dd6bac43d24075aebab440303b9edfadc63a0dc3 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Sun, 30 Mar 2025 12:58:13 +0400
Subject: [PATCH 94/97] Fix loading manager with torch 2.6+

---
 src/lm_polygraph/utils/manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 987177e34..99449b4bb 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -721,7 +721,7 @@ def load(load_path: str, **kwargs) -> "UEManager":
         Parameters:
             load_path (str): Path to file with saved benchmark results to load.
         """
-        res_dict = torch.load(load_path)
+        res_dict = torch.load(load_path, weights_only=False)
         default_kwargs = {
             "data": None,
             "model": None,

From af8ce4c805c18a348833db6e2c52361301877f14 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Sun, 30 Mar 2025 14:58:34 +0400
Subject: [PATCH 95/97] Fix greedy semantic dens

---
 src/lm_polygraph/estimators/semantic_density.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lm_polygraph/estimators/semantic_density.py b/src/lm_polygraph/estimators/semantic_density.py
index 1c09250b2..8e5c76427 100644
--- a/src/lm_polygraph/estimators/semantic_density.py
+++ b/src/lm_polygraph/estimators/semantic_density.py
@@ -94,7 +94,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         batch_sample_texts = stats["sample_texts"]
         batch_semantic_matrix_contra = stats["concat_greedy_semantic_matrix_contra_forward"]
         batch_semantic_matrix_neutral = stats["concat_greedy_semantic_matrix_neutral_forward"]
-        batch_greedy_log_likelihoods = stats["concat_greedy_log_likelihoods"]
+        batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"]
 
         semantic_density = []
         for batch_data in zip(

From ff96ac9da2d1092b76bc4e61572df8e9846db3c7 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Sun, 30 Mar 2025 21:18:18 +0400
Subject: [PATCH 96/97] Turn semantic density around

---
 src/lm_polygraph/estimators/semantic_density.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lm_polygraph/estimators/semantic_density.py b/src/lm_polygraph/estimators/semantic_density.py
index 8e5c76427..693215a81 100644
--- a/src/lm_polygraph/estimators/semantic_density.py
+++ b/src/lm_polygraph/estimators/semantic_density.py
@@ -67,7 +67,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             semantic_density.append(np.sum(numerator) / np.sum(denominator))
 
-        return np.array(semantic_density)
+        return -np.array(semantic_density)
 
 
 class GreedySemanticDensity(Estimator):
@@ -134,4 +134,4 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
 
             semantic_density.append(np.sum(numerator) / np.sum(denominator))
 
-        return np.array(semantic_density)
+        return -np.array(semantic_density)

From c652a519bf43a305490478bf81ee5a82f8a78970 Mon Sep 17 00:00:00 2001
From: Roman Vashurin <rvashurin@gmail.com>
Date: Sun, 30 Mar 2025 21:23:37 +0400
Subject: [PATCH 97/97] Fix gpt naming

---
 src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py
index 7e3a170df..ac95bfb58 100644
--- a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py
+++ b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py
@@ -33,7 +33,10 @@ def __init__(self, model="gpt-4o-mini", sample=False, sample_strategy="First", a
 
     def __str__(self):
         if self.sample == True:
-            return f"GptAccuracy_{self.model}_{self.sample_strategy}"
+            if self.sample_strategy == "First":
+                return f"SampleGptAccuracy_{self.model}"
+            else:
+                return f"{self.sample_strategy}GptAccuracy_{self.model}"
         return f"GptAccuracy_{self.model}"
 
     def _filter_input(self, input):