From 3b438fd82cd22ee7521c0834acca05d116acb9f2 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Mon, 14 Oct 2024 13:03:04 +0300 Subject: [PATCH 01/97] Add templates for new methods --- .../polygraph_eval_triviaqa_sentsar.yaml | 86 +++++++++++++++++++ .../polygraph_eval_wmt14_fren_sentsar.yaml | 85 ++++++++++++++++++ .../polygraph_eval_wmt19_deen_sentsar.yaml | 85 ++++++++++++++++++ src/lm_polygraph/estimators/__init__.py | 7 +- src/lm_polygraph/estimators/sentence_sar.py | 67 +++++++++++++++ 5 files changed, 329 insertions(+), 1 deletion(-) create mode 100644 examples/configs/polygraph_eval_triviaqa_sentsar.yaml create mode 100644 examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml create mode 100644 examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml new file mode 100644 index 000000000..eaf197a0a --- /dev/null +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -0,0 +1,86 @@ +hydra: + run: + dir: ${cache_path}/${task}/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + +defaults: + - model: bloomz-560m + - _self_ + +cache_path: ./workdir/output +save_path: '${hydra:run.dir}' + +task: qa + +dataset: [trivia_qa, rc.nocontext] +text_column: question +label_column: answer +prompt: "Question: {question}\nAnswer:{answer}" +few_shot_split: train +train_split: train +eval_split: validation +max_new_tokens: 20 +load_from_disk: false +n_shot: 5 +multiref: true +normalize: true +generation_params: + generate_until: + - "\n" + +train_dataset: null +train_test_split: false +test_split_size: 1 + +background_train_dataset: allenai/c4 +background_train_dataset_text_column: text +background_train_dataset_label_column: url +background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz +background_load_from_disk: false + +subsample_background_train_dataset: 1000 +subsample_train_dataset: 1000 +subsample_eval_dataset: -1 + +use_density_based_ue: false +use_seq_ue: false +use_tok_ue: false +use_ens_ue: false +generation_metrics: null +ens_type: + +additional_estimators: + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability + kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.semantic_entropy + class_name: SemanticEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: SentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: ReweightedSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: {} + +ignore_exceptions: false + +batch_size: 1 +deberta_batch_size: 1 + +seed: + - 1 diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml new file mode 100644 index 000000000..4dcbf11a5 --- /dev/null +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -0,0 +1,85 @@ +hydra: + run: + dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + +defaults: + - model: bloomz-560m + - _self_ + +cache_path: ./workdir/output +save_path: '${hydra:run.dir}' + +device: cpu + +task: nmt + +dataset: [wmt14, fr-en] +text_column: fr +label_column: en +prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n" +train_split: train +eval_split: test +max_new_tokens: 107 +load_from_disk: false +generation_params: + generate_until: + - "\n" + +source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" + +train_dataset: null +train_test_split: false +test_split_size: 1 + +background_train_dataset: allenai/c4 +background_train_dataset_text_column: text +background_train_dataset_label_column: url +background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz +background_load_from_disk: false + +subsample_background_train_dataset: 1000 +subsample_train_dataset: 1000 +subsample_eval_dataset: -1 + +use_density_based_ue: false +use_ens_ue: false +use_seq_ue: false +use_tok_ue: false +generation_metrics: null + +additional_estimators: + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability + kwargs: + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + - module: lm_polygraph.estimatorts.semantic_entropy + class_name: SemanticEntropy + kwargs: + - module: lm_polygraph.estimators.sentence_sar + class_name: SentenceSAR + kwargs: + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + - module: lm_polygraph.estimators.sentence_sar + class_name: ReweightedSentenceSAR + kwargs: + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: + +ignore_exceptions: false + +batch_size: 1 +deberta_batch_size: 1 + +seed: + - 1 diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml new file mode 100644 index 000000000..8cb7432ce --- /dev/null +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -0,0 +1,85 @@ +hydra: + run: + dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + +defaults: + - model: bloomz-560m + - _self_ + +cache_path: ./workdir/output +save_path: '${hydra:run.dir}' + +device: cpu + +task: nmt + +dataset: [wmt19, de-en] +text_column: de +label_column: en +prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n" +train_split: train +eval_split: validation +max_new_tokens: 107 +load_from_disk: false +generation_params: + generate_until: + - "\n" + +source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" + +train_dataset: null +train_test_split: false +test_split_size: 1 + +background_train_dataset: allenai/c4 +background_train_dataset_text_column: text +background_train_dataset_label_column: url +background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz +background_load_from_disk: false + +subsample_background_train_dataset: 1000 +subsample_train_dataset: 1000 +subsample_eval_dataset: -1 + +use_density_based_ue: false +use_ens_ue: false +use_seq_ue: false +use_tok_ue: false + +additional_estimators: + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability + kwargs: + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + - module: lm_polygraph.estimatorts.semantic_entropy + class_name: SemanticEntropy + kwargs: + - module: lm_polygraph.estimators.sentence_sar + class_name: SentenceSAR + kwargs: + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + - module: lm_polygraph.estimators.sentence_sar + class_name: ReweightedSentenceSAR + kwargs: + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: + +ignore_exceptions: false + +batch_size: 1 +deberta_batch_size: 1 + +seed: + - 1 + diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index c9c16afa2..5287644f5 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -62,7 +62,12 @@ PESrmiabs, ) from .token_sar import TokenSAR -from .sentence_sar import SentenceSAR +from .sentence_sar import ( + SentenceSAR, + OtherSentenceSAR, + ReweightedSentenceSAR, + PPLSentenceSAR +) from .sar import SAR from .renyi_neg import RenyiNeg from .fisher_rao import FisherRao diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index 44f762afe..f911496be 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -52,3 +52,70 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) + + +class OtherSentenceSAR(Estimator): + """ + Like SAR, but only looks at other samples for each sample in the output. + """ + + def __init__(self, verbose: bool = False): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.t = 0.001 + + def __str__(self): + return "OtherSentenceSAR" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + sentenceSAR = [] + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + sample_probs = np.exp(np.array(sample_log_probs)) + R_s = ( + sample_probs + * sample_sentence_similarity + * (1 - np.eye(sample_sentence_similarity.shape[0])) + ) + sent_relevance = R_s.sum(-1) / self.t + E_s = -np.log(sent_relevance) + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) + + +class ReweightedSentenceSAR(Estimator): + """ + Like SAR, but normalizes similarity-based scores at each iteration + alpha_ij = g(s_i, s_j) / (\sum_k^(K - 1) g(s_i, s_k)) + K - number of samples in output minus one + """ + + def __str__(self): + return "ReweightedSentenceSAR" + + +class PPLSentenceSAR(Estimator): + """ + Like SAR, but uses log probs normalized on sample length in tokens + Look at perplexity.py for an example + Tokenwise log-likelihoods are available in stats['sample_log_likelihoods'] i think + """ + + def __str__(self): + return "PPLSentenceSAR" From e8c11cd955f29b9434d4843251a9982760343a6c Mon Sep 17 00:00:00 2001 From: SDUgitrep Date: Mon, 14 Oct 2024 17:33:42 +0400 Subject: [PATCH 02/97] PPL + Reweighted --- src/lm_polygraph/estimators/sentence_sar.py | 83 ++++++++++++++++++++- 1 file changed, 80 insertions(+), 3 deletions(-) diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index f911496be..c4516ab72 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -105,17 +105,94 @@ class ReweightedSentenceSAR(Estimator): alpha_ij = g(s_i, s_j) / (\sum_k^(K - 1) g(s_i, s_k)) K - number of samples in output minus one """ + def __init__(self, verbose: bool = False): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.t = 0.001 def __str__(self): return "ReweightedSentenceSAR" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + sentenceSAR = [] + + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + # Compute probabilities from log probabilities + sample_probs = np.exp(np.array(sample_log_probs)) + + # Initialize alpha_ij (reweighted sentence similarities) + alpha_ij = np.zeros_like(sample_sentence_similarity) + + # Normalize similarity-based scores at each iteration + for i in range(sample_sentence_similarity.shape[0]): + similarity_row = sample_sentence_similarity[i] + # Exclude self-similarity g(s_i, s_i) + similarity_row_without_self = similarity_row * (1 - np.eye(len(similarity_row)))[i] + sum_similarity = np.sum(similarity_row_without_self) + + if sum_similarity > 0: + alpha_ij[i] = similarity_row_without_self / sum_similarity + else: + alpha_ij[i] = similarity_row_without_self # If the normalization factor is 0, leave the row unchanged + + # Compute sentence relevance using normalized alpha_ij + R_s = sample_probs * alpha_ij + sent_relevance = R_s.sum(-1) / self.t + + # Compute SentenceSAR (Uncertainty Estimation) + E_s = -np.log(sent_relevance + sample_probs) + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) + class PPLSentenceSAR(Estimator): """ - Like SAR, but uses log probs normalized on sample length in tokens - Look at perplexity.py for an example - Tokenwise log-likelihoods are available in stats['sample_log_likelihoods'] i think + Like SAR, but uses log probs normalized by sample length in tokens to calculate PPL (Perplexity). + Tokenwise log-likelihoods are available in stats['sample_log_likelihoods']. """ + def __init__(self, verbose: bool = False): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.t = 0.001 def __str__(self): return "PPLSentenceSAR" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the PPL-based sentence-level uncertainty using token-wise log-likelihoods. + + Parameters: + stats (Dict[str, np.ndarray]): Input statistics, including: + * 'sample_log_likelihoods': token-wise log-likelihoods for each sample. + + Returns: + np.ndarray: float PPL values for each sample. + Lower values indicate less uncertainty (better predictions), higher values indicate more uncertainty. + """ + # Extract token-wise log-likelihoods from the stats + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + + perplexities = [] + + # Loop over each sample's token-wise log-likelihoods + for sample_log_likelihoods in batch_sample_log_likelihoods: + # Calculate the number of tokens (length of the sample in tokens) + num_tokens = len(sample_log_likelihoods) + + # Calculate average log-likelihood for the sample + avg_log_likelihood = np.mean(sample_log_likelihoods) + + # Perplexity is exp(-avg_log_likelihood) + ppl = np.exp(-avg_log_likelihood) + + perplexities.append(ppl) + + return np.array(perplexities) From aa1f199f31ce97a67e3db5269efe5856e92f784c Mon Sep 17 00:00:00 2001 From: SDUgitrep Date: Mon, 14 Oct 2024 17:47:21 +0400 Subject: [PATCH 03/97] PPL upd --- src/lm_polygraph/estimators/sentence_sar.py | 25 ++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index c4516ab72..4bbbc67e9 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -179,11 +179,14 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ # Extract token-wise log-likelihoods from the stats batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - perplexities = [] + sentenceSAR = [] - # Loop over each sample's token-wise log-likelihoods - for sample_log_likelihoods in batch_sample_log_likelihoods: + # Loop over each sample's log-likelihoods and sentence similarities + for sample_log_likelihoods, sample_sentence_similarity in zip( + batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): # Calculate the number of tokens (length of the sample in tokens) num_tokens = len(sample_log_likelihoods) @@ -193,6 +196,18 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Perplexity is exp(-avg_log_likelihood) ppl = np.exp(-avg_log_likelihood) - perplexities.append(ppl) + # Initialize the sentence relevance (R_s) using PPL + R_s = ( + ppl # Use PPL instead of probabilities + * sample_sentence_similarity + * (1 - np.eye(sample_sentence_similarity.shape[0])) # Remove self-similarity + ) - return np.array(perplexities) + # Compute sentence relevance + sent_relevance = R_s.sum(-1) / self.t + + # Compute SentenceSAR (Uncertainty Estimation) using PPL + E_s = -np.log(sent_relevance + ppl) + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) From 3ba0f9132b905cfe5a164cc3946eea036516ba53 Mon Sep 17 00:00:00 2001 From: SDUgitrep Date: Mon, 14 Oct 2024 18:50:53 +0400 Subject: [PATCH 04/97] PPL upd --- src/lm_polygraph/estimators/sentence_sar.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index 4bbbc67e9..90155ae11 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -188,11 +188,14 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods, batch_sample_sentence_similarity ): # Calculate the number of tokens (length of the sample in tokens) - num_tokens = len(sample_log_likelihoods) - # Calculate average log-likelihood for the sample - avg_log_likelihood = np.mean(sample_log_likelihoods) + token_log_likelihoods = [np.mean(token_ll) for token_ll in sample_log_likelihoods] + + # Calculate the number of tokens (length of the sample in tokens) + num_tokens = len(token_log_likelihoods) + # Calculate the mean log-likelihood across tokens + avg_log_likelihood = np.sum(token_log_likelihoods) / num_tokens # Perplexity is exp(-avg_log_likelihood) ppl = np.exp(-avg_log_likelihood) @@ -205,9 +208,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Compute sentence relevance sent_relevance = R_s.sum(-1) / self.t - + sample_probs = np.exp(np.array(sample_log_likelihoods)) # Compute SentenceSAR (Uncertainty Estimation) using PPL - E_s = -np.log(sent_relevance + ppl) + E_s = -np.log(sent_relevance + sample_probs) sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) From 94b8b183cd5bbf961751adddc37561828e257130 Mon Sep 17 00:00:00 2001 From: SDUgitrep Date: Mon, 14 Oct 2024 18:51:21 +0400 Subject: [PATCH 05/97] PPL upd --- src/lm_polygraph/estimators/sentence_sar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index 90155ae11..7a1a2252d 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -210,7 +210,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: sent_relevance = R_s.sum(-1) / self.t sample_probs = np.exp(np.array(sample_log_likelihoods)) # Compute SentenceSAR (Uncertainty Estimation) using PPL - E_s = -np.log(sent_relevance + sample_probs) + E_s = -np.log(sent_relevance + ppl) sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) From 0a437e68d735ee76e3173f182bf87e18765ec31e Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Tue, 15 Oct 2024 00:10:50 +0300 Subject: [PATCH 06/97] Fix ppl --- src/lm_polygraph/estimators/sentence_sar.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index 7a1a2252d..97d646373 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -189,15 +189,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): # Calculate the number of tokens (length of the sample in tokens) - token_log_likelihoods = [np.mean(token_ll) for token_ll in sample_log_likelihoods] - - # Calculate the number of tokens (length of the sample in tokens) - num_tokens = len(token_log_likelihoods) - - # Calculate the mean log-likelihood across tokens - avg_log_likelihood = np.sum(token_log_likelihoods) / num_tokens - # Perplexity is exp(-avg_log_likelihood) - ppl = np.exp(-avg_log_likelihood) + token_log_likelihoods = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) # Initialize the sentence relevance (R_s) using PPL R_s = ( From a6a49ecc565ca97d2648688cd9638eb8136f578f Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Tue, 15 Oct 2024 17:41:19 +0300 Subject: [PATCH 07/97] Fix yamls --- .../polygraph_eval_wmt14_fren_sentsar.yaml | 18 +++++++++--------- .../polygraph_eval_wmt19_deen_sentsar.yaml | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 4dcbf11a5..715c808e9 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -50,31 +50,31 @@ generation_metrics: null additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability - kwargs: + kwargs: {} - module: lm_polygraph.estimators.perplexity class_name: Perplexity - kwargs: + kwargs: {} - module: lm_polygraph.estimators.token_sar class_name: TokenSAR - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR - kwargs: + kwargs: {} - module: lm_polygraph.estimatorts.semantic_entropy class_name: SemanticEntropy - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: ReweightedSentenceSAR - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSentenceSAR - kwargs: + kwargs: {} ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index 8cb7432ce..f5371f7ee 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -49,31 +49,31 @@ use_tok_ue: false additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability - kwargs: + kwargs: {} - module: lm_polygraph.estimators.perplexity class_name: Perplexity - kwargs: + kwargs: {} - module: lm_polygraph.estimators.token_sar class_name: TokenSAR - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR - kwargs: + kwargs: {} - module: lm_polygraph.estimatorts.semantic_entropy class_name: SemanticEntropy - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: ReweightedSentenceSAR - kwargs: + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSentenceSAR - kwargs: + kwargs: {} ignore_exceptions: false From a5d5dc72f4ef739112e312efe1703ae677b04af5 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 16 Oct 2024 14:20:46 +0300 Subject: [PATCH 08/97] Fix small bugs --- examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 2 +- examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 2 +- src/lm_polygraph/estimators/sentence_sar.py | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 715c808e9..a67f961fa 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -60,7 +60,7 @@ additional_estimators: - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} - - module: lm_polygraph.estimatorts.semantic_entropy + - module: lm_polygraph.estimators.semantic_entropy class_name: SemanticEntropy kwargs: {} - module: lm_polygraph.estimators.sentence_sar diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index f5371f7ee..77c0b2f62 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -59,7 +59,7 @@ additional_estimators: - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} - - module: lm_polygraph.estimatorts.semantic_entropy + - module: lm_polygraph.estimators.semantic_entropy class_name: SemanticEntropy kwargs: {} - module: lm_polygraph.estimators.sentence_sar diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index 97d646373..ff2f00dc1 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -187,9 +187,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_log_likelihoods, sample_sentence_similarity in zip( batch_sample_log_likelihoods, batch_sample_sentence_similarity ): - # Calculate the number of tokens (length of the sample in tokens) - - token_log_likelihoods = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) # Initialize the sentence relevance (R_s) using PPL R_s = ( From d8a7278b08ad543ce20a12fc635f1680a8ade880 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 16 Oct 2024 14:57:33 +0300 Subject: [PATCH 09/97] Remove redundant line --- src/lm_polygraph/estimators/sentence_sar.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index ff2f00dc1..790a8f34e 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -198,7 +198,6 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Compute sentence relevance sent_relevance = R_s.sum(-1) / self.t - sample_probs = np.exp(np.array(sample_log_likelihoods)) # Compute SentenceSAR (Uncertainty Estimation) using PPL E_s = -np.log(sent_relevance + ppl) sentenceSAR.append(E_s.mean()) From bb25a126896c267ab4fc1e583990e3d1a69b1b67 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 25 Oct 2024 12:43:27 +0400 Subject: [PATCH 10/97] Add distilled sars --- .../polygraph_eval_triviaqa_sentsar.yaml | 42 ++++++ src/lm_polygraph/estimators/__init__.py | 4 +- src/lm_polygraph/estimators/sentence_sar.py | 142 +++++++++++++++++- 3 files changed, 183 insertions(+), 5 deletions(-) diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index eaf197a0a..9686c1aef 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -70,12 +70,54 @@ additional_estimators: - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1.0 + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1.0 + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1.0 + use_log: false + reverse: false - module: lm_polygraph.estimators.sentence_sar class_name: ReweightedSentenceSAR kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSentenceSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilOneSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilOneSentenceSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilOneSentenceSAR + kwargs: + use_log: false + reverse: false ignore_exceptions: false diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 5287644f5..25f57e792 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -66,7 +66,9 @@ SentenceSAR, OtherSentenceSAR, ReweightedSentenceSAR, - PPLSentenceSAR + PPLSentenceSAR, + DistilSentenceSAR, + DistilOneSentenceSAR, ) from .sar import SAR from .renyi_neg import RenyiNeg diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index 790a8f34e..6f51f0ba2 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -1,6 +1,7 @@ import numpy as np from typing import Dict +from copy import deepcopy from .estimator import Estimator @@ -59,13 +60,20 @@ class OtherSentenceSAR(Estimator): Like SAR, but only looks at other samples for each sample in the output. """ - def __init__(self, verbose: bool = False): + def __init__(self, verbose: bool = False, t: float = 0.001, use_log: bool = True, reverse: bool = False): super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") self.verbose = verbose - self.t = 0.001 + self.t = t + self.use_log = use_log + self.reverse = reverse def __str__(self): - return "OtherSentenceSAR" + base = f"OtherSentenceSAR_{self.t}" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -93,7 +101,15 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: * (1 - np.eye(sample_sentence_similarity.shape[0])) ) sent_relevance = R_s.sum(-1) / self.t - E_s = -np.log(sent_relevance) + + if self.use_log: + E_s = -np.log(sent_relevance) + else: + if self.reverse: + E_s = sent_relevance + else: + E_s = -sent_relevance + sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) @@ -203,3 +219,121 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) + + +class DistilSentenceSAR(Estimator): + """ + Like SAR, but only looks at other samples for each sample in the output. + """ + + def __init__(self, verbose: bool = False, use_log: bool = True, reverse: bool = False): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = f"DistilSentenceSAR" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + sentenceSAR = [] + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + sample_probs = np.exp(np.array(sample_log_probs)) + R_s = ( + sample_probs + * sample_sentence_similarity + ) + sent_relevance = R_s.sum(-1) + + if self.use_log: + E_s = -np.log(sent_relevance) + else: + if self.reverse: + E_s = sent_relevance + else: + E_s = -sent_relevance + + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) + + +class DistilOneSentenceSAR(Estimator): + """ + Like SAR, but only looks at other samples for each sample in the output. + """ + + def __init__(self, verbose: bool = False, use_log: bool = True, reverse: bool = False): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = f"DistilOneSentenceSAR" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = deepcopy(stats["sample_sentence_similarity"]) + + sentenceSAR = [] + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + sample_probs = np.exp(np.array(sample_log_probs)) + np.fill_diagonal(sample_sentence_similarity, 1) + + R_s = ( + sample_probs + * sample_sentence_similarity + ) + sent_relevance = R_s.sum(-1) + + if self.use_log: + E_s = -np.log(sent_relevance) + else: + if self.reverse: + E_s = sent_relevance + else: + E_s = -sent_relevance + + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) From 0c7644924fead55849b8c4f6d2097d170c351cfc Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 25 Oct 2024 14:23:39 +0400 Subject: [PATCH 11/97] WiP --- src/lm_polygraph/utils/manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 263034002..5d7d1e03d 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -256,6 +256,7 @@ def __init__( max_new_tokens: int = 100, background_train_dataset_max_new_tokens: int = 100, cache_path=os.path.expanduser("~") + "/.cache", + save_stats: List[str] = [] ): """ Parameters: @@ -400,6 +401,7 @@ def __init__( self.metrics: Dict[Tuple[str, str, str, str], float] = {} self.total_bad_estimators: Dict[Estimator, float] = {} self.stats: Dict[str, List] = defaultdict(list) + self.save_stats = list(set(['greedy_texts', 'greedy_tokens']) + set(save_stats)) self.processors = processors self.ignore_exceptions = ignore_exceptions @@ -474,7 +476,7 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: self.gen_metrics[generation_metric.level, str(generation_metric)] += m batch_gen_metrics[generation_metric.level, str(generation_metric)] += m - for key in ["greedy_texts", "greedy_tokens"]: + for key in self.save_stats: if key in batch_stats.keys(): self.stats[key] += batch_stats[key] for processor in self.processors: From d064d905c384b95304dabb3e5911ddf7ddccca80 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 30 Oct 2024 21:37:38 +0400 Subject: [PATCH 12/97] Add new sar variants --- .../polygraph_eval_triviaqa_sentsar.yaml | 76 ++++- src/lm_polygraph/estimators/__init__.py | 6 +- src/lm_polygraph/estimators/sar.py | 6 +- src/lm_polygraph/estimators/sentence_sar.py | 286 +++++++++++++++++- 4 files changed, 364 insertions(+), 10 deletions(-) diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index eaf197a0a..29662b6d1 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -61,21 +61,93 @@ additional_estimators: - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + t: 1 - module: lm_polygraph.estimators.semantic_entropy class_name: SemanticEntropy kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: ReweightedSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: ReweightedSentenceSAR + class_name: OtherSentenceSAR + kwargs: + t: 1 + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1 + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1 + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DisilSentenceSAR kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: true ignore_exceptions: false diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 5287644f5..539450637 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -66,7 +66,11 @@ SentenceSAR, OtherSentenceSAR, ReweightedSentenceSAR, - PPLSentenceSAR + PPLSentenceSAR, + DistilSentenceSAR, + DistilSAR, + DistilPPLSAR, + DistilMTESAR, ) from .sar import SAR from .renyi_neg import RenyiNeg diff --git a/src/lm_polygraph/estimators/sar.py b/src/lm_polygraph/estimators/sar.py index 2aed3fa76..57e9c2902 100644 --- a/src/lm_polygraph/estimators/sar.py +++ b/src/lm_polygraph/estimators/sar.py @@ -15,7 +15,7 @@ class SAR(Estimator): and text relevance relative to all other generations. """ - def __init__(self, verbose: bool = False): + def __init__(self, verbose: bool = False, t: float = 0.001): super().__init__( [ "sample_sentence_similarity", @@ -25,10 +25,10 @@ def __init__(self, verbose: bool = False): "sequence", ) self.verbose = verbose - self.t = 0.001 + self.t = t def __str__(self): - return "SAR" + return f"SAR_t{self.t}" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index 790a8f34e..082f9be79 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -59,13 +59,26 @@ class OtherSentenceSAR(Estimator): Like SAR, but only looks at other samples for each sample in the output. """ - def __init__(self, verbose: bool = False): + def __init__( + self, + verbose: bool = False, + t: float = 0.001, + use_log: bool = True, + reverse: bool = False + ): super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") self.verbose = verbose - self.t = 0.001 + self.t = t + self.use_log = use_log + self.reverse = reverse def __str__(self): - return "OtherSentenceSAR" + base = f"OtherSentenceSAR_t{self.t}" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -93,7 +106,15 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: * (1 - np.eye(sample_sentence_similarity.shape[0])) ) sent_relevance = R_s.sum(-1) / self.t - E_s = -np.log(sent_relevance) + + if use_log: + E_s = -np.log(sent_relevance) + else: + if reverse: + E_s = -sent_relevance + else: + E_s = sent_relevance + sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) @@ -203,3 +224,260 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) + + +class DistilSentenceSAR(Estimator): + """ + Like SAR, but only looks at other samples for each sample in the output. + """ + + def __init__( + self, + verbose: bool = False, + use_log: bool = True, + reverse: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = "DistilSentenceSAR" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + sentenceSAR = [] + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + sample_probs = np.exp(np.array(sample_log_probs)) + R_s = ( + sample_probs + * sample_sentence_similarity + ) + sent_relevance = R_s.sum(-1) + + if use_log: + E_s = -np.log(sent_relevance) + else: + if reverse: + E_s = -sent_relevance + else: + E_s = sent_relevance + + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) + + +class DistilSAR(Estimator): + """ + Like SAR, but only looks at other samples for each sample in the output. + """ + + def __init__( + self, + verbose: bool = False, + use_log: bool = True, + reverse: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = "DistilSentenceSAR" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + sentenceSAR = [] + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + sample_probs = np.exp(np.array(sample_log_probs)) + R_s = ( + sample_probs + * sample_sentence_similarity + ) + sent_relevance = R_s.sum(-1) + + if use_log: + E_s = -np.log(sent_relevance) + else: + if reverse: + E_s = -sent_relevance + else: + E_s = sent_relevance + + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) + + +class DistilPPLSAR(Estimator): + """ + Like SAR, but only looks at other samples for each sample in the output. + """ + + def __init__( + self, + verbose: bool = False, + use_log: bool = True, + reverse: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = "DistilPPLSAR" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + sentenceSAR = [] + for sample_log_likelihoods, sample_sentence_similarity in zip( + batch_sample_likelihoods, batch_sample_sentence_similarity + ): + ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + + R_s = ( + ppl + * sample_sentence_similarity + ) + sent_relevance = R_s.sum(-1) + + if use_log: + E_s = -np.log(sent_relevance) + else: + if reverse: + E_s = -sent_relevance + else: + E_s = sent_relevance + + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) + + +class DistilMTESAR(Estimator): + """ + Like SAR, but only looks at other samples for each sample in the output. + """ + + def __init__( + self, + verbose: bool = False, + use_log: bool = True, + reverse: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = "DistilMTESAR" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + sentenceSAR = [] + for sample_log_likelihoods, sample_sentence_similarity in zip( + batch_sample_likelihoods, batch_sample_sentence_similarity + ): + entropy = [] + for lp in sample_log_likelihoods: + mask = ~np.isinf(lp) + entropy.append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask]))) + + R_s = ( + entropy + * sample_sentence_similarity + ) + sent_relevance = R_s.sum(-1) + + if use_log: + E_s = -np.log(sent_relevance) + else: + if reverse: + E_s = -sent_relevance + else: + E_s = sent_relevance + + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) From ff39d6ede32bca7ac05932aec13bca620e36fa55 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 31 Oct 2024 11:34:54 +0400 Subject: [PATCH 13/97] Fix various errors --- .../polygraph_eval_triviaqa_sentsar.yaml | 34 +++++---- scripts/polygraph_eval | 2 +- src/lm_polygraph/estimators/sentence_sar.py | 70 ++++++++++++------- 3 files changed, 67 insertions(+), 39 deletions(-) diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 29662b6d1..41de5578c 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -58,6 +58,12 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_sequence_entropy + class_name: MonteCarloSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy + class_name: MonteCarloNormalizedSequenceEntropy + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} @@ -97,7 +103,7 @@ additional_estimators: use_log: false reverse: true - module: lm_polygraph.estimators.sentence_sar - class_name: DisilSentenceSAR + class_name: DistilSentenceSAR kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilSentenceSAR @@ -135,19 +141,19 @@ additional_estimators: kwargs: use_log: false reverse: true - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR - kwargs: - use_log: false - reverse: true + # - module: lm_polygraph.estimators.sentence_sar + # class_name: DistilMTESAR + # kwargs: {} + # - module: lm_polygraph.estimators.sentence_sar + # class_name: DistilMTESAR + # kwargs: + # use_log: false + # reverse: false + # - module: lm_polygraph.estimators.sentence_sar + # class_name: DistilMTESAR + # kwargs: + # use_log: false + # reverse: true ignore_exceptions: false diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 23f6db69e..70abd8523 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -89,7 +89,7 @@ def main(args): instruct=getattr(args, "instruct", None), split=args.eval_split, load_from_disk=args.load_from_disk, - trust_remote_code=getattr(args, "trust_remote_code", False), + #trust_remote_code=getattr(args, "trust_remote_code", False), **cache_kwargs ) log.info("Done with loading eval data.") diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index 082f9be79..f7101582a 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -107,10 +107,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ) sent_relevance = R_s.sum(-1) / self.t - if use_log: + if self.use_log: E_s = -np.log(sent_relevance) else: - if reverse: + if self.reverse: E_s = -sent_relevance else: E_s = sent_relevance @@ -276,10 +276,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ) sent_relevance = R_s.sum(-1) - if use_log: + if self.use_log: E_s = -np.log(sent_relevance) else: - if reverse: + if self.reverse: E_s = -sent_relevance else: E_s = sent_relevance @@ -306,7 +306,7 @@ def __init__( self.reverse = reverse def __str__(self): - base = "DistilSentenceSAR" + base = "DistilSAR" if not self.use_log: base += "_no_log" if self.reverse: @@ -325,31 +325,50 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: np.ndarray: float sentenceSAR for each sample in input statistics. Higher values indicate more uncertain samples. """ - batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_token_similarity = stats["sample_token_similarity"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - sentenceSAR = [] - for sample_log_probs, sample_sentence_similarity in zip( - batch_sample_log_probs, batch_sample_sentence_similarity + SAR = [] + for batch_data in zip( + batch_sample_log_likelihoods, + batch_sample_token_similarity, + batch_sample_sentence_similarity, ): - sample_probs = np.exp(np.array(sample_log_probs)) + sample_log_likelihoods = batch_data[0] + sample_token_similarity = batch_data[1] + sample_sentence_similarity = batch_data[2] + + tokenSAR = [] + for log_likelihoods, token_similarity in zip( + sample_log_likelihoods, sample_token_similarity + ): + log_likelihoods = np.array(log_likelihoods) + R_t = 1 - token_similarity + R_t_norm = R_t / R_t.sum() + E_t = -log_likelihoods * R_t_norm + tokenSAR.append(E_t.sum()) + + tokenSAR = np.array(tokenSAR) + probs_token_sar = np.exp(-tokenSAR) + R_s = ( - sample_probs + probs_token_sar * sample_sentence_similarity ) sent_relevance = R_s.sum(-1) - if use_log: + if self.use_log: E_s = -np.log(sent_relevance) else: - if reverse: + if self.reverse: E_s = -sent_relevance else: E_s = sent_relevance - sentenceSAR.append(E_s.mean()) + SAR.append(E_s.mean()) - return np.array(sentenceSAR) + return np.array(SAR) class DistilPPLSAR(Estimator): @@ -393,7 +412,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: sentenceSAR = [] for sample_log_likelihoods, sample_sentence_similarity in zip( - batch_sample_likelihoods, batch_sample_sentence_similarity + batch_sample_log_likelihoods, batch_sample_sentence_similarity ): ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) @@ -403,10 +422,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ) sent_relevance = R_s.sum(-1) - if use_log: + if self.use_log: E_s = -np.log(sent_relevance) else: - if reverse: + if self.reverse: E_s = -sent_relevance else: E_s = sent_relevance @@ -457,12 +476,15 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: sentenceSAR = [] for sample_log_likelihoods, sample_sentence_similarity in zip( - batch_sample_likelihoods, batch_sample_sentence_similarity + batch_sample_log_likelihoods, batch_sample_sentence_similarity ): entropy = [] - for lp in sample_log_likelihoods: - mask = ~np.isinf(lp) - entropy.append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask]))) + for seq_lp in sample_log_likelihoods: + seq_entropy = [] + for lp in seq_lp: + mask = ~np.isinf(lp) + seq_entropy.append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask]))) + entropy.append(np.mean(seq_entropy)) R_s = ( entropy @@ -470,10 +492,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ) sent_relevance = R_s.sum(-1) - if use_log: + if self.use_log: E_s = -np.log(sent_relevance) else: - if reverse: + if self.reverse: E_s = -sent_relevance else: E_s = sent_relevance From af30cb75c240c57f73c8c90759fdbb9571986e3e Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 31 Oct 2024 14:14:35 +0400 Subject: [PATCH 14/97] Add xsum config, different alignscore versions --- .../polygraph_eval_wmt14_fren_sentsar.yaml | 69 +++++++- .../polygraph_eval_wmt19_deen_sentsar.yaml | 70 ++++++++- .../configs/polygraph_eval_xsum_sentsar.yaml | 148 ++++++++++++++++++ scripts/polygraph_eval | 4 +- .../generation_metrics/alignscore.py | 27 +++- 5 files changed, 306 insertions(+), 12 deletions(-) create mode 100644 examples/configs/polygraph_eval_xsum_sentsar.yaml diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index a67f961fa..b0635ce2a 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -57,24 +57,89 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_sequence_entropy + class_name: MonteCarloSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy + class_name: MonteCarloNormalizedSequenceEntropy + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + t: 1 - module: lm_polygraph.estimators.semantic_entropy class_name: SemanticEntropy kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: ReweightedSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: ReweightedSentenceSAR + class_name: OtherSentenceSAR + kwargs: + t: 1 + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1 + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1 + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: + use_log: false + reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index 77c0b2f62..0c0f0e730 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -56,24 +56,89 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_sequence_entropy + class_name: MonteCarloSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy + class_name: MonteCarloNormalizedSequenceEntropy + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + t: 1 - module: lm_polygraph.estimators.semantic_entropy class_name: SemanticEntropy kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: ReweightedSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: ReweightedSentenceSAR + class_name: OtherSentenceSAR + kwargs: + t: 1 + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1 + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1 + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: + use_log: false + reverse: true ignore_exceptions: false @@ -82,4 +147,3 @@ deberta_batch_size: 1 seed: - 1 - diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml new file mode 100644 index 000000000..3af6b12f3 --- /dev/null +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -0,0 +1,148 @@ +hydra: + run: + dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + +defaults: + - model: bloomz-560m + - _self_ + +cache_path: ./workdir/output +save_path: '${hydra:run.dir}' + +device: cpu + +task: ats + +dataset: xsum +text_column: document +label_column: summary +prompt: "Here's the text and it's short one-sentence summary.\n\nText:\n{text}\n\nSummary (one sentence):\n" +train_split: train +eval_split: test +max_new_tokens: 56 +load_from_disk: false +trust_remote_code: true +generation_params: + generate_until: + - "\n" + +train_dataset: null +train_test_split: false +test_split_size: 1 + +background_train_dataset: allenai/c4 +background_train_dataset_text_column: text +background_train_dataset_label_column: url +background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz +background_load_from_disk: false + +subsample_background_train_dataset: 1000 +subsample_train_dataset: 1000 +subsample_eval_dataset: -1 + +use_density_based_ue: false +use_seq_ue: false +use_tok_ue: false +use_ens_ue: false + +additional_estimators: + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability + kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_sequence_entropy + class_name: MonteCarloSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy + class_name: MonteCarloNormalizedSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + t: 1 + - module: lm_polygraph.estimators.semantic_entropy + class_name: SemanticEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: SentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: ReweightedSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1 + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1 + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: OtherSentenceSAR + kwargs: + t: 1 + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSentenceSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilSAR + kwargs: + use_log: false + reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilPPLSAR + kwargs: + use_log: false + reverse: true + +ignore_exceptions: false + +batch_size: 1 +deberta_batch_size: 1 + +seed: + - 1 diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 70abd8523..239dc921f 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -395,7 +395,9 @@ def get_generation_metrics(args): output_ignore_regex = getattr(args, "output_ignore_regex", None), normalize = getattr(args, "normalize", False), ), - AlignScore(target_is_claims=False if args.task == "ats" else True), + AlignScore(), + AlignScore(target_is_claims=False), + AlignScore(ignore_target=True), ] if getattr(args.model, "type", "Whitebox") != "Blackbox": if getattr(args, "use_claim_ue", False): diff --git a/src/lm_polygraph/generation_metrics/alignscore.py b/src/lm_polygraph/generation_metrics/alignscore.py index a1f9a63d7..139b558e1 100644 --- a/src/lm_polygraph/generation_metrics/alignscore.py +++ b/src/lm_polygraph/generation_metrics/alignscore.py @@ -18,11 +18,13 @@ def __init__( ckpt_path="https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt", batch_size=16, target_is_claims=True, + ignore_target=False, ): super().__init__(["greedy_texts", "input_texts"], "sequence") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.target_is_claims = target_is_claims self.batch_size = batch_size + self.ignore_target = ignore_target self.scorer = AlignScorer( model="roberta-large", batch_size=batch_size, @@ -32,7 +34,14 @@ def __init__( ) def __str__(self): - return "AlignScore" + base = "AlignScore" + if self.ignore_target: + base += "InputOutput" + elif self.target_is_claims: + base += "OutputTarget" + else: + base += "TargetOutput" + return base def __call__( self, @@ -51,16 +60,22 @@ def __call__( np.ndarray: list of AlignScore Scores for each sample in input. """ greedy_texts = stats["greedy_texts"] + input_texts = stats["input_texts"] filtered_targets = [x if len(x.strip()) else "(empty)" for x in target_texts] filtered_outputs = [x if len(x.strip()) else "(empty)" for x in greedy_texts] + filtered_inputs = [x if len(x.strip()) else "(empty)" for x in input_texts] - if self.target_is_claims: - claims = filtered_targets - contexts = filtered_outputs - else: + if self.ignore_target: claims = filtered_outputs - contexts = filtered_targets + contexts = filtered_inputs + else: + if self.target_is_claims: + claims = filtered_targets + contexts = filtered_outputs + else: + claims = filtered_outputs + contexts = filtered_targets scores = np.array( self.scorer.score( From 3fad16bf37774ecbdcae71f544373fbb9984d190 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 8 Nov 2024 15:36:30 +0400 Subject: [PATCH 15/97] Smallfix --- src/lm_polygraph/estimators/sentence_sar.py | 2 +- src/lm_polygraph/utils/manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index 6f51f0ba2..48c4ca0b0 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -319,7 +319,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): sample_probs = np.exp(np.array(sample_log_probs)) np.fill_diagonal(sample_sentence_similarity, 1) - + R_s = ( sample_probs * sample_sentence_similarity diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 5d7d1e03d..8e903c731 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -401,7 +401,7 @@ def __init__( self.metrics: Dict[Tuple[str, str, str, str], float] = {} self.total_bad_estimators: Dict[Estimator, float] = {} self.stats: Dict[str, List] = defaultdict(list) - self.save_stats = list(set(['greedy_texts', 'greedy_tokens']) + set(save_stats)) + self.save_stats = list(set(['greedy_texts', 'greedy_tokens']).union(set(save_stats))) self.processors = processors self.ignore_exceptions = ignore_exceptions From b1f0346da886ce44f709a9e300eaa55466282cdb Mon Sep 17 00:00:00 2001 From: silvimica Date: Sat, 9 Nov 2024 19:24:22 +0400 Subject: [PATCH 16/97] Sample entropy --- src/lm_polygraph/stat_calculators/__init__.py | 1 + src/lm_polygraph/stat_calculators/entropy.py | 26 +++++++++++++++++++ .../utils/register_stat_calculators.py | 1 + 3 files changed, 28 insertions(+) diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py index 7bf5b7c21..f69abd428 100644 --- a/src/lm_polygraph/stat_calculators/__init__.py +++ b/src/lm_polygraph/stat_calculators/__init__.py @@ -9,6 +9,7 @@ OPENAI_FACT_CHECK_PROMPTS, ) from .entropy import EntropyCalculator +from .entropy import SampleEntropyCalculator from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator from .greedy_alternatives_nli import ( GreedyAlternativesNLICalculator, diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py index 1696007fd..02939569d 100644 --- a/src/lm_polygraph/stat_calculators/entropy.py +++ b/src/lm_polygraph/stat_calculators/entropy.py @@ -42,3 +42,29 @@ def __call__( mask = ~np.isinf(lp) entropies[-1].append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask]))) return {"entropy": entropies} + +class SampleEntropyCalculator(StatCalculator): + def __init__(self): + super().__init__(["sample_entropy"], ["sample_log_likelihoods"]) + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str] = None, + model: WhiteboxModel = None, + max_new_tokens: int = 100, + **kwargs, + ) -> Dict[str, np.ndarray]: + logprobs = dependencies["sample_log_likelihoods"] + entropies = [] + + for sample_log_probs in logprobs: + for token_log_probs in sample_log_probs: + token_log_probs = np.array(token_log_probs) + probabilities = np.exp(token_log_probs) + + mask = ~np.isinf(token_log_probs) + sample_entropy = -np.sum(probabilities[mask] * token_log_probs[mask]) + + entropies.append(sample_entropy) + return {"sample_entropy": entropies} \ No newline at end of file diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index 7588ed1c6..2a46b5740 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -63,6 +63,7 @@ def _register(calculator_class: StatCalculator): else: _register(GreedyProbsCalculator(n_alternatives=n_ccp_alternatives)) _register(EntropyCalculator()) + _register(SampleEntropyCalculator()) _register(GreedyLMProbsCalculator()) _register(SamplingGenerationCalculator()) _register(BartScoreCalculator()) From 73e6a6ffbbc18192125fa24b19be64d72aa77375 Mon Sep 17 00:00:00 2001 From: silvimica Date: Sat, 9 Nov 2024 19:37:42 +0400 Subject: [PATCH 17/97] Add entropy-based sentence sar --- src/lm_polygraph/estimators/__init__.py | 1 + src/lm_polygraph/estimators/sentence_sar.py | 53 +++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 539450637..17d70def6 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -71,6 +71,7 @@ DistilSAR, DistilPPLSAR, DistilMTESAR, + EntropySentenceSAR, ) from .sar import SAR from .renyi_neg import RenyiNeg diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index f7101582a..5d187b0db 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -503,3 +503,56 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) + + +class EntropySentenceSAR(Estimator): + """ + Like SAR, but uses sample entropy calculated from token-wise log probs for each sample. + Tokenwise log-likelihoods are available in stats['sample_log_likelihoods']. + """ + def __init__(self, verbose: bool = False): + super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + self.verbose = verbose + self.t = 0.001 + + def __str__(self): + return "EntropySentenceSAR" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the Entropy-based sentence-level uncertainty using token-wise log-likelihoods. + + Parameters: + stats (Dict[str, np.ndarray]): Input statistics, including: + * 'sample_log_likelihoods': token-wise log-likelihoods for each sample. + + Returns: + np.ndarray: float PPL values for each sample. + Lower values indicate less uncertainty (better predictions), higher values indicate more uncertainty. + """ + # Extract token-wise log-likelihoods from the stats + batch_sample_entropy = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + sentenceSAR = [] + + # Loop over each sample's log-likelihoods and sentence similarities + for sample_entropy, sample_sentence_similarity in zip( + batch_sample_entropy, batch_sample_sentence_similarity + ): + entropy = sample_entropy + # Initialize the sentence relevance (R_s) using PPL + R_s = ( + entropy # Use entropy instead of probabilities + * sample_sentence_similarity + * (1 - np.eye(sample_sentence_similarity.shape[0])) # Remove self-similarity + ) + + # Compute sentence relevance + sent_relevance = R_s.sum(-1) / self.t + # Compute SentenceSAR (Uncertainty Estimation) using PPL + E_s = -np.log(sent_relevance + entropy) + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) + From d3d6723f8329ff1c3366d241ceb60164bfdee653 Mon Sep 17 00:00:00 2001 From: silvimica Date: Tue, 12 Nov 2024 19:08:44 +0400 Subject: [PATCH 18/97] New sentences --- requirements.txt | 2 +- src/lm_polygraph/estimators/__init__.py | 1 + src/lm_polygraph/estimators/sentence_sar.py | 70 +++++++++++++++++++- src/lm_polygraph/stat_calculators/entropy.py | 29 +++++--- src/lm_polygraph/stat_calculators/sample.py | 11 ++- 5 files changed, 98 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index 949524b00..39fc2a345 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,5 +33,5 @@ bert-score unbabel-comet==2.2.1 nltk>=3.7,<4 evaluate -spacy>=3.4.0,<4 +spacy>=3.4.0,<3.8 fastchat diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 5d66c3523..2eb69d448 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -67,6 +67,7 @@ OtherSentenceSAR, ReweightedSentenceSAR, PPLSentenceSAR, + MTESentenceSAR, DistilSentenceSAR, DistilOneSentenceSAR, DistilSAR, diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index f30aad7ec..fede78ded 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -501,7 +501,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(sentenceSAR) -class DistilMTESAR(Estimator): +class MTESentenceSAR(Estimator): """ Like SAR, but uses sample entropy calculated from token-wise log probs for each sample. Tokenwise log-likelihoods are available in stats['sample_log_likelihoods']. @@ -512,7 +512,7 @@ def __init__(self, verbose: bool = False): self.t = 0.001 def __str__(self): - return "EntropySentenceSAR" + return "MTESentenceSAR" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -551,3 +551,69 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) + + + + +class DistilMTESAR(Estimator): + """ + Like SAR, but uses Mean Token Entropy (MTE) calculated from token-wise log probs for each sample. + Token-wise log-likelihoods are available in stats['sample_entropy']. + """ + + def __init__( + self, + verbose: bool = False, + use_log: bool = True, + reverse: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = "DistilMTESAR" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample using Mean Token Entropy (MTE). + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * 'sample_entropy': Mean Token Entropy for each sample, + * 'sample_sentence_similarity': matrix with cross-encoder similarities. + + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_entropy = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + sentenceSAR = [] + + # Loop over each sample's Mean Token Entropy and sentence similarities + for sample_entropy, sample_sentence_similarity in zip( + batch_sample_entropy, batch_sample_sentence_similarity + ): + # Use MTE for sentence relevance calculation + R_s = sample_entropy * sample_sentence_similarity + + # Compute sentence relevance by summing along the last axis + sent_relevance = R_s.sum(-1) + + # Calculate E_s with options for log transformation and reversal + if self.use_log: + E_s = -np.log(sent_relevance) + else: + E_s = -sent_relevance if self.reverse else sent_relevance + + sentenceSAR.append(E_s.mean()) + + return np.array(sentenceSAR) diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py index 02939569d..dbc49a4c8 100644 --- a/src/lm_polygraph/stat_calculators/entropy.py +++ b/src/lm_polygraph/stat_calculators/entropy.py @@ -4,7 +4,8 @@ from .stat_calculator import StatCalculator from lm_polygraph.utils.model import WhiteboxModel - +import torch +from torch.nn import functional as F class EntropyCalculator(StatCalculator): """ @@ -45,7 +46,7 @@ def __call__( class SampleEntropyCalculator(StatCalculator): def __init__(self): - super().__init__(["sample_entropy"], ["sample_log_likelihoods"]) + super().__init__(["sample_entropy"], ["token_distributions"]) def __call__( self, @@ -55,16 +56,22 @@ def __call__( max_new_tokens: int = 100, **kwargs, ) -> Dict[str, np.ndarray]: - logprobs = dependencies["sample_log_likelihoods"] + token_distributions = dependencies["token_distributions"] entropies = [] - for sample_log_probs in logprobs: - for token_log_probs in sample_log_probs: - token_log_probs = np.array(token_log_probs) - probabilities = np.exp(token_log_probs) + for sample_distributions in token_distributions: + sample_entropies = [] + for token_dist in sample_distributions: + # Convert token_dist to a numpy array first, then to a torch tensor + token_dist_tensor = torch.tensor(np.array(token_dist)) + + # Calculate entropy using torch's Categorical distribution + # Apply mean() in case the entropy returns a multi-element tensor + entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy().mean() + sample_entropies.append(entropy.item()) # Convert to a scalar value if needed - mask = ~np.isinf(token_log_probs) - sample_entropy = -np.sum(probabilities[mask] * token_log_probs[mask]) - - entropies.append(sample_entropy) + # Calculate mean entropy for the sample + mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0 + entropies.append(mean_entropy.item()) + return {"sample_entropy": entropies} \ No newline at end of file diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py index 96a447a57..6f1fb2236 100644 --- a/src/lm_polygraph/stat_calculators/sample.py +++ b/src/lm_polygraph/stat_calculators/sample.py @@ -98,6 +98,7 @@ def __init__(self, samples_n: int = 10): "sample_tokens", "sample_texts", "sample_log_likelihoods", + "token_distributions", ], [], ) @@ -123,6 +124,7 @@ def __call__( - 'sample_tokens' (List[List[List[float]]]): tokenized 'sample_texts', - 'sample_log_probs' (List[List[float]]): sum of the log probabilities at each token of the sampling generation. - 'sample_log_likelihoods' (List[List[List[float]]]): log probabilities at each token of the sampling generation. + - 'token_distributions' (List[List[List[float]]]): full token probability distributions for each generated token. """ batch: Dict[str, torch.Tensor] = model.tokenize(texts) batch = {k: v.to(model.device()) for k, v in batch.items()} @@ -152,10 +154,14 @@ def __call__( tokens = [[] for _ in range(len(texts))] texts = [[] for _ in range(len(texts))] log_likelihoods = [[] for _ in range(len(texts))] + token_distributions = [[] for _ in range(len(texts))] + + if model.model_type == "Seq2SeqLM": sequences = [seq[1:] for seq in sequences] + for i in range(len(logits)): - log_prob, ll, toks = 0, [], [] + log_prob, ll, toks, distributions = 0, [], [], [] inp_size = ( len(batch["input_ids"][int(i / self.samples_n)]) if model.model_type == "CausalLM" @@ -168,15 +174,18 @@ def __call__( break ll.append(logits[i][j][cur_token].item()) toks.append(cur_token) + distributions.append(logits[i][j].softmax(dim=-1).cpu().numpy()) log_likelihoods[int(i / self.samples_n)].append(ll) log_probs[int(i / self.samples_n)].append(log_prob) tokens[int(i / self.samples_n)].append(toks) texts[int(i / self.samples_n)].append(model.tokenizer.decode(toks)) + token_distributions[int(i / self.samples_n)].append(distributions) return { "sample_log_likelihoods": log_likelihoods, "sample_log_probs": log_probs, "sample_tokens": tokens, "sample_texts": texts, + "token_distributions": token_distributions, } From 3504d82cfce7f5d3be94c4f2bb7f4dfe14742528 Mon Sep 17 00:00:00 2001 From: silvimica Date: Mon, 18 Nov 2024 14:38:21 +0400 Subject: [PATCH 19/97] Small fix to sample entropy --- src/lm_polygraph/stat_calculators/entropy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py index dbc49a4c8..eaa07dee1 100644 --- a/src/lm_polygraph/stat_calculators/entropy.py +++ b/src/lm_polygraph/stat_calculators/entropy.py @@ -66,9 +66,8 @@ def __call__( token_dist_tensor = torch.tensor(np.array(token_dist)) # Calculate entropy using torch's Categorical distribution - # Apply mean() in case the entropy returns a multi-element tensor - entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy().mean() - sample_entropies.append(entropy.item()) # Convert to a scalar value if needed + entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy() + sample_entropies.append(entropy.item()) # Calculate mean entropy for the sample mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0 From 877a3dddf7d9ad8f81673c63a0d85fcf784ad535 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Tue, 19 Nov 2024 16:26:19 +0400 Subject: [PATCH 20/97] Save some additional stats for samples --- examples/configs/polygraph_eval_xsum_sentsar.yaml | 5 +++++ scripts/polygraph_eval | 1 + 2 files changed, 6 insertions(+) diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index 3af6b12f3..c78c143ba 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -25,6 +25,11 @@ trust_remote_code: true generation_params: generate_until: - "\n" +save_stats: + - sample_tokens + - sample_texts + - sample_log_probs + - sample_sentence_similarity train_dataset: null train_test_split: false diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 239dc921f..9fac71216 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -201,6 +201,7 @@ def main(args): ensemble_model=ensemble_model, cache_path=args.cache_path, language=getattr(args, 'language', 'en'), + save_stats=getattr(args, 'save_stats', []), ) man() From 9906efc559adc0d007acb7fdaf12f93229533d99 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Tue, 19 Nov 2024 16:35:15 +0400 Subject: [PATCH 21/97] Use MTE sar --- .../configs/polygraph_eval_xsum_sentsar.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index c78c143ba..1203c1408 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -85,6 +85,9 @@ additional_estimators: - module: lm_polygraph.estimators.sentence_sar class_name: PPLSentenceSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR kwargs: {} @@ -143,6 +146,19 @@ additional_estimators: kwargs: use_log: false reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: true ignore_exceptions: false From 7c89d1141b20de727efd138ecb7447d25dd50570 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 21 Nov 2024 08:35:36 +0400 Subject: [PATCH 22/97] Add batch iteration in MTE sar --- src/lm_polygraph/stat_calculators/entropy.py | 23 ++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py index eaa07dee1..44e592316 100644 --- a/src/lm_polygraph/stat_calculators/entropy.py +++ b/src/lm_polygraph/stat_calculators/entropy.py @@ -56,21 +56,22 @@ def __call__( max_new_tokens: int = 100, **kwargs, ) -> Dict[str, np.ndarray]: - token_distributions = dependencies["token_distributions"] + batch_distributions = dependencies["token_distributions"] entropies = [] + + for input_distributions in batch_distributions: + for sample_distributions in input_distributions: + sample_entropies = [] + for token_dist in sample_distributions: + # Convert token_dist to a numpy array first, then to a torch tensor + token_dist_tensor = torch.tensor(np.array(token_dist)) - for sample_distributions in token_distributions: - sample_entropies = [] - for token_dist in sample_distributions: - # Convert token_dist to a numpy array first, then to a torch tensor - token_dist_tensor = torch.tensor(np.array(token_dist)) - - # Calculate entropy using torch's Categorical distribution - entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy() - sample_entropies.append(entropy.item()) + # Calculate entropy using torch's Categorical distribution + entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy() + sample_entropies.append(entropy.item()) # Calculate mean entropy for the sample mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0 entropies.append(mean_entropy.item()) - return {"sample_entropy": entropies} \ No newline at end of file + return {"sample_entropy": entropies} From 4fcd0301ebabaa709bce46e82463a1c2a836db24 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 21 Nov 2024 17:52:12 +0400 Subject: [PATCH 23/97] Make xsum work --- scripts/polygraph_eval | 2 +- src/lm_polygraph/utils/manager.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 9fac71216..7a61f524e 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -89,7 +89,7 @@ def main(args): instruct=getattr(args, "instruct", None), split=args.eval_split, load_from_disk=args.load_from_disk, - #trust_remote_code=getattr(args, "trust_remote_code", False), + trust_remote_code=getattr(args, "trust_remote_code", False), **cache_kwargs ) log.info("Done with loading eval data.") diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 8e903c731..88d8494cc 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -478,7 +478,11 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: for key in self.save_stats: if key in batch_stats.keys(): - self.stats[key] += batch_stats[key] + try: + self.stats[key] += list(batch_stats[key]) + except: + breakpoint() + pass for processor in self.processors: processor.on_batch(batch_stats, batch_gen_metrics, batch_estimations) From 65c91f0c034f21d40c92cfe43fe9e89eb378043c Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 22 Nov 2024 19:58:03 +0400 Subject: [PATCH 24/97] Prevent generation of newlines only --- src/lm_polygraph/utils/model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index 4ab587f27..de4994b0c 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -355,6 +355,10 @@ def __init__( def __call__(self, input_ids, scores, **kwargs) -> bool: # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :] + + # Do not stop generation if stop sequence is the first thing generated + if lookback_ids_batch.shape[1] < 2: + return False lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :] From 2007f63f44fd8149cded1a76c91abd4c36461958 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 22 Nov 2024 20:01:38 +0400 Subject: [PATCH 25/97] Consistent method set --- .../polygraph_eval_triviaqa_sentsar.yaml | 51 ++++++++----------- .../polygraph_eval_wmt14_fren_sentsar.yaml | 19 +++++++ .../polygraph_eval_wmt19_deen_sentsar.yaml | 19 +++++++ .../configs/polygraph_eval_xsum_sentsar.yaml | 3 ++ 4 files changed, 63 insertions(+), 29 deletions(-) diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 1a9179268..8a9fbf367 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -58,6 +58,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.monte_carlo_sequence_entropy class_name: MonteCarloSequenceEntropy kwargs: {} @@ -83,6 +86,9 @@ additional_estimators: - module: lm_polygraph.estimators.sentence_sar class_name: PPLSentenceSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR kwargs: {} @@ -107,27 +113,14 @@ additional_estimators: kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilSentenceSAR - kwargs: - use_log: false - reverse: true - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR - kwargs: + kwargs: use_log: false reverse: false - module: lm_polygraph.estimators.sentence_sar - class_name: DistilOneSentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilOneSentenceSAR - kwargs: + class_name: DistilSentenceSAR + kwargs: use_log: false reverse: true - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilOneSentenceSAR - kwargs: - use_log: false - reverse: false - module: lm_polygraph.estimators.sentence_sar class_name: DistilSAR kwargs: {} @@ -154,19 +147,19 @@ additional_estimators: kwargs: use_log: false reverse: true - # - module: lm_polygraph.estimators.sentence_sar - # class_name: DistilMTESAR - # kwargs: {} - # - module: lm_polygraph.estimators.sentence_sar - # class_name: DistilMTESAR - # kwargs: - # use_log: false - # reverse: false - # - module: lm_polygraph.estimators.sentence_sar - # class_name: DistilMTESAR - # kwargs: - # use_log: false - # reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index b0635ce2a..c8684afc4 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -57,6 +57,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.monte_carlo_sequence_entropy class_name: MonteCarloSequenceEntropy kwargs: {} @@ -82,6 +85,9 @@ additional_estimators: - module: lm_polygraph.estimators.sentence_sar class_name: PPLSentenceSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR kwargs: {} @@ -140,6 +146,19 @@ additional_estimators: kwargs: use_log: false reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index 0c0f0e730..0210310b4 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -56,6 +56,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.monte_carlo_sequence_entropy class_name: MonteCarloSequenceEntropy kwargs: {} @@ -81,6 +84,9 @@ additional_estimators: - module: lm_polygraph.estimators.sentence_sar class_name: PPLSentenceSAR kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: OtherSentenceSAR kwargs: {} @@ -139,6 +145,19 @@ additional_estimators: kwargs: use_log: false reverse: true + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.sentence_sar + class_name: DistilMTESAR + kwargs: + use_log: false + reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index 1203c1408..ed8d86730 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -60,6 +60,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.monte_carlo_sequence_entropy class_name: MonteCarloSequenceEntropy kwargs: {} From 303d5792ea8f696d4ef1e1ecbf53062ea4f53ac1 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Sat, 23 Nov 2024 12:15:17 +0400 Subject: [PATCH 26/97] Fix lookback procedure --- src/lm_polygraph/utils/model.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index de4994b0c..c38fb118b 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -356,17 +356,14 @@ def __call__(self, input_ids, scores, **kwargs) -> bool: # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :] - # Do not stop generation if stop sequence is the first thing generated - if lookback_ids_batch.shape[1] < 2: - return False - lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :] lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch) for i, done in enumerate(self.done_tracker): if not done: - self.done_tracker[i] = self.sequence in lookback_tokens_batch[i] + # Stop generation if the stop sequence is in the lookback tokens but doesn't start with stop sequence + self.done_tracker[i] = (self.sequence in lookback_tokens_batch[i] and not lookback_tokens_batch[i][: len(self.sequence)] == self.sequence) return False not in self.done_tracker def get_stopping_criteria(self, input_ids: torch.Tensor): From 0978d2a883db95a4b59a1e6a064dee7093db1325 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Sat, 23 Nov 2024 12:40:22 +0400 Subject: [PATCH 27/97] One more stopping criterion fix --- src/lm_polygraph/utils/model.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index c38fb118b..f22901919 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -362,8 +362,11 @@ def __call__(self, input_ids, scores, **kwargs) -> bool: for i, done in enumerate(self.done_tracker): if not done: - # Stop generation if the stop sequence is in the lookback tokens but doesn't start with stop sequence - self.done_tracker[i] = (self.sequence in lookback_tokens_batch[i] and not lookback_tokens_batch[i][: len(self.sequence)] == self.sequence) + lookback_tokens_batch_i = lookback_tokens_batch[i] + # Remove stop sequence from the begginning of the lookback tokens if it is there + if len(lookback_tokens_batch_i) >= len(self.sequence) and lookback_tokens_batch_i[: len(self.sequence)] == self.sequence: + lookback_tokens_batch_i = lookback_tokens_batch_i[len(self.sequence) :] + self.done_tracker[i] = self.sequence in lookback_tokens_batch_i return False not in self.done_tracker def get_stopping_criteria(self, input_ids: torch.Tensor): From c8d06dd224cb3456e6aa06cc93e648f7aa3685bd Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Mon, 25 Nov 2024 13:26:49 +0400 Subject: [PATCH 28/97] Use stop string criteria from transformers to stop generation early --- src/lm_polygraph/utils/model.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index f22901919..7469b19b0 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -17,6 +17,7 @@ BartForConditionalGeneration, StoppingCriteria, StoppingCriteriaList, + StopStringCriteria, PreTrainedTokenizer, ) @@ -375,10 +376,11 @@ def get_stopping_criteria(self, input_ids: torch.Tensor): return StoppingCriteriaList( [ *[ - self._MultiTokenEOSCriteria( - sequence, self.tokenizer, input_ids.shape[1], input_ids.shape[0] - ) - for sequence in stop_sequences + #self._MultiTokenEOSCriteria( + # sequence, self.tokenizer, input_ids.shape[1], input_ids.shape[0] + #) + #for sequence in stop_sequences + StopStringCriteria(self.tokenizer, stop_sequences) ], ] ) From 49fbbc2582df1ddfc94283521b2bd6b64bf4d5ba Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Mon, 25 Nov 2024 13:50:24 +0400 Subject: [PATCH 29/97] Rollback --- src/lm_polygraph/utils/model.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index 7469b19b0..f22901919 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -17,7 +17,6 @@ BartForConditionalGeneration, StoppingCriteria, StoppingCriteriaList, - StopStringCriteria, PreTrainedTokenizer, ) @@ -376,11 +375,10 @@ def get_stopping_criteria(self, input_ids: torch.Tensor): return StoppingCriteriaList( [ *[ - #self._MultiTokenEOSCriteria( - # sequence, self.tokenizer, input_ids.shape[1], input_ids.shape[0] - #) - #for sequence in stop_sequences - StopStringCriteria(self.tokenizer, stop_sequences) + self._MultiTokenEOSCriteria( + sequence, self.tokenizer, input_ids.shape[1], input_ids.shape[0] + ) + for sequence in stop_sequences ], ] ) From 6f41f7efd004f69517979aa5d62dfd5785430ded Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Mon, 25 Nov 2024 15:30:46 +0400 Subject: [PATCH 30/97] Top K entropy --- .../polygraph_eval_triviaqa_sentsar.yaml | 82 ++++++++----------- .../polygraph_eval_wmt14_fren_sentsar.yaml | 82 ++++++++----------- .../polygraph_eval_wmt19_deen_sentsar.yaml | 82 ++++++++----------- .../configs/polygraph_eval_xsum_sentsar.yaml | 77 +++++++---------- scripts/polygraph_eval | 1 + src/lm_polygraph/stat_calculators/entropy.py | 34 ++++++-- src/lm_polygraph/stat_calculators/sample.py | 2 +- src/lm_polygraph/utils/manager.py | 4 +- .../utils/register_stat_calculators.py | 5 +- 9 files changed, 169 insertions(+), 200 deletions(-) diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 8a9fbf367..dcc0ffb30 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -26,6 +26,12 @@ normalize: true generation_params: generate_until: - "\n" +save_stats: + - sample_tokens + - sample_texts + - sample_log_probs + - sample_sentence_similarity +entropy_top_k: 50 train_dataset: null train_test_split: false @@ -49,65 +55,22 @@ generation_metrics: null ens_type: additional_estimators: - - module: lm_polygraph.estimators.max_probability - class_name: MaximumSequenceProbability - kwargs: {} - - module: lm_polygraph.estimators.perplexity - class_name: Perplexity - kwargs: {} - - module: lm_polygraph.estimators.token_sar - class_name: TokenSAR - kwargs: {} - - module: lm_polygraph.estimators.token_entropy - class_name: MeanTokenEntropy - kwargs: {} - module: lm_polygraph.estimators.monte_carlo_sequence_entropy class_name: MonteCarloSequenceEntropy kwargs: {} - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy class_name: MonteCarloNormalizedSequenceEntropy kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: - t: 1 - module: lm_polygraph.estimators.semantic_entropy class_name: SemanticEntropy kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: SentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: ReweightedSentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: MTESentenceSAR + + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR + class_name: SentenceSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - use_log: false - reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - use_log: false - reverse: true - module: lm_polygraph.estimators.sentence_sar class_name: DistilSentenceSAR kwargs: {} @@ -121,6 +84,17 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + t: 1 - module: lm_polygraph.estimators.sentence_sar class_name: DistilSAR kwargs: {} @@ -134,6 +108,13 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilPPLSAR kwargs: {} @@ -147,6 +128,13 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilMTESAR kwargs: {} diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index c8684afc4..1c34e85e2 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -24,6 +24,12 @@ load_from_disk: false generation_params: generate_until: - "\n" +save_stats: + - sample_tokens + - sample_texts + - sample_log_probs + - sample_sentence_similarity +entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" @@ -48,65 +54,22 @@ use_tok_ue: false generation_metrics: null additional_estimators: - - module: lm_polygraph.estimators.max_probability - class_name: MaximumSequenceProbability - kwargs: {} - - module: lm_polygraph.estimators.perplexity - class_name: Perplexity - kwargs: {} - - module: lm_polygraph.estimators.token_sar - class_name: TokenSAR - kwargs: {} - - module: lm_polygraph.estimators.token_entropy - class_name: MeanTokenEntropy - kwargs: {} - module: lm_polygraph.estimators.monte_carlo_sequence_entropy class_name: MonteCarloSequenceEntropy kwargs: {} - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy class_name: MonteCarloNormalizedSequenceEntropy kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: - t: 1 - module: lm_polygraph.estimators.semantic_entropy class_name: SemanticEntropy kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: SentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: ReweightedSentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: MTESentenceSAR + + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR + class_name: SentenceSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - use_log: false - reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - use_log: false - reverse: true - module: lm_polygraph.estimators.sentence_sar class_name: DistilSentenceSAR kwargs: {} @@ -120,6 +83,17 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + t: 1 - module: lm_polygraph.estimators.sentence_sar class_name: DistilSAR kwargs: {} @@ -133,6 +107,13 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilPPLSAR kwargs: {} @@ -146,6 +127,13 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilMTESAR kwargs: {} diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index 0210310b4..f6c4b1ada 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -24,6 +24,12 @@ load_from_disk: false generation_params: generate_until: - "\n" +save_stats: + - sample_tokens + - sample_texts + - sample_log_probs + - sample_sentence_similarity +entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" @@ -47,65 +53,22 @@ use_seq_ue: false use_tok_ue: false additional_estimators: - - module: lm_polygraph.estimators.max_probability - class_name: MaximumSequenceProbability - kwargs: {} - - module: lm_polygraph.estimators.perplexity - class_name: Perplexity - kwargs: {} - - module: lm_polygraph.estimators.token_sar - class_name: TokenSAR - kwargs: {} - - module: lm_polygraph.estimators.token_entropy - class_name: MeanTokenEntropy - kwargs: {} - module: lm_polygraph.estimators.monte_carlo_sequence_entropy class_name: MonteCarloSequenceEntropy kwargs: {} - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy class_name: MonteCarloNormalizedSequenceEntropy kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: - t: 1 - module: lm_polygraph.estimators.semantic_entropy class_name: SemanticEntropy kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: SentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: ReweightedSentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: MTESentenceSAR + + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR + class_name: SentenceSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - use_log: false - reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - use_log: false - reverse: true - module: lm_polygraph.estimators.sentence_sar class_name: DistilSentenceSAR kwargs: {} @@ -119,6 +82,17 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + t: 1 - module: lm_polygraph.estimators.sentence_sar class_name: DistilSAR kwargs: {} @@ -132,6 +106,13 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilPPLSAR kwargs: {} @@ -145,6 +126,13 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilMTESAR kwargs: {} diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index ed8d86730..9dcac6213 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -30,6 +30,7 @@ save_stats: - sample_texts - sample_log_probs - sample_sentence_similarity +entropy_top_k: 50 train_dataset: null train_test_split: false @@ -51,65 +52,22 @@ use_tok_ue: false use_ens_ue: false additional_estimators: - - module: lm_polygraph.estimators.max_probability - class_name: MaximumSequenceProbability - kwargs: {} - - module: lm_polygraph.estimators.perplexity - class_name: Perplexity - kwargs: {} - - module: lm_polygraph.estimators.token_sar - class_name: TokenSAR - kwargs: {} - - module: lm_polygraph.estimators.token_entropy - class_name: MeanTokenEntropy - kwargs: {} - module: lm_polygraph.estimators.monte_carlo_sequence_entropy class_name: MonteCarloSequenceEntropy kwargs: {} - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy class_name: MonteCarloNormalizedSequenceEntropy kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: - t: 1 - module: lm_polygraph.estimators.semantic_entropy class_name: SemanticEntropy kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: SentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: ReweightedSentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: MTESentenceSAR + + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR + class_name: SentenceSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - use_log: false - reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: OtherSentenceSAR - kwargs: - t: 1 - use_log: false - reverse: true - module: lm_polygraph.estimators.sentence_sar class_name: DistilSentenceSAR kwargs: {} @@ -123,6 +81,17 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: + t: 1 - module: lm_polygraph.estimators.sentence_sar class_name: DistilSAR kwargs: {} @@ -136,6 +105,13 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilPPLSAR kwargs: {} @@ -149,6 +125,13 @@ additional_estimators: kwargs: use_log: false reverse: true + + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESentenceSAR + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: DistilMTESAR kwargs: {} diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 7a61f524e..ccf079f9e 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -202,6 +202,7 @@ def main(args): cache_path=args.cache_path, language=getattr(args, 'language', 'en'), save_stats=getattr(args, 'save_stats', []), + entropy_top_k=getattr(args, 'entropy_top_k', None), ) man() diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py index 44e592316..0a4409797 100644 --- a/src/lm_polygraph/stat_calculators/entropy.py +++ b/src/lm_polygraph/stat_calculators/entropy.py @@ -12,7 +12,11 @@ class EntropyCalculator(StatCalculator): Calculates entropy of probabilities at each token position in the generation of a Whitebox model. """ - def __init__(self): + def __init__( + self, + top_k: int = None, + ): + self.top_k = top_k super().__init__(["entropy"], ["greedy_log_probs"]) def __call__( @@ -40,12 +44,23 @@ def __call__( for s_lp in logprobs: entropies.append([]) for lp in s_lp: - mask = ~np.isinf(lp) - entropies[-1].append(-np.sum(np.array(lp[mask]) * np.exp(lp[mask]))) + lp = torch.tensor(lp) + if self.top_k is not None: + lp = torch.topk(lp, self.top_k).values + #mask = ~np.isinf(lp) + #lp = lp[mask] + #if self.top_k is not None: + # lp = np.sort(lp)[-self.top_k:] + #entropies[-1].append(-np.sum(np.array(lp) * np.exp(lp))) + entropies[-1].append(torch.distributions.Categorical(logits=lp).entropy().item()) return {"entropy": entropies} class SampleEntropyCalculator(StatCalculator): - def __init__(self): + def __init__( + self, + top_k: int = None, + ): + self.top_k = top_k super().__init__(["sample_entropy"], ["token_distributions"]) def __call__( @@ -58,20 +73,23 @@ def __call__( ) -> Dict[str, np.ndarray]: batch_distributions = dependencies["token_distributions"] entropies = [] - + for input_distributions in batch_distributions: for sample_distributions in input_distributions: sample_entropies = [] for token_dist in sample_distributions: # Convert token_dist to a numpy array first, then to a torch tensor - token_dist_tensor = torch.tensor(np.array(token_dist)) + token_dist_tensor = torch.tensor(token_dist) + + if self.top_k is not None: + token_dist_tensor = torch.topk(token_dist_tensor, self.top_k).values # Calculate entropy using torch's Categorical distribution - entropy = torch.distributions.Categorical(probs=token_dist_tensor).entropy() + entropy = torch.distributions.Categorical(logits=token_dist_tensor).entropy() sample_entropies.append(entropy.item()) # Calculate mean entropy for the sample mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0 entropies.append(mean_entropy.item()) - + return {"sample_entropy": entropies} diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py index 6f1fb2236..a5f9dc3d2 100644 --- a/src/lm_polygraph/stat_calculators/sample.py +++ b/src/lm_polygraph/stat_calculators/sample.py @@ -174,7 +174,7 @@ def __call__( break ll.append(logits[i][j][cur_token].item()) toks.append(cur_token) - distributions.append(logits[i][j].softmax(dim=-1).cpu().numpy()) + distributions.append(logits[i][j].cpu().numpy()) log_likelihoods[int(i / self.samples_n)].append(ll) log_probs[int(i / self.samples_n)].append(log_prob) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 88d8494cc..f62da1cba 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -256,7 +256,8 @@ def __init__( max_new_tokens: int = 100, background_train_dataset_max_new_tokens: int = 100, cache_path=os.path.expanduser("~") + "/.cache", - save_stats: List[str] = [] + save_stats: List[str] = [], + entropy_top_k: Optional[int] = None, ): """ Parameters: @@ -286,6 +287,7 @@ def __init__( language=language, cache_path=cache_path, model=model, + entropy_top_k=entropy_top_k, ) self.stat_calculators_dict = stat_calculators_dict diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index 2a46b5740..b827b7702 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -18,6 +18,7 @@ def register_stat_calculators( n_ccp_alternatives: int = 10, cache_path=os.path.expanduser("~") + "/.cache", model: Model = None, + entropy_top_k: Optional[int] = None, ) -> Tuple[Dict[str, "StatCalculator"], Dict[str, List[str]]]: """ Registers all available statistic calculators to be seen by UEManager for properly organizing the calculations @@ -62,8 +63,8 @@ def _register(calculator_class: StatCalculator): _register(BlackboxSamplingGenerationCalculator()) else: _register(GreedyProbsCalculator(n_alternatives=n_ccp_alternatives)) - _register(EntropyCalculator()) - _register(SampleEntropyCalculator()) + _register(EntropyCalculator(top_k=entropy_top_k)) + _register(SampleEntropyCalculator(top_k=entropy_top_k)) _register(GreedyLMProbsCalculator()) _register(SamplingGenerationCalculator()) _register(BartScoreCalculator()) From bc4558387aa7fd4cd682061835cc0ab6246b8e8c Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Mon, 25 Nov 2024 16:40:31 +0400 Subject: [PATCH 31/97] Rename stuff --- .../configs/polygraph_eval_xsum_sentsar.yaml | 56 +- src/lm_polygraph/estimators/__init__.py | 15 +- src/lm_polygraph/estimators/gsu.py | 259 +++++++ src/lm_polygraph/estimators/sentence_sar.py | 653 +++++------------- 4 files changed, 480 insertions(+), 503 deletions(-) create mode 100644 src/lm_polygraph/estimators/gsu.py diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index 9dcac6213..b157ac671 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -68,16 +68,16 @@ additional_estimators: - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: use_log: false reverse: true @@ -88,20 +88,16 @@ additional_estimators: - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: - t: 1 - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: use_log: false reverse: true @@ -110,18 +106,18 @@ additional_estimators: class_name: Perplexity kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR + class_name: PPLSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: use_log: false reverse: true @@ -130,18 +126,18 @@ additional_estimators: class_name: MeanTokenEntropy kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: MTESentenceSAR + class_name: MTESAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: use_log: false reverse: true diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 2eb69d448..1da22c401 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -64,17 +64,14 @@ from .token_sar import TokenSAR from .sentence_sar import ( SentenceSAR, - OtherSentenceSAR, - ReweightedSentenceSAR, - PPLSentenceSAR, - MTESentenceSAR, - DistilSentenceSAR, - DistilOneSentenceSAR, - DistilSAR, - DistilPPLSAR, - DistilMTESAR, +# OtherSentenceSAR, +# ReweightedSentenceSAR, + PPLSAR, + MTESAR, + #DistilOneSentenceSAR, ) from .sar import SAR +from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU from .renyi_neg import RenyiNeg from .fisher_rao import FisherRao from .verbalized_1s import Verbalized1S diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py new file mode 100644 index 000000000..8b192c699 --- /dev/null +++ b/src/lm_polygraph/estimators/gsu.py @@ -0,0 +1,259 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator + + +class MaxprobGSU(Estimator): + def __init__( + self, + verbose: bool = False, + use_log: bool = True, + reverse: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = "MaxprobGSU" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + GSU = [] + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + sample_probs = np.exp(np.array(sample_log_probs)) + R_s = ( + sample_probs + * sample_sentence_similarity + ) + sent_relevance = R_s.sum(-1) + + if self.use_log: + E_s = -np.log(sent_relevance) + else: + E_s = -sent_relevance if self.reverse else sent_relevance + + GSU.append(E_s.mean()) + + return np.array(GSU) + + +class PPLGSU(Estimator): + def __init__( + self, + verbose: bool = False, + use_log: bool = True, + reverse: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = "PPLGSU" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + GSU = [] + for sample_log_likelihoods, sample_sentence_similarity in zip( + batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): + ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + + R_s = ( + ppl + * sample_sentence_similarity + ) + sent_relevance = R_s.sum(-1) + + if self.use_log: + E_s = -np.log(sent_relevance) + else: + E_s = -sent_relevance if self.reverse else sent_relevance + + GSU.append(E_s.mean()) + + return np.array(GSU) + + +class TokenSARGSU(Estimator): + def __init__( + self, + verbose: bool = False, + use_log: bool = True, + reverse: bool = False + ): + super().__init__( + [ + "sample_sentence_similarity", + "sample_log_likelihoods", + "sample_token_similarity", + ], + "sequence", + ) + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = "TokenSARGSU" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the SAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * log p(y_i | y_ np.ndarray: + """ + Estimates the sentenceSAR for each sample using Mean Token Entropy (MTE). + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * 'sample_entropy': Mean Token Entropy for each sample, + * 'sample_sentence_similarity': matrix with cross-encoder similarities. + + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_entropy = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + GSU = [] + # Loop over each sample's Mean Token Entropy and sentence similarities + for sample_entropy, sample_sentence_similarity in zip( + batch_sample_entropy, batch_sample_sentence_similarity + ): + # Use MTE for sentence relevance calculation + R_s = sample_entropy * sample_sentence_similarity + + # Compute sentence relevance by summing along the last axis + sent_relevance = R_s.sum(-1) + + # Calculate E_s with options for log transformation and reversal + if self.use_log: + E_s = -np.log(sent_relevance) + else: + E_s = -sent_relevance if self.reverse else sent_relevance + + GSU.append(E_s.mean()) + + return np.array(GSU) diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index fede78ded..c2ff21395 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -55,126 +55,127 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(sentenceSAR) -class OtherSentenceSAR(Estimator): - """ - Like SAR, but only looks at other samples for each sample in the output. - """ - - def __init__( - self, - verbose: bool = False, - t: float = 0.001, - use_log: bool = True, - reverse: bool = False - ): - super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") - self.verbose = verbose - self.t = t - self.use_log = use_log - self.reverse = reverse - - def __str__(self): - base = f"OtherSentenceSAR_t{self.t}" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - """ - Estimates the sentenceSAR for each sample in the input statistics. - - Parameters: - stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: - * corresponding log probabilities in 'sample_log_probs', - * matrix with cross-encoder similarities in 'sample_sentence_similarity' - Returns: - np.ndarray: float sentenceSAR for each sample in input statistics. - Higher values indicate more uncertain samples. - """ - batch_sample_log_probs = stats["sample_log_probs"] - batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - - sentenceSAR = [] - for sample_log_probs, sample_sentence_similarity in zip( - batch_sample_log_probs, batch_sample_sentence_similarity - ): - sample_probs = np.exp(np.array(sample_log_probs)) - R_s = ( - sample_probs - * sample_sentence_similarity - * (1 - np.eye(sample_sentence_similarity.shape[0])) - ) - sent_relevance = R_s.sum(-1) / self.t - - if self.use_log: - E_s = -np.log(sent_relevance) - else: - if self.reverse: - E_s = sent_relevance - else: - E_s = -sent_relevance - - sentenceSAR.append(E_s.mean()) - - return np.array(sentenceSAR) - - -class ReweightedSentenceSAR(Estimator): - """ - Like SAR, but normalizes similarity-based scores at each iteration - alpha_ij = g(s_i, s_j) / (\sum_k^(K - 1) g(s_i, s_k)) - K - number of samples in output minus one - """ - def __init__(self, verbose: bool = False): - super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") - self.verbose = verbose - self.t = 0.001 - - def __str__(self): - return "ReweightedSentenceSAR" - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - batch_sample_log_probs = stats["sample_log_probs"] - batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - - sentenceSAR = [] - - for sample_log_probs, sample_sentence_similarity in zip( - batch_sample_log_probs, batch_sample_sentence_similarity - ): - # Compute probabilities from log probabilities - sample_probs = np.exp(np.array(sample_log_probs)) - - # Initialize alpha_ij (reweighted sentence similarities) - alpha_ij = np.zeros_like(sample_sentence_similarity) - - # Normalize similarity-based scores at each iteration - for i in range(sample_sentence_similarity.shape[0]): - similarity_row = sample_sentence_similarity[i] - # Exclude self-similarity g(s_i, s_i) - similarity_row_without_self = similarity_row * (1 - np.eye(len(similarity_row)))[i] - sum_similarity = np.sum(similarity_row_without_self) - - if sum_similarity > 0: - alpha_ij[i] = similarity_row_without_self / sum_similarity - else: - alpha_ij[i] = similarity_row_without_self # If the normalization factor is 0, leave the row unchanged - - # Compute sentence relevance using normalized alpha_ij - R_s = sample_probs * alpha_ij - sent_relevance = R_s.sum(-1) / self.t - - # Compute SentenceSAR (Uncertainty Estimation) - E_s = -np.log(sent_relevance + sample_probs) - sentenceSAR.append(E_s.mean()) - - return np.array(sentenceSAR) - - - -class PPLSentenceSAR(Estimator): +#class OtherSentenceSAR(Estimator): +# """ +# Like SAR, but only looks at other samples for each sample in the output. +# """ +# +# def __init__( +# self, +# verbose: bool = False, +# t: float = 0.001, +# use_log: bool = True, +# reverse: bool = False +# ): +# super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") +# self.verbose = verbose +# self.t = t +# self.use_log = use_log +# self.reverse = reverse +# +# def __str__(self): +# base = f"OtherSentenceSAR_t{self.t}" +# if not self.use_log: +# base += "_no_log" +# if self.reverse: +# base += "_reverse" +# return base +# +# def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: +# """ +# Estimates the sentenceSAR for each sample in the input statistics. +# +# Parameters: +# stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: +# * corresponding log probabilities in 'sample_log_probs', + +# * matrix with cross-encoder similarities in 'sample_sentence_similarity' +# Returns: +# np.ndarray: float sentenceSAR for each sample in input statistics. +# Higher values indicate more uncertain samples. +# """ +# batch_sample_log_probs = stats["sample_log_probs"] +# batch_sample_sentence_similarity = stats["sample_sentence_similarity"] +# +# sentenceSAR = [] +# for sample_log_probs, sample_sentence_similarity in zip( +# batch_sample_log_probs, batch_sample_sentence_similarity +# ): +# sample_probs = np.exp(np.array(sample_log_probs)) +# R_s = ( +# sample_probs +# * sample_sentence_similarity +# * (1 - np.eye(sample_sentence_similarity.shape[0])) +# ) +# sent_relevance = R_s.sum(-1) / self.t +# +# if self.use_log: +# E_s = -np.log(sent_relevance) +# else: +# if self.reverse: +# E_s = sent_relevance +# else: +# E_s = -sent_relevance +# +# sentenceSAR.append(E_s.mean()) +# +# return np.array(sentenceSAR) +# +# +#class ReweightedSentenceSAR(Estimator): +# """ +# Like SAR, but normalizes similarity-based scores at each iteration +# alpha_ij = g(s_i, s_j) / (\sum_k^(K - 1) g(s_i, s_k)) +# K - number of samples in output minus one +# """ +# def __init__(self, verbose: bool = False): +# super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") +# self.verbose = verbose +# self.t = 0.001 +# +# def __str__(self): +# return "ReweightedSentenceSAR" +# +# def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: +# batch_sample_log_probs = stats["sample_log_probs"] +# batch_sample_sentence_similarity = stats["sample_sentence_similarity"] +# +# sentenceSAR = [] +# +# for sample_log_probs, sample_sentence_similarity in zip( +# batch_sample_log_probs, batch_sample_sentence_similarity +# ): +# # Compute probabilities from log probabilities +# sample_probs = np.exp(np.array(sample_log_probs)) +# +# # Initialize alpha_ij (reweighted sentence similarities) +# alpha_ij = np.zeros_like(sample_sentence_similarity) +# +# # Normalize similarity-based scores at each iteration +# for i in range(sample_sentence_similarity.shape[0]): +# similarity_row = sample_sentence_similarity[i] +# # Exclude self-similarity g(s_i, s_i) +# similarity_row_without_self = similarity_row * (1 - np.eye(len(similarity_row)))[i] +# sum_similarity = np.sum(similarity_row_without_self) +# +# if sum_similarity > 0: +# alpha_ij[i] = similarity_row_without_self / sum_similarity +# else: +# alpha_ij[i] = similarity_row_without_self # If the normalization factor is 0, leave the row unchanged +# +# # Compute sentence relevance using normalized alpha_ij +# R_s = sample_probs * alpha_ij +# sent_relevance = R_s.sum(-1) / self.t +# +# # Compute SentenceSAR (Uncertainty Estimation) +# E_s = -np.log(sent_relevance + sample_probs) +# sentenceSAR.append(E_s.mean()) +# +# return np.array(sentenceSAR) + + + +class PPLSAR(Estimator): """ Like SAR, but uses log probs normalized by sample length in tokens to calculate PPL (Perplexity). Tokenwise log-likelihoods are available in stats['sample_log_likelihoods']. @@ -185,7 +186,7 @@ def __init__(self, verbose: bool = False): self.t = 0.001 def __str__(self): - return "PPLSentenceSAR" + return "PPLSAR" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -226,282 +227,72 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(sentenceSAR) - -class DistilSentenceSAR(Estimator): - """ - Like SAR, but only looks at other samples for each sample in the output. - """ - - def __init__( - self, - verbose: bool = False, - use_log: bool = True, - reverse: bool = False - ): - super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") - self.verbose = verbose - self.use_log = use_log - self.reverse = reverse - - def __str__(self): - base = "DistilSentenceSAR" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - """ - Estimates the sentenceSAR for each sample in the input statistics. - - Parameters: - stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: - * corresponding log probabilities in 'sample_log_probs', - * matrix with cross-encoder similarities in 'sample_sentence_similarity' - Returns: - np.ndarray: float sentenceSAR for each sample in input statistics. - Higher values indicate more uncertain samples. - """ - batch_sample_log_probs = stats["sample_log_probs"] - batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - - sentenceSAR = [] - for sample_log_probs, sample_sentence_similarity in zip( - batch_sample_log_probs, batch_sample_sentence_similarity - ): - sample_probs = np.exp(np.array(sample_log_probs)) - R_s = ( - sample_probs - * sample_sentence_similarity - ) - sent_relevance = R_s.sum(-1) - - if self.use_log: - E_s = -np.log(sent_relevance) - else: - if self.reverse: - E_s = sent_relevance - else: - E_s = -sent_relevance - - sentenceSAR.append(E_s.mean()) - - return np.array(sentenceSAR) - - -class DistilOneSentenceSAR(Estimator): - """ - Like SAR, but only looks at other samples for each sample in the output. - """ - - def __init__( - self, - verbose: bool = False, - use_log: bool = True, - reverse: bool = False - ): - super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") - self.verbose = verbose - self.use_log = use_log - self.reverse = reverse - - def __str__(self): - base = f"DistilOneSentenceSAR" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - """ - Estimates the sentenceSAR for each sample in the input statistics. - - Parameters: - stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: - * corresponding log probabilities in 'sample_log_probs', - * matrix with cross-encoder similarities in 'sample_sentence_similarity' - Returns: - np.ndarray: float sentenceSAR for each sample in input statistics. - Higher values indicate more uncertain samples. - """ - batch_sample_log_probs = stats["sample_log_probs"] - batch_sample_sentence_similarity = deepcopy(stats["sample_sentence_similarity"]) - - sentenceSAR = [] - for sample_log_probs, sample_sentence_similarity in zip( - batch_sample_log_probs, batch_sample_sentence_similarity - ): - sample_probs = np.exp(np.array(sample_log_probs)) - np.fill_diagonal(sample_sentence_similarity, 1) - - R_s = ( - sample_probs - * sample_sentence_similarity - ) - sent_relevance = R_s.sum(-1) - - if self.use_log: - E_s = -np.log(sent_relevance) - else: - if self.reverse: - E_s = sent_relevance - else: - E_s = -sent_relevance - - SAR.append(E_s.mean()) - - return np.array(SAR) - - -class DistilSAR(Estimator): - """ - Like SAR, but only looks at other samples for each sample in the output. - """ - - def __init__( - self, - verbose: bool = False, - use_log: bool = True, - reverse: bool = False - ): - super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") - self.verbose = verbose - self.use_log = use_log - self.reverse = reverse - - def __str__(self): - base = "DistilSAR" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - """ - Estimates the sentenceSAR for each sample in the input statistics. - - Parameters: - stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: - * corresponding log probabilities in 'sample_log_probs', - * matrix with cross-encoder similarities in 'sample_sentence_similarity' - Returns: - np.ndarray: float sentenceSAR for each sample in input statistics. - Higher values indicate more uncertain samples. - """ - batch_sample_log_likelihoods = stats["sample_log_likelihoods"] - batch_sample_token_similarity = stats["sample_token_similarity"] - batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - - SAR = [] - for batch_data in zip( - batch_sample_log_likelihoods, - batch_sample_token_similarity, - batch_sample_sentence_similarity, - ): - sample_log_likelihoods = batch_data[0] - sample_token_similarity = batch_data[1] - sample_sentence_similarity = batch_data[2] - - tokenSAR = [] - for log_likelihoods, token_similarity in zip( - sample_log_likelihoods, sample_token_similarity - ): - log_likelihoods = np.array(log_likelihoods) - R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() - E_t = -log_likelihoods * R_t_norm - tokenSAR.append(E_t.sum()) - - tokenSAR = np.array(tokenSAR) - probs_token_sar = np.exp(-tokenSAR) - - R_s = ( - probs_token_sar - * sample_sentence_similarity - ) - sent_relevance = R_s.sum(-1) - - if self.use_log: - E_s = -np.log(sent_relevance) - else: - if self.reverse: - E_s = sent_relevance - else: - E_s = -sent_relevance - - SAR.append(E_s.mean()) - - return np.array(SAR) - - -class DistilPPLSAR(Estimator): - """ - Like SAR, but only looks at other samples for each sample in the output. - """ - - def __init__( - self, - verbose: bool = False, - use_log: bool = True, - reverse: bool = False - ): - super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") - self.verbose = verbose - self.use_log = use_log - self.reverse = reverse - - def __str__(self): - base = "DistilPPLSAR" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - """ - Estimates the sentenceSAR for each sample in the input statistics. - - Parameters: - stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: - * corresponding log probabilities in 'sample_log_probs', - * matrix with cross-encoder similarities in 'sample_sentence_similarity' - Returns: - np.ndarray: float sentenceSAR for each sample in input statistics. - Higher values indicate more uncertain samples. - """ - batch_sample_log_likelihoods = stats["sample_log_likelihoods"] - batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - - sentenceSAR = [] - for sample_log_likelihoods, sample_sentence_similarity in zip( - batch_sample_log_likelihoods, batch_sample_sentence_similarity - ): - ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) - - R_s = ( - ppl - * sample_sentence_similarity - ) - sent_relevance = R_s.sum(-1) - - if self.use_log: - E_s = -np.log(sent_relevance) - else: - if self.reverse: - E_s = -sent_relevance - else: - E_s = sent_relevance - - sentenceSAR.append(E_s.mean()) - - return np.array(sentenceSAR) - - -class MTESentenceSAR(Estimator): +#class DistilOneSentenceSAR(Estimator): +# """ +# Like SAR, but only looks at other samples for each sample in the output. +# """ +# +# def __init__( +# self, +# verbose: bool = False, +# use_log: bool = True, +# reverse: bool = False +# ): +# super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") +# self.verbose = verbose +# self.use_log = use_log +# self.reverse = reverse +# +# def __str__(self): +# base = f"DistilOneSentenceSAR" +# if not self.use_log: +# base += "_no_log" +# if self.reverse: +# base += "_reverse" +# return base +# +# def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: +# """ +# Estimates the sentenceSAR for each sample in the input statistics. +# +# Parameters: +# stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: +# * corresponding log probabilities in 'sample_log_probs', +# * matrix with cross-encoder similarities in 'sample_sentence_similarity' +# Returns: +# np.ndarray: float sentenceSAR for each sample in input statistics. +# Higher values indicate more uncertain samples. +# """ +# batch_sample_log_probs = stats["sample_log_probs"] +# batch_sample_sentence_similarity = deepcopy(stats["sample_sentence_similarity"]) +# +# sentenceSAR = [] +# for sample_log_probs, sample_sentence_similarity in zip( +# batch_sample_log_probs, batch_sample_sentence_similarity +# ): +# sample_probs = np.exp(np.array(sample_log_probs)) +# np.fill_diagonal(sample_sentence_similarity, 1) +# +# R_s = ( +# sample_probs +# * sample_sentence_similarity +# ) +# sent_relevance = R_s.sum(-1) +# +# if self.use_log: +# E_s = -np.log(sent_relevance) +# else: +# if self.reverse: +# E_s = sent_relevance +# else: +# E_s = -sent_relevance +# +# SAR.append(E_s.mean()) +# +# return np.array(SAR) + + +class MTESAR(Estimator): """ Like SAR, but uses sample entropy calculated from token-wise log probs for each sample. Tokenwise log-likelihoods are available in stats['sample_log_likelihoods']. @@ -512,7 +303,7 @@ def __init__(self, verbose: bool = False): self.t = 0.001 def __str__(self): - return "MTESentenceSAR" + return "MTESAR" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -551,69 +342,3 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) - - - - -class DistilMTESAR(Estimator): - """ - Like SAR, but uses Mean Token Entropy (MTE) calculated from token-wise log probs for each sample. - Token-wise log-likelihoods are available in stats['sample_entropy']. - """ - - def __init__( - self, - verbose: bool = False, - use_log: bool = True, - reverse: bool = False - ): - super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") - self.verbose = verbose - self.use_log = use_log - self.reverse = reverse - - def __str__(self): - base = "DistilMTESAR" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - """ - Estimates the sentenceSAR for each sample using Mean Token Entropy (MTE). - - Parameters: - stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: - * 'sample_entropy': Mean Token Entropy for each sample, - * 'sample_sentence_similarity': matrix with cross-encoder similarities. - - Returns: - np.ndarray: float sentenceSAR for each sample in input statistics. - Higher values indicate more uncertain samples. - """ - batch_sample_entropy = stats["sample_entropy"] - batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - - sentenceSAR = [] - - # Loop over each sample's Mean Token Entropy and sentence similarities - for sample_entropy, sample_sentence_similarity in zip( - batch_sample_entropy, batch_sample_sentence_similarity - ): - # Use MTE for sentence relevance calculation - R_s = sample_entropy * sample_sentence_similarity - - # Compute sentence relevance by summing along the last axis - sent_relevance = R_s.sum(-1) - - # Calculate E_s with options for log transformation and reversal - if self.use_log: - E_s = -np.log(sent_relevance) - else: - E_s = -sent_relevance if self.reverse else sent_relevance - - sentenceSAR.append(E_s.mean()) - - return np.array(sentenceSAR) From cfeff8232cd39c376acf99bd580ffffa77e20a1f Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Mon, 25 Nov 2024 16:42:42 +0400 Subject: [PATCH 32/97] Use renamed methods everywhere --- .../polygraph_eval_triviaqa_sentsar.yaml | 56 +++++++++---------- .../polygraph_eval_wmt14_fren_sentsar.yaml | 56 +++++++++---------- .../polygraph_eval_wmt19_deen_sentsar.yaml | 56 +++++++++---------- 3 files changed, 78 insertions(+), 90 deletions(-) diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index dcc0ffb30..74a06f08d 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -71,16 +71,16 @@ additional_estimators: - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: use_log: false reverse: true @@ -91,20 +91,16 @@ additional_estimators: - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: - t: 1 - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: use_log: false reverse: true @@ -113,18 +109,18 @@ additional_estimators: class_name: Perplexity kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR + class_name: PPLSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: use_log: false reverse: true @@ -133,18 +129,18 @@ additional_estimators: class_name: MeanTokenEntropy kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: MTESentenceSAR + class_name: MTESAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: use_log: false reverse: true diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 1c34e85e2..7c2213716 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -70,16 +70,16 @@ additional_estimators: - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: use_log: false reverse: true @@ -90,20 +90,16 @@ additional_estimators: - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: - t: 1 - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: use_log: false reverse: true @@ -112,18 +108,18 @@ additional_estimators: class_name: Perplexity kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR + class_name: PPLSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: use_log: false reverse: true @@ -132,18 +128,18 @@ additional_estimators: class_name: MeanTokenEntropy kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: MTESentenceSAR + class_name: MTESAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: use_log: false reverse: true diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index f6c4b1ada..f5d70927a 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -69,16 +69,16 @@ additional_estimators: - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSentenceSAR + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU kwargs: use_log: false reverse: true @@ -89,20 +89,16 @@ additional_estimators: - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: - t: 1 - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilSAR + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU kwargs: use_log: false reverse: true @@ -111,18 +107,18 @@ additional_estimators: class_name: Perplexity kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSentenceSAR + class_name: PPLSAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilPPLSAR + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU kwargs: use_log: false reverse: true @@ -131,18 +127,18 @@ additional_estimators: class_name: MeanTokenEntropy kwargs: {} - module: lm_polygraph.estimators.sentence_sar - class_name: MTESentenceSAR + class_name: MTESAR kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: use_log: false reverse: false - - module: lm_polygraph.estimators.sentence_sar - class_name: DistilMTESAR + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU kwargs: use_log: false reverse: true From 14dfb287de80c508f4dc704d3101f0b6c85030b8 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 27 Nov 2024 19:39:24 +0400 Subject: [PATCH 33/97] Fix ccp, add consistency between sampling and greedy, and ccpgsu --- .../configs/polygraph_eval_xsum_sentsar.yaml | 17 +++ src/lm_polygraph/estimators/__init__.py | 2 +- .../claim_conditioned_probability.py | 25 +++- src/lm_polygraph/estimators/gsu.py | 82 ++++++++++++++ src/lm_polygraph/stat_calculators/__init__.py | 1 + src/lm_polygraph/stat_calculators/entropy.py | 4 +- .../stat_calculators/greedy_probs.py | 9 +- src/lm_polygraph/stat_calculators/sample.py | 33 ++++-- .../sample_alternatives_nli.py | 107 ++++++++++++++++++ src/lm_polygraph/utils/dataset.py | 6 + src/lm_polygraph/utils/manager.py | 6 +- .../utils/register_stat_calculators.py | 3 +- 12 files changed, 268 insertions(+), 27 deletions(-) create mode 100644 src/lm_polygraph/stat_calculators/sample_alternatives_nli.py diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index b157ac671..084015edb 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -142,6 +142,23 @@ additional_estimators: use_log: false reverse: true + - module: lm_polygraph.estimators.claim_conditioned_probability + class_name: ClaimConditionedProbability + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: CCPGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: CCPGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: CCPGSU + kwargs: + use_log: false + reverse: true + ignore_exceptions: false batch_size: 1 diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 1da22c401..ee51c6b77 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -71,7 +71,7 @@ #DistilOneSentenceSAR, ) from .sar import SAR -from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU +from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU, CCPGSU from .renyi_neg import RenyiNeg from .fisher_rao import FisherRao from .verbalized_1s import Verbalized1S diff --git a/src/lm_polygraph/estimators/claim_conditioned_probability.py b/src/lm_polygraph/estimators/claim_conditioned_probability.py index 5c7b63add..7e2a86d1e 100644 --- a/src/lm_polygraph/estimators/claim_conditioned_probability.py +++ b/src/lm_polygraph/estimators/claim_conditioned_probability.py @@ -20,7 +20,22 @@ def __str__(self): return "CCP" def _reduce(self, logprobs: list[float]): - return np.exp(np.sum(logprobs)) + return np.sum(logprobs) + + def _combine_nli(self, forward: str, backward: str): + """ + Combines two NLI predictions NLI(x, y) and NLI(y, x) into a single prediction. + + Prioritizes "entail" or "contra" if present, otherwise returns "neutral". + """ + if forward == backward: + return forward + if all(x in [forward, backward] for x in ["entail", "contra"]): + return "neutral" + for x in ["entail", "contra"]: + if x in [forward, backward]: + return x + return "neutral" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: words = stats["greedy_tokens"] @@ -42,10 +57,14 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: contra_logprobs, contra_words = [], [] for i in range(len(word_alternatives)): word_alt, logprob = word_alternatives[i] - if i == 0 or word_alternatives_nli[0][i] == "entail": + nli_outcome = self._combine_nli( + word_alternatives_nli[0][i], + word_alternatives_nli[i][0], + ) + if i == 0 or nli_outcome == "entail": entail_logprobs.append(logprob) entail_words.append(word_alt) - elif word_alternatives_nli[0][i] == "contra": + elif nli_outcome == "contra": contra_logprobs.append(logprob) contra_words.append(word_alt) entail_logprob = np.logaddexp.reduce(entail_logprobs) diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py index 8b192c699..b7ab5b6d9 100644 --- a/src/lm_polygraph/estimators/gsu.py +++ b/src/lm_polygraph/estimators/gsu.py @@ -4,6 +4,7 @@ from copy import deepcopy from .estimator import Estimator +from lm_polygraph.estimators.claim_conditioned_probability import ClaimConditionedProbability class MaxprobGSU(Estimator): @@ -257,3 +258,84 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: GSU.append(E_s.mean()) return np.array(GSU) + + +class CCPGSU(Estimator): + def __init__( + self, + verbose: bool = False, + use_log: bool = True, + reverse: bool = False + ): + super().__init__(["sample_sentence_similarity", + "sample_tokens", + "sample_tokens_alternatives", + "sample_tokens_alternatives_nli"], "sequence") + self.verbose = verbose + self.use_log = use_log + self.reverse = reverse + + def __str__(self): + base = "CCPGSU" + if not self.use_log: + base += "_no_log" + if self.reverse: + base += "_reverse" + return base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the sentenceSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * corresponding log probabilities in 'sample_log_probs', + * matrix with cross-encoder similarities in 'sample_sentence_similarity' + Returns: + np.ndarray: float sentenceSAR for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + batch_sample_tokens = stats["sample_tokens"] + batch_sample_tokens_alternatives = stats["sample_tokens_alternatives"] + batch_sample_tokens_alternatives_nli = stats["sample_tokens_alternatives_nli"] + + GSU = [] + for sample_sentence_similarity, \ + samples_tokens, \ + samples_tokens_alternatives, \ + samples_tokens_alternatives_nli in zip( + batch_sample_sentence_similarity, + batch_sample_tokens, + batch_sample_tokens_alternatives, + batch_sample_tokens_alternatives_nli + ): + ccps = [] + for sample_tokens, \ + sample_tokens_alternatives, \ + sample_tokens_alternatives_nli in zip( + samples_tokens, + samples_tokens_alternatives, + samples_tokens_alternatives_nli + ): + ccp_stats = { + "greedy_tokens": [sample_tokens], + "greedy_tokens_alternatives": [sample_tokens_alternatives], + "greedy_tokens_alternatives_nli": [sample_tokens_alternatives_nli] + } + ccps.append(ClaimConditionedProbability()(stats=ccp_stats)[0]) + + R_s = ( + ccps + * sample_sentence_similarity + ) + sent_relevance = R_s.sum(-1) + + if self.use_log: + E_s = -np.log(sent_relevance) + else: + E_s = -sent_relevance if self.reverse else sent_relevance + + GSU.append(E_s.mean()) + + return np.array(GSU) diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py index f69abd428..1a2ea5721 100644 --- a/src/lm_polygraph/stat_calculators/__init__.py +++ b/src/lm_polygraph/stat_calculators/__init__.py @@ -11,6 +11,7 @@ from .entropy import EntropyCalculator from .entropy import SampleEntropyCalculator from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator +from .sample_alternatives_nli import SampleAlternativesNLICalculator from .greedy_alternatives_nli import ( GreedyAlternativesNLICalculator, GreedyAlternativesFactPrefNLICalculator, diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py index 0a4409797..f6ba90c4e 100644 --- a/src/lm_polygraph/stat_calculators/entropy.py +++ b/src/lm_polygraph/stat_calculators/entropy.py @@ -61,7 +61,7 @@ def __init__( top_k: int = None, ): self.top_k = top_k - super().__init__(["sample_entropy"], ["token_distributions"]) + super().__init__(["sample_entropy"], ["sample_tokens_distributions"]) def __call__( self, @@ -71,7 +71,7 @@ def __call__( max_new_tokens: int = 100, **kwargs, ) -> Dict[str, np.ndarray]: - batch_distributions = dependencies["token_distributions"] + batch_distributions = dependencies["sample_tokens_distributions"] entropies = [] for input_distributions in batch_distributions: diff --git a/src/lm_polygraph/stat_calculators/greedy_probs.py b/src/lm_polygraph/stat_calculators/greedy_probs.py index c94468fb5..5c746c1ab 100644 --- a/src/lm_polygraph/stat_calculators/greedy_probs.py +++ b/src/lm_polygraph/stat_calculators/greedy_probs.py @@ -134,14 +134,9 @@ def __call__( seq = sequences[i, idx:].cpu() else: seq = sequences[i, 1:].cpu() - length, text_length = len(seq), len(seq) - for j in range(len(seq)): - if seq[j] == model.tokenizer.eos_token_id: - length = j + 1 - text_length = j - break + length = len(seq) cut_sequences.append(seq[:length].tolist()) - cut_texts.append(model.tokenizer.decode(seq[:text_length])) + cut_texts.append(model.tokenizer.decode(seq[:length], skip_special_tokens=True)) cut_logits.append(logits[i, :length, :].cpu().numpy()) cut_alternatives.append([[] for _ in range(length)]) for j in range(length): diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py index a5f9dc3d2..2c74f7a1d 100644 --- a/src/lm_polygraph/stat_calculators/sample.py +++ b/src/lm_polygraph/stat_calculators/sample.py @@ -86,19 +86,21 @@ class SamplingGenerationCalculator(StatCalculator): * probabilities of the sampled tokens generation """ - def __init__(self, samples_n: int = 10): + def __init__(self, samples_n: int = 10, n_alternatives: int = 10): """ Parameters: samples_n (int): number of samples to generate per input text. Default: 10 """ self.samples_n = samples_n + self.n_alternatives = n_alternatives super().__init__( [ "sample_log_probs", "sample_tokens", "sample_texts", "sample_log_likelihoods", - "token_distributions", + "sample_tokens_distributions", + "sample_tokens_alternatives", ], [], ) @@ -155,6 +157,7 @@ def __call__( texts = [[] for _ in range(len(texts))] log_likelihoods = [[] for _ in range(len(texts))] token_distributions = [[] for _ in range(len(texts))] + alternatives = [[] for _ in range(len(texts))] if model.model_type == "Seq2SeqLM": @@ -167,25 +170,39 @@ def __call__( if model.model_type == "CausalLM" else 0 ) - for j in range(len(sequences[i]) - inp_size): + gen_size = len(sequences[i]) - inp_size + sample_alternatives = [[] for _ in range(gen_size)] + for j in range(gen_size): cur_token = sequences[i][j + inp_size].item() log_prob += logits[i][j][cur_token].item() - if cur_token == model.tokenizer.eos_token_id: - break ll.append(logits[i][j][cur_token].item()) toks.append(cur_token) - distributions.append(logits[i][j].cpu().numpy()) + + lt = logits[i][j].cpu().numpy() + distributions.append(lt) + + best_tokens = np.argpartition(lt, -self.n_alternatives) + ln = len(best_tokens) + best_tokens = best_tokens[ln - self.n_alternatives : ln] + for t in best_tokens: + sample_alternatives[j].append((t.item(), lt[t].item())) + sample_alternatives[j].sort( + key=lambda x: x[0] == cur_token, + reverse=True, + ) log_likelihoods[int(i / self.samples_n)].append(ll) log_probs[int(i / self.samples_n)].append(log_prob) tokens[int(i / self.samples_n)].append(toks) - texts[int(i / self.samples_n)].append(model.tokenizer.decode(toks)) + texts[int(i / self.samples_n)].append(model.tokenizer.decode(toks, skip_special_tokens=True)) token_distributions[int(i / self.samples_n)].append(distributions) + alternatives[int(i / self.samples_n)].append(sample_alternatives) return { "sample_log_likelihoods": log_likelihoods, "sample_log_probs": log_probs, "sample_tokens": tokens, "sample_texts": texts, - "token_distributions": token_distributions, + "sample_tokens_distributions": token_distributions, + "sample_tokens_alternatives": alternatives, } diff --git a/src/lm_polygraph/stat_calculators/sample_alternatives_nli.py b/src/lm_polygraph/stat_calculators/sample_alternatives_nli.py new file mode 100644 index 000000000..1832278af --- /dev/null +++ b/src/lm_polygraph/stat_calculators/sample_alternatives_nli.py @@ -0,0 +1,107 @@ +import numpy as np + +from typing import Dict, List, Tuple + +from .stat_calculator import StatCalculator +from lm_polygraph.utils.model import WhiteboxModel +from lm_polygraph.utils.deberta import Deberta +from collections import defaultdict +import torch.nn as nn +import string + + +def _eval_nli_model(nli_queue: List[Tuple[str, str]], deberta: Deberta) -> List[str]: + nli_set = list(set(nli_queue)) + + softmax = nn.Softmax(dim=1) + w_probs = defaultdict(lambda: defaultdict(lambda: None)) + for k in range(0, len(nli_set), deberta.batch_size): + batch = nli_set[k : k + deberta.batch_size] + encoded = deberta.deberta_tokenizer.batch_encode_plus( + batch, padding=True, return_tensors="pt" + ).to(deberta.device) + logits = deberta.deberta(**encoded).logits + logits = logits.detach().to(deberta.device) + for (wi, wj), prob in zip(batch, softmax(logits).cpu().detach()): + w_probs[wi][wj] = prob + + classes = [] + for w1, w2 in nli_queue: + pr = w_probs[w1][w2] + id = pr.argmax() + ent_id = deberta.deberta.config.label2id["ENTAILMENT"] + contra_id = deberta.deberta.config.label2id["CONTRADICTION"] + if id == ent_id: + str_class = "entail" + elif id == contra_id: + str_class = "contra" + else: + str_class = "neutral" + classes.append(str_class) + return classes + + +class SampleAlternativesNLICalculator(StatCalculator): + def __init__(self, nli_model): + super().__init__( + [ + "sample_tokens_alternatives_nli", + ], + ["sample_tokens_alternatives"], + ) + + self.nli_model = nli_model + + def _strip(self, w: str): + return w.strip(string.punctuation + " \n") + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + **kwargs, + ) -> Dict[str, np.ndarray]: + batch_alternatives = dependencies["sample_tokens_alternatives"] + batch_alternatives_nli = [] + for samples_alternatives in batch_alternatives: + sample_alternatives_nli = [] + for sample_alternatives in samples_alternatives: + nli_matrixes = [] + for w_number, word_alternatives in enumerate(sample_alternatives): + nli_queue = [] + nli_matrix = [ + ["" for _ in range(len(word_alternatives))] + for _ in range(len(word_alternatives)) + ] + if len(word_alternatives) > 0 and not isinstance( + word_alternatives[0][0], + str, + ): + word_alternatives = [ + (model.tokenizer.decode([alt]), prob) + for alt, prob in word_alternatives + ] + words = [self._strip(alt[0]) for alt in word_alternatives] + for wi in words: + nli_queue.append((words[0], wi)) + nli_queue.append((wi, words[0])) + + nli_classes = _eval_nli_model(nli_queue, self.nli_model) + nli_class = defaultdict(lambda: None) + for nli_cl, (w1, w2) in zip(nli_classes, nli_queue): + nli_class[w1, w2] = nli_cl + + for i, wi in enumerate(words): + for j, wj in enumerate(words): + # Only calculate NLI with sample token + if i > 0 and j > 0: + continue + nli_matrix[i][j] = nli_class[wi, wj] + + nli_matrixes.append(nli_matrix) + sample_alternatives_nli.append(nli_matrixes) + batch_alternatives_nli.append(sample_alternatives_nli) + + return {"sample_tokens_alternatives_nli": batch_alternatives_nli} diff --git a/src/lm_polygraph/utils/dataset.py b/src/lm_polygraph/utils/dataset.py index 05c79ea1c..49a1c29a4 100644 --- a/src/lm_polygraph/utils/dataset.py +++ b/src/lm_polygraph/utils/dataset.py @@ -184,6 +184,8 @@ def from_datasets( """ dataset_name, dataset = Dataset.load_hf_dataset(dataset_path, split, **kwargs) few_shot_dataset = None + #no_few_shot_x = None + if n_shot > 0: _, few_shot_dataset = Dataset.load_hf_dataset( dataset_path, few_shot_split, **kwargs @@ -417,7 +419,11 @@ def doc_to_text(doc, prompt, i=0): else: x = dataset[x_column] y = dataset[y_column] + + #if no_few_shot_x is None: + # no_few_shot_x = x + #return Dataset(x, y, batch_size, no_few_shot_x) return Dataset(x, y, batch_size) @staticmethod diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index f62da1cba..36e882308 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -480,11 +480,7 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: for key in self.save_stats: if key in batch_stats.keys(): - try: - self.stats[key] += list(batch_stats[key]) - except: - breakpoint() - pass + self.stats[key] += list(batch_stats[key]) for processor in self.processors: processor.on_batch(batch_stats, batch_gen_metrics, batch_estimations) diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index b827b7702..2c7ccd8f8 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -66,13 +66,14 @@ def _register(calculator_class: StatCalculator): _register(EntropyCalculator(top_k=entropy_top_k)) _register(SampleEntropyCalculator(top_k=entropy_top_k)) _register(GreedyLMProbsCalculator()) - _register(SamplingGenerationCalculator()) + _register(SamplingGenerationCalculator(n_alternatives=n_ccp_alternatives)) _register(BartScoreCalculator()) _register(ModelScoreCalculator()) _register(EmbeddingsCalculator()) _register(EnsembleTokenLevelDataCalculator()) _register(CrossEncoderSimilarityMatrixCalculator(nli_model=nli_model)) _register(GreedyAlternativesNLICalculator(nli_model=nli_model)) + _register(SampleAlternativesNLICalculator(nli_model=nli_model)) _register(GreedyAlternativesFactPrefNLICalculator(nli_model=nli_model)) _register(ClaimsExtractor(openai_chat=openai_chat, language=language)) _register( From b37936f41f4fd52afc4cd4c03232a9160528ccd7 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 28 Nov 2024 10:35:34 +0400 Subject: [PATCH 34/97] Fix sample entropy --- .../configs/polygraph_eval_xsum_sentsar.yaml | 17 ----------------- src/lm_polygraph/stat_calculators/entropy.py | 16 +++++++++------- 2 files changed, 9 insertions(+), 24 deletions(-) diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index 084015edb..b157ac671 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -142,23 +142,6 @@ additional_estimators: use_log: false reverse: true - - module: lm_polygraph.estimators.claim_conditioned_probability - class_name: ClaimConditionedProbability - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: CCPGSU - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: CCPGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: CCPGSU - kwargs: - use_log: false - reverse: true - ignore_exceptions: false batch_size: 1 diff --git a/src/lm_polygraph/stat_calculators/entropy.py b/src/lm_polygraph/stat_calculators/entropy.py index f6ba90c4e..c5956992d 100644 --- a/src/lm_polygraph/stat_calculators/entropy.py +++ b/src/lm_polygraph/stat_calculators/entropy.py @@ -72,11 +72,12 @@ def __call__( **kwargs, ) -> Dict[str, np.ndarray]: batch_distributions = dependencies["sample_tokens_distributions"] - entropies = [] + input_entropies = [] for input_distributions in batch_distributions: + sample_entropies = [] for sample_distributions in input_distributions: - sample_entropies = [] + token_entropies = [] for token_dist in sample_distributions: # Convert token_dist to a numpy array first, then to a torch tensor token_dist_tensor = torch.tensor(token_dist) @@ -86,10 +87,11 @@ def __call__( # Calculate entropy using torch's Categorical distribution entropy = torch.distributions.Categorical(logits=token_dist_tensor).entropy() - sample_entropies.append(entropy.item()) + token_entropies.append(entropy.item()) - # Calculate mean entropy for the sample - mean_entropy = torch.mean(torch.tensor(sample_entropies)) if sample_entropies else 0 - entropies.append(mean_entropy.item()) + # Calculate mean entropy for the sample + sample_entropy = torch.mean(torch.tensor(token_entropies)) if token_entropies else 0 + sample_entropies.append(sample_entropy.item()) + input_entropies.append(sample_entropies) - return {"sample_entropy": entropies} + return {"sample_entropy": input_entropies} From 22a3a70db17f5c214585ffeef2bf24ab510f9858 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 28 Nov 2024 11:20:22 +0400 Subject: [PATCH 35/97] Expand configs --- .../configs/polygraph_eval_coqa_sentsar.yaml | 158 +++++++++++++++++ .../polygraph_eval_gsm8k_sentsar_cot.yaml | 162 ++++++++++++++++++ .../configs/polygraph_eval_mmlu_sentsar.yaml | 160 +++++++++++++++++ .../polygraph_eval_triviaqa_sentsar.yaml | 9 +- .../polygraph_eval_wmt14_enfr_sentsar.yaml | 160 +++++++++++++++++ .../polygraph_eval_wmt14_fren_sentsar.yaml | 9 +- .../polygraph_eval_wmt19_deen_sentsar.yaml | 9 +- .../polygraph_eval_wmt19_ende_sentsar.yaml | 159 +++++++++++++++++ .../configs/polygraph_eval_xsum_sentsar.yaml | 9 +- 9 files changed, 831 insertions(+), 4 deletions(-) create mode 100644 examples/configs/polygraph_eval_coqa_sentsar.yaml create mode 100644 examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml create mode 100644 examples/configs/polygraph_eval_mmlu_sentsar.yaml create mode 100644 examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml create mode 100644 examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml new file mode 100644 index 000000000..5c4c83673 --- /dev/null +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -0,0 +1,158 @@ +hydra: + run: + dir: ${cache_path}/coqa/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + +defaults: + - model: bloomz-560m + - _self_ + +cache_path: ./workdir/output +save_path: '${hydra:run.dir}' + +task: qa + +dataset: coqa +text_column: questions +label_column: answers +description: "The following are stories and questions about them. Each story is followed by a question and answer to a given question.\n\nStory: {story}" +prompt: "Question: {question}\nAnswer:{answer}" +train_split: train +eval_split: validation +max_new_tokens: 20 +load_from_disk: false +normalize: true +generation_params: + generate_until: + - "\n" +save_stats: + - greedy_tokens + - greedy_log_likelihoods + - greedy_tokens_alternatives + - entropy + - sample_tokens + - sample_tokens_alternatives + - sample_texts + - sample_log_probs + - sample_log_likelihoods + - sample_sentence_similarity + - sample_entropy +entropy_top_k: 50 + +train_dataset: null +train_test_split: false +test_split_size: 1 + +background_train_dataset: allenai/c4 +background_train_dataset_text_column: text +background_train_dataset_label_column: url +background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz +background_load_from_disk: false + +subsample_background_train_dataset: 1000 +subsample_train_dataset: 1000 +subsample_eval_dataset: -1 + +use_density_based_ue: false +use_seq_ue: false +use_tok_ue: false +use_ens_ue: false +generation_metrics: null + +additional_estimators: + - module: lm_polygraph.estimators.monte_carlo_sequence_entropy + class_name: MonteCarloSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy + class_name: MonteCarloNormalizedSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.semantic_entropy + class_name: SemanticEntropy + kwargs: {} + + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: SentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: true + +ignore_exceptions: false + +batch_size: 1 +deberta_batch_size: 1 + +seed: + - 1 diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml new file mode 100644 index 000000000..dfd7d072c --- /dev/null +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -0,0 +1,162 @@ +hydra: + run: + dir: ${cache_path}/gsm8k_cot/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + +defaults: + - model: bloomz-560m + - _self_ + +cache_path: ./workdir/output +save_path: '${hydra:run.dir}' + +task: qa + +dataset: [gsm8k, main] +text_column: question +label_column: answer +prompt: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\nQ: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\nQ: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39.\n\nQ: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is 8.\n\nQ: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is 9.\n\nQ: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is 29.\n\nQ: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33.\n\nQ: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.\n\nQ: {question}\nA:" +train_split: train +few_shot_split: train +eval_split: test +max_new_tokens: 256 +load_from_disk: false +n_shot: 0 +normalize: true +generation_params: + generate_until: + - "\n" +save_stats: + - greedy_tokens + - greedy_log_likelihoods + - greedy_tokens_alternatives + - entropy + - sample_tokens + - sample_tokens_alternatives + - sample_texts + - sample_log_probs + - sample_log_likelihoods + - sample_sentence_similarity + - sample_entropy +entropy_top_k: 50 + +target_ignore_regex: "(?s).*#### " +output_ignore_regex: "(?s).*The answer is " + +train_dataset: null +train_test_split: false +test_split_size: 1 + +background_train_dataset: allenai/c4 +background_train_dataset_text_column: text +background_train_dataset_label_column: url +background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz +background_load_from_disk: false + +subsample_background_train_dataset: 1000 +subsample_train_dataset: 1000 +subsample_eval_dataset: -1 + +use_density_based_ue: false +use_seq_ue: false +use_tok_ue: false +use_ens_ue: false +generation_metrics: null + +additional_estimators: + - module: lm_polygraph.estimators.monte_carlo_sequence_entropy + class_name: MonteCarloSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy + class_name: MonteCarloNormalizedSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.semantic_entropy + class_name: SemanticEntropy + kwargs: {} + + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: SentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: true + +ignore_exceptions: false + +batch_size: 1 +deberta_batch_size: 1 + +seed: + - 1 diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml new file mode 100644 index 000000000..6e81f65da --- /dev/null +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -0,0 +1,160 @@ +hydra: + run: + dir: ${cache_path}/mmlu/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + +defaults: + - model: bloomz-560m + - _self_ + +cache_path: ./workdir/output +save_path: '${hydra:run.dir}' + +task: qa + +dataset: [cais/mmlu, all] +text_column: question +label_column: answer +description: "The following are multiple choice questions (with answers) about {subject}." +prompt: "Q:{question}\nA. {choices[0]}\nB. {choices[1]}\nC. {choices[2]}\nD. {choices[3]}\nAnswer:{answer}" +few_shot_split: dev +train_split: validation +eval_split: test +max_new_tokens: 3 +load_from_disk: false +n_shot: 5 +max_subject_size: 100 +generation_params: + generate_until: + - "\n" +save_stats: + - greedy_tokens + - greedy_log_likelihoods + - greedy_tokens_alternatives + - entropy + - sample_tokens + - sample_tokens_alternatives + - sample_texts + - sample_log_probs + - sample_log_likelihoods + - sample_sentence_similarity + - sample_entropy +entropy_top_k: 50 + +train_dataset: null +train_test_split: false +test_split_size: 1 + +background_train_dataset: allenai/c4 +background_train_dataset_text_column: text +background_train_dataset_label_column: url +background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz +background_load_from_disk: false + +subsample_background_train_dataset: 1000 +subsample_train_dataset: 1000 +subsample_eval_dataset: -1 + +use_density_based_ue: false +use_seq_ue: false +use_tok_ue: false +use_ens_ue: false +generation_metrics: null + +additional_estimators: + - module: lm_polygraph.estimators.monte_carlo_sequence_entropy + class_name: MonteCarloSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy + class_name: MonteCarloNormalizedSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.semantic_entropy + class_name: SemanticEntropy + kwargs: {} + + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: SentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: true + +ignore_exceptions: false + +batch_size: 1 +deberta_batch_size: 1 + +seed: + - 1 diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 74a06f08d..e4af4a50e 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -1,6 +1,6 @@ hydra: run: - dir: ${cache_path}/${task}/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + dir: ${cache_path}/triviaqa/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} defaults: - model: bloomz-560m @@ -27,10 +27,17 @@ generation_params: generate_until: - "\n" save_stats: + - greedy_tokens + - greedy_log_likelihoods + - greedy_tokens_alternatives + - entropy - sample_tokens + - sample_tokens_alternatives - sample_texts - sample_log_probs + - sample_log_likelihoods - sample_sentence_similarity + - sample_entropy entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml new file mode 100644 index 000000000..8efef7f56 --- /dev/null +++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml @@ -0,0 +1,160 @@ +hydra: + run: + dir: ${cache_path}/wmt14_enfr/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + +defaults: + - model: bloomz-560m + - _self_ + +cache_path: ./workdir/output +save_path: '${hydra:run.dir}' + +device: cpu + +task: nmt + +dataset: [wmt14, fr-en] +text_column: en +label_column: fr +prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n" +train_split: train +eval_split: test +max_new_tokens: 182 +load_from_disk: false +generation_params: + generate_until: + - "\n" +save_stats: + - greedy_tokens + - greedy_log_likelihoods + - greedy_tokens_alternatives + - entropy + - sample_tokens + - sample_tokens_alternatives + - sample_texts + - sample_log_probs + - sample_log_likelihoods + - sample_sentence_similarity + - sample_entropy +entropy_top_k: 50 + +source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" + +train_dataset: null +train_test_split: false +test_split_size: 1 + +background_train_dataset: allenai/c4 +background_train_dataset_text_column: text +background_train_dataset_label_column: url +background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz +background_load_from_disk: false + +subsample_background_train_dataset: 1000 +subsample_train_dataset: 1000 +subsample_eval_dataset: -1 + +use_density_based_ue: false +use_ens_ue: false +use_seq_ue: false +use_tok_ue: false +generation_metrics: null + +additional_estimators: + - module: lm_polygraph.estimators.monte_carlo_sequence_entropy + class_name: MonteCarloSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy + class_name: MonteCarloNormalizedSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.semantic_entropy + class_name: SemanticEntropy + kwargs: {} + + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: SentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: true + +ignore_exceptions: false + +batch_size: 1 +deberta_batch_size: 1 + +seed: + - 1 diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 7c2213716..0819ab00b 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -1,6 +1,6 @@ hydra: run: - dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + dir: ${cache_path}/wmt14_fren/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} defaults: - model: bloomz-560m @@ -25,10 +25,17 @@ generation_params: generate_until: - "\n" save_stats: + - greedy_tokens + - greedy_log_likelihoods + - greedy_tokens_alternatives + - entropy - sample_tokens + - sample_tokens_alternatives - sample_texts - sample_log_probs + - sample_log_likelihoods - sample_sentence_similarity + - sample_entropy entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index f5d70927a..86afb4328 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -1,6 +1,6 @@ hydra: run: - dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + dir: ${cache_path}/wmt19_deen/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} defaults: - model: bloomz-560m @@ -25,10 +25,17 @@ generation_params: generate_until: - "\n" save_stats: + - greedy_tokens + - greedy_log_likelihoods + - greedy_tokens_alternatives + - entropy - sample_tokens + - sample_tokens_alternatives - sample_texts - sample_log_probs + - sample_log_likelihoods - sample_sentence_similarity + - sample_entropy entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml new file mode 100644 index 000000000..7c23dd127 --- /dev/null +++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml @@ -0,0 +1,159 @@ +hydra: + run: + dir: ${cache_path}/wmt19_ende/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + +defaults: + - model: bloomz-560m + - _self_ + +cache_path: ./workdir/output +save_path: '${hydra:run.dir}' + +device: cpu + +task: nmt + +dataset: [wmt19, de-en] +text_column: en +label_column: de +prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n" +train_split: train +eval_split: validation +max_new_tokens: 200 +load_from_disk: false +generation_params: + generate_until: + - "\n" +save_stats: + - greedy_tokens + - greedy_log_likelihoods + - greedy_tokens_alternatives + - entropy + - sample_tokens + - sample_tokens_alternatives + - sample_texts + - sample_log_probs + - sample_log_likelihoods + - sample_sentence_similarity + - sample_entropy +entropy_top_k: 50 + +source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" + +train_dataset: null +train_test_split: false +test_split_size: 1 + +background_train_dataset: allenai/c4 +background_train_dataset_text_column: text +background_train_dataset_label_column: url +background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz +background_load_from_disk: false + +subsample_background_train_dataset: 1000 +subsample_train_dataset: 1000 +subsample_eval_dataset: -1 + +use_density_based_ue: false +use_ens_ue: false +use_seq_ue: false +use_tok_ue: false + +additional_estimators: + - module: lm_polygraph.estimators.monte_carlo_sequence_entropy + class_name: MonteCarloSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy + class_name: MonteCarloNormalizedSequenceEntropy + kwargs: {} + - module: lm_polygraph.estimators.semantic_entropy + class_name: SemanticEntropy + kwargs: {} + + - module: lm_polygraph.estimators.max_probability + class_name: MaximumSequenceProbability + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: SentenceSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MaxprobGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_sar + class_name: TokenSAR + kwargs: {} + - module: lm_polygraph.estimators.sar + class_name: SAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: TokenSARGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.perplexity + class_name: Perplexity + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: PPLSAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: PPLGSU + kwargs: + use_log: false + reverse: true + + - module: lm_polygraph.estimators.token_entropy + class_name: MeanTokenEntropy + kwargs: {} + - module: lm_polygraph.estimators.sentence_sar + class_name: MTESAR + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: {} + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: false + - module: lm_polygraph.estimators.gsu + class_name: MTEGSU + kwargs: + use_log: false + reverse: true + +ignore_exceptions: false + +batch_size: 1 +deberta_batch_size: 1 + +seed: + - 1 diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index b157ac671..ac69dc14c 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -1,6 +1,6 @@ hydra: run: - dir: ${cache_path}/${task}/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} + dir: ${cache_path}/xsum/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} defaults: - model: bloomz-560m @@ -26,10 +26,17 @@ generation_params: generate_until: - "\n" save_stats: + - greedy_tokens + - greedy_log_likelihoods + - greedy_tokens_alternatives + - entropy - sample_tokens + - sample_tokens_alternatives - sample_texts - sample_log_probs + - sample_log_likelihoods - sample_sentence_similarity + - sample_entropy entropy_top_k: 50 train_dataset: null From 229502874518510a789c51b790d6b750e61954be Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 28 Nov 2024 12:43:00 +0400 Subject: [PATCH 36/97] Use only PRR --- scripts/polygraph_eval | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index ccf079f9e..5cb50e829 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -212,10 +212,10 @@ def main(args): def get_ue_metrics(args): ue_metrics = [ - ReversedPairsProportion(), + #ReversedPairsProportion(), PredictionRejectionArea(), PredictionRejectionArea(max_rejection=0.5), - RiskCoverageCurveAUC(), + #RiskCoverageCurveAUC(), ] if getattr(args, "use_claim_ue", False): ue_metrics += [ From ef8258b97db73b6a3cfbad1a16899d349eed058c Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 4 Dec 2024 13:32:55 +0400 Subject: [PATCH 37/97] Simplify and unify GSU --- src/lm_polygraph/estimators/gsu.py | 104 +++++------------------------ src/lm_polygraph/utils/manager.py | 12 ++-- 2 files changed, 24 insertions(+), 92 deletions(-) diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py index b7ab5b6d9..8aae841d1 100644 --- a/src/lm_polygraph/estimators/gsu.py +++ b/src/lm_polygraph/estimators/gsu.py @@ -11,21 +11,12 @@ class MaxprobGSU(Estimator): def __init__( self, verbose: bool = False, - use_log: bool = True, - reverse: bool = False ): super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") self.verbose = verbose - self.use_log = use_log - self.reverse = reverse def __str__(self): - base = "MaxprobGSU" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base + return "MaxprobGSU" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -46,17 +37,12 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_log_probs, sample_sentence_similarity in zip( batch_sample_log_probs, batch_sample_sentence_similarity ): - sample_probs = np.exp(np.array(sample_log_probs)) + sample_probs = -np.exp(np.array(sample_log_probs)) R_s = ( sample_probs * sample_sentence_similarity ) - sent_relevance = R_s.sum(-1) - - if self.use_log: - E_s = -np.log(sent_relevance) - else: - E_s = -sent_relevance if self.reverse else sent_relevance + E_s = R_s.sum(-1) GSU.append(E_s.mean()) @@ -66,22 +52,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: class PPLGSU(Estimator): def __init__( self, - verbose: bool = False, - use_log: bool = True, - reverse: bool = False + verbose: bool = False ): super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") self.verbose = verbose - self.use_log = use_log - self.reverse = reverse def __str__(self): - base = "PPLGSU" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base + return "PPLGSU" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -102,18 +79,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_log_likelihoods, sample_sentence_similarity in zip( batch_sample_log_likelihoods, batch_sample_sentence_similarity ): - ppl = np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + ppl = -np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) R_s = ( ppl * sample_sentence_similarity ) - sent_relevance = R_s.sum(-1) - - if self.use_log: - E_s = -np.log(sent_relevance) - else: - E_s = -sent_relevance if self.reverse else sent_relevance + E_s = R_s.sum(-1) GSU.append(E_s.mean()) @@ -123,10 +95,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: class TokenSARGSU(Estimator): def __init__( self, - verbose: bool = False, - use_log: bool = True, - reverse: bool = False - ): + verbose: bool = False): super().__init__( [ "sample_sentence_similarity", @@ -136,16 +105,9 @@ def __init__( "sequence", ) self.verbose = verbose - self.use_log = use_log - self.reverse = reverse def __str__(self): - base = "TokenSARGSU" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base + return "TokenSARGSU" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -185,17 +147,12 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: tokenSAR.append(E_t.sum()) tokenSAR = np.array(tokenSAR) - probs_token_sar = np.exp(-tokenSAR) + probs_token_sar = -np.exp(-tokenSAR) R_s = ( probs_token_sar * sample_sentence_similarity ) - sent_relevance = R_s.sum(-1) - E_s = -np.log(sent_relevance) - if self.use_log: - E_s = -np.log(sent_relevance) - else: - E_s = -sent_relevance if self.reverse else sent_relevance + E_s = R_s.sum(-1) GSU.append(E_s.mean()) @@ -205,22 +162,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: class MTEGSU(Estimator): def __init__( self, - verbose: bool = False, - use_log: bool = True, - reverse: bool = False + verbose: bool = False ): super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") self.verbose = verbose - self.use_log = use_log - self.reverse = reverse def __str__(self): - base = "MTEGSU" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base + return "MTEGSU" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -247,13 +195,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: R_s = sample_entropy * sample_sentence_similarity # Compute sentence relevance by summing along the last axis - sent_relevance = R_s.sum(-1) - - # Calculate E_s with options for log transformation and reversal - if self.use_log: - E_s = -np.log(sent_relevance) - else: - E_s = -sent_relevance if self.reverse else sent_relevance + E_s = R_s.sum(-1) GSU.append(E_s.mean()) @@ -263,25 +205,16 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: class CCPGSU(Estimator): def __init__( self, - verbose: bool = False, - use_log: bool = True, - reverse: bool = False + verbose: bool = False ): super().__init__(["sample_sentence_similarity", "sample_tokens", "sample_tokens_alternatives", "sample_tokens_alternatives_nli"], "sequence") self.verbose = verbose - self.use_log = use_log - self.reverse = reverse def __str__(self): - base = "CCPGSU" - if not self.use_log: - base += "_no_log" - if self.reverse: - base += "_reverse" - return base + return "CCPGSU" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -331,11 +264,6 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ) sent_relevance = R_s.sum(-1) - if self.use_log: - E_s = -np.log(sent_relevance) - else: - E_s = -sent_relevance if self.reverse else sent_relevance - GSU.append(E_s.mean()) return np.array(GSU) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 36e882308..b21d7d9aa 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -514,7 +514,15 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: torch.cuda.empty_cache() gc.collect() + + self.eval_ue() + for processor in self.processors: + processor.on_eval(self.metrics, self.total_bad_estimators) + + return self.metrics + + def eval_ue(self): for (e_level, e_name), estimator_values in self.estimations.items(): for (gen_level, gen_name), generation_metric in self.gen_metrics.items(): for ue_metric in self.ue_metrics: @@ -541,10 +549,6 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: e_level, e_name, gen_name, str(ue_metric) + "_normalized" ] = normalize_metric(ue_metric_val, oracle_score, random_score) - for processor in self.processors: - processor.on_eval(self.metrics, self.total_bad_estimators) - - return self.metrics def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> dict: """ From a79620e415f2fea413317e5a1e64e8a787580b94 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 4 Dec 2024 13:39:48 +0400 Subject: [PATCH 38/97] Add missing stats to yamls --- examples/configs/polygraph_eval_coqa_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml | 2 ++ examples/configs/polygraph_eval_mmlu_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_triviaqa_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_xsum_sentsar.yaml | 2 ++ 9 files changed, 18 insertions(+) diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml index 5c4c83673..4c2e28160 100644 --- a/examples/configs/polygraph_eval_coqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -28,6 +28,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - token_similarity - entropy - sample_tokens - sample_tokens_alternatives @@ -35,6 +36,7 @@ save_stats: - sample_log_probs - sample_log_likelihoods - sample_sentence_similarity + - sample_token_similarity - sample_entropy entropy_top_k: 50 diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml index dfd7d072c..3db79b51b 100644 --- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -29,6 +29,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - token_similarity - entropy - sample_tokens - sample_tokens_alternatives @@ -36,6 +37,7 @@ save_stats: - sample_log_probs - sample_log_likelihoods - sample_sentence_similarity + - sample_token_similarity - sample_entropy entropy_top_k: 50 diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml index 6e81f65da..713f536cb 100644 --- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -30,6 +30,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - token_similarity - entropy - sample_tokens - sample_tokens_alternatives @@ -37,6 +38,7 @@ save_stats: - sample_log_probs - sample_log_likelihoods - sample_sentence_similarity + - sample_token_similarity - sample_entropy entropy_top_k: 50 diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index e4af4a50e..94be9806b 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -30,6 +30,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - token_similarity - entropy - sample_tokens - sample_tokens_alternatives @@ -37,6 +38,7 @@ save_stats: - sample_log_probs - sample_log_likelihoods - sample_sentence_similarity + - sample_token_similarity - sample_entropy entropy_top_k: 50 diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml index 8efef7f56..d86071a11 100644 --- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml @@ -28,6 +28,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - token_similarity - entropy - sample_tokens - sample_tokens_alternatives @@ -35,6 +36,7 @@ save_stats: - sample_log_probs - sample_log_likelihoods - sample_sentence_similarity + - sample_token_similarity - sample_entropy entropy_top_k: 50 diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 0819ab00b..47dbdc3f3 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -28,6 +28,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - token_similarity - entropy - sample_tokens - sample_tokens_alternatives @@ -35,6 +36,7 @@ save_stats: - sample_log_probs - sample_log_likelihoods - sample_sentence_similarity + - sample_token_similarity - sample_entropy entropy_top_k: 50 diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index 86afb4328..cc19da68f 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -28,6 +28,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - token_similarity - entropy - sample_tokens - sample_tokens_alternatives @@ -35,6 +36,7 @@ save_stats: - sample_log_probs - sample_log_likelihoods - sample_sentence_similarity + - sample_token_similarity - sample_entropy entropy_top_k: 50 diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml index 7c23dd127..4b0099411 100644 --- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml @@ -28,6 +28,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - token_similarity - entropy - sample_tokens - sample_tokens_alternatives @@ -35,6 +36,7 @@ save_stats: - sample_log_probs - sample_log_likelihoods - sample_sentence_similarity + - sample_token_similarity - sample_entropy entropy_top_k: 50 diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index ac69dc14c..f308fbefb 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -29,6 +29,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - token_similarity - entropy - sample_tokens - sample_tokens_alternatives @@ -36,6 +37,7 @@ save_stats: - sample_log_probs - sample_log_likelihoods - sample_sentence_similarity + - sample_token_similarity - sample_entropy entropy_top_k: 50 From 8ecc1d56891e81bbba3353c5666d1b9d4c6fd0f4 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 4 Dec 2024 15:10:08 +0400 Subject: [PATCH 39/97] Add tqdm to ce similarity --- .../stat_calculators/cross_encoder_similarity.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py b/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py index d6faa20bb..ada5c9136 100644 --- a/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py +++ b/src/lm_polygraph/stat_calculators/cross_encoder_similarity.py @@ -2,6 +2,7 @@ import itertools from typing import Dict, List +from tqdm import tqdm from .stat_calculator import StatCalculator from sentence_transformers import CrossEncoder @@ -67,7 +68,7 @@ def __call__( batch_counts.append(len(unique_texts)) batch_token_scores = [] - for input_texts, tokens in zip(batch_input_texts, batch_greedy_tokens): + for input_texts, tokens in tqdm(zip(batch_input_texts, batch_greedy_tokens)): if len(tokens) > 1: is_special_tokens = np.isin(tokens, special_tokens) cropped_tokens = list(itertools.combinations(tokens, len(tokens) - 1))[ @@ -96,7 +97,7 @@ def __call__( batch_token_scores.append(token_scores) sim_matrices = [] - for i, pairs in enumerate(batch_pairs): + for i, pairs in tqdm(enumerate(batch_pairs)): sim_scores = self.crossencoder.predict(pairs, batch_size=deberta_batch_size) unique_mat_shape = (batch_counts[i], batch_counts[i]) @@ -109,7 +110,7 @@ def __call__( sim_matrices = np.stack(sim_matrices) batch_samples_token_scores = [] - for sample_tokens, input_texts in zip(batch_sample_tokens, batch_input_texts): + for sample_tokens, input_texts in tqdm(zip(batch_sample_tokens, batch_input_texts)): samples_token_scores = [] for tokens in sample_tokens: if len(tokens) > 1: From 6f32205044511de147fc49952c2330b7beff2f68 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 4 Dec 2024 16:06:12 +0400 Subject: [PATCH 40/97] Add model config for llama --- examples/configs/model/llama.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 examples/configs/model/llama.yaml diff --git a/examples/configs/model/llama.yaml b/examples/configs/model/llama.yaml new file mode 100644 index 000000000..0d1870443 --- /dev/null +++ b/examples/configs/model/llama.yaml @@ -0,0 +1,11 @@ +defaults: + - default + +path: meta-llama/Meta-Llama-3.1-8B +type: CausalLM +path_to_load_script: model/default_causal.py + +load_model_args: + device_map: balanced_low_0 + dtype: bfloat16 +load_tokenizer_args: {} From 457f94bcd39c3e532750a99e3e79c9fb1c417fb4 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 5 Dec 2024 13:06:39 +0400 Subject: [PATCH 41/97] Add dtype to load args --- examples/configs/model/default_causal.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/configs/model/default_causal.py b/examples/configs/model/default_causal.py index 956a71dde..c77a317fc 100644 --- a/examples/configs/model/default_causal.py +++ b/examples/configs/model/default_causal.py @@ -1,9 +1,10 @@ from transformers import AutoModelForCausalLM, AutoTokenizer +import torch - -def load_model(model_path: str, device_map: str): +def load_model(model_path: str, device_map: str, dtype: str = "float32"): + dtype = getattr(torch, dtype) model = AutoModelForCausalLM.from_pretrained( - model_path, trust_remote_code=True, device_map=device_map + model_path, trust_remote_code=True, device_map=device_map, torch_dtype=dtype ) model.eval() From 1e1a3c86d28961c5f9746a3264052a8cb51d259a Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 5 Dec 2024 20:55:57 +0400 Subject: [PATCH 42/97] Remove redundant methods --- .../configs/polygraph_eval_coqa_sentsar.yaml | 39 ------------------ .../polygraph_eval_gsm8k_sentsar_cot.yaml | 40 ------------------- .../configs/polygraph_eval_mmlu_sentsar.yaml | 40 ------------------- .../polygraph_eval_triviaqa_sentsar.yaml | 40 ------------------- .../polygraph_eval_wmt14_enfr_sentsar.yaml | 39 ------------------ .../polygraph_eval_wmt14_fren_sentsar.yaml | 40 ------------------- .../polygraph_eval_wmt19_deen_sentsar.yaml | 40 ------------------- .../polygraph_eval_wmt19_ende_sentsar.yaml | 40 ------------------- .../configs/polygraph_eval_xsum_sentsar.yaml | 40 ------------------- 9 files changed, 358 deletions(-) diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml index 4c2e28160..9e75ed4af 100644 --- a/examples/configs/polygraph_eval_coqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -80,16 +80,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_sar class_name: TokenSAR @@ -101,15 +91,6 @@ additional_estimators: class_name: TokenSARGSU kwargs: {} - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.perplexity class_name: Perplexity @@ -120,16 +101,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy @@ -140,16 +111,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml index 3db79b51b..b4eea7dcd 100644 --- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -84,16 +84,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_sar class_name: TokenSAR @@ -104,16 +94,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.perplexity class_name: Perplexity @@ -124,16 +104,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy @@ -144,16 +114,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml index 713f536cb..755da5c74 100644 --- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -82,16 +82,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_sar class_name: TokenSAR @@ -102,16 +92,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.perplexity class_name: Perplexity @@ -122,16 +102,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy @@ -142,16 +112,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 94be9806b..1b40fb1b9 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -83,16 +83,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_sar class_name: TokenSAR @@ -103,16 +93,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.perplexity class_name: Perplexity @@ -123,16 +103,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy @@ -143,16 +113,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml index d86071a11..b3505e7cc 100644 --- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml @@ -82,16 +82,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_sar class_name: TokenSAR @@ -103,15 +93,6 @@ additional_estimators: class_name: TokenSARGSU kwargs: {} - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.perplexity class_name: Perplexity @@ -122,16 +103,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy @@ -142,16 +113,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 47dbdc3f3..b471f29f8 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -82,16 +82,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_sar class_name: TokenSAR @@ -102,16 +92,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.perplexity class_name: Perplexity @@ -122,16 +102,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy @@ -142,16 +112,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index cc19da68f..c9658242b 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -81,16 +81,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_sar class_name: TokenSAR @@ -101,16 +91,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.perplexity class_name: Perplexity @@ -121,16 +101,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy @@ -141,16 +111,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml index 4b0099411..afd8f28f3 100644 --- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml @@ -81,16 +81,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_sar class_name: TokenSAR @@ -101,16 +91,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.perplexity class_name: Perplexity @@ -121,16 +101,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy @@ -141,16 +111,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: true ignore_exceptions: false diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index f308fbefb..af828f29c 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -80,16 +80,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MaxprobGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_sar class_name: TokenSAR @@ -100,16 +90,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.perplexity class_name: Perplexity @@ -120,16 +100,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: PPLGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: - use_log: false - reverse: true - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy @@ -140,16 +110,6 @@ additional_estimators: - module: lm_polygraph.estimators.gsu class_name: MTEGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: false - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: - use_log: false - reverse: true ignore_exceptions: false From 41a0849bb031791ee68cca87f83f2e15991b688f Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Tue, 10 Dec 2024 12:56:50 +0400 Subject: [PATCH 43/97] Add possibility of continuing estimation from saved manager, sampled versions of single-seq methods --- .../configs/polygraph_eval_xsum_sentsar.yaml | 28 +++ scripts/polygraph_eval | 117 ++++++++--- src/lm_polygraph/estimators/__init__.py | 1 + src/lm_polygraph/estimators/average_ue.py | 129 ++++++++++++ .../estimators/max_probability.py | 28 +++ src/lm_polygraph/estimators/perplexity.py | 12 ++ src/lm_polygraph/estimators/token_entropy.py | 28 +++ src/lm_polygraph/estimators/token_sar.py | 53 +++++ .../cross_encoder_similarity.py | 1 + src/lm_polygraph/utils/manager.py | 185 ++++++++---------- src/lm_polygraph/utils/processor.py | 2 +- 11 files changed, 457 insertions(+), 127 deletions(-) create mode 100644 src/lm_polygraph/estimators/average_ue.py diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index af828f29c..a04c9c672 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -13,6 +13,9 @@ device: cpu task: ats +base_manager: null +overwrite_base_estimations: false + dataset: xsum text_column: document label_column: summary @@ -74,6 +77,9 @@ additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability kwargs: {} + - module: lm_polygraph.estimators.max_probability + class_name: SampledMaximumSequenceProbability + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} @@ -84,6 +90,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: SampledTokenSAR + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} @@ -94,6 +103,9 @@ additional_estimators: - module: lm_polygraph.estimators.perplexity class_name: Perplexity kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: SampledPerplexity + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSAR kwargs: {} @@ -104,6 +116,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: SampledMeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: MTESAR kwargs: {} @@ -111,6 +126,19 @@ additional_estimators: class_name: MTEGSU kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMaxprob + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AvePPL + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveTokenSAR + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMTE + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 5cb50e829..2944cf21e 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -183,27 +183,50 @@ def main(args): generation_metrics = get_generation_metrics(args) ue_metrics = get_ue_metrics(args) - - man = UEManager( - dataset, - model, - estimators, - generation_metrics, - ue_metrics, - [ - Logger(), - ], - deberta_batch_size=getattr(args, 'deberta_batch_size', 10), - train_data=train_dataset, - ignore_exceptions=args.ignore_exceptions, - background_train_data=background_train_dataset, - max_new_tokens=args.max_new_tokens, - ensemble_model=ensemble_model, - cache_path=args.cache_path, - language=getattr(args, 'language', 'en'), - save_stats=getattr(args, 'save_stats', []), - entropy_top_k=getattr(args, 'entropy_top_k', None), - ) + + if getattr(args, "base_manager", None) is None: + man = UEManager( + dataset, + model, + estimators, + generation_metrics, + ue_metrics, + [ + Logger(), + ], + batch_size=args.batch_size, + deberta_batch_size=getattr(args, 'deberta_batch_size', 10), + train_data=train_dataset, + ignore_exceptions=args.ignore_exceptions, + background_train_data=background_train_dataset, + max_new_tokens=args.max_new_tokens, + ensemble_model=ensemble_model, + cache_path=args.cache_path, + language=getattr(args, 'language', 'en'), + save_stats=getattr(args, 'save_stats', []), + entropy_top_k=getattr(args, 'entropy_top_k', None), + ) + else: + man = UEManager.load( + args.base_manager, + data = dataset, + model = model, + estimators = estimators, + generation_metrics = generation_metrics, + ue_metrics = ue_metrics, + processors = [Logger()], + batch_size=args.batch_size, + deberta_batch_size=getattr(args, 'deberta_batch_size', 10), + train_data=train_dataset, + ignore_exceptions=args.ignore_exceptions, + background_train_data=background_train_dataset, + max_new_tokens=args.max_new_tokens, + ensemble_model=ensemble_model, + cache_path=args.cache_path, + language=getattr(args, 'language', 'en'), + save_stats=getattr(args, 'save_stats', []), + entropy_top_k=getattr(args, 'entropy_top_k', None), + ) man() @@ -267,10 +290,18 @@ def get_density_based_ue_methods(args, model_type): def get_ue_methods(args, model): + if getattr(args, "base_manager", None) is not None: + base_manager = UEManager.load(args.base_manager) + existing_estimators = list(base_manager.estimations.keys()) + else: + existing_estimators = [] + + overwrite = getattr(args, "overwrite_base_estimations", False) + estimators = [] if getattr(args.model, "type", "Whitebox") == "Blackbox": if getattr(args, "use_seq_ue", False): - estimators += [ + bb_estimators = [ LexicalSimilarity(metric="rouge1"), LexicalSimilarity(metric="rouge2"), LexicalSimilarity(metric="rougeL"), @@ -287,6 +318,10 @@ def get_ue_methods(args, model): Eccentricity(similarity_score="Jaccard_score"), ] + for estimator in bb_estimators: + if overwrite or ('sequence', str(estimator)) not in existing_estimators: + estimators.append(estimator) + if getattr(args, "use_ens_ue", False): raise NotImplementedError('Ensemble UE methods not applicable for blackbox models') @@ -297,7 +332,7 @@ def get_ue_methods(args, model): raise NotImplementedError('Claim UE methods not applicable for blackbox models') else: if getattr(args, "use_seq_ue", False): - estimators += [ + wb_estimators = [ MaximumSequenceProbability(), Perplexity(), MeanTokenEntropy(), @@ -330,6 +365,10 @@ def get_ue_methods(args, model): FisherRao(), ] + for estimator in wb_estimators: + if overwrite or ('sequence', str(estimator)) not in existing_estimators: + estimators.append(estimator) + if getattr(args, "use_ens_ue", False): # Ensemble-based UE methods have been disabled due to dependency on old # transformers code, which prevents bumping transformers version in @@ -350,7 +389,7 @@ def get_ue_methods(args, model): #estimators += (token_measures + sequence_measures) if getattr(args, "use_tok_ue", False): - estimators += [ + tok_estimators = [ MaximumTokenProbability(), TokenEntropy(), PointwiseMutualInformation(), @@ -358,8 +397,12 @@ def get_ue_methods(args, model): SemanticEntropyToken(model.model_path, args.cache_path), ] + for estimator in tok_estimators: + if overwrite or ('token', str(estimator)) not in existing_estimators: + estimators.append(estimator) + if getattr(args, "use_claim_ue", False): - estimators += [ + claim_estimators = [ MaximumClaimProbability(), PerplexityClaim(), MaxTokenEntropyClaim(), @@ -369,12 +412,19 @@ def get_ue_methods(args, model): ClaimConditionedProbabilityClaim(nli_context="fact_pref"), ] + for estimator in claim_estimators: + if overwrite or ('claim', str(estimator)) not in existing_estimators: + estimators.append(estimator) + additional_estimators = getattr(args, "additional_estimators", {}) for estimator_args in additional_estimators: module = importlib.import_module(estimator_args.module) estimator_class = getattr(module, estimator_args.class_name) - estimators.append(estimator_class(**estimator_args.kwargs)) + estimator = estimator_class(**estimator_args.kwargs) + # Additional estimator filtering only works correctly for sequence-level estimators + if overwrite or ('sequence', str(estimator)) not in existing_estimators: + estimators.append(estimator_class(**estimator_args.kwargs)) return estimators @@ -383,6 +433,12 @@ def get_generation_metrics(args): log.info("="*100) log.info("Initializing generation metrics...") + if getattr(args, "base_manager", None) is not None: + base_manager = UEManager.load(args.base_manager) + existing_metrics = list(base_manager.gen_metrics.keys()) + else: + existing_metrics = [] + generation_metrics = getattr(args, "generation_metrics", None) if not generation_metrics: result = [ @@ -415,6 +471,15 @@ def get_generation_metrics(args): metric_args = metric.get("args", []) result.append(metric_class(*metric_args)) + # Filter out metrics that are already present in the base manager + filtered_result = [] + for metric in result: + if (metric.level, str(metric)) in existing_metrics: + log.warning(f"Skipping metric {metric} as it is already present in the base manager.") + else: + filtered_result.append(metric) + result = filtered_result + process_output_fn = getattr(args, "process_output_fn", None) process_target_fn = getattr(args, "process_target_fn", None) if process_target_fn or process_output_fn: diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index ee51c6b77..5f75d7c74 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -79,3 +79,4 @@ from .linguistic_1s import Linguistic1S from .label_prob import LabelProb from .p_true_empirical import PTrueEmpirical +from .average_ue import AveMaxprob diff --git a/src/lm_polygraph/estimators/average_ue.py b/src/lm_polygraph/estimators/average_ue.py new file mode 100644 index 000000000..b03147a1f --- /dev/null +++ b/src/lm_polygraph/estimators/average_ue.py @@ -0,0 +1,129 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator + + +class AveMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + + def __str__(self): + return "AveMaxprob" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + sample_probs = -np.exp(np.array(sample_log_probs)) + + ave.append(sample_probs.mean()) + + return np.array(ave) + +class AvePPL(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "AvePPL" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for sample_log_likelihoods, sample_sentence_similarity in zip( + batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): + ppl = -np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + + ave.append(ppl.mean()) + + return np.array(ave) + +class AveTokenSAR(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__( + [ + "sample_sentence_similarity", + "sample_log_likelihoods", + "sample_token_similarity", + ], + "sequence", + ) + self.verbose = verbose + + def __str__(self): + return "AveTokenSAR" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_token_similarity = stats["sample_token_similarity"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for batch_data in zip( + batch_sample_log_likelihoods, + batch_sample_token_similarity, + batch_sample_sentence_similarity, + ): + sample_log_likelihoods = batch_data[0] + sample_token_similarity = batch_data[1] + sample_sentence_similarity = batch_data[2] + + tokenSAR = [] + for log_likelihoods, token_similarity in zip( + sample_log_likelihoods, sample_token_similarity + ): + log_likelihoods = np.array(log_likelihoods) + R_t = 1 - token_similarity + R_t_norm = R_t / R_t.sum() + E_t = -log_likelihoods * R_t_norm + tokenSAR.append(E_t.sum()) + + tokenSAR = np.array(tokenSAR) + probs_token_sar = -np.exp(-tokenSAR) + ave.append(probs_token_sar.mean()) + + return np.array(ave) + +class AveMTE(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "AveMTE" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_entropy = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for sample_entropy, sample_sentence_similarity in zip( + batch_sample_entropy, batch_sample_sentence_similarity + ): + ave.append(np.mean(sample_entropy)) + + return np.array(ave) diff --git a/src/lm_polygraph/estimators/max_probability.py b/src/lm_polygraph/estimators/max_probability.py index 1d93b5e3c..b8fe2afde 100644 --- a/src/lm_polygraph/estimators/max_probability.py +++ b/src/lm_polygraph/estimators/max_probability.py @@ -33,6 +33,34 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: log_likelihoods = stats["greedy_log_likelihoods"] return np.array([-np.sum(log_likelihood) for log_likelihood in log_likelihoods]) +class SampledMaximumSequenceProbability(Estimator): + """ + Estimates the sequence-level uncertainty of a language model by calculating the + log-probability of the generation with minus sign. + It is calculated as the sum of log-probabilities in each token. + Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel). + """ + + def __init__(self): + super().__init__(["sample_log_probs"], "sequence") + + def __str__(self): + return "SampledMaximumSequenceProbability" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the minus log-probability of each sample in input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * log p(y_i | y_ np.ndarray: log_likelihoods = stats["greedy_log_likelihoods"] return np.array([-np.mean(ll) for ll in log_likelihoods]) + +class SampledPerplexity(Estimator): + def __init__(self): + super().__init__(["sample_log_likelihoods"], "sequence") + + def __str__(self): + return "SampledPerplexity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + log_likelihoods = stats["sample_log_likelihoods"] + ppl = [np.mean(sample_log_likelihoods[0]) for sample_log_likelihoods in log_likelihoods] + return -np.array(ppl) diff --git a/src/lm_polygraph/estimators/token_entropy.py b/src/lm_polygraph/estimators/token_entropy.py index fc87cc77c..9e1d080dd 100644 --- a/src/lm_polygraph/estimators/token_entropy.py +++ b/src/lm_polygraph/estimators/token_entropy.py @@ -33,6 +33,34 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array([np.mean(e) for e in entropy]) +class SampledMeanTokenEntropy(Estimator): + """ + Estimates the sequence-level uncertainty of a language model by calculating the + mean entropy among all tokens in the generation. + Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel). + """ + + def __init__(self): + super().__init__(["sample_entropy"], "sequence") + + def __str__(self): + return "SampledMeanTokenEntropy" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the mean token entropy for each sample in input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * Entropy(* | y_ np.ndarray: tokenSAR.append(E_t.sum()) return np.array(tokenSAR) + + +class SampledTokenSAR(Estimator): + """ + Estimates the sequence-level uncertainty of a language model following the method of + "Token SAR" as provided in the paper https://arxiv.org/abs/2307.01379. + Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel). + + This method calculates the weighted sum of log_likelihoods with weights computed using token relevance. + """ + + def __init__(self, verbose: bool = False): + super().__init__(["sample_token_similarity", "sample_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "SampledTokenSAR" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the tokenSAR for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * log p(y_i | y_ Dict[Tuple[str, str, str, str], float]: """ @@ -431,22 +442,13 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: - generation metrics name, - `ue_metrics` name which was used to calculate quality. """ - + self.prepare_calculators() train_stats = self._extract_train_embeddings() background_train_stats = self._extract_train_embeddings(background=True) iterable_data = tqdm(self.data) if self.verbose else self.data for batch_i, (inp_texts, target_texts) in enumerate(iterable_data): - batch_stats: Dict[str, np.ndarray] = {} - for key, val in [ - ("input_texts", inp_texts), - ("target_texts", target_texts), - ]: - self.stats[key] += val - batch_stats[key] = val - batch_stats["model"] = self.model - - batch_stats["model"] = self.model + batch_stats = self.initiate_batch_stats(batch_i, inp_texts, target_texts) train_stats_keys = list(train_stats.keys()) for stat in train_stats_keys: @@ -455,8 +457,10 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: background_train_stats_keys = list(background_train_stats.keys()) for stat in background_train_stats_keys: batch_stats[stat] = background_train_stats.pop(stat) - + + old_stats = set(batch_stats.keys()) batch_stats = self.calculate(batch_stats, self.stat_calculators, inp_texts) + new_stats = set(batch_stats.keys()) - old_stats batch_estimations, bad_estimators = self.estimate( batch_stats, self.estimators @@ -479,42 +483,14 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: batch_gen_metrics[generation_metric.level, str(generation_metric)] += m for key in self.save_stats: - if key in batch_stats.keys(): + if key in new_stats: self.stats[key] += list(batch_stats[key]) for processor in self.processors: processor.on_batch(batch_stats, batch_gen_metrics, batch_estimations) - if self.ensemble_model is not None: - iterable_data = tqdm(self.data) if self.verbose else self.data - for batch_i, (inp_texts, target_texts) in enumerate(iterable_data): - batch_stats: Dict[str, np.ndarray] = {} - for key, val in [ - ("input_texts", inp_texts), - ("target_texts", target_texts), - ("model", self.model), - ]: - batch_stats[key] = val - - batch_stats["ensemble_generation_params"] = {} - batch_stats["ensemble_model"] = self.ensemble_model - - batch_stats = self.calculate( - batch_stats, self.ensemble_stat_calculators, inp_texts - ) - - batch_estimations, bad_estimators = self.estimate( - batch_stats, self.ensemble_estimators - ) - - for bad_estimator in bad_estimators: - key = (bad_estimator.level, str(bad_estimator)) - self.estimations.pop(key, None) - self.ensemble_estimators.remove(bad_estimator) - self.total_bad_estimators[bad_estimator] = batch_i - torch.cuda.empty_cache() gc.collect() - + self.eval_ue() for processor in self.processors: @@ -703,7 +679,7 @@ def save(self, save_path: str): ) @staticmethod - def load(load_path: str) -> "UEManager": + def load(load_path: str, **kwargs) -> "UEManager": """ Loads UEManager from the specified path. To save the calculated manager results, see UEManager.save(). @@ -711,7 +687,16 @@ def load(load_path: str) -> "UEManager": load_path (str): Path to file with saved benchmark results to load. """ res_dict = torch.load(load_path) - man = UEManager(None, None, [], [], [], []) + default_kwargs = { + "data": None, + "model": None, + "estimators": [], + "generation_metrics": [], + "ue_metrics": [], + "processors": [], + } + default_kwargs.update(kwargs) + man = UEManager(**default_kwargs) man.metrics = res_dict.get("metrics", None) man.gen_metrics = res_dict.get("gen_metrics", None) man.estimations = res_dict.get("estimations", None) diff --git a/src/lm_polygraph/utils/processor.py b/src/lm_polygraph/utils/processor.py index 424df271c..49908dea9 100644 --- a/src/lm_polygraph/utils/processor.py +++ b/src/lm_polygraph/utils/processor.py @@ -61,7 +61,7 @@ def on_batch( for key, val in batch_stats.items(): str_repr = str(val) # to skip large outputs - if len(str_repr) < 10000 and str_repr.count("\n") < 10: + if len(str_repr) < 10000 and str_repr.count("\n") < 20: print(f"{key}: {val}") print() print("-" * 100) From f570400f780d2ebcd8c7f0dec661da0dedea9fdd Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 11 Dec 2024 14:58:27 +0400 Subject: [PATCH 44/97] Add quality metrics based off first sample generated --- .../configs/polygraph_eval_xsum_sentsar.yaml | 2 +- scripts/polygraph_eval | 18 ++++++++++--- .../generation_metrics/accuracy.py | 17 +++++++++--- .../generation_metrics/alignscore.py | 19 +++++++++++--- src/lm_polygraph/generation_metrics/bleu.py | 17 +++++++++--- src/lm_polygraph/generation_metrics/comet.py | 18 ++++++++++--- src/lm_polygraph/generation_metrics/rouge.py | 17 +++++++++--- src/lm_polygraph/stat_calculators/__init__.py | 2 +- src/lm_polygraph/stat_calculators/sample.py | 26 +++++++++++++++++++ .../utils/register_stat_calculators.py | 1 + 10 files changed, 116 insertions(+), 21 deletions(-) diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index a04c9c672..4c4ea0daf 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -13,7 +13,7 @@ device: cpu task: ats -base_manager: null +base_manager: /Users/romanvashurin/workspace/sar_enhancements/gsu/mistral7b_xsum.man overwrite_base_estimations: false dataset: xsum diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 2944cf21e..ba60b5577 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -446,8 +446,6 @@ def get_generation_metrics(args): RougeMetric("rouge2"), RougeMetric("rougeL"), BLEUMetric(), - BertScoreMetric('rh'), - SbertMetric(), AccuracyMetric( target_ignore_regex = getattr(args, "target_ignore_regex", None), output_ignore_regex = getattr(args, "output_ignore_regex", None), @@ -456,13 +454,27 @@ def get_generation_metrics(args): AlignScore(), AlignScore(target_is_claims=False), AlignScore(ignore_target=True), + RougeMetric("rouge1", sample=True), + RougeMetric("rouge2", sample=True), + RougeMetric("rougeL", sample=True), + BLEUMetric(sample=True), + AccuracyMetric( + target_ignore_regex = getattr(args, "target_ignore_regex", None), + output_ignore_regex = getattr(args, "output_ignore_regex", None), + normalize = getattr(args, "normalize", False), + sample=True, + ), + AlignScore(sample=True), + AlignScore(target_is_claims=False, sample=True), + AlignScore(ignore_target=True, sample=True), ] if getattr(args.model, "type", "Whitebox") != "Blackbox": if getattr(args, "use_claim_ue", False): result += [OpenAIFactCheck(cache_path=args.cache_path, language=getattr(args, "language", "en"))] if args.task == "nmt": ignore_regex = getattr(args, "source_ignore_regex", None) - result += [Comet(source_ignore_regex = ignore_regex)] + result += [Comet(source_ignore_regex = ignore_regex), + Comet(source_ignore_regex = ignore_regex, sample=True)] else: result = [] for metric in generation_metrics: diff --git a/src/lm_polygraph/generation_metrics/accuracy.py b/src/lm_polygraph/generation_metrics/accuracy.py index 5c2478efb..a71c1b989 100644 --- a/src/lm_polygraph/generation_metrics/accuracy.py +++ b/src/lm_polygraph/generation_metrics/accuracy.py @@ -16,9 +16,13 @@ class AccuracyMetric(GenerationMetric): """ def __init__( - self, target_ignore_regex=None, output_ignore_regex=None, normalize=False + self, target_ignore_regex=None, output_ignore_regex=None, normalize=False, sample: bool = False ): - super().__init__(["greedy_texts"], "sequence") + if sample: + super().__init__(["first_sample_texts"], "sequence") + else: + super().__init__(["greedy_texts"], "sequence") + self.sample = sample self.target_ignore_regex = ( re.compile(target_ignore_regex) if target_ignore_regex else None ) @@ -33,6 +37,8 @@ def __init__( ) def __str__(self): + if self.sample: + return "SampleAccuracy" return "Accuracy" def _score_single(self, output: str, target: str) -> int: @@ -66,11 +72,14 @@ def __call__( Returns: np.ndarray: list of accuracies: 1 if generated text is equal to ground-truth and 0 otherwise. """ - greedy_texts = stats["greedy_texts"] + if self.sample: + gen_texts = stats["first_sample_texts"] + else: + gen_texts = stats["greedy_texts"] result = [] - for hyp, ref in zip(greedy_texts, target_texts): + for hyp, ref in zip(gen_texts, target_texts): ref = self._filter_text(ref, self.target_ignore_regex) hyp = self._filter_text(hyp, self.output_ignore_regex) diff --git a/src/lm_polygraph/generation_metrics/alignscore.py b/src/lm_polygraph/generation_metrics/alignscore.py index 139b558e1..b47a080a3 100644 --- a/src/lm_polygraph/generation_metrics/alignscore.py +++ b/src/lm_polygraph/generation_metrics/alignscore.py @@ -19,8 +19,13 @@ def __init__( batch_size=16, target_is_claims=True, ignore_target=False, + sample: bool = False, ): - super().__init__(["greedy_texts", "input_texts"], "sequence") + if sample: + super().__init__(["first_sample_texts", "input_texts"], "sequence") + else: + super().__init__(["greedy_texts", "input_texts"], "sequence") + self.sample = sample device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.target_is_claims = target_is_claims self.batch_size = batch_size @@ -41,6 +46,10 @@ def __str__(self): base += "OutputTarget" else: base += "TargetOutput" + + if self.sample: + return f"Sample{base}" + return base def __call__( @@ -59,11 +68,15 @@ def __call__( Returns: np.ndarray: list of AlignScore Scores for each sample in input. """ - greedy_texts = stats["greedy_texts"] + if self.sample: + gen_texts = stats["first_sample_texts"] + else: + gen_texts = stats["greedy_texts"] + input_texts = stats["input_texts"] filtered_targets = [x if len(x.strip()) else "(empty)" for x in target_texts] - filtered_outputs = [x if len(x.strip()) else "(empty)" for x in greedy_texts] + filtered_outputs = [x if len(x.strip()) else "(empty)" for x in gen_texts] filtered_inputs = [x if len(x.strip()) else "(empty)" for x in input_texts] if self.ignore_target: diff --git a/src/lm_polygraph/generation_metrics/bleu.py b/src/lm_polygraph/generation_metrics/bleu.py index 641047495..dd9b19ae7 100644 --- a/src/lm_polygraph/generation_metrics/bleu.py +++ b/src/lm_polygraph/generation_metrics/bleu.py @@ -10,11 +10,17 @@ class BLEUMetric(GenerationMetric): Calculates BLEU metric between model-generated texts and ground truth texts. """ - def __init__(self): - super().__init__(["greedy_texts"], "sequence") + def __init__(self, sample: bool = False): + if sample: + super().__init__(["first_sample_texts"], "sequence") + else: + super().__init__(["greedy_texts"], "sequence") + self.sample = sample self.scorer = BLEU(effective_order=True, lowercase=True) def __str__(self): + if self.sample: + return "SampleBLEU" return "BLEU" def _score_single(self, t1: str, t2: str): @@ -37,9 +43,14 @@ def __call__( Returns: np.ndarray: list of BLEU Scores for each sample in input. """ + if self.sample: + gen_texts = stats["first_sample_texts"] + else: + gen_texts = stats["greedy_texts"] + return np.array( [ self._score_single(hyp, ref) - for hyp, ref in zip(stats["greedy_texts"], target_texts) + for hyp, ref in zip(gen_texts, target_texts) ] ) diff --git a/src/lm_polygraph/generation_metrics/comet.py b/src/lm_polygraph/generation_metrics/comet.py index 0fcd9b3e2..4d02b37a6 100644 --- a/src/lm_polygraph/generation_metrics/comet.py +++ b/src/lm_polygraph/generation_metrics/comet.py @@ -12,14 +12,20 @@ class Comet(GenerationMetric): between model-generated texts and ground truth texts. """ - def __init__(self, source_ignore_regex=None, lang="en"): - super().__init__(["greedy_texts", "input_texts"], "sequence") + def __init__(self, source_ignore_regex=None, lang="en", sample: bool = False): + if sample: + super().__init__(["first_sample_texts", "input_texts"], "sequence") + else: + super().__init__(["greedy_texts", "input_texts"], "sequence") + self.sample = sample self.scorer = load("comet") self.source_ignore_regex = ( re.compile(source_ignore_regex) if source_ignore_regex else None ) def __str__(self): + if self.sample: + return "SampleComet" return "Comet" def _filter_text(self, text: str, ignore_regex: re.Pattern) -> str: @@ -54,9 +60,15 @@ def __call__( self._filter_text(src, self.source_ignore_regex) for src in stats["input_texts"] ] + + if self.sample: + gen_texts = stats["first_sample_texts"] + else: + gen_texts = stats["greedy_texts"] + scores = np.array( self.scorer.compute( - predictions=stats["greedy_texts"], + predictions=gen_texts, references=target_texts, sources=sources, )["scores"] diff --git a/src/lm_polygraph/generation_metrics/rouge.py b/src/lm_polygraph/generation_metrics/rouge.py index e4f96a18d..86ac231e3 100644 --- a/src/lm_polygraph/generation_metrics/rouge.py +++ b/src/lm_polygraph/generation_metrics/rouge.py @@ -15,7 +15,7 @@ class RougeMetric(GenerationMetric): Calculates Rouge metric between model-generated texts and ground truth texts. """ - def __init__(self, rouge_name): + def __init__(self, rouge_name, sample: bool = False): """ Parameters: rouge_name (str): rouge metric type. Possible values: @@ -23,11 +23,17 @@ def __init__(self, rouge_name): * rouge2 * rougeL """ - super().__init__(["greedy_texts"], "sequence") + if sample: + super().__init__(["first_sample_texts"], "sequence") + else: + super().__init__(["greedy_texts"], "sequence") + self.sample = sample self.rouge_name = rouge_name self.scorer = rouge_scorer.RougeScorer([rouge_name], use_stemmer=True) def __str__(self): + if self.sample: + return f"SampleRouge_{self.rouge_name}" return f"Rouge_{self.rouge_name}" def _score_single(self, t1: str, t2: str): @@ -52,9 +58,14 @@ def __call__( Returns: np.ndarray: list of Rouge Scores for each sample in input. """ + if self.sample: + gen_texts = stats["first_sample_texts"] + else: + gen_texts = stats["greedy_texts"] + return np.array( [ self._score_single(hyp, ref) - for hyp, ref in zip(stats["greedy_texts"], target_texts) + for hyp, ref in zip(gen_texts, target_texts) ] ) diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py index 1a2ea5721..29844b507 100644 --- a/src/lm_polygraph/stat_calculators/__init__.py +++ b/src/lm_polygraph/stat_calculators/__init__.py @@ -10,7 +10,7 @@ ) from .entropy import EntropyCalculator from .entropy import SampleEntropyCalculator -from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator +from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator, FirstSampleCalculator from .sample_alternatives_nli import SampleAlternativesNLICalculator from .greedy_alternatives_nli import ( GreedyAlternativesNLICalculator, diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py index 2c74f7a1d..d00f4f6f5 100644 --- a/src/lm_polygraph/stat_calculators/sample.py +++ b/src/lm_polygraph/stat_calculators/sample.py @@ -206,3 +206,29 @@ def __call__( "sample_tokens_distributions": token_distributions, "sample_tokens_alternatives": alternatives, } + +class FirstSampleCalculator(StatCalculator): + def __init__(self): + super().__init__( + [ + "first_sample_texts", + ], + [ + "sample_texts", + ] + ) + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + sample_texts = dependencies["sample_texts"] + first_sample_texts = [st[0] for st in sample_texts] + + return { + "first_sample_texts": first_sample_texts, + } + diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index 2c7ccd8f8..7c82caf80 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -67,6 +67,7 @@ def _register(calculator_class: StatCalculator): _register(SampleEntropyCalculator(top_k=entropy_top_k)) _register(GreedyLMProbsCalculator()) _register(SamplingGenerationCalculator(n_alternatives=n_ccp_alternatives)) + _register(FirstSampleCalculator()) _register(BartScoreCalculator()) _register(ModelScoreCalculator()) _register(EmbeddingsCalculator()) From 31b38e24caa676af55f941c7f5bb8be1902929af Mon Sep 17 00:00:00 2001 From: silvimica Date: Wed, 11 Dec 2024 16:19:36 +0400 Subject: [PATCH 45/97] MaxSampledMaximumSequenceProbability --- src/lm_polygraph/estimators/__init__.py | 1 + .../estimators/max_probability.py | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 5f75d7c74..b903b3cc3 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -8,6 +8,7 @@ from .max_probability import ( MaximumSequenceProbability, MaximumTokenProbability, + MaxSampledMaximumSequenceProbability, ) from .claim_conditioned_probability import ClaimConditionedProbability from .token_entropy import MeanTokenEntropy, TokenEntropy diff --git a/src/lm_polygraph/estimators/max_probability.py b/src/lm_polygraph/estimators/max_probability.py index b8fe2afde..cbfc8ed32 100644 --- a/src/lm_polygraph/estimators/max_probability.py +++ b/src/lm_polygraph/estimators/max_probability.py @@ -61,6 +61,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: mp = [lp[0] for lp in stats["sample_log_probs"]] return -np.array(mp) + + +class MaxSampledMaximumSequenceProbability(Estimator): + """ + Estimates the sequence-level uncertainty of a language model by calculating the + log-probability of the generation with minus sign. + It is calculated as the sum of log-probabilities in each token. + Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel). + """ + + def __init__(self): + super().__init__(["sample_log_probs"], "sequence") + + def __str__(self): + return "MaxSampledMaximumSequenceProbability" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the minus log-probability of each sample in input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * log p(y_i | y_ Date: Wed, 11 Dec 2024 16:35:20 +0400 Subject: [PATCH 46/97] Add average sample ue baseline, base manager params to all sentsar configs --- .../configs/polygraph_eval_coqa_sentsar.yaml | 29 ++++++++++++++++++- .../polygraph_eval_gsm8k_sentsar_cot.yaml | 28 ++++++++++++++++++ .../configs/polygraph_eval_mmlu_sentsar.yaml | 28 ++++++++++++++++++ .../polygraph_eval_triviaqa_sentsar.yaml | 28 ++++++++++++++++++ .../polygraph_eval_wmt14_enfr_sentsar.yaml | 29 ++++++++++++++++++- .../polygraph_eval_wmt14_fren_sentsar.yaml | 28 ++++++++++++++++++ .../polygraph_eval_wmt19_deen_sentsar.yaml | 28 ++++++++++++++++++ .../polygraph_eval_wmt19_ende_sentsar.yaml | 28 ++++++++++++++++++ .../configs/polygraph_eval_xsum_sentsar.yaml | 2 +- 9 files changed, 225 insertions(+), 3 deletions(-) diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml index 9e75ed4af..7af151dc6 100644 --- a/examples/configs/polygraph_eval_coqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -11,6 +11,9 @@ save_path: '${hydra:run.dir}' task: qa +base_manager: null +overwrite_base_estimations: false + dataset: coqa text_column: questions label_column: answers @@ -74,6 +77,9 @@ additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability kwargs: {} + - module: lm_polygraph.estimators.max_probability + class_name: SampledMaximumSequenceProbability + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} @@ -84,17 +90,22 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: SampledTokenSAR + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} - module: lm_polygraph.estimators.gsu class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - module: lm_polygraph.estimators.perplexity class_name: Perplexity kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: SampledPerplexity + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSAR kwargs: {} @@ -105,6 +116,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: SampledMeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: MTESAR kwargs: {} @@ -112,6 +126,19 @@ additional_estimators: class_name: MTEGSU kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMaxprob + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AvePPL + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveTokenSAR + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMTE + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml index b4eea7dcd..d13ccfc0f 100644 --- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -11,6 +11,9 @@ save_path: '${hydra:run.dir}' task: qa +base_manager: null +overwrite_base_estimations: false + dataset: [gsm8k, main] text_column: question label_column: answer @@ -78,6 +81,9 @@ additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability kwargs: {} + - module: lm_polygraph.estimators.max_probability + class_name: SampledMaximumSequenceProbability + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} @@ -88,6 +94,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: SampledTokenSAR + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} @@ -98,6 +107,9 @@ additional_estimators: - module: lm_polygraph.estimators.perplexity class_name: Perplexity kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: SampledPerplexity + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSAR kwargs: {} @@ -108,6 +120,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: SampledMeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: MTESAR kwargs: {} @@ -115,6 +130,19 @@ additional_estimators: class_name: MTEGSU kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMaxprob + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AvePPL + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveTokenSAR + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMTE + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml index 755da5c74..7162070b0 100644 --- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -11,6 +11,9 @@ save_path: '${hydra:run.dir}' task: qa +base_manager: null +overwrite_base_estimations: false + dataset: [cais/mmlu, all] text_column: question label_column: answer @@ -76,6 +79,9 @@ additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability kwargs: {} + - module: lm_polygraph.estimators.max_probability + class_name: SampledMaximumSequenceProbability + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} @@ -86,6 +92,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: SampledTokenSAR + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} @@ -96,6 +105,9 @@ additional_estimators: - module: lm_polygraph.estimators.perplexity class_name: Perplexity kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: SampledPerplexity + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSAR kwargs: {} @@ -106,6 +118,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: SampledMeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: MTESAR kwargs: {} @@ -113,6 +128,19 @@ additional_estimators: class_name: MTEGSU kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMaxprob + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AvePPL + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveTokenSAR + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMTE + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 1b40fb1b9..f9fa19928 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -11,6 +11,9 @@ save_path: '${hydra:run.dir}' task: qa +base_manager: null +overwrite_base_estimations: false + dataset: [trivia_qa, rc.nocontext] text_column: question label_column: answer @@ -77,6 +80,9 @@ additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability kwargs: {} + - module: lm_polygraph.estimators.max_probability + class_name: SampledMaximumSequenceProbability + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} @@ -87,6 +93,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: SampledTokenSAR + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} @@ -97,6 +106,9 @@ additional_estimators: - module: lm_polygraph.estimators.perplexity class_name: Perplexity kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: SampledPerplexity + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSAR kwargs: {} @@ -107,6 +119,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: SampledMeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: MTESAR kwargs: {} @@ -114,6 +129,19 @@ additional_estimators: class_name: MTEGSU kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMaxprob + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AvePPL + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveTokenSAR + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMTE + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml index b3505e7cc..38f283f9b 100644 --- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml @@ -13,6 +13,9 @@ device: cpu task: nmt +base_manager: null +overwrite_base_estimations: false + dataset: [wmt14, fr-en] text_column: en label_column: fr @@ -76,6 +79,9 @@ additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability kwargs: {} + - module: lm_polygraph.estimators.max_probability + class_name: SampledMaximumSequenceProbability + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} @@ -86,17 +92,22 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: SampledTokenSAR + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} - module: lm_polygraph.estimators.gsu class_name: TokenSARGSU kwargs: {} - - module: lm_polygraph.estimators.gsu - module: lm_polygraph.estimators.perplexity class_name: Perplexity kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: SampledPerplexity + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSAR kwargs: {} @@ -107,6 +118,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: SampledMeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: MTESAR kwargs: {} @@ -114,6 +128,19 @@ additional_estimators: class_name: MTEGSU kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMaxprob + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AvePPL + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveTokenSAR + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMTE + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index b471f29f8..1dbed406e 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -13,6 +13,9 @@ device: cpu task: nmt +base_manager: null +overwrite_base_estimations: false + dataset: [wmt14, fr-en] text_column: fr label_column: en @@ -76,6 +79,9 @@ additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability kwargs: {} + - module: lm_polygraph.estimators.max_probability + class_name: SampledMaximumSequenceProbability + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} @@ -86,6 +92,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: SampledTokenSAR + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} @@ -96,6 +105,9 @@ additional_estimators: - module: lm_polygraph.estimators.perplexity class_name: Perplexity kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: SampledPerplexity + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSAR kwargs: {} @@ -106,6 +118,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: SampledMeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: MTESAR kwargs: {} @@ -113,6 +128,19 @@ additional_estimators: class_name: MTEGSU kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMaxprob + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AvePPL + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveTokenSAR + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMTE + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index c9658242b..b0e766163 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -13,6 +13,9 @@ device: cpu task: nmt +base_manager: null +overwrite_base_estimations: false + dataset: [wmt19, de-en] text_column: de label_column: en @@ -75,6 +78,9 @@ additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability kwargs: {} + - module: lm_polygraph.estimators.max_probability + class_name: SampledMaximumSequenceProbability + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} @@ -85,6 +91,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: SampledTokenSAR + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} @@ -95,6 +104,9 @@ additional_estimators: - module: lm_polygraph.estimators.perplexity class_name: Perplexity kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: SampledPerplexity + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSAR kwargs: {} @@ -105,6 +117,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: SampledMeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: MTESAR kwargs: {} @@ -112,6 +127,19 @@ additional_estimators: class_name: MTEGSU kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMaxprob + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AvePPL + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveTokenSAR + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMTE + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml index afd8f28f3..b52328b10 100644 --- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml @@ -13,6 +13,9 @@ device: cpu task: nmt +base_manager: null +overwrite_base_estimations: false + dataset: [wmt19, de-en] text_column: en label_column: de @@ -75,6 +78,9 @@ additional_estimators: - module: lm_polygraph.estimators.max_probability class_name: MaximumSequenceProbability kwargs: {} + - module: lm_polygraph.estimators.max_probability + class_name: SampledMaximumSequenceProbability + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: SentenceSAR kwargs: {} @@ -85,6 +91,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_sar class_name: TokenSAR kwargs: {} + - module: lm_polygraph.estimators.token_sar + class_name: SampledTokenSAR + kwargs: {} - module: lm_polygraph.estimators.sar class_name: SAR kwargs: {} @@ -95,6 +104,9 @@ additional_estimators: - module: lm_polygraph.estimators.perplexity class_name: Perplexity kwargs: {} + - module: lm_polygraph.estimators.perplexity + class_name: SampledPerplexity + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: PPLSAR kwargs: {} @@ -105,6 +117,9 @@ additional_estimators: - module: lm_polygraph.estimators.token_entropy class_name: MeanTokenEntropy kwargs: {} + - module: lm_polygraph.estimators.token_entropy + class_name: SampledMeanTokenEntropy + kwargs: {} - module: lm_polygraph.estimators.sentence_sar class_name: MTESAR kwargs: {} @@ -112,6 +127,19 @@ additional_estimators: class_name: MTEGSU kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMaxprob + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AvePPL + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveTokenSAR + kwargs: {} + - module: lm_polygraph.estimators.average_ue + class_name: AveMTE + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index 4c4ea0daf..a04c9c672 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -13,7 +13,7 @@ device: cpu task: ats -base_manager: /Users/romanvashurin/workspace/sar_enhancements/gsu/mistral7b_xsum.man +base_manager: null overwrite_base_estimations: false dataset: xsum From b9646f0e1fe49ae9496b97c37b0dc88a6b83ce0f Mon Sep 17 00:00:00 2001 From: silvimica Date: Wed, 11 Dec 2024 17:24:42 +0400 Subject: [PATCH 47/97] add MaxSampledPerplexity --- src/lm_polygraph/estimators/__init__.py | 5 ++++- src/lm_polygraph/estimators/perplexity.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index b903b3cc3..c51f08e0a 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -31,7 +31,10 @@ from .num_sem_sets import NumSemSets from .semantic_entropy import SemanticEntropy from .semantic_entropy_token import SemanticEntropyToken -from .perplexity import Perplexity +from .perplexity import ( + Perplexity, + MaxSampledPerplexity, +) from .mahalanobis_distance import MahalanobisDistanceSeq from .relative_mahalanobis_distance import RelativeMahalanobisDistanceSeq from .rde import RDESeq diff --git a/src/lm_polygraph/estimators/perplexity.py b/src/lm_polygraph/estimators/perplexity.py index d8c8e22b7..44fd9350f 100644 --- a/src/lm_polygraph/estimators/perplexity.py +++ b/src/lm_polygraph/estimators/perplexity.py @@ -27,3 +27,23 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: log_likelihoods = stats["sample_log_likelihoods"] ppl = [np.mean(sample_log_likelihoods[0]) for sample_log_likelihoods in log_likelihoods] return -np.array(ppl) + +class MaxSampledPerplexity(Estimator): + def init(self): + super().init(["sample_log_likelihoods"], "sequence") + + def str(self): + return "MaxSampledPerplexity" + + def call(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + log_likelihoods = stats["sample_log_likelihoods"] + + ppl_per_sample = [ + [-np.mean(sequence) for sequence in sample_log_likelihoods] + for sample_log_likelihoods in log_likelihoods + ] + + # Find the maximum perplexity for each set of samples + max_ppl = [max(ppl_sample) for ppl_sample in ppl_per_sample] + + return -np.array(max_ppl) \ No newline at end of file From 6faa7763c6e56812468d9d4b8aea168bf2db1bd1 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 11 Dec 2024 18:17:55 +0400 Subject: [PATCH 48/97] Use common scorers for alignscores and comets --- scripts/polygraph_eval | 30 ++++++++++++++----- .../generation_metrics/alignscore.py | 13 ++------ src/lm_polygraph/generation_metrics/comet.py | 5 ++-- src/lm_polygraph/utils/manager.py | 7 +++-- 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index ba60b5577..72c01c031 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -13,10 +13,12 @@ import logging log = logging.getLogger('lm_polygraph') +from evaluate import load from lm_polygraph.utils.manager import UEManager from lm_polygraph.utils.dataset import Dataset from lm_polygraph.utils.model import WhiteboxModel, BlackboxModel, create_ensemble from lm_polygraph.utils.processor import Logger +from lm_polygraph.generation_metrics.alignscore_utils import AlignScorer from lm_polygraph.generation_metrics import * from lm_polygraph.estimators import * from lm_polygraph.utils.openai_chat import OpenAIChat @@ -441,6 +443,17 @@ def get_generation_metrics(args): generation_metrics = getattr(args, "generation_metrics", None) if not generation_metrics: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + batch_size = 16 + ckpt_path="https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt" + align_scorer = AlignScorer( + model="roberta-large", + batch_size=batch_size, + device=device, + ckpt_path=ckpt_path, + evaluation_mode="nli_sp", + ) + result = [ RougeMetric("rouge1"), RougeMetric("rouge2"), @@ -451,9 +464,9 @@ def get_generation_metrics(args): output_ignore_regex = getattr(args, "output_ignore_regex", None), normalize = getattr(args, "normalize", False), ), - AlignScore(), - AlignScore(target_is_claims=False), - AlignScore(ignore_target=True), + AlignScore(align_scorer), + AlignScore(align_scorer, target_is_claims=False), + AlignScore(align_scorer, ignore_target=True), RougeMetric("rouge1", sample=True), RougeMetric("rouge2", sample=True), RougeMetric("rougeL", sample=True), @@ -464,17 +477,18 @@ def get_generation_metrics(args): normalize = getattr(args, "normalize", False), sample=True, ), - AlignScore(sample=True), - AlignScore(target_is_claims=False, sample=True), - AlignScore(ignore_target=True, sample=True), + AlignScore(align_scorer, sample=True), + AlignScore(align_scorer, target_is_claims=False, sample=True), + AlignScore(align_scorer, ignore_target=True, sample=True), ] if getattr(args.model, "type", "Whitebox") != "Blackbox": if getattr(args, "use_claim_ue", False): result += [OpenAIFactCheck(cache_path=args.cache_path, language=getattr(args, "language", "en"))] if args.task == "nmt": ignore_regex = getattr(args, "source_ignore_regex", None) - result += [Comet(source_ignore_regex = ignore_regex), - Comet(source_ignore_regex = ignore_regex, sample=True)] + comet_scorer = load("comet") + result += [Comet(comet_scorer, source_ignore_regex = ignore_regex), + Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True)] else: result = [] for metric in generation_metrics: diff --git a/src/lm_polygraph/generation_metrics/alignscore.py b/src/lm_polygraph/generation_metrics/alignscore.py index b47a080a3..57c6d454b 100644 --- a/src/lm_polygraph/generation_metrics/alignscore.py +++ b/src/lm_polygraph/generation_metrics/alignscore.py @@ -14,9 +14,8 @@ class AlignScore(GenerationMetric): def __init__( self, + scorer, lang="en", - ckpt_path="https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt", - batch_size=16, target_is_claims=True, ignore_target=False, sample: bool = False, @@ -26,17 +25,9 @@ def __init__( else: super().__init__(["greedy_texts", "input_texts"], "sequence") self.sample = sample - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.target_is_claims = target_is_claims - self.batch_size = batch_size self.ignore_target = ignore_target - self.scorer = AlignScorer( - model="roberta-large", - batch_size=batch_size, - device=device, - ckpt_path=ckpt_path, - evaluation_mode="nli_sp", - ) + self.scorer = scorer def __str__(self): base = "AlignScore" diff --git a/src/lm_polygraph/generation_metrics/comet.py b/src/lm_polygraph/generation_metrics/comet.py index 4d02b37a6..35c0f9ab4 100644 --- a/src/lm_polygraph/generation_metrics/comet.py +++ b/src/lm_polygraph/generation_metrics/comet.py @@ -1,6 +1,5 @@ import re import numpy as np -from evaluate import load from typing import List, Dict from .generation_metric import GenerationMetric @@ -12,16 +11,16 @@ class Comet(GenerationMetric): between model-generated texts and ground truth texts. """ - def __init__(self, source_ignore_regex=None, lang="en", sample: bool = False): + def __init__(self, scorer, source_ignore_regex=None, lang="en", sample: bool = False): if sample: super().__init__(["first_sample_texts", "input_texts"], "sequence") else: super().__init__(["greedy_texts", "input_texts"], "sequence") self.sample = sample - self.scorer = load("comet") self.source_ignore_regex = ( re.compile(source_ignore_regex) if source_ignore_regex else None ) + self.scorer = scorer def __str__(self): if self.sample: diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 545ed1421..c2e7cd98f 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -408,8 +408,11 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts): for key, val in self.stats.items(): # Get corresponding batch from existing stats - val_batch = val[batch_i * self.batch_size : (batch_i + 1) * self.batch_size] - batch_stats[key] = val_batch + batch_start = batch_i * self.batch_size + batch_end = (batch_i + 1) * self.batch_size + if len(val) >= batch_end: + val_batch = val[batch_start:batch_end] + batch_stats[key] = val_batch for key, val in [ ("input_texts", inp_texts), From 52166f5396cb73cdefd2f1150118d613fd9c6f33 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 11 Dec 2024 18:35:07 +0400 Subject: [PATCH 49/97] Do not recalculate dependencies --- src/lm_polygraph/utils/manager.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index c2e7cd98f..f56f68fff 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -27,11 +27,12 @@ def _order_calculators( stats: List[str], + existing_stats: Set[str], stat_calculators: Dict[str, StatCalculator], stat_dependencies: Dict[str, List[str]], ) -> Tuple[List[str], Set[str]]: ordered: List[str] = [] - have_stats: Set[str] = set() + have_stats: Set[str] = set(existing_stats) while len(stats) > 0: stat = stats[0] if stat in have_stats: @@ -339,10 +340,12 @@ def prepare_calculators(self): ) # Only calculate stats that are not already calculated - stats = list(set(stats) - set(self.stats)) + existing_stats = set(self.stats.keys()) + stats = list(set(stats) - existing_stats) stats, have_stats = _order_calculators( stats, + existing_stats, stat_calculators_dict, stat_dependencies_dict, ) @@ -374,10 +377,11 @@ def prepare_calculators(self): else [] ) - train_stats = list(set(train_stats) - set(self.stats)) + train_stats = list(set(train_stats) - existing_stats) train_stats, _ = _order_calculators( train_stats, + existing_stats, stat_calculators_dict, stat_dependencies_dict, ) @@ -392,10 +396,11 @@ def prepare_calculators(self): if s.startswith("background_train") ] - background_train_stats = list(set(background_train_stats) - set(self.stats)) + background_train_stats = list(set(background_train_stats) - existing_stats) background_train_stats, _ = _order_calculators( background_train_stats, + existing_stats, stat_calculators_dict, stat_dependencies_dict, ) From 58c7f6d4cb956267bf9b1fab7444141a8204829b Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 11 Dec 2024 21:22:25 +0400 Subject: [PATCH 50/97] Consider sampling-based evaluation in gen metric wrappers --- .../generation_metrics/aggregated_metric.py | 11 +++++++++-- .../generation_metrics/preprocess_output_target.py | 12 +++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/lm_polygraph/generation_metrics/aggregated_metric.py b/src/lm_polygraph/generation_metrics/aggregated_metric.py index bd20e9d93..17a05cc6f 100644 --- a/src/lm_polygraph/generation_metrics/aggregated_metric.py +++ b/src/lm_polygraph/generation_metrics/aggregated_metric.py @@ -11,6 +11,7 @@ class AggregatedMetric(GenerationMetric): def __init__(self, base_metric: GenerationMetric, aggregation: str = "max"): self.base_metric = base_metric + self.sample = base_metric.sample self.level = base_metric.level self.stats_dependencies = base_metric.stats_dependencies self.aggregation = aggregation @@ -34,8 +35,14 @@ def __call__( np.ndarray: list of aggregated metric values for each sample in input. """ metric_values = [] - for i, (targets, greedy_text) in enumerate( - zip(target_texts, stats["greedy_texts"]) + + if self.sample: + gen_texts = stats["first_sample_texts"] + else: + gen_texts = stats["greedy_texts"] + + for i, (targets, gen_text) in enumerate( + zip(target_texts, gen_texts) ): # truncate stats to only process one sample at a time truncated_stats = { diff --git a/src/lm_polygraph/generation_metrics/preprocess_output_target.py b/src/lm_polygraph/generation_metrics/preprocess_output_target.py index 8d3d56671..0e77415aa 100644 --- a/src/lm_polygraph/generation_metrics/preprocess_output_target.py +++ b/src/lm_polygraph/generation_metrics/preprocess_output_target.py @@ -12,6 +12,7 @@ class PreprocessOutputTarget(GenerationMetric): def __init__(self, base_metric, process_output_fn, process_target_fn): self.base_metric = getattr(base_metric, "base_metric", base_metric) + self.sample = base_metric.sample self.level = base_metric.level self.stats_dependencies = base_metric.stats_dependencies self.process_output_fn = process_output_fn @@ -44,8 +45,13 @@ def __call__( stats_copy = {k: v for k, v in stats.items() if k in self.stats_dependencies} stats_copy = deepcopy(stats_copy) - stats_copy["greedy_texts"] = [ - self.process_output_fn(output) for output in stats_copy["greedy_texts"] - ] + if self.sample: + stats_copy["first_sample_texts"] = [ + self.process_output_fn(output) for output in stats_copy["first_sample_texts"] + ] + else: + stats_copy["greedy_texts"] = [ + self.process_output_fn(output) for output in stats_copy["greedy_texts"] + ] return self.base_metric(stats_copy, processed_target_texts) From 518de01d130b80b0205dc79e1548453e707057c6 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 13 Dec 2024 12:01:03 +0400 Subject: [PATCH 51/97] Correctly handle the case when last batch is not whole --- src/lm_polygraph/utils/manager.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index f56f68fff..45cf7c6a2 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -410,11 +410,12 @@ def prepare_calculators(self): def initiate_batch_stats(self, batch_i, inp_texts, target_texts): batch_stats: Dict[str, np.ndarray] = {} - + cur_batch_size = len(inp_texts) + for key, val in self.stats.items(): # Get corresponding batch from existing stats - batch_start = batch_i * self.batch_size - batch_end = (batch_i + 1) * self.batch_size + batch_start = batch_i * cur_batch_size + batch_end = (batch_i + 1) * cur_batch_size if len(val) >= batch_end: val_batch = val[batch_start:batch_end] batch_stats[key] = val_batch @@ -422,10 +423,14 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts): for key, val in [ ("input_texts", inp_texts), ("target_texts", target_texts), - ]: + ]: if key not in batch_stats: self.stats[key] += val batch_stats[key] = val + else: + # Check that new stats will be calculated + # against the same input texts and targets + assert np.all(np.array(batch_stats[key]) == np.array(val)) batch_stats["model"] = self.model From c5795918b761efe68b06f42a164ab263cc784f17 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 13 Dec 2024 13:38:21 +0400 Subject: [PATCH 52/97] Use common batch size for full batches --- src/lm_polygraph/utils/manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 45cf7c6a2..ea27fed32 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -414,8 +414,12 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts): for key, val in self.stats.items(): # Get corresponding batch from existing stats - batch_start = batch_i * cur_batch_size + batch_start = batch_i * self.batch_size + # If last batch is not full, we need to adjust the end index batch_end = (batch_i + 1) * cur_batch_size + # This will only be true if the calculation is based off + # existing manager. Otherwise, all stats will contain only + # values calculated in previous batches if len(val) >= batch_end: val_batch = val[batch_start:batch_end] batch_stats[key] = val_batch From b67933cf92a5a9e96001e28cf0ce8bb8becf2910 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 13 Dec 2024 14:25:22 +0400 Subject: [PATCH 53/97] Fix MTESAR --- src/lm_polygraph/estimators/sentence_sar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm_polygraph/estimators/sentence_sar.py b/src/lm_polygraph/estimators/sentence_sar.py index c2ff21395..e8f5278b9 100644 --- a/src/lm_polygraph/estimators/sentence_sar.py +++ b/src/lm_polygraph/estimators/sentence_sar.py @@ -338,7 +338,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Compute sentence relevance sent_relevance = R_s.sum(-1) / self.t # Compute SentenceSAR (Uncertainty Estimation) using PPL - E_s = -np.log(sent_relevance + entropy) + E_s = np.log(sent_relevance + entropy) sentenceSAR.append(E_s.mean()) return np.array(sentenceSAR) From 19535ee5fc515291627118424c668dc80e18e7cc Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 13 Dec 2024 16:04:48 +0400 Subject: [PATCH 54/97] Import average baselines, correct batch initiation --- src/lm_polygraph/estimators/__init__.py | 2 +- src/lm_polygraph/utils/manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index c51f08e0a..9eb981925 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -83,4 +83,4 @@ from .linguistic_1s import Linguistic1S from .label_prob import LabelProb from .p_true_empirical import PTrueEmpirical -from .average_ue import AveMaxprob +from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index ea27fed32..e4494c9ab 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -416,7 +416,7 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts): # Get corresponding batch from existing stats batch_start = batch_i * self.batch_size # If last batch is not full, we need to adjust the end index - batch_end = (batch_i + 1) * cur_batch_size + batch_end = batch_start + cur_batch_size # This will only be true if the calculation is based off # existing manager. Otherwise, all stats will contain only # values calculated in previous batches From 1b67bbd8188d5b910e29a781e90a5b4f0e882579 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 13 Dec 2024 17:13:27 +0400 Subject: [PATCH 55/97] Only check input stats for consistency --- src/lm_polygraph/utils/manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index e4494c9ab..7d9a32f24 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -431,9 +431,9 @@ def initiate_batch_stats(self, batch_i, inp_texts, target_texts): if key not in batch_stats: self.stats[key] += val batch_stats[key] = val - else: + elif key == "input_texts": # Check that new stats will be calculated - # against the same input texts and targets + # against the same input texts assert np.all(np.array(batch_stats[key]) == np.array(val)) batch_stats["model"] = self.model From f034cbd2fafb82e92fed7c983fcec68269cbbf80 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Tue, 17 Dec 2024 13:12:45 +0400 Subject: [PATCH 56/97] Lighten the prr calculation, use logs as base for GSU and other new methods --- scripts/polygraph_eval | 4 - src/lm_polygraph/estimators/__init__.py | 3 +- src/lm_polygraph/estimators/average_ue.py | 8 +- src/lm_polygraph/estimators/gsu.py | 77 +--------- .../estimators/semantic_average_ue.py | 131 ++++++++++++++++++ src/lm_polygraph/utils/manager.py | 15 +- 6 files changed, 150 insertions(+), 88 deletions(-) create mode 100644 src/lm_polygraph/estimators/semantic_average_ue.py diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 72c01c031..d6a373497 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -455,8 +455,6 @@ def get_generation_metrics(args): ) result = [ - RougeMetric("rouge1"), - RougeMetric("rouge2"), RougeMetric("rougeL"), BLEUMetric(), AccuracyMetric( @@ -467,8 +465,6 @@ def get_generation_metrics(args): AlignScore(align_scorer), AlignScore(align_scorer, target_is_claims=False), AlignScore(align_scorer, ignore_target=True), - RougeMetric("rouge1", sample=True), - RougeMetric("rouge2", sample=True), RougeMetric("rougeL", sample=True), BLEUMetric(sample=True), AccuracyMetric( diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 9eb981925..06956ecc6 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -75,7 +75,7 @@ #DistilOneSentenceSAR, ) from .sar import SAR -from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU, CCPGSU +from .gsu import MaxprobGSU, PPLGSU, MTEGSU, TokenSARGSU from .renyi_neg import RenyiNeg from .fisher_rao import FisherRao from .verbalized_1s import Verbalized1S @@ -84,3 +84,4 @@ from .label_prob import LabelProb from .p_true_empirical import PTrueEmpirical from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE +from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE diff --git a/src/lm_polygraph/estimators/average_ue.py b/src/lm_polygraph/estimators/average_ue.py index b03147a1f..a7748e9e6 100644 --- a/src/lm_polygraph/estimators/average_ue.py +++ b/src/lm_polygraph/estimators/average_ue.py @@ -25,7 +25,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_log_probs, sample_sentence_similarity in zip( batch_sample_log_probs, batch_sample_sentence_similarity ): - sample_probs = -np.exp(np.array(sample_log_probs)) + sample_probs = -np.array(sample_log_probs) ave.append(sample_probs.mean()) @@ -50,7 +50,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_log_likelihoods, sample_sentence_similarity in zip( batch_sample_log_likelihoods, batch_sample_sentence_similarity ): - ppl = -np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) ave.append(ppl.mean()) @@ -99,9 +99,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) - tokenSAR = np.array(tokenSAR) - probs_token_sar = -np.exp(-tokenSAR) - ave.append(probs_token_sar.mean()) + ave.append(np.mean(tokenSAR)) return np.array(ave) diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py index 8aae841d1..4d58b8fee 100644 --- a/src/lm_polygraph/estimators/gsu.py +++ b/src/lm_polygraph/estimators/gsu.py @@ -37,7 +37,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_log_probs, sample_sentence_similarity in zip( batch_sample_log_probs, batch_sample_sentence_similarity ): - sample_probs = -np.exp(np.array(sample_log_probs)) + sample_probs = -np.array(sample_log_probs) R_s = ( sample_probs * sample_sentence_similarity @@ -79,7 +79,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_log_likelihoods, sample_sentence_similarity in zip( batch_sample_log_likelihoods, batch_sample_sentence_similarity ): - ppl = -np.exp([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) R_s = ( ppl @@ -146,10 +146,8 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) - tokenSAR = np.array(tokenSAR) - probs_token_sar = -np.exp(-tokenSAR) R_s = ( - probs_token_sar + tokenSAR * sample_sentence_similarity ) E_s = R_s.sum(-1) @@ -193,77 +191,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): # Use MTE for sentence relevance calculation R_s = sample_entropy * sample_sentence_similarity - + # Compute sentence relevance by summing along the last axis E_s = R_s.sum(-1) GSU.append(E_s.mean()) return np.array(GSU) - - -class CCPGSU(Estimator): - def __init__( - self, - verbose: bool = False - ): - super().__init__(["sample_sentence_similarity", - "sample_tokens", - "sample_tokens_alternatives", - "sample_tokens_alternatives_nli"], "sequence") - self.verbose = verbose - - def __str__(self): - return "CCPGSU" - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - """ - Estimates the sentenceSAR for each sample in the input statistics. - - Parameters: - stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: - * corresponding log probabilities in 'sample_log_probs', - * matrix with cross-encoder similarities in 'sample_sentence_similarity' - Returns: - np.ndarray: float sentenceSAR for each sample in input statistics. - Higher values indicate more uncertain samples. - """ - batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - batch_sample_tokens = stats["sample_tokens"] - batch_sample_tokens_alternatives = stats["sample_tokens_alternatives"] - batch_sample_tokens_alternatives_nli = stats["sample_tokens_alternatives_nli"] - - GSU = [] - for sample_sentence_similarity, \ - samples_tokens, \ - samples_tokens_alternatives, \ - samples_tokens_alternatives_nli in zip( - batch_sample_sentence_similarity, - batch_sample_tokens, - batch_sample_tokens_alternatives, - batch_sample_tokens_alternatives_nli - ): - ccps = [] - for sample_tokens, \ - sample_tokens_alternatives, \ - sample_tokens_alternatives_nli in zip( - samples_tokens, - samples_tokens_alternatives, - samples_tokens_alternatives_nli - ): - ccp_stats = { - "greedy_tokens": [sample_tokens], - "greedy_tokens_alternatives": [sample_tokens_alternatives], - "greedy_tokens_alternatives_nli": [sample_tokens_alternatives_nli] - } - ccps.append(ClaimConditionedProbability()(stats=ccp_stats)[0]) - - R_s = ( - ccps - * sample_sentence_similarity - ) - sent_relevance = R_s.sum(-1) - - GSU.append(E_s.mean()) - - return np.array(GSU) diff --git a/src/lm_polygraph/estimators/semantic_average_ue.py b/src/lm_polygraph/estimators/semantic_average_ue.py new file mode 100644 index 000000000..962ad91e9 --- /dev/null +++ b/src/lm_polygraph/estimators/semantic_average_ue.py @@ -0,0 +1,131 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator + + +class SemanticAveMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + + def __str__(self): + return "SemanticAveMaxprob" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + sample_probs = -np.array(sample_log_probs) + weights = sample_sentence_similarity[0, :] + ave.append(np.average(sample_probs, weights=weights)) + + return np.array(ave) + +class SemanticAvePPL(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "SemanticAvePPL" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for sample_log_likelihoods, sample_sentence_similarity in zip( + batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): + ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + weights = sample_sentence_similarity[0, :] + + ave.append(np.average(ppl, weights=weights)) + + return np.array(ave) + +class SemanticAveTokenSAR(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__( + [ + "sample_sentence_similarity", + "sample_log_likelihoods", + "sample_token_similarity", + ], + "sequence", + ) + self.verbose = verbose + + def __str__(self): + return "SemanticAveTokenSAR" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_token_similarity = stats["sample_token_similarity"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for batch_data in zip( + batch_sample_log_likelihoods, + batch_sample_token_similarity, + batch_sample_sentence_similarity, + ): + sample_log_likelihoods = batch_data[0] + sample_token_similarity = batch_data[1] + sample_sentence_similarity = batch_data[2] + + tokenSAR = [] + for log_likelihoods, token_similarity in zip( + sample_log_likelihoods, sample_token_similarity + ): + log_likelihoods = np.array(log_likelihoods) + R_t = 1 - token_similarity + R_t_norm = R_t / R_t.sum() + E_t = -log_likelihoods * R_t_norm + tokenSAR.append(E_t.sum()) + + weights = sample_sentence_similarity[0, :] + + ave.append(np.average(tokenSAR, weights=weights)) + + return np.array(ave) + +class SemanticAveMTE(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "SemanticAveMTE" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_entropy = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for sample_entropy, sample_sentence_similarity in zip( + batch_sample_entropy, batch_sample_sentence_similarity + ): + weights = sample_sentence_similarity[0, :] + ave.append(np.average(sample_entropy, weights=weights)) + + return np.array(ave) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 7d9a32f24..db3c4a0ec 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -516,9 +516,13 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: return self.metrics def eval_ue(self): - for (e_level, e_name), estimator_values in self.estimations.items(): - for (gen_level, gen_name), generation_metric in self.gen_metrics.items(): - for ue_metric in self.ue_metrics: + for (gen_level, gen_name), generation_metric in self.gen_metrics.items(): + generation_metric = np.array(generation_metric) + for ue_metric in self.ue_metrics: + oracle_score = ue_metric(-generation_metric, generation_metric) + random_score = get_random_scores(ue_metric, generation_metric) + + for (e_level, e_name), estimator_values in self.estimations.items(): if gen_level != e_level: continue if len(estimator_values) != len(generation_metric): @@ -529,11 +533,12 @@ def eval_ue(self): # TODO: Report how many nans! # This is important to know for a user ue, metric = _delete_nans(estimator_values, generation_metric) + assert len(ue) == len(estimator_values) + assert len(metric) == len(generation_metric) + if len(ue) == 0: self.metrics[e_level, e_name, gen_name, str(ue_metric)] = np.nan else: - oracle_score = ue_metric(-metric, metric) - random_score = get_random_scores(ue_metric, metric) ue_metric_val = ue_metric(ue, metric) self.metrics[e_level, e_name, gen_name, str(ue_metric)] = ( ue_metric_val From 04b3d523fb3904f305e28e8a13c4e734764798bd Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Tue, 17 Dec 2024 13:24:55 +0400 Subject: [PATCH 57/97] Save first sample texts separately --- examples/configs/polygraph_eval_coqa_sentsar.yaml | 1 + examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml | 1 + examples/configs/polygraph_eval_mmlu_sentsar.yaml | 1 + examples/configs/polygraph_eval_triviaqa_sentsar.yaml | 1 + examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml | 1 + examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 1 + examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 1 + examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml | 1 + examples/configs/polygraph_eval_xsum_sentsar.yaml | 1 + 9 files changed, 9 insertions(+) diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml index 7af151dc6..f7dcdc754 100644 --- a/examples/configs/polygraph_eval_coqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -41,6 +41,7 @@ save_stats: - sample_sentence_similarity - sample_token_similarity - sample_entropy + - first_sample_texts entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml index d13ccfc0f..e5fbdd9c4 100644 --- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -42,6 +42,7 @@ save_stats: - sample_sentence_similarity - sample_token_similarity - sample_entropy + - first_sample_texts entropy_top_k: 50 target_ignore_regex: "(?s).*#### " diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml index 7162070b0..20437d081 100644 --- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -43,6 +43,7 @@ save_stats: - sample_sentence_similarity - sample_token_similarity - sample_entropy + - first_sample_texts entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index f9fa19928..6cf4fd248 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -43,6 +43,7 @@ save_stats: - sample_sentence_similarity - sample_token_similarity - sample_entropy + - first_sample_texts entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml index 38f283f9b..89e058463 100644 --- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml @@ -41,6 +41,7 @@ save_stats: - sample_sentence_similarity - sample_token_similarity - sample_entropy + - first_sample_texts entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 1dbed406e..7b4e95e49 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -41,6 +41,7 @@ save_stats: - sample_sentence_similarity - sample_token_similarity - sample_entropy + - first_sample_texts entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index b0e766163..62fe84c26 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -41,6 +41,7 @@ save_stats: - sample_sentence_similarity - sample_token_similarity - sample_entropy + - first_sample_texts entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml index b52328b10..cece970ac 100644 --- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml @@ -41,6 +41,7 @@ save_stats: - sample_sentence_similarity - sample_token_similarity - sample_entropy + - first_sample_texts entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index a04c9c672..56e70c7cf 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -42,6 +42,7 @@ save_stats: - sample_sentence_similarity - sample_token_similarity - sample_entropy + - first_sample_texts entropy_top_k: 50 train_dataset: null From 3f98b5965a4b4bb264deeee8ea3eea658fc4597e Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 25 Dec 2024 16:01:55 +0400 Subject: [PATCH 58/97] Add degmat based on CE, log/exp differentiation for semantic methods and semantic median methods --- src/lm_polygraph/estimators/__init__.py | 3 +- src/lm_polygraph/estimators/deg_mat.py | 45 +++++ src/lm_polygraph/estimators/gsu.py | 42 ++++- .../estimators/semantic_average_ue.py | 30 +++- .../estimators/semantic_median_ue.py | 157 ++++++++++++++++++ 5 files changed, 268 insertions(+), 9 deletions(-) create mode 100644 src/lm_polygraph/estimators/semantic_median_ue.py diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 06956ecc6..74ef07953 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -25,7 +25,7 @@ from .monte_carlo_sequence_entropy import MonteCarloSequenceEntropy from .monte_carlo_normalized_sequence_entropy import MonteCarloNormalizedSequenceEntropy from .lexical_similarity import LexicalSimilarity -from .deg_mat import DegMat +from .deg_mat import DegMat, CEDegMat from .eccentricity import Eccentricity from .eig_val_laplacian import EigValLaplacian from .num_sem_sets import NumSemSets @@ -85,3 +85,4 @@ from .p_true_empirical import PTrueEmpirical from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE +from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE diff --git a/src/lm_polygraph/estimators/deg_mat.py b/src/lm_polygraph/estimators/deg_mat.py index 634884c63..f373a09d3 100644 --- a/src/lm_polygraph/estimators/deg_mat.py +++ b/src/lm_polygraph/estimators/deg_mat.py @@ -88,3 +88,48 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: print(f"generated answers: {answers}") res.append(self.U_DegMat(i, stats)) return np.array(res) + + +class CEDegMat(Estimator): + """ + Estimates the sequence-level uncertainty of a language model following the method of + "The Degree Matrix" as provided in the paper https://arxiv.org/abs/2305.19187. + Works with both whitebox and blackbox models (initialized using + lm_polygraph.utils.model.BlackboxModel/WhiteboxModel). + + Elements on diagonal of matrix D are sums of similarities between the particular number + (position in matrix) and other answers. Thus, it is an average pairwise distance + (lower values indicated smaller distance between answers which means greater uncertainty). + """ + + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_texts"], "sequence") + self.verbose = verbose + + def __str__(self): + return "CEDegMat" + + def U_DegMat(self, W, answers): + # The Degree Matrix + D = np.diag(W.sum(axis=1)) + return np.trace(len(answers) - D) / (len(answers) ** 2) + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + """ + Estimates the uncertainties for each sample in the input statistics. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: + * generated samples in 'sample_texts', + * matrix with semantic similarities in 'semantic_matrix_entail'/'semantic_matrix_contra' + Returns: + np.ndarray: float uncertainty for each sample in input statistics. + Higher values indicate more uncertain samples. + """ + res = [] + for W, answers in zip(stats["sample_sentence_similarity"], stats["sample_texts"]): + res.append(self.U_DegMat(W, answers)) + return np.array(res) diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py index 4d58b8fee..2969dca95 100644 --- a/src/lm_polygraph/estimators/gsu.py +++ b/src/lm_polygraph/estimators/gsu.py @@ -11,12 +11,17 @@ class MaxprobGSU(Estimator): def __init__( self, verbose: bool = False, + exp: bool = False ): super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") self.verbose = verbose + self.exp = exp def __str__(self): - return "MaxprobGSU" + if self.exp: + return "MaxprobGSUexp" + else: + return "MaxprobGSU" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -38,12 +43,16 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_probs, batch_sample_sentence_similarity ): sample_probs = -np.array(sample_log_probs) + if self.exp: + sample_probs = -np.exp(-sample_probs) R_s = ( sample_probs * sample_sentence_similarity ) E_s = R_s.sum(-1) + E_s = E_s / sample_sentence_similarity.sum(-1) + GSU.append(E_s.mean()) return np.array(GSU) @@ -52,13 +61,18 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: class PPLGSU(Estimator): def __init__( self, - verbose: bool = False + verbose: bool = False, + exp: bool = False ): super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") self.verbose = verbose + self.exp = exp def __str__(self): - return "PPLGSU" + if self.exp: + return "PPLGSUexp" + else: + return "PPLGSU" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -81,12 +95,17 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + if self.exp: + ppl = -np.exp(-ppl) + R_s = ( ppl * sample_sentence_similarity ) E_s = R_s.sum(-1) + E_s = E_s / sample_sentence_similarity.sum(-1) + GSU.append(E_s.mean()) return np.array(GSU) @@ -95,7 +114,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: class TokenSARGSU(Estimator): def __init__( self, - verbose: bool = False): + verbose: bool = False, + exp: bool = False + ): super().__init__( [ "sample_sentence_similarity", @@ -105,9 +126,13 @@ def __init__( "sequence", ) self.verbose = verbose + self.exp = exp def __str__(self): - return "TokenSARGSU" + if self.exp: + return "TokenSARGSUexp" + else: + return "TokenSARGSU" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -146,12 +171,17 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) + if self.exp: + tokenSAR = -np.exp(-np.array(tokenSAR)) + R_s = ( tokenSAR * sample_sentence_similarity ) E_s = R_s.sum(-1) + E_s = E_s / sample_sentence_similarity.sum(-1) + GSU.append(E_s.mean()) return np.array(GSU) @@ -195,6 +225,8 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Compute sentence relevance by summing along the last axis E_s = R_s.sum(-1) + E_s = E_s / sample_sentence_similarity.sum(-1) + GSU.append(E_s.mean()) return np.array(GSU) diff --git a/src/lm_polygraph/estimators/semantic_average_ue.py b/src/lm_polygraph/estimators/semantic_average_ue.py index 962ad91e9..fb361d44d 100644 --- a/src/lm_polygraph/estimators/semantic_average_ue.py +++ b/src/lm_polygraph/estimators/semantic_average_ue.py @@ -10,12 +10,17 @@ class SemanticAveMaxprob(Estimator): def __init__( self, verbose: bool = False, + exp: bool = False ): super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") self.verbose = verbose + self.exp = exp def __str__(self): - return "SemanticAveMaxprob" + if self.exp: + return "SemanticAveMaxprobexp" + else: + return "SemanticAveMaxprob" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_probs = stats["sample_log_probs"] @@ -26,6 +31,8 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_probs, batch_sample_sentence_similarity ): sample_probs = -np.array(sample_log_probs) + if self.exp: + sample_probs = -np.exp(-sample_probs) weights = sample_sentence_similarity[0, :] ave.append(np.average(sample_probs, weights=weights)) @@ -35,12 +42,17 @@ class SemanticAvePPL(Estimator): def __init__( self, verbose: bool = False, + exp: bool = False ): super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") self.verbose = verbose + self.exp = exp def __str__(self): - return "SemanticAvePPL" + if self.exp: + return "SemanticAvePPLexp" + else: + return "SemanticAvePPL" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] @@ -51,6 +63,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods, batch_sample_sentence_similarity ): ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + + if self.exp: + ppl = -np.exp(-ppl) + weights = sample_sentence_similarity[0, :] ave.append(np.average(ppl, weights=weights)) @@ -61,6 +77,7 @@ class SemanticAveTokenSAR(Estimator): def __init__( self, verbose: bool = False, + exp: bool = False ): super().__init__( [ @@ -71,9 +88,13 @@ def __init__( "sequence", ) self.verbose = verbose + self.exp = exp def __str__(self): - return "SemanticAveTokenSAR" + if self.exp: + return "SemanticAveTokenSARexp" + else: + return "SemanticAveTokenSAR" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] @@ -99,6 +120,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) + + if self.exp: + tokenSAR = -np.exp(-np.array(tokenSAR)) weights = sample_sentence_similarity[0, :] diff --git a/src/lm_polygraph/estimators/semantic_median_ue.py b/src/lm_polygraph/estimators/semantic_median_ue.py new file mode 100644 index 000000000..11b9beaac --- /dev/null +++ b/src/lm_polygraph/estimators/semantic_median_ue.py @@ -0,0 +1,157 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator + +from wquantiles import median + + +class SemanticMedianMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "SemanticMedianMaxprobexp" + else: + return "SemanticMedianMaxprob" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + sample_probs = -np.array(sample_log_probs) + if self.exp: + sample_probs = -np.exp(-sample_probs) + weights = sample_sentence_similarity[0, :] + ave.append(median(sample_probs, weights)) + + return np.array(ave) + +class SemanticMedianPPL(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "SemanticMedianPPLexp" + else: + return "SemanticMedianPPL" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for sample_log_likelihoods, sample_sentence_similarity in zip( + batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): + ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + + if self.exp: + ppl = -np.exp(-ppl) + + weights = sample_sentence_similarity[0, :] + + ave.append(median(ppl, weights)) + + return np.array(ave) + +class SemanticMedianTokenSAR(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False + ): + super().__init__( + [ + "sample_sentence_similarity", + "sample_log_likelihoods", + "sample_token_similarity", + ], + "sequence", + ) + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "SemanticMedianTokenSARexp" + else: + return "SemanticMedianTokenSAR" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_token_similarity = stats["sample_token_similarity"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for batch_data in zip( + batch_sample_log_likelihoods, + batch_sample_token_similarity, + batch_sample_sentence_similarity, + ): + sample_log_likelihoods = batch_data[0] + sample_token_similarity = batch_data[1] + sample_sentence_similarity = batch_data[2] + + tokenSAR = [] + for log_likelihoods, token_similarity in zip( + sample_log_likelihoods, sample_token_similarity + ): + log_likelihoods = np.array(log_likelihoods) + R_t = 1 - token_similarity + R_t_norm = R_t / R_t.sum() + E_t = -log_likelihoods * R_t_norm + tokenSAR.append(E_t.sum()) + + if self.exp: + tokenSAR = -np.exp(-np.array(tokenSAR)) + + weights = sample_sentence_similarity[0, :] + + ave.append(median(np.array(tokenSAR), weights)) + + return np.array(ave) + +class SemanticMedianMTE(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "SemanticMedianMTE" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_entropy = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + ave = [] + for sample_entropy, sample_sentence_similarity in zip( + batch_sample_entropy, batch_sample_sentence_similarity + ): + weights = sample_sentence_similarity[0, :] + ave.append(median(np.array(sample_entropy), weights)) + + return np.array(ave) From 93aecdfbdcb7e16da2425c6f7f297c21ee723c93 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 25 Dec 2024 19:19:06 +0400 Subject: [PATCH 59/97] Add sample-based gen metrics from best samples --- .../generation_metrics/accuracy.py | 22 +++++++++-- .../generation_metrics/alignscore.py | 23 ++++++++++-- src/lm_polygraph/generation_metrics/bleu.py | 22 +++++++++-- src/lm_polygraph/generation_metrics/comet.py | 24 ++++++++++-- src/lm_polygraph/generation_metrics/rouge.py | 23 ++++++++++-- src/lm_polygraph/stat_calculators/__init__.py | 2 +- src/lm_polygraph/stat_calculators/sample.py | 37 +++++++++++++++++++ 7 files changed, 135 insertions(+), 18 deletions(-) diff --git a/src/lm_polygraph/generation_metrics/accuracy.py b/src/lm_polygraph/generation_metrics/accuracy.py index a71c1b989..e3e4d13ed 100644 --- a/src/lm_polygraph/generation_metrics/accuracy.py +++ b/src/lm_polygraph/generation_metrics/accuracy.py @@ -19,10 +19,16 @@ def __init__( self, target_ignore_regex=None, output_ignore_regex=None, normalize=False, sample: bool = False ): if sample: - super().__init__(["first_sample_texts"], "sequence") + super().__init__([ + "first_sample_texts", + "best_sample_texts", + "best_normalized_sample_texts", + "input_texts"], + "sequence") else: super().__init__(["greedy_texts"], "sequence") self.sample = sample + self.sample_strategy = sample_strategy self.target_ignore_regex = ( re.compile(target_ignore_regex) if target_ignore_regex else None ) @@ -38,7 +44,10 @@ def __init__( def __str__(self): if self.sample: - return "SampleAccuracy" + if self.sample_strategy == "First": + return "SampleAccuracy" + else: + return f"{self.sample_strategy}SampleAccuracy" return "Accuracy" def _score_single(self, output: str, target: str) -> int: @@ -73,7 +82,14 @@ def __call__( np.ndarray: list of accuracies: 1 if generated text is equal to ground-truth and 0 otherwise. """ if self.sample: - gen_texts = stats["first_sample_texts"] + if self.sample_strategy == "First": + gen_texts = stats["first_sample_texts"] + elif self.sample_strategy == "Best": + gen_texts = stats["best_sample_texts"] + elif self.sample_strategy == "BestNormalized": + gen_texts = stats["best_normalized_sample_texts"] + else: + raise ValueError(f"Invalid sample strategy: {self.sample_strategy}") else: gen_texts = stats["greedy_texts"] diff --git a/src/lm_polygraph/generation_metrics/alignscore.py b/src/lm_polygraph/generation_metrics/alignscore.py index 57c6d454b..a6bd49504 100644 --- a/src/lm_polygraph/generation_metrics/alignscore.py +++ b/src/lm_polygraph/generation_metrics/alignscore.py @@ -19,12 +19,19 @@ def __init__( target_is_claims=True, ignore_target=False, sample: bool = False, + sample_strategy: str = "First", ): if sample: - super().__init__(["first_sample_texts", "input_texts"], "sequence") + super().__init__([ + "first_sample_texts", + "best_sample_texts", + "best_normalized_sample_texts", + "input_texts"], + "sequence") else: super().__init__(["greedy_texts", "input_texts"], "sequence") self.sample = sample + self.sample_strategy = sample_strategy self.target_is_claims = target_is_claims self.ignore_target = ignore_target self.scorer = scorer @@ -39,7 +46,10 @@ def __str__(self): base += "TargetOutput" if self.sample: - return f"Sample{base}" + if self.sample_strategy == "First": + return f"Sample{base}" + else: + return f"{self.sample_strategy}Sample{base}" return base @@ -60,7 +70,14 @@ def __call__( np.ndarray: list of AlignScore Scores for each sample in input. """ if self.sample: - gen_texts = stats["first_sample_texts"] + if self.sample_strategy == "First": + gen_texts = stats["first_sample_texts"] + elif self.sample_strategy == "Best": + gen_texts = stats["best_sample_texts"] + elif self.sample_strategy == "BestNormalized": + gen_texts = stats["best_normalized_sample_texts"] + else: + raise ValueError(f"Invalid sample strategy: {self.sample_strategy}") else: gen_texts = stats["greedy_texts"] diff --git a/src/lm_polygraph/generation_metrics/bleu.py b/src/lm_polygraph/generation_metrics/bleu.py index dd9b19ae7..91b4b5098 100644 --- a/src/lm_polygraph/generation_metrics/bleu.py +++ b/src/lm_polygraph/generation_metrics/bleu.py @@ -12,15 +12,24 @@ class BLEUMetric(GenerationMetric): def __init__(self, sample: bool = False): if sample: - super().__init__(["first_sample_texts"], "sequence") + super().__init__([ + "first_sample_texts", + "best_sample_texts", + "best_normalized_sample_texts", + "input_texts"], + "sequence") else: super().__init__(["greedy_texts"], "sequence") self.sample = sample + self.sample_strategy = sample_strategy self.scorer = BLEU(effective_order=True, lowercase=True) def __str__(self): if self.sample: - return "SampleBLEU" + if self.sample_strategy == "First": + return "SampleBLEU" + else: + return f"{self.sample_strategy}SampleBLEU" return "BLEU" def _score_single(self, t1: str, t2: str): @@ -44,7 +53,14 @@ def __call__( np.ndarray: list of BLEU Scores for each sample in input. """ if self.sample: - gen_texts = stats["first_sample_texts"] + if self.sample_strategy == "First": + gen_texts = stats["first_sample_texts"] + elif self.sample_strategy == "Best": + gen_texts = stats["best_sample_texts"] + elif self.sample_strategy == "BestNormalized": + gen_texts = stats["best_normalized_sample_texts"] + else: + raise ValueError(f"Invalid sample strategy: {self.sample_strategy}") else: gen_texts = stats["greedy_texts"] diff --git a/src/lm_polygraph/generation_metrics/comet.py b/src/lm_polygraph/generation_metrics/comet.py index 35c0f9ab4..f91c833a3 100644 --- a/src/lm_polygraph/generation_metrics/comet.py +++ b/src/lm_polygraph/generation_metrics/comet.py @@ -11,12 +11,18 @@ class Comet(GenerationMetric): between model-generated texts and ground truth texts. """ - def __init__(self, scorer, source_ignore_regex=None, lang="en", sample: bool = False): + def __init__(self, scorer, source_ignore_regex=None, lang="en", sample: bool = False, sample_strategy: str = "First"): if sample: - super().__init__(["first_sample_texts", "input_texts"], "sequence") + super().__init__([ + "first_sample_texts", + "best_sample_texts", + "best_normalized_sample_texts", + "input_texts"], + "sequence") else: super().__init__(["greedy_texts", "input_texts"], "sequence") self.sample = sample + self.sample_strategy = sample_strategy self.source_ignore_regex = ( re.compile(source_ignore_regex) if source_ignore_regex else None ) @@ -24,7 +30,10 @@ def __init__(self, scorer, source_ignore_regex=None, lang="en", sample: bool = F def __str__(self): if self.sample: - return "SampleComet" + if self.sample_strategy == "First": + return f"SampleComet" + else: + return f"{self.sample_strategy}SampleComet" return "Comet" def _filter_text(self, text: str, ignore_regex: re.Pattern) -> str: @@ -61,7 +70,14 @@ def __call__( ] if self.sample: - gen_texts = stats["first_sample_texts"] + if self.sample_strategy == "First": + gen_texts = stats["first_sample_texts"] + elif self.sample_strategy == "Best": + gen_texts = stats["best_sample_texts"] + elif self.sample_strategy == "BestNormalized": + gen_texts = stats["best_normalized_sample_texts"] + else: + raise ValueError(f"Invalid sample strategy: {self.sample_strategy}") else: gen_texts = stats["greedy_texts"] diff --git a/src/lm_polygraph/generation_metrics/rouge.py b/src/lm_polygraph/generation_metrics/rouge.py index 86ac231e3..cea5201ec 100644 --- a/src/lm_polygraph/generation_metrics/rouge.py +++ b/src/lm_polygraph/generation_metrics/rouge.py @@ -15,7 +15,7 @@ class RougeMetric(GenerationMetric): Calculates Rouge metric between model-generated texts and ground truth texts. """ - def __init__(self, rouge_name, sample: bool = False): + def __init__(self, rouge_name, sample: bool = False, sample_strategy: str = "First"): """ Parameters: rouge_name (str): rouge metric type. Possible values: @@ -24,16 +24,24 @@ def __init__(self, rouge_name, sample: bool = False): * rougeL """ if sample: - super().__init__(["first_sample_texts"], "sequence") + super().__init__([ + "first_sample_texts", + "best_sample_texts", + "best_normalized_sample_texts"], + "sequence") else: super().__init__(["greedy_texts"], "sequence") self.sample = sample + self.sample_strategy = sample_strategy self.rouge_name = rouge_name self.scorer = rouge_scorer.RougeScorer([rouge_name], use_stemmer=True) def __str__(self): if self.sample: - return f"SampleRouge_{self.rouge_name}" + if self.sample_strategy == "First": + return f"SampleRouge_{self.rouge_name}" + else: + return f"{self.sample_strategy}SampleRouge_{self.rouge_name}" return f"Rouge_{self.rouge_name}" def _score_single(self, t1: str, t2: str): @@ -59,7 +67,14 @@ def __call__( np.ndarray: list of Rouge Scores for each sample in input. """ if self.sample: - gen_texts = stats["first_sample_texts"] + if self.sample_strategy == "First": + gen_texts = stats["first_sample_texts"] + elif self.sample_strategy == "Best": + gen_texts = stats["best_sample_texts"] + elif self.sample_strategy == "BestNormalized": + gen_texts = stats["best_normalized_sample_texts"] + else: + raise ValueError(f"Invalid sample strategy: {self.sample_strategy}") else: gen_texts = stats["greedy_texts"] diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py index 29844b507..1ba3b4057 100644 --- a/src/lm_polygraph/stat_calculators/__init__.py +++ b/src/lm_polygraph/stat_calculators/__init__.py @@ -10,7 +10,7 @@ ) from .entropy import EntropyCalculator from .entropy import SampleEntropyCalculator -from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator, FirstSampleCalculator +from .sample import SamplingGenerationCalculator, BlackboxSamplingGenerationCalculator, FirstSampleCalculator, BestSampleCalculator from .sample_alternatives_nli import SampleAlternativesNLICalculator from .greedy_alternatives_nli import ( GreedyAlternativesNLICalculator, diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py index d00f4f6f5..b6d6b1553 100644 --- a/src/lm_polygraph/stat_calculators/sample.py +++ b/src/lm_polygraph/stat_calculators/sample.py @@ -232,3 +232,40 @@ def __call__( "first_sample_texts": first_sample_texts, } +class BestSampleCalculator(StatCalculator): + def __init__(self): + super().__init__( + [ + "best_sample_texts", + "best_normalized_sample_texts", + ], + [ + "sample_texts", + "sample_log_probs", + "sample_log_likelihoods", + ] + ) + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + best_sample_texts = [] + best_normalized_sample_texts = [] + + for batch_i, (sample_texts, sample_log_probs, sample_log_likelihoods) in enumerate(zip(dependencies["sample_texts"], dependencies["sample_log_probs"], dependencies["sample_log_likelihoods"])): + best_i = np.argmax(sample_log_probs) + best_sample_texts.append(sample_texts[best_i]) + + ppls = [np.mean(ll) for ll in sample_log_likelihoods] + best_ppl_i = np.argmax(ppls) + best_normalized_sample_texts.append(sample_texts[best_ppl_i]) + + return { + "best_sample_texts": best_sample_texts, + "best_normalized_sample_texts": best_normalized_sample_texts, + } + From 65772794ce2ad9732bf2d8cae5837061c05a70fa Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 25 Dec 2024 19:19:09 +0400 Subject: [PATCH 60/97] Add sample-based gen metrics from best samples --- scripts/polygraph_eval | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index d6a373497..c21a66afb 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -465,6 +465,7 @@ def get_generation_metrics(args): AlignScore(align_scorer), AlignScore(align_scorer, target_is_claims=False), AlignScore(align_scorer, ignore_target=True), + # Sample-based metrics RougeMetric("rougeL", sample=True), BLEUMetric(sample=True), AccuracyMetric( @@ -476,7 +477,34 @@ def get_generation_metrics(args): AlignScore(align_scorer, sample=True), AlignScore(align_scorer, target_is_claims=False, sample=True), AlignScore(align_scorer, ignore_target=True, sample=True), + # Best sample-based metrics + RougeMetric("rougeL", sample=True, sample_strategy="Best"), + BLEUMetric(sample=True, sample_strategy="Best"), + AccuracyMetric( + target_ignore_regex = getattr(args, "target_ignore_regex", None), + output_ignore_regex = getattr(args, "output_ignore_regex", None), + normalize = getattr(args, "normalize", False), + sample=True, + sample_strategy="Best", + ), + AlignScore(align_scorer, sample=True, sample_strategy="Best"), + AlignScore(align_scorer, target_is_claims=False, sample=True, sample_strategy="Best"), + AlignScore(align_scorer, ignore_target=True, sample=True, sample_strategy="Best"), + # Best normalized sample-based metrics + RougeMetric("rougeL", sample=True, sample_strategy="BestNormalized"), + BLEUMetric(sample=True, sample_strategy="BestNormalized"), + AccuracyMetric( + target_ignore_regex = getattr(args, "target_ignore_regex", None), + output_ignore_regex = getattr(args, "output_ignore_regex", None), + normalize = getattr(args, "normalize", False), + sample=True, + sample_strategy="BestNormalized", + ), + AlignScore(align_scorer, sample=True, sample_strategy="BestNormalized"), + AlignScore(align_scorer, target_is_claims=False, sample=True, sample_strategy="BestNormalized"), + AlignScore(align_scorer, ignore_target=True, sample=True, sample_strategy="BestNormalized"), ] + if getattr(args.model, "type", "Whitebox") != "Blackbox": if getattr(args, "use_claim_ue", False): result += [OpenAIFactCheck(cache_path=args.cache_path, language=getattr(args, "language", "en"))] From bc991b068ecd57ecd3a4e9506f0c7c175e052deb Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 25 Dec 2024 19:29:41 +0400 Subject: [PATCH 61/97] Save new stats in manager --- examples/configs/polygraph_eval_coqa_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml | 2 ++ examples/configs/polygraph_eval_mmlu_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_triviaqa_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml | 2 ++ examples/configs/polygraph_eval_xsum_sentsar.yaml | 2 ++ 9 files changed, 18 insertions(+) diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml index f7dcdc754..828e2327a 100644 --- a/examples/configs/polygraph_eval_coqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -42,6 +42,8 @@ save_stats: - sample_token_similarity - sample_entropy - first_sample_texts + - best_sample_texts + - best_normalized_sample_texts entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml index e5fbdd9c4..ea91a213f 100644 --- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -43,6 +43,8 @@ save_stats: - sample_token_similarity - sample_entropy - first_sample_texts + - best_sample_texts + - best_normalized_sample_texts entropy_top_k: 50 target_ignore_regex: "(?s).*#### " diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml index 20437d081..743904e1b 100644 --- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -44,6 +44,8 @@ save_stats: - sample_token_similarity - sample_entropy - first_sample_texts + - best_sample_texts + - best_normalized_sample_texts entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 6cf4fd248..4fd78ca81 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -44,6 +44,8 @@ save_stats: - sample_token_similarity - sample_entropy - first_sample_texts + - best_sample_texts + - best_normalized_sample_texts entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml index 89e058463..2404e8822 100644 --- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml @@ -42,6 +42,8 @@ save_stats: - sample_token_similarity - sample_entropy - first_sample_texts + - best_sample_texts + - best_normalized_sample_texts entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 7b4e95e49..6040bd6e7 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -42,6 +42,8 @@ save_stats: - sample_token_similarity - sample_entropy - first_sample_texts + - best_sample_texts + - best_normalized_sample_texts entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index 62fe84c26..58e5cee10 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -42,6 +42,8 @@ save_stats: - sample_token_similarity - sample_entropy - first_sample_texts + - best_sample_texts + - best_normalized_sample_texts entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml index cece970ac..33bae1849 100644 --- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml @@ -42,6 +42,8 @@ save_stats: - sample_token_similarity - sample_entropy - first_sample_texts + - best_sample_texts + - best_normalized_sample_texts entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index 56e70c7cf..1a4d971c6 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -43,6 +43,8 @@ save_stats: - sample_token_similarity - sample_entropy - first_sample_texts + - best_sample_texts + - best_normalized_sample_texts entropy_top_k: 50 train_dataset: null From 744b108e32f6e9e0516a472fd39885d7e8b78ee2 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 25 Dec 2024 19:42:40 +0400 Subject: [PATCH 62/97] Small fixes --- src/lm_polygraph/generation_metrics/accuracy.py | 2 +- src/lm_polygraph/generation_metrics/bleu.py | 2 +- src/lm_polygraph/utils/register_stat_calculators.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lm_polygraph/generation_metrics/accuracy.py b/src/lm_polygraph/generation_metrics/accuracy.py index e3e4d13ed..7f0f062da 100644 --- a/src/lm_polygraph/generation_metrics/accuracy.py +++ b/src/lm_polygraph/generation_metrics/accuracy.py @@ -16,7 +16,7 @@ class AccuracyMetric(GenerationMetric): """ def __init__( - self, target_ignore_regex=None, output_ignore_regex=None, normalize=False, sample: bool = False + self, target_ignore_regex=None, output_ignore_regex=None, normalize=False, sample: bool = False, sample_strategy: str = "First" ): if sample: super().__init__([ diff --git a/src/lm_polygraph/generation_metrics/bleu.py b/src/lm_polygraph/generation_metrics/bleu.py index 91b4b5098..34fee322a 100644 --- a/src/lm_polygraph/generation_metrics/bleu.py +++ b/src/lm_polygraph/generation_metrics/bleu.py @@ -10,7 +10,7 @@ class BLEUMetric(GenerationMetric): Calculates BLEU metric between model-generated texts and ground truth texts. """ - def __init__(self, sample: bool = False): + def __init__(self, sample: bool = False, sample_strategy: str = "First"): if sample: super().__init__([ "first_sample_texts", diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index 7c82caf80..c2bae472e 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -68,6 +68,7 @@ def _register(calculator_class: StatCalculator): _register(GreedyLMProbsCalculator()) _register(SamplingGenerationCalculator(n_alternatives=n_ccp_alternatives)) _register(FirstSampleCalculator()) + _register(BestSampleCalculator()) _register(BartScoreCalculator()) _register(ModelScoreCalculator()) _register(EmbeddingsCalculator()) From 65f9513d442b656fb51ab783dd6491c42f15f759 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 26 Dec 2024 11:28:36 +0400 Subject: [PATCH 63/97] Add Comet against best --- scripts/polygraph_eval | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index c21a66afb..c64b9f992 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -512,7 +512,9 @@ def get_generation_metrics(args): ignore_regex = getattr(args, "source_ignore_regex", None) comet_scorer = load("comet") result += [Comet(comet_scorer, source_ignore_regex = ignore_regex), - Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True)] + Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True) + Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="Best"), + Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="BestNormalized")] else: result = [] for metric in generation_metrics: From 2d8fa7ebaa049be62bee7ce9a0e3ab08966a5bf1 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 26 Dec 2024 11:29:53 +0400 Subject: [PATCH 64/97] Fix --- scripts/polygraph_eval | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index c64b9f992..0eda6666c 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -512,7 +512,7 @@ def get_generation_metrics(args): ignore_regex = getattr(args, "source_ignore_regex", None) comet_scorer = load("comet") result += [Comet(comet_scorer, source_ignore_regex = ignore_regex), - Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True) + Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True), Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="Best"), Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="BestNormalized")] else: From c5509ea479ae22e3756237e4e1c45a0d751fd52b Mon Sep 17 00:00:00 2001 From: silvimica Date: Sun, 29 Dec 2024 09:08:41 +0400 Subject: [PATCH 65/97] Avesimilarity --- src/lm_polygraph/estimators/__init__.py | 1 + .../semantic_average_ue_average_similarity.py | 224 ++++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 74ef07953..d810dd6d9 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -85,4 +85,5 @@ from .p_true_empirical import PTrueEmpirical from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE +from .semantic_average_ue_average_similarity import SemanticAveMaxprobAveSimilarity, SemanticAvePPLAveSimilarity, SemanticAveTokenSARAveSimilarity,SemanticAveMTEAveSimilarity from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py new file mode 100644 index 000000000..b214554bd --- /dev/null +++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py @@ -0,0 +1,224 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator + + +class SemanticAveMaxprobAveSimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "SemanticAveMaxprobAveSimilarityexp" + else: + return "SemanticAveMaxprobAveSimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + enriched_metrics = [] # To store enriched metrics for each sample + + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + # Compute probabilities (negative log-probs) + sample_probs = -np.array(sample_log_probs) + if self.exp: + sample_probs = -np.exp(-sample_probs) + + # Compute row-wise average similarity, excluding self-similarity + # Diagonal contains self-similarities + row_averages = [] + for i in range(sample_sentence_similarity.shape[0]): + row = sample_sentence_similarity[i] + average_similarity = (np.sum(row) - row[i]) / (len(row) - 1) + row_averages.append(average_similarity) + + # Enrich each metric by scaling it by 1/row_average + enriched_sample_metrics = [] + for i, (prob, avg_similarity) in enumerate(zip(sample_probs, row_averages)): + if avg_similarity == 0: + avg_similarity = 1e-10 # Avoid division by zero + enriched_metric = prob * (1 / avg_similarity) + enriched_sample_metrics.append(enriched_metric) + + enriched_metrics.append(np.array(enriched_sample_metrics)) + # Return only metric for the first sample for prr calculation + first_elements = [metrics[0] for metrics in enriched_metrics] + return np.array(first_elements) + +class SemanticAvePPLAveSimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "SemanticAvePPLAveSimilarityexp" + else: + return "SemanticAvePPLAveSimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + enriched_ppl = [] # To store enriched PPL for each sample + + for sample_log_likelihoods, sample_sentence_similarity in zip( + batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): + # get PPL for each sample + ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + if self.exp: + ppl = -np.exp(-ppl) + + # Compute row-wise average similarity, excluding self-similarity + row_averages = [] + for i in range(sample_sentence_similarity.shape[0]): + row = sample_sentence_similarity[i] + average_similarity = (np.sum(row) - row[i]) / (len(row) - 1) # Exclude g_ii + row_averages.append(average_similarity) + + # Enrich each PPL independently by scaling with 1/row_average + enriched_sample_ppl = [] + for i, (ppl_value, avg_similarity) in enumerate(zip(ppl, row_averages)): + if avg_similarity == 0: + avg_similarity = 1e-10 # Avoid division by zero + enriched_value = ppl_value * (1 / avg_similarity) + enriched_sample_ppl.append(enriched_value) + + enriched_ppl.append(np.array(enriched_sample_ppl)) # Collect enriched PPL values + # Return only metric for the first sample for prr calculation + first_elements = [metrics[0] for metrics in enriched_ppl] + return np.array(first_elements) + +class SemanticAveTokenSARAveSimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False + ): + super().__init__( + [ + "sample_sentence_similarity", + "sample_log_likelihoods", + "sample_token_similarity", + ], + "sequence", + ) + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "SemanticAveTokenSARAveSimilarityexp" + else: + return "SemanticAveTokenSARAveSimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_token_similarity = stats["sample_token_similarity"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + enriched_tokenSAR = [] + + for batch_data in zip( + batch_sample_log_likelihoods, + batch_sample_token_similarity, + batch_sample_sentence_similarity, + ): + sample_log_likelihoods = batch_data[0] + sample_token_similarity = batch_data[1] + sample_sentence_similarity = batch_data[2] + + tokenSAR = [] + for log_likelihoods, token_similarity in zip( + sample_log_likelihoods, sample_token_similarity + ): + log_likelihoods = np.array(log_likelihoods) + R_t = 1 - token_similarity + R_t_norm = R_t / R_t.sum() + E_t = -log_likelihoods * R_t_norm + tokenSAR.append(E_t.sum()) + + if self.exp: + tokenSAR = -np.exp(-np.array(tokenSAR)) + + # Compute row-wise average similarity excluding self-similarity + row_averages = [] + for i in range(sample_sentence_similarity.shape[0]): + row = sample_sentence_similarity[i] + average_similarity = (np.sum(row) - row[i]) / (len(row) - 1) # Exclude g_ii + row_averages.append(average_similarity) + + # Enrich each tokenSAR value + enriched_sample_tokenSAR = [] + for i, (sar_value, avg_similarity) in enumerate(zip(tokenSAR, row_averages)): + if avg_similarity == 0: + avg_similarity = 1e-10 # Avoid division by zero + enriched_value = sar_value * (1 / avg_similarity) + enriched_sample_tokenSAR.append(enriched_value) + + enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR)) + # Return only metric for the first sample for prr calculation + + first_elements = [metrics[0] for metrics in enriched_tokenSAR] + return np.array(first_elements) + + +class SemanticAveMTEAveSimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "SemanticAveMTEAveSimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_entropy = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + enriched_entropy = [] + + for sample_entropy, sample_sentence_similarity in zip( + batch_sample_entropy, batch_sample_sentence_similarity + ): + # Compute row-wise average similarity, excluding self-similarity + row_averages = [] + for i in range(sample_sentence_similarity.shape[0]): + row = sample_sentence_similarity[i] + average_similarity = (np.sum(row) - row[i]) / (len(row) - 1) # Exclude g_ii + row_averages.append(average_similarity) + + # Enrich each sample's entropy value + enriched_sample_entropy = [] + for i, (entropy, avg_similarity) in enumerate(zip(sample_entropy, row_averages)): + if avg_similarity == 0: + avg_similarity = 1e-10 # Avoid division by zero + enriched_value = entropy * (1 / avg_similarity) + enriched_sample_entropy.append(enriched_value) + + enriched_entropy.append(np.array(enriched_sample_entropy)) + # Return only metric for the first sample for prr calculation + first_elements = [metrics[0] for metrics in enriched_entropy] + return np.array(first_elements) + + From fb601db6a080d2136c2c5c0d241072f340e24011 Mon Sep 17 00:00:00 2001 From: silvimica Date: Thu, 2 Jan 2025 13:56:57 +0400 Subject: [PATCH 66/97] UE metric enriched with average dissimilarity --- src/lm_polygraph/estimators/__init__.py | 11 +- .../semantic_average_ue_average_similarity.py | 265 ++++++++++++++++++ 2 files changed, 275 insertions(+), 1 deletion(-) diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index d810dd6d9..7b1cfe0c4 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -85,5 +85,14 @@ from .p_true_empirical import PTrueEmpirical from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE -from .semantic_average_ue_average_similarity import SemanticAveMaxprobAveSimilarity, SemanticAvePPLAveSimilarity, SemanticAveTokenSARAveSimilarity,SemanticAveMTEAveSimilarity +from .semantic_average_ue_average_similarity import ( + SemanticAveMaxprobAveSimilarity, + SemanticAvePPLAveSimilarity, + SemanticAveTokenSARAveSimilarity, + SemanticAveMTEAveSimilarity, + SemanticEnrichedPPLAveDissimilarity, + SemanticEnrichedTokenSARAveDissimilarity , + SemanticEnrichedMaxprobAveDissimilarity, + SemanticEnrichedMTEAveDissimilarity, + AveDissimilarity) from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py index b214554bd..f72b6bb13 100644 --- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py @@ -57,6 +57,62 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: first_elements = [metrics[0] for metrics in enriched_metrics] return np.array(first_elements) +class SemanticEnrichedMaxprobAveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "SemanticEnrichedMaxprobAveDissimilarityexp" + else: + return "SemanticEnrichedMaxprobAveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + enriched_metrics = [] # To store enriched metrics for each sample + + for sample_log_probs, sample_sentence_similarity in zip( + batch_sample_log_probs, batch_sample_sentence_similarity + ): + # Step 1: Compute probabilities (negative log-probs) + sample_probs = -np.array(sample_log_probs) + if self.exp: + sample_probs = -np.exp(-sample_probs) + + # Step 2: Compute row-wise sum of dissimilarities (1 - g) + row_dissimilarities = [] + for i in range(sample_sentence_similarity.shape[0]): + row = sample_sentence_similarity[i] + sum_dissimilarities = np.sum(1 - row) - (1 - row[i]) # Exclude self-similarity + row_dissimilarities.append(sum_dissimilarities) + + # Step 3: Normalize by (M - 1) + normalized_dissimilarities = [ + dissim / (len(sample_sentence_similarity) - 1) + for dissim in row_dissimilarities + ] + + # Step 4: Enrich each metric + enriched_sample_metrics = [] + for prob, dissim in zip(sample_probs, normalized_dissimilarities): + enriched_metric = prob * dissim + enriched_sample_metrics.append(enriched_metric) + + enriched_metrics.append(np.array(enriched_sample_metrics)) + + # Return only metric for the first sample for PRR calculation + first_elements = [metrics[0] for metrics in enriched_metrics] + return np.array(first_elements) + + class SemanticAvePPLAveSimilarity(Estimator): def __init__( self, @@ -107,6 +163,59 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: first_elements = [metrics[0] for metrics in enriched_ppl] return np.array(first_elements) +class SemanticEnrichedPPLAveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False + ): + super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "SemanticEnrichedPPLAveDissimilarityexp" + else: + return "SemanticEnrichedPPLAveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + enriched_ppl = [] # To store enriched PPL for each sample + + for sample_log_likelihoods, sample_sentence_similarity in zip( + batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): + # Step 1: Compute PPL for each sample + ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) + if self.exp: + ppl = -np.exp(-ppl) + + # Step 2: Compute row-wise average dissimilarity (1 - g) + row_averages = [] + for i in range(sample_sentence_similarity.shape[0]): + row = sample_sentence_similarity[i] + # Compute average dissimilarity, excluding self-similarity + average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1) + row_averages.append(average_dissimilarity) + + # Step 3: Enrich each PPL independently by scaling with the average dissimilarity + enriched_sample_ppl = [] + for i, (ppl_value, avg_dissimilarity) in enumerate(zip(ppl, row_averages)): + if avg_dissimilarity == 0: + avg_dissimilarity = 1e-10 # Avoid division by zero + enriched_value = ppl_value * avg_dissimilarity + enriched_sample_ppl.append(enriched_value) + + enriched_ppl.append(np.array(enriched_sample_ppl)) # Collect enriched PPL values + + # Return only metric for the first sample for PRR calculation + first_elements = [metrics[0] for metrics in enriched_ppl] + return np.array(first_elements) + + class SemanticAveTokenSARAveSimilarity(Estimator): def __init__( self, @@ -181,6 +290,80 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(first_elements) +class SemanticEnrichedTokenSARAveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False + ): + super().__init__( + [ + "sample_sentence_similarity", + "sample_log_likelihoods", + "sample_token_similarity", + ], + "sequence", + ) + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "SemanticEnrichedTokenSARAveDissimilarityexp" + else: + return "SemanticEnrichedTokenSARAveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_token_similarity = stats["sample_token_similarity"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + enriched_tokenSAR = [] + + for batch_data in zip( + batch_sample_log_likelihoods, + batch_sample_token_similarity, + batch_sample_sentence_similarity, + ): + sample_log_likelihoods = batch_data[0] + sample_token_similarity = batch_data[1] + sample_sentence_similarity = batch_data[2] + + tokenSAR = [] + for log_likelihoods, token_similarity in zip( + sample_log_likelihoods, sample_token_similarity + ): + log_likelihoods = np.array(log_likelihoods) + R_t = 1 - token_similarity + R_t_norm = R_t / R_t.sum() + E_t = -log_likelihoods * R_t_norm + tokenSAR.append(E_t.sum()) + + if self.exp: + tokenSAR = -np.exp(-np.array(tokenSAR)) + + # Compute row-wise average dissimilarity (1 - g), excluding self-similarity + row_averages = [] + for i in range(sample_sentence_similarity.shape[0]): + row = sample_sentence_similarity[i] + average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1) + row_averages.append(average_dissimilarity) + + # Enrich each tokenSAR value + enriched_sample_tokenSAR = [] + for i, (sar_value, avg_dissimilarity) in enumerate(zip(tokenSAR, row_averages)): + if avg_dissimilarity == 0: + avg_dissimilarity = 1e-10 # Avoid division by zero + enriched_value = sar_value * avg_dissimilarity + enriched_sample_tokenSAR.append(enriched_value) + + enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR)) + # Return only metric for the first sample for PRR calculation + + first_elements = [metrics[0] for metrics in enriched_tokenSAR] + return np.array(first_elements) + + class SemanticAveMTEAveSimilarity(Estimator): def __init__( self, @@ -222,3 +405,85 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(first_elements) + +class SemanticEnrichedMTEAveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "SemanticEnrichedMTEAveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_entropy = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + enriched_entropy = [] + + for sample_entropy, sample_sentence_similarity in zip( + batch_sample_entropy, batch_sample_sentence_similarity + ): + # Compute row-wise average dissimilarity (1 - g), excluding self-similarity + row_averages = [] + for i in range(sample_sentence_similarity.shape[0]): + row = sample_sentence_similarity[i] + average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1) + row_averages.append(average_dissimilarity) + + # Enrich each sample's entropy value + enriched_sample_entropy = [] + for i, (entropy, avg_dissimilarity) in enumerate(zip(sample_entropy, row_averages)): + if avg_dissimilarity == 0: + avg_dissimilarity = 1e-10 # Avoid division by zero + enriched_value = entropy * avg_dissimilarity + enriched_sample_entropy.append(enriched_value) + + enriched_entropy.append(np.array(enriched_sample_entropy)) + # Return only metric for the first sample for PRR calculation + first_elements = [metrics[0] for metrics in enriched_entropy] + return np.array(first_elements) + + + +class AveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "AveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_entropy = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + + enriched_entropy = [] + + for sample_entropy, sample_sentence_similarity in zip( + batch_sample_entropy, batch_sample_sentence_similarity + ): + # Compute row-wise average dissimilarity (1 - g), excluding self-similarity + row_averages = [] + for i in range(sample_sentence_similarity.shape[0]): + row = sample_sentence_similarity[i] + average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1) + row_averages.append(average_dissimilarity) + + # Enrich each sample's entropy value + enriched_sample_entropy = [] + for i, (entropy, avg_dissimilarity) in enumerate(zip(sample_entropy, row_averages)): + if avg_dissimilarity == 0: + avg_dissimilarity = 1e-10 # Avoid division by zero + enriched_value = avg_dissimilarity + enriched_sample_entropy.append(enriched_value) + + enriched_entropy.append(np.array(enriched_sample_entropy)) + # Return only metric for the first sample for PRR calculation + first_elements = [metrics[0] for metrics in enriched_entropy] + return np.array(first_elements) From ab5f055d76c8d67ff1e610e0de94287ec77938bc Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 9 Jan 2025 14:10:32 +0400 Subject: [PATCH 67/97] Set sample selection strategy for sample-focused methods, add greedy-focused semantically-enriched methods --- src/lm_polygraph/estimators/__init__.py | 19 +- src/lm_polygraph/estimators/common.py | 19 + ..._semantic_average_ue_average_similarity.py | 349 ++++++++++++++++++ .../estimators/max_probability.py | 41 +- src/lm_polygraph/estimators/perplexity.py | 31 +- .../estimators/semantic_average_ue.py | 59 ++- .../semantic_average_ue_average_similarity.py | 162 +++++--- .../estimators/semantic_median_ue.py | 60 +-- src/lm_polygraph/estimators/token_entropy.py | 10 +- src/lm_polygraph/estimators/token_sar.py | 11 +- src/lm_polygraph/stat_calculators/__init__.py | 1 + .../stat_calculators/greedy_similarity.py | 80 ++++ src/lm_polygraph/stat_calculators/sample.py | 7 +- .../utils/register_stat_calculators.py | 1 + 14 files changed, 695 insertions(+), 155 deletions(-) create mode 100644 src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py create mode 100644 src/lm_polygraph/stat_calculators/greedy_similarity.py diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 7b1cfe0c4..9d3009913 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -7,11 +7,11 @@ from .claim.pointwise_mutual_information import PointwiseMutualInformationClaim from .max_probability import ( MaximumSequenceProbability, + SampledMaximumSequenceProbability, MaximumTokenProbability, - MaxSampledMaximumSequenceProbability, ) from .claim_conditioned_probability import ClaimConditionedProbability -from .token_entropy import MeanTokenEntropy, TokenEntropy +from .token_entropy import MeanTokenEntropy, TokenEntropy, SampledMeanTokenEntropy from .pointwise_mutual_information import ( MeanPointwiseMutualInformation, PointwiseMutualInformation, @@ -32,8 +32,7 @@ from .semantic_entropy import SemanticEntropy from .semantic_entropy_token import SemanticEntropyToken from .perplexity import ( - Perplexity, - MaxSampledPerplexity, + Perplexity, SampledPerplexity ) from .mahalanobis_distance import MahalanobisDistanceSeq from .relative_mahalanobis_distance import RelativeMahalanobisDistanceSeq @@ -65,7 +64,7 @@ PESrmi, PESrmiabs, ) -from .token_sar import TokenSAR +from .token_sar import TokenSAR, SampledTokenSAR from .sentence_sar import ( SentenceSAR, # OtherSentenceSAR, @@ -95,4 +94,14 @@ SemanticEnrichedMaxprobAveDissimilarity, SemanticEnrichedMTEAveDissimilarity, AveDissimilarity) +from .greedy_semantic_average_ue_average_similarity import ( + GreedySemanticAveMaxprobAveSimilarity, + GreedySemanticAvePPLAveSimilarity, + GreedySemanticAveTokenSARAveSimilarity, + GreedySemanticAveMTEAveSimilarity, + GreedySemanticEnrichedPPLAveDissimilarity, + GreedySemanticEnrichedTokenSARAveDissimilarity , + GreedySemanticEnrichedMaxprobAveDissimilarity, + GreedySemanticEnrichedMTEAveDissimilarity, +) from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE diff --git a/src/lm_polygraph/estimators/common.py b/src/lm_polygraph/estimators/common.py index 0a10c414c..72e2142e7 100644 --- a/src/lm_polygraph/estimators/common.py +++ b/src/lm_polygraph/estimators/common.py @@ -29,3 +29,22 @@ def _compute_Jaccard_score(lst): def compute_sim_score(answers, affinity, similarity_score): return _compute_Jaccard_score(answers) + +def sample_strategy_to_prefix(sample_strategy): + if sample_strategy == "first": + return "" + elif sample_strategy in ["best", "best_normalized"]: + return "".join(list(map(lambda x: x.capitalize(), sample_strategy.split("_")))) + else: + raise ValueError(f"Unknown sample strategy: {sample_strategy}") + +def best_sample_ids(sample_strategy, stats): + batch_size = len(stats["sample_log_probs"]) + if sample_strategy == "first": + return [0] * batch_size + elif sample_strategy == "best": + return stats["best_sample_text_ids"] + elif sample_strategy == "best_normalized": + return stats["best_normalized_sample_text_ids"] + else: + raise ValueError(f"Unknown sample strategy: {sample_strategy}") diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py new file mode 100644 index 000000000..5b7a8ab99 --- /dev/null +++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py @@ -0,0 +1,349 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids + + +class GreedySemanticAveMaxprobAveSimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "GreedySemanticAveMaxprobAveSimilarityexp" + else: + return "GreedySemanticAveMaxprobAveSimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]]) + + enriched_metrics = [] # To store enriched metrics for each sample + for greedy_ll, greedy_sentence_similarity in zip( + batch_lls, batch_greedy_sentence_similarity + ): + # Compute probabilities (negative log-probs) + prob = -greedy_ll + if self.exp: + prob = -np.exp(-prob) + + # Compute row-wise average similarity, excluding self-similarity + # Diagonal contains self-similarities + ave_similarity = np.mean(greedy_sentence_similarity) + + # Enrich each metric by scaling it by 1/row_average + if ave_similarity == 0: + ave_similarity = 1e-10 # Avoid division by zero + + enriched_metric = prob * (1 / avg_similarity) + enriched_metrics.append(enriched_metric) + + return np.array(enriched_metrics) + + +class GreedySemanticAveMaxprobAveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "GreedySemanticAveMaxprobAveDissimilarityexp" + else: + return "GreedySemanticAveMaxprobAveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]]) + + enriched_metrics = [] # To store enriched metrics for each sample + for greedy_ll, greedy_sentence_similarity in zip( + batch_lls, batch_greedy_sentence_similarity + ): + # Compute probabilities (negative log-probs) + prob = -greedy_ll + if self.exp: + prob = -np.exp(-prob) + + # Compute row-wise average similarity, excluding self-similarity + # Diagonal contains self-similarities + ave_dissimilarity = np.mean(1 - greedy_sentence_similarity) + + enriched_metric = prob * avg_dissimilarity + enriched_metrics.append(enriched_metric) + + return np.array(enriched_metrics) + + +class GreedySemanticAvePPLAveSimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "GreedySemanticAvePPLAveSimilarityexp" + else: + return "GreedySemanticAvePPLAveSimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_ppl = [] # To store enriched PPL for each sample + + for greedy_log_likelihoods, greedy_sentence_similarity in zip( + batch_greedy_log_likelihoods, batch_greedy_sentence_similarity + ): + # get PPL for each sample + ppl = -np.mean(greedy_log_likelihoods) + if self.exp: + ppl = -np.exp(-ppl) + + # Compute row-wise average similarity, excluding self-similarity + avg_similarity = np.mean(greedy_sentence_similarity) + + # Enrich each PPL independently by scaling with 1/row_average + if avg_similarity == 0: + avg_similarity = 1e-10 # Avoid division by zero + + enriched_value = ppl * (1 / avg_similarity) + enriched_ppl.append(enriched_value) + + return np.array(enriched_ppl) + + +class GreedySemanticAvePPLAveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "GreedySemanticAvePPLAveDissimilarityexp" + else: + return "GreedySemanticAvePPLAveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_ppl = [] # To store enriched PPL for each sample + + for greedy_log_likelihoods, greedy_sentence_similarity in zip( + batch_greedy_log_likelihoods, batch_greedy_sentence_similarity + ): + # get PPL for each sample + ppl = -np.mean(greedy_log_likelihoods) + if self.exp: + ppl = -np.exp(-ppl) + + # Compute row-wise average similarity, excluding self-similarity + avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) + + enriched_value = ppl * avg_dissimilarity + enriched_ppl.append(enriched_value) + + return np.array(enriched_ppl) + + +class GreedySemanticAveTokenSARAveSimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False, + ): + super().__init__( + [ + "greedy_sentence_similarity", + "greedy_log_likelihoods", + ], + "sequence", + ) + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "GreedySemanticAveTokenSARAveSimilarityexp" + else: + return "GreedySemanticAveTokenSARAveSimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] + batch_greedy_token_similarity = stats["token_similarity"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_tokenSAR = [] + + for batch_data in zip( + batch_greedy_log_likelihoods, + batch_greedy_token_similarity, + batch_greedy_sentence_similarity, + ): + log_likelihoods = batch_data[0] + token_similarity = batch_data[1] + greedy_sentence_similarity = batch_data[2] + + log_likelihoods = np.array(log_likelihoods) + R_t = 1 - token_similarity + R_t_norm = R_t / R_t.sum() + E_t = -log_likelihoods * R_t_norm + tokenSAR.append(E_t.sum()) + + if self.exp: + tokenSAR = -np.exp(-np.array(tokenSAR)) + + # Compute row-wise average similarity, excluding self-similarity + avg_similarity = np.mean(greedy_sentence_similarity) + + # Enrich each PPL independently by scaling with 1/row_average + if avg_similarity == 0: + avg_similarity = 1e-10 # Avoid division by zero + + enriched_value = tokenSAR * (1 / avg_similarity) + enriched_tokenSAR.append(enriched_value) + + return np.array(enriched_tokenSAR) + + +class GreedySemanticAveTokenSARAveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + exp: bool = False, + ): + super().__init__( + [ + "greedy_sentence_similarity", + "greedy_log_likelihoods", + ], + "sequence", + ) + self.verbose = verbose + self.exp = exp + + def __str__(self): + if self.exp: + return "GreedySemanticAveTokenSARAveDissimilarityexp" + else: + return "GreedySemanticAveTokenSARAveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] + batch_greedy_token_similarity = stats["token_similarity"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_tokenSAR = [] + + for batch_data in zip( + batch_greedy_log_likelihoods, + batch_greedy_token_similarity, + batch_greedy_sentence_similarity, + ): + log_likelihoods = batch_data[0] + token_similarity = batch_data[1] + greedy_sentence_similarity = batch_data[2] + + log_likelihoods = np.array(log_likelihoods) + R_t = 1 - token_similarity + R_t_norm = R_t / R_t.sum() + E_t = -log_likelihoods * R_t_norm + tokenSAR.append(E_t.sum()) + + if self.exp: + tokenSAR = -np.exp(-np.array(tokenSAR)) + + # Compute row-wise average similarity, excluding self-similarity + avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) + + enriched_value = tokenSAR * avg_dissimilarity + enriched_tokenSAR.append(enriched_value) + + return np.array(enriched_tokenSAR) + + +class GreedySemanticAveMTEAveSimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedySemanticAveMTEAveSimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_entropy = stats["entropy"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_entropy = [] + + for greedy_entropy, greedy_sentence_similarity in zip( + batch_greedy_entropy, batch_greedy_sentence_similarity + ): + # Compute row-wise average similarity, excluding self-similarity + avg_similarity = np.mean(greedy_sentence_similarity) + + # Enrich each PPL independently by scaling with 1/row_average + if avg_similarity == 0: + avg_similarity = 1e-10 # Avoid division by zero + + enriched_value = greedy_entropy * (1 / avg_similarity) + enriched_entropy.append(enriched_value) + + return np.array(enriched_entropy) + + +class GreedySemanticEnrichedMTEAveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedySemanticEnrichedMTEAveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_entropy = stats["entropy"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_entropy = [] + + for greedy_entropy, greedy_sentence_similarity in zip( + batch_greedy_entropy, batch_greedy_sentence_similarity + ): + # Compute row-wise average similarity, excluding self-similarity + avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) + + enriched_value = greedy_entropy * avg_dissimilarity + enriched_entropy.append(enriched_value) + + return np.array(enriched_entropy) diff --git a/src/lm_polygraph/estimators/max_probability.py b/src/lm_polygraph/estimators/max_probability.py index cbfc8ed32..406021cd6 100644 --- a/src/lm_polygraph/estimators/max_probability.py +++ b/src/lm_polygraph/estimators/max_probability.py @@ -3,6 +3,7 @@ from typing import Dict from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids class MaximumSequenceProbability(Estimator): @@ -41,41 +42,13 @@ class SampledMaximumSequenceProbability(Estimator): Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel). """ - def __init__(self): + def __init__(self, sample_strategy: str = "first"): super().__init__(["sample_log_probs"], "sequence") + self.sample_strategy = sample_strategy def __str__(self): - return "SampledMaximumSequenceProbability" - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - """ - Estimates the minus log-probability of each sample in input statistics. - - Parameters: - stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: - * log p(y_i | y_ np.ndarray: """ @@ -88,7 +61,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: np.ndarray: minus log probabilities for each sample. Higher values indicate more uncertain samples. """ - mp = [max(lp) for lp in stats["sample_log_probs"]] + sample_ids = best_sample_ids(self.sample_strategy, stats) + + mp = [] + for best_id, sample_log_probs in zip(sample_ids, stats["sample_log_probs"]): + mp.append(sample_log_probs[best_id]) return -np.array(mp) diff --git a/src/lm_polygraph/estimators/perplexity.py b/src/lm_polygraph/estimators/perplexity.py index 44fd9350f..4581b7c39 100644 --- a/src/lm_polygraph/estimators/perplexity.py +++ b/src/lm_polygraph/estimators/perplexity.py @@ -3,6 +3,7 @@ from typing import Dict from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids class Perplexity(Estimator): @@ -17,33 +18,19 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array([-np.mean(ll) for ll in log_likelihoods]) class SampledPerplexity(Estimator): - def __init__(self): + def __init__(self, sample_strategy: str = "first"): super().__init__(["sample_log_likelihoods"], "sequence") + self.sample_strategy = sample_strategy def __str__(self): - return "SampledPerplexity" + return sample_strategy_to_prefix(self.sample_strategy) + "SampledPerplexity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: log_likelihoods = stats["sample_log_likelihoods"] - ppl = [np.mean(sample_log_likelihoods[0]) for sample_log_likelihoods in log_likelihoods] - return -np.array(ppl) + sample_ids = best_sample_ids(self.sample_strategy, stats) -class MaxSampledPerplexity(Estimator): - def init(self): - super().init(["sample_log_likelihoods"], "sequence") + ppl = [] + for best_id, sample_log_likelihoods in zip(sample_ids, log_likelihoods): + ppl.append(np.mean(sample_log_likelihoods[best_id])) - def str(self): - return "MaxSampledPerplexity" - - def call(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - log_likelihoods = stats["sample_log_likelihoods"] - - ppl_per_sample = [ - [-np.mean(sequence) for sequence in sample_log_likelihoods] - for sample_log_likelihoods in log_likelihoods - ] - - # Find the maximum perplexity for each set of samples - max_ppl = [max(ppl_sample) for ppl_sample in ppl_per_sample] - - return -np.array(max_ppl) \ No newline at end of file + return -np.array(ppl) diff --git a/src/lm_polygraph/estimators/semantic_average_ue.py b/src/lm_polygraph/estimators/semantic_average_ue.py index fb361d44d..d58489d99 100644 --- a/src/lm_polygraph/estimators/semantic_average_ue.py +++ b/src/lm_polygraph/estimators/semantic_average_ue.py @@ -4,36 +4,42 @@ from copy import deepcopy from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids class SemanticAveMaxprob(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticAveMaxprobexp" + base = "SemanticAveMaxprobexp" else: - return "SemanticAveMaxprob" + base = "SemanticAveMaxprob" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_probs = stats["sample_log_probs"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) ave = [] - for sample_log_probs, sample_sentence_similarity in zip( - batch_sample_log_probs, batch_sample_sentence_similarity + for best_id, sample_log_probs, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity ): sample_probs = -np.array(sample_log_probs) if self.exp: sample_probs = -np.exp(-sample_probs) - weights = sample_sentence_similarity[0, :] + + weights = sample_sentence_similarity[best_id, :] ave.append(np.average(sample_probs, weights=weights)) return np.array(ave) @@ -42,32 +48,36 @@ class SemanticAvePPL(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticAvePPLexp" + base = "SemanticAvePPLexp" else: - return "SemanticAvePPL" + base = "SemanticAvePPL" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) ave = [] - for sample_log_likelihoods, sample_sentence_similarity in zip( - batch_sample_log_likelihoods, batch_sample_sentence_similarity + for best_id, sample_log_likelihoods, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity ): ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) if self.exp: ppl = -np.exp(-ppl) - weights = sample_sentence_similarity[0, :] + weights = sample_sentence_similarity[best_id, :] ave.append(np.average(ppl, weights=weights)) @@ -77,7 +87,8 @@ class SemanticAveTokenSAR(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__( [ @@ -89,27 +100,32 @@ def __init__( ) self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticAveTokenSARexp" + base = "SemanticAveTokenSARexp" else: - return "SemanticAveTokenSAR" + base = "SemanticAveTokenSAR" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] batch_sample_token_similarity = stats["sample_token_similarity"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) ave = [] for batch_data in zip( batch_sample_log_likelihoods, batch_sample_token_similarity, batch_sample_sentence_similarity, + sample_ids, ): sample_log_likelihoods = batch_data[0] sample_token_similarity = batch_data[1] sample_sentence_similarity = batch_data[2] + best_id = batch_data[3] tokenSAR = [] for log_likelihoods, token_similarity in zip( @@ -124,7 +140,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: if self.exp: tokenSAR = -np.exp(-np.array(tokenSAR)) - weights = sample_sentence_similarity[0, :] + weights = sample_sentence_similarity[best_id, :] ave.append(np.average(tokenSAR, weights=weights)) @@ -134,22 +150,25 @@ class SemanticAveMTE(Estimator): def __init__( self, verbose: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") self.verbose = verbose + self.sample_strategy = sample_strategy def __str__(self): - return "SemanticAveMTE" + return sample_strategy_to_prefix(self.sample_strategy) + "SemanticAveMTE" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_entropy = stats["sample_entropy"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) ave = [] - for sample_entropy, sample_sentence_similarity in zip( - batch_sample_entropy, batch_sample_sentence_similarity + for best_id, sample_entropy, sample_sentence_similarity in zip( + sample_ids, batch_sample_entropy, batch_sample_sentence_similarity ): - weights = sample_sentence_similarity[0, :] + weights = sample_sentence_similarity[best_id, :] ave.append(np.average(sample_entropy, weights=weights)) return np.array(ave) diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py index f72b6bb13..2a0f0b617 100644 --- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py @@ -4,27 +4,32 @@ from copy import deepcopy from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids class SemanticAveMaxprobAveSimilarity(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticAveMaxprobAveSimilarityexp" + base = "SemanticAveMaxprobAveSimilarityexp" else: - return "SemanticAveMaxprobAveSimilarity" + base = "SemanticAveMaxprobAveSimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_probs = stats["sample_log_probs"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) enriched_metrics = [] # To store enriched metrics for each sample @@ -53,29 +58,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_sample_metrics.append(enriched_metric) enriched_metrics.append(np.array(enriched_sample_metrics)) - # Return only metric for the first sample for prr calculation - first_elements = [metrics[0] for metrics in enriched_metrics] - return np.array(first_elements) + + # Return only metric for the best sample for prr calculation + best_elements = [] + for best_id, metrics in zip(sample_ids, enriched_metrics): + best_elements.append(metrics[best_id]) + + return np.array(best_elements) class SemanticEnrichedMaxprobAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticEnrichedMaxprobAveDissimilarityexp" + base = "SemanticEnrichedMaxprobAveDissimilarityexp" else: - return "SemanticEnrichedMaxprobAveDissimilarity" + base = "SemanticEnrichedMaxprobAveDissimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_probs = stats["sample_log_probs"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) enriched_metrics = [] # To store enriched metrics for each sample @@ -108,30 +121,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_metrics.append(np.array(enriched_sample_metrics)) - # Return only metric for the first sample for PRR calculation - first_elements = [metrics[0] for metrics in enriched_metrics] - return np.array(first_elements) + # Return only metric for the best sample for PRR calculation + best_elements = [] + for best_id, metrics in zip(sample_ids, enriched_metrics): + best_elements.append(metrics[best_id]) + + return np.array(best_elements) class SemanticAvePPLAveSimilarity(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticAvePPLAveSimilarityexp" + base = "SemanticAvePPLAveSimilarityexp" else: - return "SemanticAvePPLAveSimilarity" + base = "SemanticAvePPLAveSimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) enriched_ppl = [] # To store enriched PPL for each sample @@ -159,29 +179,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_sample_ppl.append(enriched_value) enriched_ppl.append(np.array(enriched_sample_ppl)) # Collect enriched PPL values - # Return only metric for the first sample for prr calculation - first_elements = [metrics[0] for metrics in enriched_ppl] - return np.array(first_elements) + + # Return only metric for the best sample for prr calculation + best_elements = [] + for best_id, metrics in zip(sample_ids, enriched_ppl): + best_elements.append(metrics[best_id]) + + return np.array(best_elements) class SemanticEnrichedPPLAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticEnrichedPPLAveDissimilarityexp" + base = "SemanticEnrichedPPLAveDissimilarityexp" else: - return "SemanticEnrichedPPLAveDissimilarity" + base = "SemanticEnrichedPPLAveDissimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) enriched_ppl = [] # To store enriched PPL for each sample @@ -211,16 +239,20 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_ppl.append(np.array(enriched_sample_ppl)) # Collect enriched PPL values - # Return only metric for the first sample for PRR calculation - first_elements = [metrics[0] for metrics in enriched_ppl] - return np.array(first_elements) + # Return only metric for the best sample for PRR calculation + best_elements = [] + for best_id, metrics in zip(sample_ids, enriched_ppl): + best_elements.append(metrics[best_id]) + + return np.array(best_elements) class SemanticAveTokenSARAveSimilarity(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__( [ @@ -232,17 +264,20 @@ def __init__( ) self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticAveTokenSARAveSimilarityexp" + base = "SemanticAveTokenSARAveSimilarityexp" else: - return "SemanticAveTokenSARAveSimilarity" + base = "SemanticAveTokenSARAveSimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] batch_sample_token_similarity = stats["sample_token_similarity"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) enriched_tokenSAR = [] @@ -284,17 +319,21 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_sample_tokenSAR.append(enriched_value) enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR)) - # Return only metric for the first sample for prr calculation - first_elements = [metrics[0] for metrics in enriched_tokenSAR] - return np.array(first_elements) + # Return only metric for the best sample for prr calculation + best_elements = [] + for best_id, metrics in zip(sample_ids, enriched_tokenSAR): + best_elements.append(metrics[best_id]) + + return np.array(best_elements) class SemanticEnrichedTokenSARAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__( [ @@ -306,17 +345,20 @@ def __init__( ) self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticEnrichedTokenSARAveDissimilarityexp" + base = "SemanticEnrichedTokenSARAveDissimilarityexp" else: - return "SemanticEnrichedTokenSARAveDissimilarity" + base = "SemanticEnrichedTokenSARAveDissimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] batch_sample_token_similarity = stats["sample_token_similarity"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) enriched_tokenSAR = [] @@ -358,26 +400,32 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_sample_tokenSAR.append(enriched_value) enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR)) - # Return only metric for the first sample for PRR calculation - first_elements = [metrics[0] for metrics in enriched_tokenSAR] - return np.array(first_elements) + # Return only metric for the best sample for PRR calculation + best_elements = [] + for best_id, metrics in zip(sample_ids, enriched_tokenSAR): + best_elements.append(metrics[best_id]) + + return np.array(best_elements) class SemanticAveMTEAveSimilarity(Estimator): def __init__( self, verbose: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") self.verbose = verbose + self.sample_strategy = sample_strategy def __str__(self): - return "SemanticAveMTEAveSimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + "SemanticAveMTEAveSimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_entropy = stats["sample_entropy"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) enriched_entropy = [] @@ -400,9 +448,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_sample_entropy.append(enriched_value) enriched_entropy.append(np.array(enriched_sample_entropy)) - # Return only metric for the first sample for prr calculation - first_elements = [metrics[0] for metrics in enriched_entropy] - return np.array(first_elements) + + # Return only metric for the best sample for prr calculation + best_elements = [] + for best_id, metrics in zip(sample_ids, enriched_entropy): + best_elements.append(metrics[best_id]) + + return np.array(best_elements) @@ -410,16 +462,19 @@ class SemanticEnrichedMTEAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") self.verbose = verbose + self.sample_strategy = sample_strategy def __str__(self): - return "SemanticEnrichedMTEAveDissimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + "SemanticEnrichedMTEAveDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_entropy = stats["sample_entropy"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) enriched_entropy = [] @@ -442,9 +497,13 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_sample_entropy.append(enriched_value) enriched_entropy.append(np.array(enriched_sample_entropy)) - # Return only metric for the first sample for PRR calculation - first_elements = [metrics[0] for metrics in enriched_entropy] - return np.array(first_elements) + + # Return only metric for the best sample for PRR calculation + best_elements = [] + for best_id, metrics in zip(sample_ids, enriched_entropy): + best_elements.append(metrics[best_id]) + + return np.array(best_elements) @@ -452,16 +511,19 @@ class AveDissimilarity(Estimator): def __init__( self, verbose: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") self.verbose = verbose + self.sample_strategy = sample_strategy def __str__(self): - return "AveDissimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + "AveDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_entropy = stats["sample_entropy"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) enriched_entropy = [] @@ -484,6 +546,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_sample_entropy.append(enriched_value) enriched_entropy.append(np.array(enriched_sample_entropy)) - # Return only metric for the first sample for PRR calculation - first_elements = [metrics[0] for metrics in enriched_entropy] - return np.array(first_elements) + + # Return only metric for the best sample for PRR calculation + best_elements = [] + for best_id, metrics in zip(sample_ids, enriched_entropy): + best_elements.append(metrics[best_id]) + + return np.array(best_elements) diff --git a/src/lm_polygraph/estimators/semantic_median_ue.py b/src/lm_polygraph/estimators/semantic_median_ue.py index 11b9beaac..0ec7d3274 100644 --- a/src/lm_polygraph/estimators/semantic_median_ue.py +++ b/src/lm_polygraph/estimators/semantic_median_ue.py @@ -4,6 +4,7 @@ from copy import deepcopy from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids from wquantiles import median @@ -12,30 +13,34 @@ class SemanticMedianMaxprob(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticMedianMaxprobexp" + base = "SemanticMedianMaxprobexp" else: - return "SemanticMedianMaxprob" + base = "SemanticMedianMaxprob" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_probs = stats["sample_log_probs"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) ave = [] - for sample_log_probs, sample_sentence_similarity in zip( - batch_sample_log_probs, batch_sample_sentence_similarity + for best_id, sample_log_probs, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity ): sample_probs = -np.array(sample_log_probs) if self.exp: sample_probs = -np.exp(-sample_probs) - weights = sample_sentence_similarity[0, :] + weights = sample_sentence_similarity[best_id, :] ave.append(median(sample_probs, weights)) return np.array(ave) @@ -44,32 +49,36 @@ class SemanticMedianPPL(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticMedianPPLexp" + base = "SemanticMedianPPLexp" else: - return "SemanticMedianPPL" + base = "SemanticMedianPPL" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) ave = [] - for sample_log_likelihoods, sample_sentence_similarity in zip( - batch_sample_log_likelihoods, batch_sample_sentence_similarity + for best_id, sample_log_likelihoods, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity ): ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) if self.exp: ppl = -np.exp(-ppl) - weights = sample_sentence_similarity[0, :] + weights = sample_sentence_similarity[best_id, :] ave.append(median(ppl, weights)) @@ -79,7 +88,8 @@ class SemanticMedianTokenSAR(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False + exp: bool = False, + sample_strategy: str = "first" ): super().__init__( [ @@ -91,27 +101,32 @@ def __init__( ) self.verbose = verbose self.exp = exp + self.sample_strategy = sample_strategy def __str__(self): if self.exp: - return "SemanticMedianTokenSARexp" + base = "SemanticMedianTokenSARexp" else: - return "SemanticMedianTokenSAR" + base = "SemanticMedianTokenSAR" + return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_log_likelihoods = stats["sample_log_likelihoods"] batch_sample_token_similarity = stats["sample_token_similarity"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) ave = [] for batch_data in zip( batch_sample_log_likelihoods, batch_sample_token_similarity, batch_sample_sentence_similarity, + sample_ids, ): sample_log_likelihoods = batch_data[0] sample_token_similarity = batch_data[1] sample_sentence_similarity = batch_data[2] + best_id = batch_data[3] tokenSAR = [] for log_likelihoods, token_similarity in zip( @@ -122,11 +137,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) - + if self.exp: tokenSAR = -np.exp(-np.array(tokenSAR)) - weights = sample_sentence_similarity[0, :] + weights = sample_sentence_similarity[best_id, :] ave.append(median(np.array(tokenSAR), weights)) @@ -136,22 +151,25 @@ class SemanticMedianMTE(Estimator): def __init__( self, verbose: bool = False, + sample_strategy: str = "first" ): super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") self.verbose = verbose + self.sample_strategy = sample_strategy def __str__(self): - return "SemanticMedianMTE" + return sample_strategy_to_prefix(self.sample_strategy) + "SemanticMedianMTE" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_entropy = stats["sample_entropy"] batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) ave = [] - for sample_entropy, sample_sentence_similarity in zip( - batch_sample_entropy, batch_sample_sentence_similarity + for best_id, sample_entropy, sample_sentence_similarity in zip( + sample_ids, batch_sample_entropy, batch_sample_sentence_similarity ): - weights = sample_sentence_similarity[0, :] + weights = sample_sentence_similarity[best_id, :] ave.append(median(np.array(sample_entropy), weights)) return np.array(ave) diff --git a/src/lm_polygraph/estimators/token_entropy.py b/src/lm_polygraph/estimators/token_entropy.py index 9e1d080dd..059934c12 100644 --- a/src/lm_polygraph/estimators/token_entropy.py +++ b/src/lm_polygraph/estimators/token_entropy.py @@ -3,6 +3,7 @@ from typing import Dict from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids class MeanTokenEntropy(Estimator): @@ -40,11 +41,12 @@ class SampledMeanTokenEntropy(Estimator): Works only with whitebox models (initialized using lm_polygraph.utils.model.WhiteboxModel). """ - def __init__(self): + def __init__(self, sample_strategy: str = "first"): super().__init__(["sample_entropy"], "sequence") + self.sample_strategy = sample_strategy def __str__(self): - return "SampledMeanTokenEntropy" + return sample_strategy_to_prefix(self.sample_strategy) + "SampledMeanTokenEntropy" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -58,7 +60,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: Higher values indicate more uncertain samples. """ entropy = stats["sample_entropy"] - return np.array([e[0] for e in entropy]) + sample_ids = best_sample_ids(self.sample_strategy, stats) + + return np.array([e[best_id] for e, best_id in zip(entropy, sample_ids)]) class TokenEntropy(Estimator): diff --git a/src/lm_polygraph/estimators/token_sar.py b/src/lm_polygraph/estimators/token_sar.py index 1a3e715c6..c2695970d 100644 --- a/src/lm_polygraph/estimators/token_sar.py +++ b/src/lm_polygraph/estimators/token_sar.py @@ -3,6 +3,7 @@ from typing import Dict from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids class TokenSAR(Estimator): @@ -58,12 +59,13 @@ class SampledTokenSAR(Estimator): This method calculates the weighted sum of log_likelihoods with weights computed using token relevance. """ - def __init__(self, verbose: bool = False): + def __init__(self, verbose: bool = False, sample_strategy: str = "first"): super().__init__(["sample_token_similarity", "sample_log_likelihoods"], "sequence") self.verbose = verbose + self.sample_strategy = sample_strategy def __str__(self): - return "SampledTokenSAR" + return sample_strategy_to_prefix(self.sample_strategy) + "SampledTokenSAR" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ @@ -79,14 +81,17 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: """ batch_sample_log_likelihoods = stats["sample_log_likelihoods"] batch_sample_token_similarity = stats["sample_token_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) result = [] for batch_data in zip( batch_sample_log_likelihoods, batch_sample_token_similarity, + sample_ids, ): sample_log_likelihoods = batch_data[0] sample_token_similarity = batch_data[1] + best_id = batch_data[2] tokenSAR = [] for log_likelihoods, token_similarity in zip( @@ -97,6 +102,6 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) - result.append(tokenSAR[0]) + result.append(tokenSAR[best_id]) return np.array(result) diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py index 1ba3b4057..f8c3fa554 100644 --- a/src/lm_polygraph/stat_calculators/__init__.py +++ b/src/lm_polygraph/stat_calculators/__init__.py @@ -24,3 +24,4 @@ from .cross_encoder_similarity import CrossEncoderSimilarityMatrixCalculator from .extract_claims import ClaimsExtractor from .semantic_classes import SemanticClassesCalculator +from .greedy_similarity import GreedySimilarityCalculator diff --git a/src/lm_polygraph/stat_calculators/greedy_similarity.py b/src/lm_polygraph/stat_calculators/greedy_similarity.py new file mode 100644 index 000000000..04eae17bf --- /dev/null +++ b/src/lm_polygraph/stat_calculators/greedy_similarity.py @@ -0,0 +1,80 @@ +import numpy as np + +import itertools +from typing import Dict, List +from tqdm import tqdm + +from .stat_calculator import StatCalculator +from sentence_transformers import CrossEncoder +from lm_polygraph.utils.model import WhiteboxModel + + +class GreedySimilarityCalculator(StatCalculator): + """ + Calculates the cross-encoder similarity between greedy sequence and sampled sequences. + """ + + def __init__(self, nli_model): + super().__init__( + [ + "greedy_sentence_similarity", + ], + ["input_texts", "sample_tokens", "sample_texts", "greedy_tokens", "greedy_texts"], + ) + + self.crossencoder_setup = False + self.nli_model = nli_model + + def _setup(self, device="cuda"): + self.crossencoder = CrossEncoder( + "cross-encoder/stsb-roberta-large", device=device + ) + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + device = model.device() + tokenizer = model.tokenizer + + if not self.crossencoder_setup: + self._setup(device=device) + self.crossencoder_setup = True + + batch_sample_tokens = dependencies["sample_tokens"] + batch_texts = dependencies["sample_texts"] + deberta_batch_size = ( + self.nli_model.batch_size + ) + batch_input_texts = dependencies["input_texts"] + batch_greedy_tokens = dependencies["greedy_tokens"] + batch_greedy_texts = dependencies["greedy_texts"] + + special_tokens = list(model.tokenizer.added_tokens_decoder.keys()) + + batch_pairs = [] + batch_invs = [] + batch_counts = [] + for texts, greedy_text in zip(batch_texts, batch_greedy_texts): + # Sampling from LLM often produces significant number of identical + # outputs. We only need to score pairs of unqiue outputs + unique_texts, inv = np.unique(texts, return_inverse=True) + batch_pairs.append(list(itertools.product([greedy_text], unique_texts))) + batch_invs.append(inv) + + sim_arrays = [] + for i, pairs in tqdm(enumerate(batch_pairs)): + sim_scores = self.crossencoder.predict(pairs, batch_size=deberta_batch_size) + + inv = batch_invs[i] + + sim_arrays.append(sim_scores[inv]) + + sim_arrays = np.stack(sim_arrays) + + return { + "greedy_sentence_similarity": sim_arrays, + } diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py index b6d6b1553..4217000c6 100644 --- a/src/lm_polygraph/stat_calculators/sample.py +++ b/src/lm_polygraph/stat_calculators/sample.py @@ -254,18 +254,23 @@ def __call__( max_new_tokens: int = 100, ) -> Dict[str, np.ndarray]: best_sample_texts = [] + best_sample_text_ids = [] best_normalized_sample_texts = [] + best_normalized_sample_text_ids = [] for batch_i, (sample_texts, sample_log_probs, sample_log_likelihoods) in enumerate(zip(dependencies["sample_texts"], dependencies["sample_log_probs"], dependencies["sample_log_likelihoods"])): best_i = np.argmax(sample_log_probs) best_sample_texts.append(sample_texts[best_i]) + best_sample_text_ids.append(best_i) ppls = [np.mean(ll) for ll in sample_log_likelihoods] best_ppl_i = np.argmax(ppls) best_normalized_sample_texts.append(sample_texts[best_ppl_i]) + best_normalized_sample_text_ids.append(best_ppl_i) return { "best_sample_texts": best_sample_texts, + "best_sample_text_ids": best_sample_text_ids, "best_normalized_sample_texts": best_normalized_sample_texts, + "best_normalized_sample_text_ids": best_normalized_sample_text_ids, } - diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index c2bae472e..926c6b164 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -74,6 +74,7 @@ def _register(calculator_class: StatCalculator): _register(EmbeddingsCalculator()) _register(EnsembleTokenLevelDataCalculator()) _register(CrossEncoderSimilarityMatrixCalculator(nli_model=nli_model)) + _register(GreedySimilarityCalculator(nli_model=nli_model)) _register(GreedyAlternativesNLICalculator(nli_model=nli_model)) _register(SampleAlternativesNLICalculator(nli_model=nli_model)) _register(GreedyAlternativesFactPrefNLICalculator(nli_model=nli_model)) From f973da89bfce1e6bdc918b5377a839d6f7a4144c Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 9 Jan 2025 22:12:28 +0400 Subject: [PATCH 68/97] Fix class names --- src/lm_polygraph/estimators/__init__.py | 3 ++- .../greedy_semantic_average_ue_average_similarity.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 9d3009913..357816c71 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -93,7 +93,8 @@ SemanticEnrichedTokenSARAveDissimilarity , SemanticEnrichedMaxprobAveDissimilarity, SemanticEnrichedMTEAveDissimilarity, - AveDissimilarity) + AveDissimilarity +) from .greedy_semantic_average_ue_average_similarity import ( GreedySemanticAveMaxprobAveSimilarity, GreedySemanticAvePPLAveSimilarity, diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py index 5b7a8ab99..7b23c52de 100644 --- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py @@ -50,7 +50,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(enriched_metrics) -class GreedySemanticAveMaxprobAveDissimilarity(Estimator): +class GreedySemanticEnrichedMaxprobAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -132,7 +132,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(enriched_ppl) -class GreedySemanticAvePPLAveDissimilarity(Estimator): +class GreedySemanticEnrichedPPLAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -231,7 +231,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(enriched_tokenSAR) -class GreedySemanticAveTokenSARAveDissimilarity(Estimator): +class GreedySemanticEnrichedTokenSARAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, From 5e7df32486d9a174747dabbe8702280b2d64bee4 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 10 Jan 2025 15:15:38 +0400 Subject: [PATCH 69/97] Fix naming --- ..._semantic_average_ue_average_similarity.py | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py index 7b23c52de..9cf501ffe 100644 --- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py @@ -38,11 +38,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Compute row-wise average similarity, excluding self-similarity # Diagonal contains self-similarities - ave_similarity = np.mean(greedy_sentence_similarity) + avg_similarity = np.mean(greedy_sentence_similarity) # Enrich each metric by scaling it by 1/row_average - if ave_similarity == 0: - ave_similarity = 1e-10 # Avoid division by zero + if avg_similarity == 0: + avg_similarity = 1e-10 # Avoid division by zero enriched_metric = prob * (1 / avg_similarity) enriched_metrics.append(enriched_metric) @@ -62,9 +62,9 @@ def __init__( def __str__(self): if self.exp: - return "GreedySemanticAveMaxprobAveDissimilarityexp" + return "GreedySemanticEnrichedMaxprobAveDissimilarityexp" else: - return "GreedySemanticAveMaxprobAveDissimilarity" + return "GreedySemanticEnrichedMaxprobAveDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] @@ -81,7 +81,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Compute row-wise average similarity, excluding self-similarity # Diagonal contains self-similarities - ave_dissimilarity = np.mean(1 - greedy_sentence_similarity) + avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) enriched_metric = prob * avg_dissimilarity enriched_metrics.append(enriched_metric) @@ -144,9 +144,9 @@ def __init__( def __str__(self): if self.exp: - return "GreedySemanticAvePPLAveDissimilarityexp" + return "GreedySemanticEnrichedPPLAveDissimilarityexp" else: - return "GreedySemanticAvePPLAveDissimilarity" + return "GreedySemanticEnrichedPPLAveDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] @@ -213,7 +213,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: R_t = 1 - token_similarity R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm - tokenSAR.append(E_t.sum()) + tokenSAR = E_t.sum() if self.exp: tokenSAR = -np.exp(-np.array(tokenSAR)) @@ -249,9 +249,9 @@ def __init__( def __str__(self): if self.exp: - return "GreedySemanticAveTokenSARAveDissimilarityexp" + return "GreedySemanticEnrichedTokenSARAveDissimilarityexp" else: - return "GreedySemanticAveTokenSARAveDissimilarity" + return "GreedySemanticEnrichedTokenSARAveDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] @@ -273,7 +273,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: R_t = 1 - token_similarity R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm - tokenSAR.append(E_t.sum()) + tokenSAR = E_t.sum() if self.exp: tokenSAR = -np.exp(-np.array(tokenSAR)) @@ -313,8 +313,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Enrich each PPL independently by scaling with 1/row_average if avg_similarity == 0: avg_similarity = 1e-10 # Avoid division by zero - - enriched_value = greedy_entropy * (1 / avg_similarity) + + entropy = np.mean(greedy_entropy) + enriched_value = entropy * (1 / avg_similarity) enriched_entropy.append(enriched_value) return np.array(enriched_entropy) @@ -342,8 +343,9 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): # Compute row-wise average similarity, excluding self-similarity avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) - - enriched_value = greedy_entropy * avg_dissimilarity + + entropy = np.mean(greedy_entropy) + enriched_value = entropy * avg_dissimilarity enriched_entropy.append(enriched_value) return np.array(enriched_entropy) From 7cf569e6dc597e7aa793fa2d55b85c67ade628dd Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 10 Jan 2025 15:32:25 +0400 Subject: [PATCH 70/97] Add missing stats to experimental configs --- .../configs/polygraph_eval_coqa_sentsar.yaml | 3 + .../polygraph_eval_gsm8k_sentsar_cot.yaml | 3 + .../configs/polygraph_eval_mmlu_sentsar.yaml | 3 + .../polygraph_eval_triviaqa_sentsar.yaml | 3 + .../polygraph_eval_wmt14_enfr_sentsar.yaml | 153 ------------------ .../polygraph_eval_wmt14_fren_sentsar.yaml | 3 + .../polygraph_eval_wmt19_deen_sentsar.yaml | 3 + .../polygraph_eval_wmt19_ende_sentsar.yaml | 152 ----------------- 8 files changed, 18 insertions(+), 305 deletions(-) delete mode 100644 examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml delete mode 100644 examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml index 828e2327a..09d8e58d4 100644 --- a/examples/configs/polygraph_eval_coqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -31,6 +31,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - greedy_sentence_similarity - token_similarity - entropy - sample_tokens @@ -43,7 +44,9 @@ save_stats: - sample_entropy - first_sample_texts - best_sample_texts + - best_sample_text_ids - best_normalized_sample_texts + - best_normalized_sample_texts_ids entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml index ea91a213f..3164160b2 100644 --- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -32,6 +32,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - greedy_sentence_similarity - token_similarity - entropy - sample_tokens @@ -44,7 +45,9 @@ save_stats: - sample_entropy - first_sample_texts - best_sample_texts + - best_sample_text_ids - best_normalized_sample_texts + - best_normalized_sample_texts_ids entropy_top_k: 50 target_ignore_regex: "(?s).*#### " diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml index 743904e1b..639fe90f6 100644 --- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -33,6 +33,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - greedy_sentence_similarity - token_similarity - entropy - sample_tokens @@ -45,7 +46,9 @@ save_stats: - sample_entropy - first_sample_texts - best_sample_texts + - best_sample_text_ids - best_normalized_sample_texts + - best_normalized_sample_texts_ids entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 4fd78ca81..12f322bf5 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -33,6 +33,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - greedy_sentence_similarity - token_similarity - entropy - sample_tokens @@ -45,7 +46,9 @@ save_stats: - sample_entropy - first_sample_texts - best_sample_texts + - best_sample_text_ids - best_normalized_sample_texts + - best_normalized_sample_texts_ids entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml deleted file mode 100644 index 2404e8822..000000000 --- a/examples/configs/polygraph_eval_wmt14_enfr_sentsar.yaml +++ /dev/null @@ -1,153 +0,0 @@ -hydra: - run: - dir: ${cache_path}/wmt14_enfr/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} - -defaults: - - model: bloomz-560m - - _self_ - -cache_path: ./workdir/output -save_path: '${hydra:run.dir}' - -device: cpu - -task: nmt - -base_manager: null -overwrite_base_estimations: false - -dataset: [wmt14, fr-en] -text_column: en -label_column: fr -prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n" -train_split: train -eval_split: test -max_new_tokens: 182 -load_from_disk: false -generation_params: - generate_until: - - "\n" -save_stats: - - greedy_tokens - - greedy_log_likelihoods - - greedy_tokens_alternatives - - token_similarity - - entropy - - sample_tokens - - sample_tokens_alternatives - - sample_texts - - sample_log_probs - - sample_log_likelihoods - - sample_sentence_similarity - - sample_token_similarity - - sample_entropy - - first_sample_texts - - best_sample_texts - - best_normalized_sample_texts -entropy_top_k: 50 - -source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" - -train_dataset: null -train_test_split: false -test_split_size: 1 - -background_train_dataset: allenai/c4 -background_train_dataset_text_column: text -background_train_dataset_label_column: url -background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz -background_load_from_disk: false - -subsample_background_train_dataset: 1000 -subsample_train_dataset: 1000 -subsample_eval_dataset: -1 - -use_density_based_ue: false -use_ens_ue: false -use_seq_ue: false -use_tok_ue: false -generation_metrics: null - -additional_estimators: - - module: lm_polygraph.estimators.monte_carlo_sequence_entropy - class_name: MonteCarloSequenceEntropy - kwargs: {} - - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy - class_name: MonteCarloNormalizedSequenceEntropy - kwargs: {} - - module: lm_polygraph.estimators.semantic_entropy - class_name: SemanticEntropy - kwargs: {} - - - module: lm_polygraph.estimators.max_probability - class_name: MaximumSequenceProbability - kwargs: {} - - module: lm_polygraph.estimators.max_probability - class_name: SampledMaximumSequenceProbability - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: SentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: {} - - - module: lm_polygraph.estimators.token_sar - class_name: TokenSAR - kwargs: {} - - module: lm_polygraph.estimators.token_sar - class_name: SampledTokenSAR - kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: {} - - - module: lm_polygraph.estimators.perplexity - class_name: Perplexity - kwargs: {} - - module: lm_polygraph.estimators.perplexity - class_name: SampledPerplexity - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSAR - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: {} - - - module: lm_polygraph.estimators.token_entropy - class_name: MeanTokenEntropy - kwargs: {} - - module: lm_polygraph.estimators.token_entropy - class_name: SampledMeanTokenEntropy - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: MTESAR - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: {} - - - module: lm_polygraph.estimators.average_ue - class_name: AveMaxprob - kwargs: {} - - module: lm_polygraph.estimators.average_ue - class_name: AvePPL - kwargs: {} - - module: lm_polygraph.estimators.average_ue - class_name: AveTokenSAR - kwargs: {} - - module: lm_polygraph.estimators.average_ue - class_name: AveMTE - kwargs: {} - -ignore_exceptions: false - -batch_size: 1 -deberta_batch_size: 1 - -seed: - - 1 diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 6040bd6e7..d5b5932b4 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -31,6 +31,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - greedy_sentence_similarity - token_similarity - entropy - sample_tokens @@ -43,7 +44,9 @@ save_stats: - sample_entropy - first_sample_texts - best_sample_texts + - best_sample_text_ids - best_normalized_sample_texts + - best_normalized_sample_texts_ids entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index 58e5cee10..0dbe3de3e 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -31,6 +31,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - greedy_sentence_similarity - token_similarity - entropy - sample_tokens @@ -43,7 +44,9 @@ save_stats: - sample_entropy - first_sample_texts - best_sample_texts + - best_sample_text_ids - best_normalized_sample_texts + - best_normalized_sample_texts_ids entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml deleted file mode 100644 index 33bae1849..000000000 --- a/examples/configs/polygraph_eval_wmt19_ende_sentsar.yaml +++ /dev/null @@ -1,152 +0,0 @@ -hydra: - run: - dir: ${cache_path}/wmt19_ende/${model.path}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S} - -defaults: - - model: bloomz-560m - - _self_ - -cache_path: ./workdir/output -save_path: '${hydra:run.dir}' - -device: cpu - -task: nmt - -base_manager: null -overwrite_base_estimations: false - -dataset: [wmt19, de-en] -text_column: en -label_column: de -prompt: "Here is a sentence in {source_lang} language and its translation in {target_lang} language.\n\nOriginal:\n{text}\nTranslation:\n" -train_split: train -eval_split: validation -max_new_tokens: 200 -load_from_disk: false -generation_params: - generate_until: - - "\n" -save_stats: - - greedy_tokens - - greedy_log_likelihoods - - greedy_tokens_alternatives - - token_similarity - - entropy - - sample_tokens - - sample_tokens_alternatives - - sample_texts - - sample_log_probs - - sample_log_likelihoods - - sample_sentence_similarity - - sample_token_similarity - - sample_entropy - - first_sample_texts - - best_sample_texts - - best_normalized_sample_texts -entropy_top_k: 50 - -source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" - -train_dataset: null -train_test_split: false -test_split_size: 1 - -background_train_dataset: allenai/c4 -background_train_dataset_text_column: text -background_train_dataset_label_column: url -background_train_dataset_data_files: en/c4-train.00000-of-01024.json.gz -background_load_from_disk: false - -subsample_background_train_dataset: 1000 -subsample_train_dataset: 1000 -subsample_eval_dataset: -1 - -use_density_based_ue: false -use_ens_ue: false -use_seq_ue: false -use_tok_ue: false - -additional_estimators: - - module: lm_polygraph.estimators.monte_carlo_sequence_entropy - class_name: MonteCarloSequenceEntropy - kwargs: {} - - module: lm_polygraph.estimators.monte_carlo_normalized_sequence_entropy - class_name: MonteCarloNormalizedSequenceEntropy - kwargs: {} - - module: lm_polygraph.estimators.semantic_entropy - class_name: SemanticEntropy - kwargs: {} - - - module: lm_polygraph.estimators.max_probability - class_name: MaximumSequenceProbability - kwargs: {} - - module: lm_polygraph.estimators.max_probability - class_name: SampledMaximumSequenceProbability - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: SentenceSAR - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MaxprobGSU - kwargs: {} - - - module: lm_polygraph.estimators.token_sar - class_name: TokenSAR - kwargs: {} - - module: lm_polygraph.estimators.token_sar - class_name: SampledTokenSAR - kwargs: {} - - module: lm_polygraph.estimators.sar - class_name: SAR - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: TokenSARGSU - kwargs: {} - - - module: lm_polygraph.estimators.perplexity - class_name: Perplexity - kwargs: {} - - module: lm_polygraph.estimators.perplexity - class_name: SampledPerplexity - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: PPLSAR - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: PPLGSU - kwargs: {} - - - module: lm_polygraph.estimators.token_entropy - class_name: MeanTokenEntropy - kwargs: {} - - module: lm_polygraph.estimators.token_entropy - class_name: SampledMeanTokenEntropy - kwargs: {} - - module: lm_polygraph.estimators.sentence_sar - class_name: MTESAR - kwargs: {} - - module: lm_polygraph.estimators.gsu - class_name: MTEGSU - kwargs: {} - - - module: lm_polygraph.estimators.average_ue - class_name: AveMaxprob - kwargs: {} - - module: lm_polygraph.estimators.average_ue - class_name: AvePPL - kwargs: {} - - module: lm_polygraph.estimators.average_ue - class_name: AveTokenSAR - kwargs: {} - - module: lm_polygraph.estimators.average_ue - class_name: AveMTE - kwargs: {} - -ignore_exceptions: false - -batch_size: 1 -deberta_batch_size: 1 - -seed: - - 1 From faf3e14511ff918de5e5b423b9d8a21d5d6b53d5 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 10 Jan 2025 16:34:29 +0400 Subject: [PATCH 71/97] Make experiments work --- .../configs/polygraph_eval_coqa_sentsar.yaml | 4 ++ .../polygraph_eval_gsm8k_sentsar_cot.yaml | 4 ++ .../configs/polygraph_eval_mmlu_sentsar.yaml | 4 ++ .../polygraph_eval_triviaqa_sentsar.yaml | 4 ++ .../polygraph_eval_wmt14_fren_sentsar.yaml | 4 ++ .../polygraph_eval_wmt19_deen_sentsar.yaml | 4 ++ .../configs/polygraph_eval_xsum_sentsar.yaml | 7 ++++ scripts/polygraph_eval | 2 +- src/lm_polygraph/estimators/common.py | 3 ++ .../semantic_average_ue_average_similarity.py | 41 ++++++++++++++----- src/lm_polygraph/stat_calculators/sample.py | 2 + 11 files changed, 68 insertions(+), 11 deletions(-) diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml index 09d8e58d4..9963feca3 100644 --- a/examples/configs/polygraph_eval_coqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -145,6 +145,10 @@ additional_estimators: class_name: AveMTE kwargs: {} + - module: lm_polygraph.estimators.semantic_average_ue_average_similarity + class_name: SemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml index 3164160b2..f282769b4 100644 --- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -149,6 +149,10 @@ additional_estimators: class_name: AveMTE kwargs: {} + - module: lm_polygraph.estimators.semantic_average_ue_average_similarity + class_name: SemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml index 639fe90f6..5c88ba80e 100644 --- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -147,6 +147,10 @@ additional_estimators: class_name: AveMTE kwargs: {} + - module: lm_polygraph.estimators.semantic_average_ue_average_similarity + class_name: SemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 12f322bf5..b639b9cfa 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -148,6 +148,10 @@ additional_estimators: class_name: AveMTE kwargs: {} + - module: lm_polygraph.estimators.semantic_average_ue_average_similarity + class_name: SemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index d5b5932b4..f2def77bf 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -147,6 +147,10 @@ additional_estimators: class_name: AveMTE kwargs: {} + - module: lm_polygraph.estimators.semantic_average_ue_average_similarity + class_name: SemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index 0dbe3de3e..fbe97acfe 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -146,6 +146,10 @@ additional_estimators: class_name: AveMTE kwargs: {} + - module: lm_polygraph.estimators.semantic_average_ue_average_similarity + class_name: SemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index 1a4d971c6..cce8d376d 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -32,6 +32,7 @@ save_stats: - greedy_tokens - greedy_log_likelihoods - greedy_tokens_alternatives + - greedy_sentence_similarity - token_similarity - entropy - sample_tokens @@ -44,7 +45,9 @@ save_stats: - sample_entropy - first_sample_texts - best_sample_texts + - best_sample_text_ids - best_normalized_sample_texts + - best_normalized_sample_texts_ids entropy_top_k: 50 train_dataset: null @@ -142,6 +145,10 @@ additional_estimators: class_name: AveMTE kwargs: {} + - module: lm_polygraph.estimators.semantic_average_ue_average_similarity + class_name: SemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 0eda6666c..2440332b0 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -426,7 +426,7 @@ def get_ue_methods(args, model): estimator = estimator_class(**estimator_args.kwargs) # Additional estimator filtering only works correctly for sequence-level estimators if overwrite or ('sequence', str(estimator)) not in existing_estimators: - estimators.append(estimator_class(**estimator_args.kwargs)) + estimators.append(estimator) return estimators diff --git a/src/lm_polygraph/estimators/common.py b/src/lm_polygraph/estimators/common.py index 72e2142e7..7942e2c41 100644 --- a/src/lm_polygraph/estimators/common.py +++ b/src/lm_polygraph/estimators/common.py @@ -1,5 +1,6 @@ import numpy as np +SAMPLE_SELECTION_STAT_KEYS = ["best_sample_text_ids", "best_normalized_sample_text_ids"] def _get_pairs(lst): pairs = [] @@ -30,6 +31,7 @@ def _compute_Jaccard_score(lst): def compute_sim_score(answers, affinity, similarity_score): return _compute_Jaccard_score(answers) + def sample_strategy_to_prefix(sample_strategy): if sample_strategy == "first": return "" @@ -38,6 +40,7 @@ def sample_strategy_to_prefix(sample_strategy): else: raise ValueError(f"Unknown sample strategy: {sample_strategy}") + def best_sample_ids(sample_strategy, stats): batch_size = len(stats["sample_log_probs"]) if sample_strategy == "first": diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py index 2a0f0b617..f33849e98 100644 --- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py @@ -4,7 +4,7 @@ from copy import deepcopy from .estimator import Estimator -from .common import sample_strategy_to_prefix, best_sample_ids +from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS class SemanticAveMaxprobAveSimilarity(Estimator): @@ -14,7 +14,10 @@ def __init__( exp: bool = False, sample_strategy: str = "first" ): - super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + super().__init__( + ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) self.verbose = verbose self.exp = exp self.sample_strategy = sample_strategy @@ -73,7 +76,10 @@ def __init__( exp: bool = False, sample_strategy: str = "first" ): - super().__init__(["sample_sentence_similarity", "sample_log_probs"], "sequence") + super().__init__( + ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) self.verbose = verbose self.exp = exp self.sample_strategy = sample_strategy @@ -136,7 +142,10 @@ def __init__( exp: bool = False, sample_strategy: str = "first" ): - super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + super().__init( + ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) self.verbose = verbose self.exp = exp self.sample_strategy = sample_strategy @@ -194,7 +203,10 @@ def __init__( exp: bool = False, sample_strategy: str = "first" ): - super().__init__(["sample_sentence_similarity", "sample_log_likelihoods"], "sequence") + super().__init( + ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) self.verbose = verbose self.exp = exp self.sample_strategy = sample_strategy @@ -259,7 +271,7 @@ def __init__( "sample_sentence_similarity", "sample_log_likelihoods", "sample_token_similarity", - ], + ] + SAMPLE_SELECTION_STAT_KEYS, "sequence", ) self.verbose = verbose @@ -340,7 +352,7 @@ def __init__( "sample_sentence_similarity", "sample_log_likelihoods", "sample_token_similarity", - ], + ] + SAMPLE_SELECTION_STAT_KEYS, "sequence", ) self.verbose = verbose @@ -415,7 +427,10 @@ def __init__( verbose: bool = False, sample_strategy: str = "first" ): - super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + super().__init( + ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) self.verbose = verbose self.sample_strategy = sample_strategy @@ -464,7 +479,10 @@ def __init__( verbose: bool = False, sample_strategy: str = "first" ): - super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + super().__init( + ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) self.verbose = verbose self.sample_strategy = sample_strategy @@ -513,7 +531,10 @@ def __init__( verbose: bool = False, sample_strategy: str = "first" ): - super().__init__(["sample_sentence_similarity", "sample_entropy"], "sequence") + super().__init( + ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) self.verbose = verbose self.sample_strategy = sample_strategy diff --git a/src/lm_polygraph/stat_calculators/sample.py b/src/lm_polygraph/stat_calculators/sample.py index 4217000c6..f05d9901e 100644 --- a/src/lm_polygraph/stat_calculators/sample.py +++ b/src/lm_polygraph/stat_calculators/sample.py @@ -237,7 +237,9 @@ def __init__(self): super().__init__( [ "best_sample_texts", + "best_sample_text_ids", "best_normalized_sample_texts", + "best_normalized_sample_text_ids", ], [ "sample_texts", From 3074b6a2331317d6d110be420598f0924322e563 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 10 Jan 2025 16:48:16 +0400 Subject: [PATCH 72/97] Add falcon model --- examples/configs/model/falcon3.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 examples/configs/model/falcon3.yaml diff --git a/examples/configs/model/falcon3.yaml b/examples/configs/model/falcon3.yaml new file mode 100644 index 000000000..3a8243339 --- /dev/null +++ b/examples/configs/model/falcon3.yaml @@ -0,0 +1,11 @@ +defaults: + - default + +path: tiiuae/Falcon3-7B-Base +type: CausalLM +path_to_load_script: model/default_causal.py + +load_model_args: + device_map: balanced_low_0 + dtype: bfloat16 +load_tokenizer_args: {} From edf9ee1701679a21ee36981c0a03ed5ce9423408 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 10 Jan 2025 19:06:23 +0400 Subject: [PATCH 73/97] Prevent tokenizer outputting token type ids --- src/lm_polygraph/utils/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm_polygraph/utils/model.py b/src/lm_polygraph/utils/model.py index f22901919..db381107f 100644 --- a/src/lm_polygraph/utils/model.py +++ b/src/lm_polygraph/utils/model.py @@ -564,7 +564,7 @@ def tokenize( formatted_texts.append(formatted_chat) texts = formatted_texts - return self.tokenizer(texts, padding=True, return_tensors="pt") + return self.tokenizer(texts, padding=True, return_tensors="pt", return_token_type_ids=False) def create_ensemble( From e07c62ac30cbe219b548f5f203e1db06d770b046 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Mon, 13 Jan 2025 12:42:29 +0400 Subject: [PATCH 74/97] Save manager state between evaluation steps --- scripts/polygraph_eval | 4 +--- src/lm_polygraph/utils/manager.py | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 2440332b0..a748e8d10 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -228,13 +228,11 @@ def main(args): language=getattr(args, 'language', 'en'), save_stats=getattr(args, 'save_stats', []), entropy_top_k=getattr(args, 'entropy_top_k', None), + save_path=save_path + f"/ue_manager_seed{seed}", ) man() - man.save(save_path + f"/ue_manager_seed{seed}") - - def get_ue_metrics(args): ue_metrics = [ #ReversedPairsProportion(), diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index db3c4a0ec..905ff3818 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -260,6 +260,8 @@ def __init__( cache_path=os.path.expanduser("~") + "/.cache", save_stats: List[str] = [], entropy_top_k: Optional[int] = None, + state: str = 'init', + save_path: Optional[str] = None, ): """ Parameters: @@ -315,6 +317,8 @@ def __init__( self.deberta_batch_size = deberta_batch_size self.deberta_device = deberta_device self.language = language + self.state = state + self.save_path = save_path def prepare_calculators(self): @@ -508,11 +512,17 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: torch.cuda.empty_cache() gc.collect() + self.state = 'post_inference' + self.save() + self.eval_ue() + self.state = 'post_eval' for processor in self.processors: processor.on_eval(self.metrics, self.total_bad_estimators) + self.save() + return self.metrics def eval_ue(self): @@ -680,7 +690,7 @@ def _extract_train_embeddings( return result_train_stat - def save(self, save_path: str): + def save(self): """ Saves the run results in the provided path. Will raise exception, if no results are calculated yet. To load the saved manager, see UEManager.load(). @@ -690,14 +700,18 @@ def save(self, save_path: str): """ if len(self.metrics) == 0: raise Exception("Nothing to save. Consider calling manager() first.") + if self.save_path is None: + raise Exception("No save path provided.") + torch.save( { "metrics": self.metrics, "gen_metrics": self.gen_metrics, "estimations": self.estimations, "stats": self.stats, + "state": self.state, }, - save_path, + self.save_path, ) @staticmethod @@ -723,4 +737,6 @@ def load(load_path: str, **kwargs) -> "UEManager": man.gen_metrics = res_dict.get("gen_metrics", None) man.estimations = res_dict.get("estimations", None) man.stats = res_dict.get("stats", None) + man.state = res_dict.get("state", 'init') + return man From c3c63ad8528b2d3c60082a3d50d55d5c572ff796 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Mon, 13 Jan 2025 14:26:26 +0400 Subject: [PATCH 75/97] Fix saving --- scripts/polygraph_eval | 2 +- src/lm_polygraph/utils/manager.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index a748e8d10..8421863a5 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -207,6 +207,7 @@ def main(args): language=getattr(args, 'language', 'en'), save_stats=getattr(args, 'save_stats', []), entropy_top_k=getattr(args, 'entropy_top_k', None), + save_path=save_path + f"/ue_manager_seed{seed}", ) else: man = UEManager.load( @@ -228,7 +229,6 @@ def main(args): language=getattr(args, 'language', 'en'), save_stats=getattr(args, 'save_stats', []), entropy_top_k=getattr(args, 'entropy_top_k', None), - save_path=save_path + f"/ue_manager_seed{seed}", ) man() diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 905ff3818..e44a260df 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -698,8 +698,6 @@ def save(self): Parameters: save_path (str): Path to file to save benchmark results to. """ - if len(self.metrics) == 0: - raise Exception("Nothing to save. Consider calling manager() first.") if self.save_path is None: raise Exception("No save path provided.") From 6f9fee1c3aa83cfcddfa00d66c2695386314d817 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Tue, 14 Jan 2025 13:53:26 +0400 Subject: [PATCH 76/97] Fix stat name issue --- examples/configs/polygraph_eval_coqa_sentsar.yaml | 2 +- examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml | 2 +- examples/configs/polygraph_eval_mmlu_sentsar.yaml | 2 +- examples/configs/polygraph_eval_triviaqa_sentsar.yaml | 2 +- examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml | 2 +- examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml | 2 +- examples/configs/polygraph_eval_xsum_sentsar.yaml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml index 9963feca3..2d2a80727 100644 --- a/examples/configs/polygraph_eval_coqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -46,7 +46,7 @@ save_stats: - best_sample_texts - best_sample_text_ids - best_normalized_sample_texts - - best_normalized_sample_texts_ids + - best_normalized_sample_text_ids entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml index f282769b4..7357f6153 100644 --- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -47,7 +47,7 @@ save_stats: - best_sample_texts - best_sample_text_ids - best_normalized_sample_texts - - best_normalized_sample_texts_ids + - best_normalized_sample_text_ids entropy_top_k: 50 target_ignore_regex: "(?s).*#### " diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml index 5c88ba80e..211dbd4a0 100644 --- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -48,7 +48,7 @@ save_stats: - best_sample_texts - best_sample_text_ids - best_normalized_sample_texts - - best_normalized_sample_texts_ids + - best_normalized_sample_text_ids entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index b639b9cfa..532a29a5e 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -48,7 +48,7 @@ save_stats: - best_sample_texts - best_sample_text_ids - best_normalized_sample_texts - - best_normalized_sample_texts_ids + - best_normalized_sample_text_ids entropy_top_k: 50 train_dataset: null diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index f2def77bf..8ab1e65bb 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -46,7 +46,7 @@ save_stats: - best_sample_texts - best_sample_text_ids - best_normalized_sample_texts - - best_normalized_sample_texts_ids + - best_normalized_sample_text_ids entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index fbe97acfe..8fca6ce1c 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -46,7 +46,7 @@ save_stats: - best_sample_texts - best_sample_text_ids - best_normalized_sample_texts - - best_normalized_sample_texts_ids + - best_normalized_sample_text_ids entropy_top_k: 50 source_ignore_regex: "(?s).*Original:\n(.*?)\nTranslation:\n" diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index cce8d376d..f3efb2356 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -47,7 +47,7 @@ save_stats: - best_sample_texts - best_sample_text_ids - best_normalized_sample_texts - - best_normalized_sample_texts_ids + - best_normalized_sample_text_ids entropy_top_k: 50 train_dataset: null From 3e7b953e7e20d94f8eb9e8c6ffc28bccb0197c86 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 15 Jan 2025 15:32:40 +0400 Subject: [PATCH 77/97] Fix tokensars, add greedy-based method to save stat --- .../configs/polygraph_eval_coqa_sentsar.yaml | 4 ++++ .../polygraph_eval_gsm8k_sentsar_cot.yaml | 4 ++++ .../configs/polygraph_eval_mmlu_sentsar.yaml | 4 ++++ .../polygraph_eval_triviaqa_sentsar.yaml | 4 ++++ .../polygraph_eval_wmt14_fren_sentsar.yaml | 4 ++++ .../polygraph_eval_wmt19_deen_sentsar.yaml | 4 ++++ .../configs/polygraph_eval_xsum_sentsar.yaml | 4 ++++ src/lm_polygraph/estimators/average_ue.py | 11 +++++----- src/lm_polygraph/estimators/gsu.py | 5 ++++- src/lm_polygraph/estimators/sar.py | 5 ++++- .../estimators/semantic_average_ue.py | 5 ++++- .../semantic_average_ue_average_similarity.py | 20 ++++++++++++------- .../estimators/semantic_median_ue.py | 5 ++++- src/lm_polygraph/estimators/token_sar.py | 5 ++++- src/lm_polygraph/utils/manager.py | 1 + 15 files changed, 68 insertions(+), 17 deletions(-) diff --git a/examples/configs/polygraph_eval_coqa_sentsar.yaml b/examples/configs/polygraph_eval_coqa_sentsar.yaml index 2d2a80727..9c710a207 100644 --- a/examples/configs/polygraph_eval_coqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_coqa_sentsar.yaml @@ -149,6 +149,10 @@ additional_estimators: class_name: SemanticAveMaxprobAveSimilarity kwargs: {} + - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity + class_name: GreedySemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml index 7357f6153..dead8a64e 100644 --- a/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml +++ b/examples/configs/polygraph_eval_gsm8k_sentsar_cot.yaml @@ -153,6 +153,10 @@ additional_estimators: class_name: SemanticAveMaxprobAveSimilarity kwargs: {} + - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity + class_name: GreedySemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_mmlu_sentsar.yaml b/examples/configs/polygraph_eval_mmlu_sentsar.yaml index 211dbd4a0..4be9fc43c 100644 --- a/examples/configs/polygraph_eval_mmlu_sentsar.yaml +++ b/examples/configs/polygraph_eval_mmlu_sentsar.yaml @@ -151,6 +151,10 @@ additional_estimators: class_name: SemanticAveMaxprobAveSimilarity kwargs: {} + - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity + class_name: GreedySemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml index 532a29a5e..81e594904 100644 --- a/examples/configs/polygraph_eval_triviaqa_sentsar.yaml +++ b/examples/configs/polygraph_eval_triviaqa_sentsar.yaml @@ -152,6 +152,10 @@ additional_estimators: class_name: SemanticAveMaxprobAveSimilarity kwargs: {} + - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity + class_name: GreedySemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml index 8ab1e65bb..67449b720 100644 --- a/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt14_fren_sentsar.yaml @@ -151,6 +151,10 @@ additional_estimators: class_name: SemanticAveMaxprobAveSimilarity kwargs: {} + - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity + class_name: GreedySemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml index 8fca6ce1c..cf9b58fe0 100644 --- a/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml +++ b/examples/configs/polygraph_eval_wmt19_deen_sentsar.yaml @@ -150,6 +150,10 @@ additional_estimators: class_name: SemanticAveMaxprobAveSimilarity kwargs: {} + - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity + class_name: GreedySemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/examples/configs/polygraph_eval_xsum_sentsar.yaml b/examples/configs/polygraph_eval_xsum_sentsar.yaml index f3efb2356..c9dd41a86 100644 --- a/examples/configs/polygraph_eval_xsum_sentsar.yaml +++ b/examples/configs/polygraph_eval_xsum_sentsar.yaml @@ -149,6 +149,10 @@ additional_estimators: class_name: SemanticAveMaxprobAveSimilarity kwargs: {} + - module: lm_polygraph.estimators.greedy_semantic_average_ue_average_similarity + class_name: GreedySemanticAveMaxprobAveSimilarity + kwargs: {} + ignore_exceptions: false batch_size: 1 diff --git a/src/lm_polygraph/estimators/average_ue.py b/src/lm_polygraph/estimators/average_ue.py index a7748e9e6..4ecf9e541 100644 --- a/src/lm_polygraph/estimators/average_ue.py +++ b/src/lm_polygraph/estimators/average_ue.py @@ -5,7 +5,6 @@ from .estimator import Estimator - class AveMaxprob(Estimator): def __init__( self, @@ -80,11 +79,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_sentence_similarity = stats["sample_sentence_similarity"] ave = [] - for batch_data in zip( + for i, batch_data in enumerate(zip( batch_sample_log_likelihoods, batch_sample_token_similarity, batch_sample_sentence_similarity, - ): + )): sample_log_likelihoods = batch_data[0] sample_token_similarity = batch_data[1] sample_sentence_similarity = batch_data[2] @@ -95,10 +94,12 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) - ave.append(np.mean(tokenSAR)) return np.array(ave) diff --git a/src/lm_polygraph/estimators/gsu.py b/src/lm_polygraph/estimators/gsu.py index 2969dca95..bdb8e5de4 100644 --- a/src/lm_polygraph/estimators/gsu.py +++ b/src/lm_polygraph/estimators/gsu.py @@ -167,7 +167,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) diff --git a/src/lm_polygraph/estimators/sar.py b/src/lm_polygraph/estimators/sar.py index 57e9c2902..2d7559db1 100644 --- a/src/lm_polygraph/estimators/sar.py +++ b/src/lm_polygraph/estimators/sar.py @@ -63,7 +63,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) diff --git a/src/lm_polygraph/estimators/semantic_average_ue.py b/src/lm_polygraph/estimators/semantic_average_ue.py index d58489d99..161ccec37 100644 --- a/src/lm_polygraph/estimators/semantic_average_ue.py +++ b/src/lm_polygraph/estimators/semantic_average_ue.py @@ -133,7 +133,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py index f33849e98..40fc0a004 100644 --- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py @@ -142,7 +142,7 @@ def __init__( exp: bool = False, sample_strategy: str = "first" ): - super().__init( + super().__init__( ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS, "sequence" ) @@ -203,7 +203,7 @@ def __init__( exp: bool = False, sample_strategy: str = "first" ): - super().__init( + super().__init__( ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS, "sequence" ) @@ -308,7 +308,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) @@ -389,7 +392,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) @@ -427,7 +433,7 @@ def __init__( verbose: bool = False, sample_strategy: str = "first" ): - super().__init( + super().__init__( ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS, "sequence" ) @@ -479,7 +485,7 @@ def __init__( verbose: bool = False, sample_strategy: str = "first" ): - super().__init( + super().__init__( ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS, "sequence" ) @@ -531,7 +537,7 @@ def __init__( verbose: bool = False, sample_strategy: str = "first" ): - super().__init( + super().__init__( ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS, "sequence" ) diff --git a/src/lm_polygraph/estimators/semantic_median_ue.py b/src/lm_polygraph/estimators/semantic_median_ue.py index 0ec7d3274..5c6687608 100644 --- a/src/lm_polygraph/estimators/semantic_median_ue.py +++ b/src/lm_polygraph/estimators/semantic_median_ue.py @@ -134,7 +134,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) diff --git a/src/lm_polygraph/estimators/token_sar.py b/src/lm_polygraph/estimators/token_sar.py index c2695970d..d31c7651b 100644 --- a/src/lm_polygraph/estimators/token_sar.py +++ b/src/lm_polygraph/estimators/token_sar.py @@ -99,7 +99,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) result.append(tokenSAR[best_id]) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index e44a260df..987177e34 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -540,6 +540,7 @@ def eval_ue(self): f"Got different number of metrics for {e_name} and {gen_name}: " f"{len(estimator_values)} and {len(generation_metric)}" ) + # TODO: Report how many nans! # This is important to know for a user ue, metric = _delete_nans(estimator_values, generation_metric) From 0bba562922497216ac1d814444c727de9b80aa81 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Mon, 20 Jan 2025 14:55:15 +0400 Subject: [PATCH 78/97] Add rouge sim matrix calculator, some fixes --- src/lm_polygraph/estimators/__init__.py | 2 + ..._semantic_average_ue_average_similarity.py | 10 ++- .../greedy_sum_semantic_entropies.py | 70 +++++++++++++++ .../estimators/sum_semantic_entropies.py | 79 +++++++++++++++++ src/lm_polygraph/estimators/token_sar.py | 5 +- src/lm_polygraph/stat_calculators/__init__.py | 3 + .../stat_calculators/greedy_rouge_matrix.py | 58 ++++++++++++ .../greedy_semantic_matrix.py | 88 +++++++++++++++++++ .../stat_calculators/greedy_similarity.py | 5 +- .../stat_calculators/rouge_matrix.py | 67 ++++++++++++++ .../utils/register_stat_calculators.py | 2 + 11 files changed, 382 insertions(+), 7 deletions(-) create mode 100644 src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py create mode 100644 src/lm_polygraph/estimators/sum_semantic_entropies.py create mode 100644 src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py create mode 100644 src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py create mode 100644 src/lm_polygraph/stat_calculators/rouge_matrix.py diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 357816c71..648d5a1ab 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -106,3 +106,5 @@ GreedySemanticEnrichedMTEAveDissimilarity, ) from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE +from .sum_semantic_entropies import SumSemanticMaxprob, SumSemanticPPL +from .greedy_sum_semantic_entropies import GreedySumSemanticMaxprob, GreedySumSemanticPPL diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py index 9cf501ffe..d2b182741 100644 --- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py @@ -211,7 +211,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR = E_t.sum() @@ -271,7 +274,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR = E_t.sum() diff --git a/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py b/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py new file mode 100644 index 000000000..1280dadc0 --- /dev/null +++ b/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py @@ -0,0 +1,70 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids + + +class GreedySumSemanticMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedySumSemanticMaxprob" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]]) + + enriched_metrics = [] # To store enriched metrics for each sample + for greedy_ll, greedy_sentence_similarity in zip( + batch_lls, batch_greedy_sentence_similarity + ): + # Compute probabilities (negative log-probs) + prob = -greedy_ll + + # Compute row-wise average similarity, excluding self-similarity + # Diagonal contains self-similarities + avg_similarity = np.mean(greedy_sentence_similarity) + + enriched_metrics.append(prob - np.log(avg_similarity)) + + return np.array(enriched_metrics) + + +class GreedySumSemanticPPL(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedySumSemanticPPL" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_ppl = [] # To store enriched PPL for each sample + + for greedy_log_likelihoods, greedy_sentence_similarity in zip( + batch_greedy_log_likelihoods, batch_greedy_sentence_similarity + ): + # get PPL for each sample + ppl = -np.mean(greedy_log_likelihoods) + + # Compute row-wise average similarity, excluding self-similarity + avg_similarity = np.mean(greedy_sentence_similarity) + + enriched_ppl.append(ppl - np.log(avg_similarity)) + + + return np.array(enriched_ppl) diff --git a/src/lm_polygraph/estimators/sum_semantic_entropies.py b/src/lm_polygraph/estimators/sum_semantic_entropies.py new file mode 100644 index 000000000..10b19a03b --- /dev/null +++ b/src/lm_polygraph/estimators/sum_semantic_entropies.py @@ -0,0 +1,79 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS + + +class SumSemanticMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first" + ): + super().__init__( + ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + + def __str__(self): + base = "SumSemanticMaxprob" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) + + enriched_metrics = [] # To store enriched metrics for each sample + + for best_id, sample_log_probs, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity + ): + sim = sample_sentence_similarity[best_id, :] + sim[best_id] = 1 + avg_similarity = np.mean(sim) + res = -np.sum(sample_log_probs[best_id]) - np.log(avg_similarity) + enriched_metrics.append(res) + + return np.array(enriched_metrics) + + +class SumSemanticPPL(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first" + ): + super().__init__( + ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + + def __str__(self): + base = "SumSemanticPPL" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) + + enriched_ppl = [] # To store enriched PPL for each sample + + for best_id, sample_log_likelihoods, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): + sim = sample_sentence_similarity[best_id, :] + sim[best_id] = 1 + avg_similarity = np.mean(sim) + res = -np.mean(sample_log_likelihoods[best_id]) - np.log(avg_similarity) + enriched_ppl.append(res) + + return np.array(enriched_ppl) diff --git a/src/lm_polygraph/estimators/token_sar.py b/src/lm_polygraph/estimators/token_sar.py index d31c7651b..0997c110b 100644 --- a/src/lm_polygraph/estimators/token_sar.py +++ b/src/lm_polygraph/estimators/token_sar.py @@ -43,7 +43,10 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ): log_likelihoods = np.array(log_likelihoods) R_t = 1 - token_similarity - R_t_norm = R_t / R_t.sum() + if R_t.sum() == 0: + R_t_norm = np.zeros_like(R_t) + else: + R_t_norm = R_t / R_t.sum() E_t = -log_likelihoods * R_t_norm tokenSAR.append(E_t.sum()) diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py index f8c3fa554..b01f9c541 100644 --- a/src/lm_polygraph/stat_calculators/__init__.py +++ b/src/lm_polygraph/stat_calculators/__init__.py @@ -25,3 +25,6 @@ from .extract_claims import ClaimsExtractor from .semantic_classes import SemanticClassesCalculator from .greedy_similarity import GreedySimilarityCalculator +from .greedy_semantic_matrix import GreedySemanticMatrixCalculator +from .rouge_matrix import RougeLSemanticMatrixCalculator +from .greedy_rouge_matrix import GreedyRougeLSemanticMatrixCalculator diff --git a/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py b/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py new file mode 100644 index 000000000..ebcc14373 --- /dev/null +++ b/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py @@ -0,0 +1,58 @@ +import numpy as np + +import itertools +from typing import Dict, List + +from .stat_calculator import StatCalculator +from lm_polygraph.utils.model import WhiteboxModel +import torch.nn as nn +import torch +from rouge_score import rouge_scorer + +class GreedyRougeLSemanticMatrixCalculator(StatCalculator): + def __init__(self): + super().__init__( + [ + "greedy_semantic_matrix", + ], + ["greedy_texts", "sample_texts"], + ) + self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + batch_texts = dependencies["sample_texts"] + batch_greedy_texts = dependencies["greedy_texts"] + + batch_pairs = [] + batch_invs = [] + for texts, greedy_text in zip(batch_texts, batch_greedy_texts): + # Sampling from LLM often produces significant number of identical + # outputs. We only need to score pairs of unqiue outputs + unique_texts, inv = np.unique(texts, return_inverse=True) + batch_pairs.append(list(itertools.product([greedy_text], unique_texts))) + batch_invs.append(inv) + + + E = [] + + for i, pairs in enumerate(batch_pairs): + sim_mat = [] + for first_texts, second_texts in pairs: + sim_mat.append(self.scorer.score(first_texts, second_texts)['rougeL'].fmeasure) + + sim_mat = np.array(sim_mat) + + inv = batch_invs[i] + E.append(sim_mat[inv]) + + E = np.stack(E) + + return { + "greedy_rouge_semantic_matrix": E, + } diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py new file mode 100644 index 000000000..6185f2d9f --- /dev/null +++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py @@ -0,0 +1,88 @@ +import numpy as np + +import itertools +from typing import Dict, List + +from .stat_calculator import StatCalculator +from lm_polygraph.utils.model import WhiteboxModel +import torch.nn as nn +import torch + +softmax = nn.Softmax(dim=1) + + +class GreedySemanticMatrixCalculator(StatCalculator): + """ + Calculates the NLI semantic matrix for generation samples using DeBERTa model. + """ + + def __init__(self, nli_model): + super().__init__( + [ + "greedy_semantic_matrix_entail", + "greedy_semantic_matrix_contra", + ], + ["greedy_texts", "sample_texts"], + ) + self.is_deberta_setup = False + self.nli_model = nli_model + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + deberta = self.nli_model + deberta_batch_size = deberta.batch_size + + batch_texts = dependencies["sample_texts"] + batch_greedy_texts = dependencies["greedy_texts"] + + batch_pairs = [] + batch_invs = [] + for texts, greedy_text in zip(batch_texts, batch_greedy_texts): + # Sampling from LLM often produces significant number of identical + # outputs. We only need to score pairs of unqiue outputs + unique_texts, inv = np.unique(texts, return_inverse=True) + batch_pairs.append(list(itertools.product([greedy_text], unique_texts))) + batch_invs.append(inv) + + device = deberta.device + ent_id = deberta.deberta.config.label2id["ENTAILMENT"] + contra_id = deberta.deberta.config.label2id["CONTRADICTION"] + + softmax = nn.Softmax(dim=1) + tokenizer = deberta.deberta_tokenizer + + E = [] + C = [] + + for i, pairs in enumerate(batch_pairs): + dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size) + probs = [] + for first_texts, second_texts in dl: + batch = list(zip(first_texts, second_texts)) + encoded = tokenizer.batch_encode_plus( + batch, padding=True, return_tensors="pt" + ).to(device) + logits = deberta.deberta(**encoded).logits.detach().to(device) + probs.append(softmax(logits).cpu().detach()) + probs = torch.cat(probs, dim=0) + + inv = batch_invs[i] + + entail_probs = probs[:, ent_id] + contra_probs = probs[:, contra_id] + + E.append(entail_probs[inv].numpy()) + C.append(contra_probs[inv].numpy()) + + E = np.stack(E) + C = np.stack(C) + + return { + "greedy_semantic_matrix_entail": E, + "greedy_semantic_matrix_contra": C, + } diff --git a/src/lm_polygraph/stat_calculators/greedy_similarity.py b/src/lm_polygraph/stat_calculators/greedy_similarity.py index 04eae17bf..c3da31778 100644 --- a/src/lm_polygraph/stat_calculators/greedy_similarity.py +++ b/src/lm_polygraph/stat_calculators/greedy_similarity.py @@ -19,7 +19,7 @@ def __init__(self, nli_model): [ "greedy_sentence_similarity", ], - ["input_texts", "sample_tokens", "sample_texts", "greedy_tokens", "greedy_texts"], + ["input_texts", "sample_texts", "greedy_texts"], ) self.crossencoder_setup = False @@ -44,16 +44,13 @@ def __call__( self._setup(device=device) self.crossencoder_setup = True - batch_sample_tokens = dependencies["sample_tokens"] batch_texts = dependencies["sample_texts"] deberta_batch_size = ( self.nli_model.batch_size ) batch_input_texts = dependencies["input_texts"] - batch_greedy_tokens = dependencies["greedy_tokens"] batch_greedy_texts = dependencies["greedy_texts"] - special_tokens = list(model.tokenizer.added_tokens_decoder.keys()) batch_pairs = [] batch_invs = [] diff --git a/src/lm_polygraph/stat_calculators/rouge_matrix.py b/src/lm_polygraph/stat_calculators/rouge_matrix.py new file mode 100644 index 000000000..f99c819ae --- /dev/null +++ b/src/lm_polygraph/stat_calculators/rouge_matrix.py @@ -0,0 +1,67 @@ +import numpy as np + +import itertools +from typing import Dict, List + +from .stat_calculator import StatCalculator +from lm_polygraph.utils.model import WhiteboxModel +import torch.nn as nn +import torch +from rouge_score import rouge_scorer + + +class RougeLSemanticMatrixCalculator(StatCalculator): + """ + Calculates the NLI semantic matrix for generation samples using DeBERTa model. + """ + + def __init__(self): + super().__init__( + [ + "rouge_semantic_matrix", + ], + ["sample_texts"], + ) + self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + + batch_texts = dependencies["sample_texts"] + + batch_pairs = [] + batch_invs = [] + batch_counts = [] + for texts in batch_texts: + # Sampling from LLM often produces significant number of identical + # outputs. We only need to score pairs of unqiue outputs + unique_texts, inv = np.unique(texts, return_inverse=True) + batch_pairs.append(list(itertools.product(unique_texts, unique_texts))) + batch_invs.append(inv) + batch_counts.append(len(unique_texts)) + + E = [] + + for i, pairs in enumerate(batch_pairs): + sim_mat = [] + for first_texts, second_texts in pairs: + sim_mat.append(self.scorer.score(first_texts, second_texts)['rougeL'].fmeasure) + + sim_mat = np.array(sim_mat) + unique_mat_shape = (batch_counts[i], batch_counts[i]) + sim_mat = sim_mat.reshape(unique_mat_shape) + + inv = batch_invs[i] + + E.append(sim_mat[inv, :][:, inv]) + + E = np.stack(E) + + return { + "rouge_semantic_matrix": E, + } diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index 926c6b164..fa96605e1 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -75,6 +75,8 @@ def _register(calculator_class: StatCalculator): _register(EnsembleTokenLevelDataCalculator()) _register(CrossEncoderSimilarityMatrixCalculator(nli_model=nli_model)) _register(GreedySimilarityCalculator(nli_model=nli_model)) + _register(RougeLSemanticMatrixCalculator()) + _register(GreedyRougeLSemanticMatrixCalculator()) _register(GreedyAlternativesNLICalculator(nli_model=nli_model)) _register(SampleAlternativesNLICalculator(nli_model=nli_model)) _register(GreedyAlternativesFactPrefNLICalculator(nli_model=nli_model)) From 5701a477a4d9c96149d57b74fea8e2490be990c0 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 22 Jan 2025 01:24:01 +0400 Subject: [PATCH 79/97] Add align matrix --- src/lm_polygraph/stat_calculators/__init__.py | 2 + .../stat_calculators/align_matrix.py | 83 +++++++++++++++++++ .../stat_calculators/greedy_align_matrix.py | 73 ++++++++++++++++ .../stat_calculators/greedy_rouge_matrix.py | 2 +- .../greedy_semantic_matrix.py | 43 ++++++---- .../stat_calculators/greedy_similarity.py | 16 +++- 6 files changed, 202 insertions(+), 17 deletions(-) create mode 100644 src/lm_polygraph/stat_calculators/align_matrix.py create mode 100644 src/lm_polygraph/stat_calculators/greedy_align_matrix.py diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py index b01f9c541..7f7e37aef 100644 --- a/src/lm_polygraph/stat_calculators/__init__.py +++ b/src/lm_polygraph/stat_calculators/__init__.py @@ -28,3 +28,5 @@ from .greedy_semantic_matrix import GreedySemanticMatrixCalculator from .rouge_matrix import RougeLSemanticMatrixCalculator from .greedy_rouge_matrix import GreedyRougeLSemanticMatrixCalculator +from .align_matrix import AlignMatrixCalculator +from .greedy_align_matrix import GreedyAlignMatrixCalculator diff --git a/src/lm_polygraph/stat_calculators/align_matrix.py b/src/lm_polygraph/stat_calculators/align_matrix.py new file mode 100644 index 000000000..9be56ddd3 --- /dev/null +++ b/src/lm_polygraph/stat_calculators/align_matrix.py @@ -0,0 +1,83 @@ +import numpy as np + +import itertools +from typing import Dict, List + +from .stat_calculator import StatCalculator +from lm_polygraph.utils.model import WhiteboxModel +import torch.nn as nn +import torch + + +class AlignMatrixCalculator(StatCalculator): + """ + Calculates the NLI semantic matrix for generation samples using DeBERTa model. + """ + + def __init__(self, scorer): + super().__init__( + [ + "align_semantic_matrix", + ], + ["sample_texts"], + ) + self.scorer = scorer + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + """ + Calculates the NLI semantic matrix for generation samples using DeBERTa model. + + Parameters: + dependencies (Dict[str, np.ndarray]): input statistics, containing: + - 'sample_texts' (List[List[str]]): several sampling generations + for each input text in the batch. + texts (List[str]): Input texts batch used for model generation. + model (Model): Model used for generation. + max_new_tokens (int): Maximum number of new tokens at model generation. Default: 100. + Returns: + Dict[str, np.ndarray]: dictionary with the following items: + - 'semantic_matrix_entail' (List[np.array]): for each input text: quadratic matrix of size + n_samples x n_samples, with probabilities of 'ENTAILMENT' output of DeBERTa. + - 'semantic_matrix_contra' (List[np.array]): for each input text: quadratic matrix of size + n_samples x n_samples, with probabilities of 'CONTRADICTION' output of DeBERTa. + - 'semantic_matrix_classes' (List[np.array]): for each input text: quadratic matrix of size + n_samples x n_samples, with the NLI label id corresponding to the DeBERTa prediction. + """ + batch_texts = dependencies["sample_texts"] + + batch_pairs = [] + batch_invs = [] + batch_counts = [] + for texts in batch_texts: + # Sampling from LLM often produces significant number of identical + # outputs. We only need to score pairs of unqiue outputs + unique_texts, inv = np.unique(texts, return_inverse=True) + batch_pairs.append(list(itertools.product(unique_texts, unique_texts))) + batch_invs.append(inv) + batch_counts.append(len(unique_texts)) + + E = [] + + for i, pairs in enumerate(batch_pairs): + first_texts, second_texts = zip(*pairs) + sim_mat = np.array(self.scorer.score(claims=first_texts, contexts=second_texts)) + + unique_mat_shape = (batch_counts[i], batch_counts[i]) + + sim_mat = sim_mat.reshape(unique_mat_shape) + + inv = batch_invs[i] + + E.append(sim_mat[inv, :][:, inv]) + + E = np.stack(E) + + return { + "align_semantic_matrix": E, + } diff --git a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py new file mode 100644 index 000000000..c7a6be6ee --- /dev/null +++ b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py @@ -0,0 +1,73 @@ +import numpy as np + +import itertools +from typing import Dict, List + +from .stat_calculator import StatCalculator +from lm_polygraph.utils.model import WhiteboxModel +import torch.nn as nn +import torch + + +class GreedyAlignMatrixCalculator(StatCalculator): + """ + Calculates the NLI semantic matrix for generation samples using DeBERTa model. + """ + + def __init__(self, scorer): + super().__init__( + [ + "greedy_align_semantic_matrix_forward", + "greedy_align_semantic_matrix_backward", + "greedy_align_semantic_matrix", + ], + ["greedy_texts", "sample_texts"], + ) + self.scorer = scorer + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + batch_texts = dependencies["sample_texts"] + batch_greedy_texts = dependencies["greedy_texts"] + + batch_pairs = [] + batch_invs = [] + for texts, greedy_text in zip(batch_texts, batch_greedy_texts): + # Sampling from LLM often produces significant number of identical + # outputs. We only need to score pairs of unqiue outputs + unique_texts, inv = np.unique(texts, return_inverse=True) + batch_pairs.append(list(itertools.product([greedy_text], unique_texts))) + batch_invs.append(inv) + + E_f = [] + E_b = [] + E = [] + + for i, pairs in enumerate(batch_pairs): + sim_mat_f = [] + sim_mat_b = [] + first_texts, second_texts = zip(*pairs) + sim_mat_f = np.array(self.scorer.score(claims=first_texts, contexts=second_texts)) + sim_mat_b = np.array(self.scorer.score(claims=second_texts, contexts=first_texts)) + + inv = batch_invs[i] + + E_f.append(sim_mat_f[inv]) + E_b.append(sim_mat_b[inv]) + E.append((sim_mat_f[inv] + sim_mat_b[inv]) / 2) + + + E_f = np.stack(E_f) + E_b = np.stack(E_b) + E = np.stack(E) + + return { + "greedy_align_semantic_matrix_forward": E_f, + "greedy_align_semantic_matrix_backward": E_b, + "greedy_align_semantic_matrix": E, + } diff --git a/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py b/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py index ebcc14373..c863b3a43 100644 --- a/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_rouge_matrix.py @@ -13,7 +13,7 @@ class GreedyRougeLSemanticMatrixCalculator(StatCalculator): def __init__(self): super().__init__( [ - "greedy_semantic_matrix", + "greedy_rouge_semantic_matrix", ], ["greedy_texts", "sample_texts"], ) diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py index 6185f2d9f..d4ae280ef 100644 --- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py @@ -19,8 +19,9 @@ class GreedySemanticMatrixCalculator(StatCalculator): def __init__(self, nli_model): super().__init__( [ - "greedy_semantic_matrix_entail", - "greedy_semantic_matrix_contra", + "greedy_semantic_matrix_forward", + "greedy_semantic_matrix_backward", + "greedy_semantic_matrix", ], ["greedy_texts", "sample_texts"], ) @@ -51,38 +52,52 @@ def __call__( device = deberta.device ent_id = deberta.deberta.config.label2id["ENTAILMENT"] - contra_id = deberta.deberta.config.label2id["CONTRADICTION"] softmax = nn.Softmax(dim=1) tokenizer = deberta.deberta_tokenizer + E_f = [] + E_b = [] E = [] - C = [] for i, pairs in enumerate(batch_pairs): dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size) - probs = [] + probs_f = [] + probs_b = [] + for first_texts, second_texts in dl: batch = list(zip(first_texts, second_texts)) encoded = tokenizer.batch_encode_plus( batch, padding=True, return_tensors="pt" ).to(device) logits = deberta.deberta(**encoded).logits.detach().to(device) - probs.append(softmax(logits).cpu().detach()) - probs = torch.cat(probs, dim=0) + probs_f.append(softmax(logits).cpu().detach()) + + batch = list(zip(second_texts, first_texts)) + encoded = tokenizer.batch_encode_plus( + batch, padding=True, return_tensors="pt" + ).to(device) + logits = deberta.deberta(**encoded).logits.detach().to(device) + probs_b.append(softmax(logits).cpu().detach()) + + probs_f = torch.cat(probs_f, dim=0) + probs_b = torch.cat(probs_b, dim=0) inv = batch_invs[i] - entail_probs = probs[:, ent_id] - contra_probs = probs[:, contra_id] + entail_probs_f = probs_f[:, ent_id] + entail_probs_b = probs_b[:, ent_id] - E.append(entail_probs[inv].numpy()) - C.append(contra_probs[inv].numpy()) + E_f.append(entail_probs_f[inv].numpy()) + E_b.append(entail_probs_b[inv].numpy()) + E.append((entail_probs_f[inv].numpy() + entail_probs_b[inv].numpy()) / 2) + E_f = np.stack(E_f) + E_b = np.stack(E_b) E = np.stack(E) - C = np.stack(C) return { - "greedy_semantic_matrix_entail": E, - "greedy_semantic_matrix_contra": C, + "greedy_semantic_matrix_forward": E_f, + "greedy_semantic_matrix_backward": E_b, + "greedy_semantic_matrix": E, } diff --git a/src/lm_polygraph/stat_calculators/greedy_similarity.py b/src/lm_polygraph/stat_calculators/greedy_similarity.py index c3da31778..cf2435985 100644 --- a/src/lm_polygraph/stat_calculators/greedy_similarity.py +++ b/src/lm_polygraph/stat_calculators/greedy_similarity.py @@ -17,6 +17,8 @@ class GreedySimilarityCalculator(StatCalculator): def __init__(self, nli_model): super().__init__( [ + "greedy_sentence_similarity_forward", + "greedy_sentence_similarity_backward", "greedy_sentence_similarity", ], ["input_texts", "sample_texts", "greedy_texts"], @@ -62,16 +64,26 @@ def __call__( batch_pairs.append(list(itertools.product([greedy_text], unique_texts))) batch_invs.append(inv) + sim_arrays_f = [] + sim_arrays_b = [] sim_arrays = [] for i, pairs in tqdm(enumerate(batch_pairs)): - sim_scores = self.crossencoder.predict(pairs, batch_size=deberta_batch_size) + pairs_b = [(b, a) for a, b in pairs] + sim_scores_f = self.crossencoder.predict(pairs, batch_size=deberta_batch_size) + sim_scores_b = self.crossencoder.predict(pairs_b, batch_size=deberta_batch_size) inv = batch_invs[i] - sim_arrays.append(sim_scores[inv]) + sim_arrays_f.append(sim_scores_f[inv]) + sim_arrays_b.append(sim_scores_b[inv]) + sim_arrays.append((sim_scores_f[inv] + sim_scores_b[inv]) / 2) + sim_arrays_f = np.stack(sim_arrays_f) + sim_arrays_b = np.stack(sim_arrays_b) sim_arrays = np.stack(sim_arrays) return { + "greedy_sentence_similarity_forward": sim_arrays_f, + "greedy_sentence_similarity_backward": sim_arrays_b, "greedy_sentence_similarity": sim_arrays, } From 7ab3691e2dd2951c4ea1a8282870683ea5fde3f4 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 22 Jan 2025 01:34:09 +0400 Subject: [PATCH 80/97] add tqdm --- src/lm_polygraph/stat_calculators/align_matrix.py | 3 ++- src/lm_polygraph/stat_calculators/greedy_align_matrix.py | 3 ++- src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py | 3 ++- src/lm_polygraph/stat_calculators/semantic_matrix.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/lm_polygraph/stat_calculators/align_matrix.py b/src/lm_polygraph/stat_calculators/align_matrix.py index 9be56ddd3..4df7fa1f8 100644 --- a/src/lm_polygraph/stat_calculators/align_matrix.py +++ b/src/lm_polygraph/stat_calculators/align_matrix.py @@ -7,6 +7,7 @@ from lm_polygraph.utils.model import WhiteboxModel import torch.nn as nn import torch +from tqdm import tqdm class AlignMatrixCalculator(StatCalculator): @@ -64,7 +65,7 @@ def __call__( E = [] - for i, pairs in enumerate(batch_pairs): + for i, pairs in tqdm(enumerate(batch_pairs)): first_texts, second_texts = zip(*pairs) sim_mat = np.array(self.scorer.score(claims=first_texts, contexts=second_texts)) diff --git a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py index c7a6be6ee..eb767f3ac 100644 --- a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py @@ -7,6 +7,7 @@ from lm_polygraph.utils.model import WhiteboxModel import torch.nn as nn import torch +from tqdm import tqdm class GreedyAlignMatrixCalculator(StatCalculator): @@ -48,7 +49,7 @@ def __call__( E_b = [] E = [] - for i, pairs in enumerate(batch_pairs): + for i, pairs in tqdm(enumerate(batch_pairs)): sim_mat_f = [] sim_mat_b = [] first_texts, second_texts = zip(*pairs) diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py index d4ae280ef..a5e5cc9df 100644 --- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py @@ -7,6 +7,7 @@ from lm_polygraph.utils.model import WhiteboxModel import torch.nn as nn import torch +from tqdm import tqdm softmax = nn.Softmax(dim=1) @@ -65,7 +66,7 @@ def __call__( probs_f = [] probs_b = [] - for first_texts, second_texts in dl: + for first_texts, second_texts in tqdm(dl): batch = list(zip(first_texts, second_texts)) encoded = tokenizer.batch_encode_plus( batch, padding=True, return_tensors="pt" diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py index 8a6862f9d..c8ede60a6 100644 --- a/src/lm_polygraph/stat_calculators/semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py @@ -7,6 +7,7 @@ from lm_polygraph.utils.model import WhiteboxModel import torch.nn as nn import torch +from tqdm import tqdm softmax = nn.Softmax(dim=1) @@ -82,7 +83,7 @@ def __call__( C = [] P = [] - for i, pairs in enumerate(batch_pairs): + for i, pairs in tqdm(enumerate(batch_pairs)): dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size) probs = [] for first_texts, second_texts in dl: From 90eacd1d80be916d279f5e9837a4bb8ebda533ef Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 22 Jan 2025 01:57:51 +0400 Subject: [PATCH 81/97] Fix issue with empty samples --- src/lm_polygraph/stat_calculators/align_matrix.py | 1 + src/lm_polygraph/stat_calculators/greedy_align_matrix.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/lm_polygraph/stat_calculators/align_matrix.py b/src/lm_polygraph/stat_calculators/align_matrix.py index 4df7fa1f8..a65bfad76 100644 --- a/src/lm_polygraph/stat_calculators/align_matrix.py +++ b/src/lm_polygraph/stat_calculators/align_matrix.py @@ -58,6 +58,7 @@ def __call__( for texts in batch_texts: # Sampling from LLM often produces significant number of identical # outputs. We only need to score pairs of unqiue outputs + texts = [text if text.strip() != "" else "" for text in texts] unique_texts, inv = np.unique(texts, return_inverse=True) batch_pairs.append(list(itertools.product(unique_texts, unique_texts))) batch_invs.append(inv) diff --git a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py index eb767f3ac..a467e92e6 100644 --- a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py @@ -41,6 +41,7 @@ def __call__( for texts, greedy_text in zip(batch_texts, batch_greedy_texts): # Sampling from LLM often produces significant number of identical # outputs. We only need to score pairs of unqiue outputs + texts = [text if text.strip() != "" else "" for text in texts] unique_texts, inv = np.unique(texts, return_inverse=True) batch_pairs.append(list(itertools.product([greedy_text], unique_texts))) batch_invs.append(inv) From 823ba26b9c46af4deacdb321d59363417149483b Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Wed, 29 Jan 2025 19:22:27 +0400 Subject: [PATCH 82/97] Add ablation-related methods --- src/lm_polygraph/estimators/__init__.py | 10 +- .../estimators/adj_sum_semantic_entropies.py | 213 ++++++++++++++++++ ..._semantic_average_ue_average_similarity.py | 27 +++ .../greedy_sum_semantic_entropies.py | 70 ------ src/lm_polygraph/estimators/prob_cocoa.py | 144 ++++++++++++ .../semantic_average_ue_average_similarity.py | 1 - .../estimators/sum_semantic_entropies.py | 142 +++++++++++- .../supervised_sum_semantic_entropies.py | 179 +++++++++++++++ .../stat_calculators/semantic_matrix.py | 12 +- .../utils/register_stat_calculators.py | 19 +- 10 files changed, 725 insertions(+), 92 deletions(-) create mode 100644 src/lm_polygraph/estimators/adj_sum_semantic_entropies.py delete mode 100644 src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py create mode 100644 src/lm_polygraph/estimators/prob_cocoa.py create mode 100644 src/lm_polygraph/estimators/supervised_sum_semantic_entropies.py diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 648d5a1ab..ea76573dc 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -104,7 +104,13 @@ GreedySemanticEnrichedTokenSARAveDissimilarity , GreedySemanticEnrichedMaxprobAveDissimilarity, GreedySemanticEnrichedMTEAveDissimilarity, + GreedyAveDissimilarity ) from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE -from .sum_semantic_entropies import SumSemanticMaxprob, SumSemanticPPL -from .greedy_sum_semantic_entropies import GreedySumSemanticMaxprob, GreedySumSemanticPPL + +from .sum_semantic_entropies import SumSemanticMaxprob, SumSemanticPPL, SumSemanticMTE, GreedySumSemanticMaxprob, GreedySumSemanticPPL, GreedySumSemanticMTE +from .adj_sum_semantic_entropies import AdjustedSumSemanticMaxprob, AdjustedSumSemanticPPL, AdjustedSumSemanticMTE, GreedyAdjustedSumSemanticMaxprob, GreedyAdjustedSumSemanticPPL, GreedyAdjustedSumSemanticMTE + +from .prob_cocoa import ProbCocoaMaxprob, ProbCocoaPPL, GreedyProbCocoaMaxprob, GreedyProbCocoaPPL + +from .supervised_sum_semantic_entropies import SupSumSemanticMaxprob, SupSumSemanticPPL, SupSumSemanticMTE, GreedySupSumSemanticMaxprob, GreedySupSumSemanticPPL, GreedySupSumSemanticMTE diff --git a/src/lm_polygraph/estimators/adj_sum_semantic_entropies.py b/src/lm_polygraph/estimators/adj_sum_semantic_entropies.py new file mode 100644 index 000000000..1d780ca1c --- /dev/null +++ b/src/lm_polygraph/estimators/adj_sum_semantic_entropies.py @@ -0,0 +1,213 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS + + +class AdjustedSumSemanticMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first" + ): + super().__init__( + ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + + def __str__(self): + base = "AdjustedSumSemanticMaxprob" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) + + enriched_metrics = [] # To store enriched metrics for each sample + + for best_id, sample_log_probs, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity + ): + sim = 1 - sample_sentence_similarity[best_id, :] + sim[best_id] = 1 + avg_similarity = np.mean(sim) + mp = -np.sum(sample_log_probs[best_id]) + res = mp + avg_similarity * mp + enriched_metrics.append(res) + + return np.array(enriched_metrics) + + +class AdjustedSumSemanticPPL(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first" + ): + super().__init__( + ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + + def __str__(self): + base = "AdjustedSumSemanticPPL" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) + + enriched_ppl = [] # To store enriched PPL for each sample + + for best_id, sample_log_likelihoods, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): + sim = 1 - sample_sentence_similarity[best_id, :] + sim[best_id] = 1 + avg_similarity = np.mean(sim) + ppl = -np.mean(sample_log_likelihoods[best_id]) + res = ppl + avg_similarity * ppl + enriched_ppl.append(res) + + return np.array(enriched_ppl) + + +class AdjustedSumSemanticMTE(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first" + ): + super().__init__( + ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + + def __str__(self): + base = "AdjustedSumSemanticMTE" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_entropies = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) + + enriched_mte = [] + + for best_id, sample_entropies, sample_sentence_similarity in zip( + sample_ids, batch_entropies, batch_sample_sentence_similarity + ): + sim = 1 - sample_sentence_similarity[best_id, :] + sim[best_id] = 1 + avg_similarity = np.mean(sim) + mte = sample_entropies[best_id] + res = mte + avg_similarity * mte + enriched_mte.append(res) + + return np.array(enriched_mte) + + +class GreedyAdjustedSumSemanticMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedyAdjustedSumSemanticMaxprob" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]]) + + enriched_metrics = [] # To store enriched metrics for each sample + for greedy_ll, greedy_sentence_similarity in zip( + batch_lls, batch_greedy_sentence_similarity + ): + # Compute probabilities (negative log-probs) + prob = -greedy_ll + + # Compute row-wise average similarity, excluding self-similarity + # Diagonal contains self-similarities + avg_similarity = 1 - np.mean(greedy_sentence_similarity) + + enriched_metrics.append(prob + avg_similarity * prob) + + return np.array(enriched_metrics) + + +class GreedyAdjustedSumSemanticPPL(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedyAdjustedSumSemanticPPL" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_ppl = [] # To store enriched PPL for each sample + + for greedy_log_likelihoods, greedy_sentence_similarity in zip( + batch_greedy_log_likelihoods, batch_greedy_sentence_similarity + ): + # get PPL for each sample + ppl = -np.mean(greedy_log_likelihoods) + + # Compute row-wise average similarity, excluding self-similarity + avg_similarity = 1 - np.mean(greedy_sentence_similarity) + + enriched_ppl.append(ppl + avg_similarity * ppl) + + + return np.array(enriched_ppl) + + +class GreedyAdjustedSumSemanticMTE(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedyAdjustedSumSemanticMTE" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_entropies = stats["greedy_log_likelihoods"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_mte = [] # To store enriched PPL for each sample + + for greedy_entropies, greedy_sentence_similarity in zip( + batch_greedy_entropies, batch_greedy_sentence_similarity + ): + # get PPL for each sample + mte = np.mean(greedy_entropies) + + # Compute row-wise average similarity, excluding self-similarity + avg_similarity = 1 - np.mean(greedy_sentence_similarity) + + enriched_mte.append(mte + avg_similarity * mte) + + + return np.array(enriched_mte) diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py index d2b182741..69611957b 100644 --- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py @@ -355,3 +355,30 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: enriched_entropy.append(enriched_value) return np.array(enriched_entropy) + + +class GreedyAveDissimilarity(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedyAveDissimilarity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_entropy = stats["entropy"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + res = [] + + for greedy_entropy, greedy_sentence_similarity in zip( + batch_greedy_entropy, batch_greedy_sentence_similarity + ): + # Compute row-wise average similarity, excluding self-similarity + avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) + res.append(avg_dissimilarity) + + return np.array(res) diff --git a/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py b/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py deleted file mode 100644 index 1280dadc0..000000000 --- a/src/lm_polygraph/estimators/greedy_sum_semantic_entropies.py +++ /dev/null @@ -1,70 +0,0 @@ -import numpy as np - -from typing import Dict -from copy import deepcopy - -from .estimator import Estimator -from .common import sample_strategy_to_prefix, best_sample_ids - - -class GreedySumSemanticMaxprob(Estimator): - def __init__( - self, - verbose: bool = False, - ): - super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") - self.verbose = verbose - - def __str__(self): - return "GreedySumSemanticMaxprob" - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] - batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]]) - - enriched_metrics = [] # To store enriched metrics for each sample - for greedy_ll, greedy_sentence_similarity in zip( - batch_lls, batch_greedy_sentence_similarity - ): - # Compute probabilities (negative log-probs) - prob = -greedy_ll - - # Compute row-wise average similarity, excluding self-similarity - # Diagonal contains self-similarities - avg_similarity = np.mean(greedy_sentence_similarity) - - enriched_metrics.append(prob - np.log(avg_similarity)) - - return np.array(enriched_metrics) - - -class GreedySumSemanticPPL(Estimator): - def __init__( - self, - verbose: bool = False, - ): - super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") - self.verbose = verbose - - def __str__(self): - return "GreedySumSemanticPPL" - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] - batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] - - enriched_ppl = [] # To store enriched PPL for each sample - - for greedy_log_likelihoods, greedy_sentence_similarity in zip( - batch_greedy_log_likelihoods, batch_greedy_sentence_similarity - ): - # get PPL for each sample - ppl = -np.mean(greedy_log_likelihoods) - - # Compute row-wise average similarity, excluding self-similarity - avg_similarity = np.mean(greedy_sentence_similarity) - - enriched_ppl.append(ppl - np.log(avg_similarity)) - - - return np.array(enriched_ppl) diff --git a/src/lm_polygraph/estimators/prob_cocoa.py b/src/lm_polygraph/estimators/prob_cocoa.py new file mode 100644 index 000000000..cf3483fac --- /dev/null +++ b/src/lm_polygraph/estimators/prob_cocoa.py @@ -0,0 +1,144 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS + + +class ProbCocoaMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first" + ): + super().__init__( + ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + + def __str__(self): + base = "ProbCocoaMaxprob" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) + + enriched_metrics = [] # To store enriched metrics for each sample + + for best_id, sample_log_probs, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity + ): + sim = 1 - sample_sentence_similarity[best_id, :] + sim[best_id] = 1 + avg_similarity = np.mean(sim) + mp = 1 - np.exp(np.sum(sample_log_probs[best_id])) + res = mp * avg_similarity + enriched_metrics.append(res) + + return np.array(enriched_metrics) + + +class ProbCocoaPPL(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first" + ): + super().__init__( + ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + + def __str__(self): + base = "ProbCocoaPPL" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_likelihoods = stats["sample_log_likelihoods"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) + + enriched_ppl = [] # To store enriched PPL for each sample + + for best_id, sample_log_likelihoods, sample_sentence_similarity in zip( + sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity + ): + sim = 1 - sample_sentence_similarity[best_id, :] + sim[best_id] = 1 + avg_similarity = np.mean(sim) + ppl = 1 - np.exp(np.mean(sample_log_likelihoods[best_id])) + res = ppl * avg_similarity + enriched_ppl.append(res) + + return np.array(enriched_ppl) + + +class GreedyProbCocoaMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedyProbCocoaMaxprob" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]]) + + enriched_metrics = [] # To store enriched metrics for each sample + for greedy_ll, greedy_sentence_similarity in zip( + batch_lls, batch_greedy_sentence_similarity + ): + # Compute probabilities (negative log-probs) + prob = 1 - np.exp(greedy_ll) + + # Compute row-wise average similarity, excluding self-similarity + # Diagonal contains self-similarities + avg_similarity = 1 - np.mean(greedy_sentence_similarity) + + enriched_metrics.append(prob * avg_similarity) + + return np.array(enriched_metrics) + + +class GreedyProbCocoaPPL(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedyProbCocoaPPL" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_ppl = [] # To store enriched PPL for each sample + + for greedy_log_likelihoods, greedy_sentence_similarity in zip( + batch_greedy_log_likelihoods, batch_greedy_sentence_similarity + ): + # get PPL for each sample + ppl = 1 - np.exp(np.mean(greedy_log_likelihoods)) + + # Compute row-wise average similarity, excluding self-similarity + avg_similarity = 1 - np.mean(greedy_sentence_similarity) + + enriched_ppl.append(ppl * avg_similarity) + + + return np.array(enriched_ppl) diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py index 40fc0a004..f10f02925 100644 --- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py @@ -530,7 +530,6 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(best_elements) - class AveDissimilarity(Estimator): def __init__( self, diff --git a/src/lm_polygraph/estimators/sum_semantic_entropies.py b/src/lm_polygraph/estimators/sum_semantic_entropies.py index 10b19a03b..47f9aad25 100644 --- a/src/lm_polygraph/estimators/sum_semantic_entropies.py +++ b/src/lm_polygraph/estimators/sum_semantic_entropies.py @@ -34,10 +34,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for best_id, sample_log_probs, sample_sentence_similarity in zip( sample_ids, batch_sample_log_probs, batch_sample_sentence_similarity ): - sim = sample_sentence_similarity[best_id, :] + sim = 1 - sample_sentence_similarity[best_id, :] sim[best_id] = 1 avg_similarity = np.mean(sim) - res = -np.sum(sample_log_probs[best_id]) - np.log(avg_similarity) + mp = -np.sum(sample_log_probs[best_id]) + res = mp + avg_similarity enriched_metrics.append(res) return np.array(enriched_metrics) @@ -70,10 +71,143 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for best_id, sample_log_likelihoods, sample_sentence_similarity in zip( sample_ids, batch_sample_log_likelihoods, batch_sample_sentence_similarity ): - sim = sample_sentence_similarity[best_id, :] + sim = 1 - sample_sentence_similarity[best_id, :] sim[best_id] = 1 avg_similarity = np.mean(sim) - res = -np.mean(sample_log_likelihoods[best_id]) - np.log(avg_similarity) + ppl = -np.mean(sample_log_likelihoods[best_id]) + res = ppl + avg_similarity enriched_ppl.append(res) return np.array(enriched_ppl) + + +class SumSemanticMTE(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first" + ): + super().__init__( + ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + + def __str__(self): + base = "SumSemanticMTE" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_entropies = stats["sample_entropy"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + sample_ids = best_sample_ids(self.sample_strategy, stats) + + enriched_mte = [] + + for best_id, sample_entropies, sample_sentence_similarity in zip( + sample_ids, batch_entropies, batch_sample_sentence_similarity + ): + sim = 1 - sample_sentence_similarity[best_id, :] + sim[best_id] = 1 + avg_similarity = np.mean(sim) + mte = sample_entropies[best_id] + res = mte + avg_similarity + enriched_mte.append(res) + + return np.array(enriched_mte) + + +class GreedySumSemanticMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedySumSemanticMaxprob" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]]) + + enriched_metrics = [] # To store enriched metrics for each sample + for greedy_ll, greedy_sentence_similarity in zip( + batch_lls, batch_greedy_sentence_similarity + ): + # Compute probabilities (negative log-probs) + prob = -greedy_ll + + # Compute row-wise average similarity, excluding self-similarity + # Diagonal contains self-similarities + avg_similarity = 1 - np.mean(greedy_sentence_similarity) + + enriched_metrics.append(prob + avg_similarity) + + return np.array(enriched_metrics) + + +class GreedySumSemanticPPL(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedySumSemanticPPL" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_ppl = [] # To store enriched PPL for each sample + + for greedy_log_likelihoods, greedy_sentence_similarity in zip( + batch_greedy_log_likelihoods, batch_greedy_sentence_similarity + ): + # get PPL for each sample + ppl = -np.mean(greedy_log_likelihoods) + + # Compute row-wise average similarity, excluding self-similarity + avg_similarity = 1 - np.mean(greedy_sentence_similarity) + + enriched_ppl.append(ppl + avg_similarity) + + + return np.array(enriched_ppl) + + +class GreedySumSemanticMTE(Estimator): + def __init__( + self, + verbose: bool = False, + ): + super().__init__(["greedy_sentence_similarity", "entropy"], "sequence") + self.verbose = verbose + + def __str__(self): + return "GreedySumSemanticMTE" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_greedy_entropies = stats["greedy_log_likelihoods"] + batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + + enriched_mte = [] # To store enriched PPL for each sample + + for greedy_entropies, greedy_sentence_similarity in zip( + batch_greedy_entropies, batch_greedy_sentence_similarity + ): + # get PPL for each sample + mte = np.mean(greedy_entropies) + + # Compute row-wise average similarity, excluding self-similarity + avg_similarity = 1 - np.mean(greedy_sentence_similarity) + + enriched_mte.append(mte + avg_similarity) + + + return np.array(enriched_mte) diff --git a/src/lm_polygraph/estimators/supervised_sum_semantic_entropies.py b/src/lm_polygraph/estimators/supervised_sum_semantic_entropies.py new file mode 100644 index 000000000..06fcd5cd5 --- /dev/null +++ b/src/lm_polygraph/estimators/supervised_sum_semantic_entropies.py @@ -0,0 +1,179 @@ +import numpy as np + +from typing import Dict +from copy import deepcopy + +from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS +from sklearn.preprocessing import MinMaxScaler + + +def get_avg_dissim(sample_sentence_similarity, sample_ids): + batch_avg_similarity = [] + for best_id, sentence_similarity in zip(sample_ids, sample_sentence_similarity): + batch_avg_similarity.append(np.mean(1 - sentence_similarity[best_id, :])) + return batch_avg_similarity + +def normalize_and_enrich(batch_metrics, batch_avg_dissimilarity, alpha): + batch_metrics = MinMaxScaler().fit_transform(np.array(batch_metrics).reshape(-1, 1)).flatten() + batch_avg_dissimilarity = MinMaxScaler().fit_transform(np.array(batch_avg_dissimilarity).reshape(-1, 1)).flatten() + enriched_metrics = [metric + avg_dissimilarity * alpha for metric, avg_dissimilarity in zip(batch_metrics, batch_avg_dissimilarity)] + return enriched_metrics + + +class SupSumSemanticMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first", + alpha: int = 1 + ): + super().__init__( + ["sample_sentence_similarity", "sample_log_probs"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + self.alpha = alpha + + def __str__(self): + base = f"SupSumSemanticMaxprob_{self.alpha}" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + sample_ids = best_sample_ids(self.sample_strategy, stats) + + batch_mps = [-np.sum(log_probs[best_id]) for best_id, log_probs in zip(sample_ids, stats["sample_log_probs"])] + batch_avg_dissim = get_avg_dissim(stats["sample_sentence_similarity"], sample_ids) + + enriched_metrics = normalize_and_enrich(batch_mps, batch_avg_dissim, self.alpha) + + return np.array(enriched_metrics) + + +class SupSumSemanticPPL(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first", + alpha: int = 1 + ): + super().__init__( + ["sample_sentence_similarity", "sample_log_likelihoods"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + self.alpha = alpha + + def __str__(self): + base = f"SupSumSemanticPPL_{self.alpha}" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + sample_ids = best_sample_ids(self.sample_strategy, stats) + + batch_ppls = [-np.mean(sample_log_likelihoods[best_id]) for best_id, sample_log_likelihoods in zip(sample_ids, stats["sample_log_likelihoods"])] + batch_avg_dissim = get_avg_dissim(stats["sample_sentence_similarity"], sample_ids) + + enriched_metrics = normalize_and_enrich(batch_ppls, batch_avg_dissim, self.alpha) + + return np.array(enriched_metrics) + + +class SupSumSemanticMTE(Estimator): + def __init__( + self, + verbose: bool = False, + sample_strategy: str = "first", + alpha: int = 1 + ): + super().__init__( + ["sample_sentence_similarity", "sample_entropy"] + SAMPLE_SELECTION_STAT_KEYS, + "sequence" + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + self.alpha = alpha + + def __str__(self): + base = f"SupSumSemanticMTE_{self.alpha}" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + sample_ids = best_sample_ids(self.sample_strategy, stats) + + batch_mtes = [entropies[best_id] for best_id, entropies in zip(sample_ids, stats["sample_entropy"])] + batch_avg_dissim = get_avg_dissim(stats["sample_sentence_similarity"], sample_ids) + + enriched_metrics = normalize_and_enrich(batch_mtes, batch_avg_dissim, self.alpha) + + return np.array(enriched_metrics) + + +class GreedySupSumSemanticMaxprob(Estimator): + def __init__( + self, + verbose: bool = False, + alpha: int = 1 + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + self.alpha = alpha + + def __str__(self): + return f"GreedySupSumSemanticMaxprob_{self.alpha}" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]]) + batch_avg_dissim = [np.mean(1 - sentence_similarity) for sentence_similarity in stats["greedy_sentence_similarity"]] + + enriched_metrics = normalize_and_enrich(batch_lls, batch_avg_dissim, self.alpha) + + return np.array(enriched_metrics) + + +class GreedySupSumSemanticPPL(Estimator): + def __init__( + self, + verbose: bool = False, + alpha: int = 1 + ): + super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + self.verbose = verbose + self.alpha = alpha + + def __str__(self): + return f"GreedySupSumSemanticPPL_{self.alpha}" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_ppls = [-np.mean(greedy_log_likelihoods) for greedy_log_likelihoods in stats["greedy_log_likelihoods"]] + batch_avg_dissim = [np.mean(1 - sentence_similarity) for sentence_similarity in stats["greedy_sentence_similarity"]] + + enriched_metrics = normalize_and_enrich(batch_ppls, batch_avg_dissim, self.alpha) + + return np.array(enriched_metrics) + + +class GreedySupSumSemanticMTE(Estimator): + def __init__( + self, + verbose: bool = False, + alpha: int = 1 + ): + super().__init__(["greedy_sentence_similarity", "entropy"], "sequence") + self.verbose = verbose + self.alpha = alpha + + def __str__(self): + return f"GreedySupSumSemanticMTE_{self.alpha}" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + #batch_greedy_entropies = stats["greedy_log_likelihoods"] + #batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + batch_mtes = [np.mean(greedy_entropies) for greedy_entropies in stats["greedy_log_likelihoods"]] + batch_avg_dissim = [np.mean(1 - sentence_similarity) for sentence_similarity in stats["greedy_sentence_similarity"]] + + enriched_metrics = normalize_and_enrich(batch_mtes, batch_avg_dissim, self.alpha) + + return np.array(enriched_metrics) diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py index c8ede60a6..57499408a 100644 --- a/src/lm_polygraph/stat_calculators/semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py @@ -101,17 +101,17 @@ def __call__( unique_mat_shape = (batch_counts[i], batch_counts[i]) - unique_E = entail_probs.view(unique_mat_shape).numpy() - unique_C = contra_probs.view(unique_mat_shape).numpy() - unique_P = class_preds.view(unique_mat_shape).numpy() + unique_E = entail_probs.view(unique_mat_shape) + unique_C = contra_probs.view(unique_mat_shape) + unique_P = class_preds.view(unique_mat_shape) inv = batch_invs[i] # Recover full matrices from unques by gathering along both axes # using inverse index - E.append(unique_E[inv, :][:, inv]) - C.append(unique_C[inv, :][:, inv]) - P.append(unique_P[inv, :][:, inv]) + E.append(unique_E.cpu().numpy()[inv, :][:, inv]) + C.append(unique_C.cpu().numpy()[inv, :][:, inv]) + P.append(unique_P.cpu().numpy()[inv, :][:, inv]) E = np.stack(E) C = np.stack(C) diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index fa96605e1..b453c89fb 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -30,15 +30,16 @@ def register_stat_calculators( log.info("=" * 100) log.info("Loading NLI model...") - if language == "en": - nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device) - elif language in ["zh", "ar", "ru"]: - nli_model = MultilingualDeberta( - batch_size=deberta_batch_size, - device=deberta_device, - ) - else: - raise Exception(f"Unsupported language: {language}") + #if language == "en": + # nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device) + #elif language in ["zh", "ar", "ru"]: + # nli_model = MultilingualDeberta( + # batch_size=deberta_batch_size, + # device=deberta_device, + # ) + #else: + # raise Exception(f"Unsupported language: {language}") + nli_model = None log.info("=" * 100) log.info("Initializing stat calculators...") From 59062663dbb8d1d2ab0c1f0af1aa9236007f1b93 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Fri, 31 Jan 2025 12:19:59 +0400 Subject: [PATCH 83/97] Final fixes before submit --- src/lm_polygraph/estimators/__init__.py | 20 +- ..._semantic_average_ue_average_similarity.py | 216 +++----------- .../semantic_average_ue_average_similarity.py | 281 ++++-------------- .../utils/register_stat_calculators.py | 18 +- 4 files changed, 120 insertions(+), 415 deletions(-) diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index ea76573dc..254788487 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -85,25 +85,21 @@ from .average_ue import AveMaxprob, AvePPL, AveTokenSAR, AveMTE from .semantic_average_ue import SemanticAveMaxprob, SemanticAvePPL, SemanticAveTokenSAR, SemanticAveMTE from .semantic_average_ue_average_similarity import ( - SemanticAveMaxprobAveSimilarity, - SemanticAvePPLAveSimilarity, - SemanticAveTokenSARAveSimilarity, - SemanticAveMTEAveSimilarity, - SemanticEnrichedPPLAveDissimilarity, - SemanticEnrichedTokenSARAveDissimilarity , SemanticEnrichedMaxprobAveDissimilarity, + SemanticEnrichedPPLAveDissimilarity, SemanticEnrichedMTEAveDissimilarity, + SemanticEnrichedMaxprobTotalDissimilarity, + SemanticEnrichedPPLTotalDissimilarity, + SemanticEnrichedMTETotalDissimilarity, AveDissimilarity ) from .greedy_semantic_average_ue_average_similarity import ( - GreedySemanticAveMaxprobAveSimilarity, - GreedySemanticAvePPLAveSimilarity, - GreedySemanticAveTokenSARAveSimilarity, - GreedySemanticAveMTEAveSimilarity, - GreedySemanticEnrichedPPLAveDissimilarity, - GreedySemanticEnrichedTokenSARAveDissimilarity , GreedySemanticEnrichedMaxprobAveDissimilarity, + GreedySemanticEnrichedPPLAveDissimilarity, GreedySemanticEnrichedMTEAveDissimilarity, + GreedySemanticEnrichedMaxprobTotalDissimilarity, + GreedySemanticEnrichedPPLTotalDissimilarity, + GreedySemanticEnrichedMTETotalDissimilarity, GreedyAveDissimilarity ) from .semantic_median_ue import SemanticMedianMaxprob, SemanticMedianPPL, SemanticMedianTokenSAR, SemanticMedianMTE diff --git a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py index 69611957b..18dbd8ddb 100644 --- a/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/greedy_semantic_average_ue_average_similarity.py @@ -7,7 +7,7 @@ from .common import sample_strategy_to_prefix, best_sample_ids -class GreedySemanticAveMaxprobAveSimilarity(Estimator): +class GreedySemanticEnrichedMaxprobAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -19,9 +19,9 @@ def __init__( def __str__(self): if self.exp: - return "GreedySemanticAveMaxprobAveSimilarityexp" + return "GreedySemanticEnrichedMaxprobAveDissimilarityexp" else: - return "GreedySemanticAveMaxprobAveSimilarity" + return "GreedySemanticEnrichedMaxprobAveDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] @@ -38,41 +38,37 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Compute row-wise average similarity, excluding self-similarity # Diagonal contains self-similarities - avg_similarity = np.mean(greedy_sentence_similarity) - - # Enrich each metric by scaling it by 1/row_average - if avg_similarity == 0: - avg_similarity = 1e-10 # Avoid division by zero + avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) - enriched_metric = prob * (1 / avg_similarity) + enriched_metric = prob * avg_dissimilarity enriched_metrics.append(enriched_metric) return np.array(enriched_metrics) -class GreedySemanticEnrichedMaxprobAveDissimilarity(Estimator): +class GreedySemanticEnrichedMaxprobTotalDissimilarity(Estimator): def __init__( self, verbose: bool = False, exp: bool = False, ): - super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + super().__init__(["sample_sentence_similarity", "greedy_log_likelihoods"], "sequence") self.verbose = verbose self.exp = exp def __str__(self): if self.exp: - return "GreedySemanticEnrichedMaxprobAveDissimilarityexp" + return "GreedySemanticEnrichedMaxprobTotalDissimilarityexp" else: - return "GreedySemanticEnrichedMaxprobAveDissimilarity" + return "GreedySemanticEnrichedMaxprobTotalDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] batch_lls = np.array([np.sum(log_likelihood) for log_likelihood in stats["greedy_log_likelihoods"]]) enriched_metrics = [] # To store enriched metrics for each sample - for greedy_ll, greedy_sentence_similarity in zip( - batch_lls, batch_greedy_sentence_similarity + for greedy_ll, sample_sentence_similarity in zip( + batch_lls, batch_sample_sentence_similarity ): # Compute probabilities (negative log-probs) prob = -greedy_ll @@ -81,7 +77,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: # Compute row-wise average similarity, excluding self-similarity # Diagonal contains self-similarities - avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) + avg_dissimilarity = np.mean(1 - np.array(sample_sentence_similarity)) enriched_metric = prob * avg_dissimilarity enriched_metrics.append(enriched_metric) @@ -89,7 +85,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(enriched_metrics) -class GreedySemanticAvePPLAveSimilarity(Estimator): +class GreedySemanticEnrichedPPLAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -101,9 +97,9 @@ def __init__( def __str__(self): if self.exp: - return "GreedySemanticAvePPLAveSimilarityexp" + return "GreedySemanticEnrichedPPLAveDissimilarityexp" else: - return "GreedySemanticAvePPLAveSimilarity" + return "GreedySemanticEnrichedPPLAveDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] @@ -119,43 +115,39 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: if self.exp: ppl = -np.exp(-ppl) - # Compute row-wise average similarity, excluding self-similarity - avg_similarity = np.mean(greedy_sentence_similarity) - - # Enrich each PPL independently by scaling with 1/row_average - if avg_similarity == 0: - avg_similarity = 1e-10 # Avoid division by zero + # Compute row-wise average similarity, excluding self-similarity + avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) - enriched_value = ppl * (1 / avg_similarity) + enriched_value = ppl * avg_dissimilarity enriched_ppl.append(enriched_value) return np.array(enriched_ppl) -class GreedySemanticEnrichedPPLAveDissimilarity(Estimator): +class GreedySemanticEnrichedPPLTotalDissimilarity(Estimator): def __init__( self, verbose: bool = False, exp: bool = False, ): - super().__init__(["greedy_sentence_similarity", "greedy_log_likelihoods"], "sequence") + super().__init__(["sample_sentence_similarity", "greedy_log_likelihoods"], "sequence") self.verbose = verbose self.exp = exp def __str__(self): if self.exp: - return "GreedySemanticEnrichedPPLAveDissimilarityexp" + return "GreedySemanticEnrichedPPLTotalDissimilarityexp" else: - return "GreedySemanticEnrichedPPLAveDissimilarity" + return "GreedySemanticEnrichedPPLTotalDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] - batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] enriched_ppl = [] # To store enriched PPL for each sample - for greedy_log_likelihoods, greedy_sentence_similarity in zip( - batch_greedy_log_likelihoods, batch_greedy_sentence_similarity + for greedy_log_likelihoods, sample_sentence_similarity in zip( + batch_greedy_log_likelihoods, batch_sample_sentence_similarity ): # get PPL for each sample ppl = -np.mean(greedy_log_likelihoods) @@ -163,7 +155,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: ppl = -np.exp(-ppl) # Compute row-wise average similarity, excluding self-similarity - avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) + avg_dissimilarity = np.mean(1 - np.array(sample_sentence_similarity)) enriched_value = ppl * avg_dissimilarity enriched_ppl.append(enriched_value) @@ -171,129 +163,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(enriched_ppl) -class GreedySemanticAveTokenSARAveSimilarity(Estimator): - def __init__( - self, - verbose: bool = False, - exp: bool = False, - ): - super().__init__( - [ - "greedy_sentence_similarity", - "greedy_log_likelihoods", - ], - "sequence", - ) - self.verbose = verbose - self.exp = exp - - def __str__(self): - if self.exp: - return "GreedySemanticAveTokenSARAveSimilarityexp" - else: - return "GreedySemanticAveTokenSARAveSimilarity" - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] - batch_greedy_token_similarity = stats["token_similarity"] - batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] - - enriched_tokenSAR = [] - - for batch_data in zip( - batch_greedy_log_likelihoods, - batch_greedy_token_similarity, - batch_greedy_sentence_similarity, - ): - log_likelihoods = batch_data[0] - token_similarity = batch_data[1] - greedy_sentence_similarity = batch_data[2] - - log_likelihoods = np.array(log_likelihoods) - R_t = 1 - token_similarity - if R_t.sum() == 0: - R_t_norm = np.zeros_like(R_t) - else: - R_t_norm = R_t / R_t.sum() - E_t = -log_likelihoods * R_t_norm - tokenSAR = E_t.sum() - - if self.exp: - tokenSAR = -np.exp(-np.array(tokenSAR)) - - # Compute row-wise average similarity, excluding self-similarity - avg_similarity = np.mean(greedy_sentence_similarity) - - # Enrich each PPL independently by scaling with 1/row_average - if avg_similarity == 0: - avg_similarity = 1e-10 # Avoid division by zero - - enriched_value = tokenSAR * (1 / avg_similarity) - enriched_tokenSAR.append(enriched_value) - - return np.array(enriched_tokenSAR) - - -class GreedySemanticEnrichedTokenSARAveDissimilarity(Estimator): - def __init__( - self, - verbose: bool = False, - exp: bool = False, - ): - super().__init__( - [ - "greedy_sentence_similarity", - "greedy_log_likelihoods", - ], - "sequence", - ) - self.verbose = verbose - self.exp = exp - - def __str__(self): - if self.exp: - return "GreedySemanticEnrichedTokenSARAveDissimilarityexp" - else: - return "GreedySemanticEnrichedTokenSARAveDissimilarity" - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] - batch_greedy_token_similarity = stats["token_similarity"] - batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] - - enriched_tokenSAR = [] - - for batch_data in zip( - batch_greedy_log_likelihoods, - batch_greedy_token_similarity, - batch_greedy_sentence_similarity, - ): - log_likelihoods = batch_data[0] - token_similarity = batch_data[1] - greedy_sentence_similarity = batch_data[2] - - log_likelihoods = np.array(log_likelihoods) - R_t = 1 - token_similarity - if R_t.sum() == 0: - R_t_norm = np.zeros_like(R_t) - else: - R_t_norm = R_t / R_t.sum() - E_t = -log_likelihoods * R_t_norm - tokenSAR = E_t.sum() - - if self.exp: - tokenSAR = -np.exp(-np.array(tokenSAR)) - - # Compute row-wise average similarity, excluding self-similarity - avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) - - enriched_value = tokenSAR * avg_dissimilarity - enriched_tokenSAR.append(enriched_value) - - return np.array(enriched_tokenSAR) - - -class GreedySemanticAveMTEAveSimilarity(Estimator): +class GreedySemanticEnrichedMTEAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -302,7 +172,7 @@ def __init__( self.verbose = verbose def __str__(self): - return "GreedySemanticAveMTEAveSimilarity" + return "GreedySemanticEnrichedMTEAveDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_greedy_entropy = stats["entropy"] @@ -314,43 +184,39 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_greedy_entropy, batch_greedy_sentence_similarity ): # Compute row-wise average similarity, excluding self-similarity - avg_similarity = np.mean(greedy_sentence_similarity) - - # Enrich each PPL independently by scaling with 1/row_average - if avg_similarity == 0: - avg_similarity = 1e-10 # Avoid division by zero + avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) entropy = np.mean(greedy_entropy) - enriched_value = entropy * (1 / avg_similarity) + enriched_value = entropy * avg_dissimilarity enriched_entropy.append(enriched_value) return np.array(enriched_entropy) -class GreedySemanticEnrichedMTEAveDissimilarity(Estimator): +class GreedySemanticEnrichedMTETotalDissimilarity(Estimator): def __init__( self, verbose: bool = False, ): - super().__init__(["greedy_sentence_similarity", "entropy"], "sequence") + super().__init__(["sample_sentence_similarity", "entropy"], "sequence") self.verbose = verbose def __str__(self): - return "GreedySemanticEnrichedMTEAveDissimilarity" + return "GreedySemanticEnrichedMTETotalDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - batch_greedy_entropy = stats["entropy"] - batch_greedy_sentence_similarity = stats["greedy_sentence_similarity"] + batch_sample_sentence_similarity = stats["sample_sentence_similarity"] + batch_entropy = stats["entropy"] enriched_entropy = [] - for greedy_entropy, greedy_sentence_similarity in zip( - batch_greedy_entropy, batch_greedy_sentence_similarity + for entropy, sample_sentence_similarity in zip( + batch_entropy, batch_sample_sentence_similarity ): # Compute row-wise average similarity, excluding self-similarity - avg_dissimilarity = np.mean(1 - greedy_sentence_similarity) - - entropy = np.mean(greedy_entropy) + avg_dissimilarity = np.mean(1 - np.array(sample_sentence_similarity)) + + entropy = np.mean(entropy) enriched_value = entropy * avg_dissimilarity enriched_entropy.append(enriched_value) diff --git a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py index f10f02925..03956cacb 100644 --- a/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py +++ b/src/lm_polygraph/estimators/semantic_average_ue_average_similarity.py @@ -7,7 +7,7 @@ from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS -class SemanticAveMaxprobAveSimilarity(Estimator): +class SemanticEnrichedMaxprobAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -24,9 +24,9 @@ def __init__( def __str__(self): if self.exp: - base = "SemanticAveMaxprobAveSimilarityexp" + base = "SemanticEnrichedMaxprobAveDissimilarityexp" else: - base = "SemanticAveMaxprobAveSimilarity" + base = "SemanticEnrichedMaxprobAveDissimilarity" return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: @@ -39,37 +39,41 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_log_probs, sample_sentence_similarity in zip( batch_sample_log_probs, batch_sample_sentence_similarity ): - # Compute probabilities (negative log-probs) + # Step 1: Compute probabilities (negative log-probs) sample_probs = -np.array(sample_log_probs) if self.exp: sample_probs = -np.exp(-sample_probs) - # Compute row-wise average similarity, excluding self-similarity - # Diagonal contains self-similarities - row_averages = [] + # Step 2: Compute row-wise sum of dissimilarities (1 - g) + row_dissimilarities = [] for i in range(sample_sentence_similarity.shape[0]): row = sample_sentence_similarity[i] - average_similarity = (np.sum(row) - row[i]) / (len(row) - 1) - row_averages.append(average_similarity) + sum_dissimilarities = np.sum(1 - row) - (1 - row[i]) # Exclude self-similarity + row_dissimilarities.append(sum_dissimilarities) - # Enrich each metric by scaling it by 1/row_average + # Step 3: Normalize by (M - 1) + normalized_dissimilarities = [ + dissim / (len(sample_sentence_similarity) - 1) + for dissim in row_dissimilarities + ] + + # Step 4: Enrich each metric enriched_sample_metrics = [] - for i, (prob, avg_similarity) in enumerate(zip(sample_probs, row_averages)): - if avg_similarity == 0: - avg_similarity = 1e-10 # Avoid division by zero - enriched_metric = prob * (1 / avg_similarity) + for prob, dissim in zip(sample_probs, normalized_dissimilarities): + enriched_metric = prob * dissim enriched_sample_metrics.append(enriched_metric) enriched_metrics.append(np.array(enriched_sample_metrics)) - # Return only metric for the best sample for prr calculation + # Return only metric for the best sample for PRR calculation best_elements = [] for best_id, metrics in zip(sample_ids, enriched_metrics): best_elements.append(metrics[best_id]) return np.array(best_elements) -class SemanticEnrichedMaxprobAveDissimilarity(Estimator): + +class SemanticEnrichedMaxprobTotalDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -86,9 +90,9 @@ def __init__( def __str__(self): if self.exp: - base = "SemanticEnrichedMaxprobAveDissimilarityexp" + base = "SemanticEnrichedMaxprobTotalDissimilarityexp" else: - base = "SemanticEnrichedMaxprobAveDissimilarity" + base = "SemanticEnrichedMaxprobTotalDissimilarity" return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: @@ -119,9 +123,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for dissim in row_dissimilarities ] + dissim = np.mean(normalized_dissimilarities) + # Step 4: Enrich each metric enriched_sample_metrics = [] - for prob, dissim in zip(sample_probs, normalized_dissimilarities): + for prob in sample_probs: enriched_metric = prob * dissim enriched_sample_metrics.append(enriched_metric) @@ -135,11 +141,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(best_elements) -class SemanticAvePPLAveSimilarity(Estimator): +class SemanticEnrichedPPLAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, - exp: bool = False, + exp: bool = False, sample_strategy: str = "first" ): super().__init__( @@ -152,9 +158,9 @@ def __init__( def __str__(self): if self.exp: - base = "SemanticAvePPLAveSimilarityexp" + base = "SemanticEnrichedPPLAveDissimilarityexp" else: - base = "SemanticAvePPLAveSimilarity" + base = "SemanticEnrichedPPLAveDissimilarity" return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: @@ -167,36 +173,38 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_log_likelihoods, sample_sentence_similarity in zip( batch_sample_log_likelihoods, batch_sample_sentence_similarity ): - # get PPL for each sample + # Step 1: Compute PPL for each sample ppl = -np.array([np.mean(token_ll) for token_ll in sample_log_likelihoods]) if self.exp: ppl = -np.exp(-ppl) - # Compute row-wise average similarity, excluding self-similarity + # Step 2: Compute row-wise average dissimilarity (1 - g) row_averages = [] for i in range(sample_sentence_similarity.shape[0]): row = sample_sentence_similarity[i] - average_similarity = (np.sum(row) - row[i]) / (len(row) - 1) # Exclude g_ii - row_averages.append(average_similarity) + # Compute average dissimilarity, excluding self-similarity + average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1) + row_averages.append(average_dissimilarity) - # Enrich each PPL independently by scaling with 1/row_average + # Step 3: Enrich each PPL independently by scaling with the average dissimilarity enriched_sample_ppl = [] - for i, (ppl_value, avg_similarity) in enumerate(zip(ppl, row_averages)): - if avg_similarity == 0: - avg_similarity = 1e-10 # Avoid division by zero - enriched_value = ppl_value * (1 / avg_similarity) + for i, (ppl_value, avg_dissimilarity) in enumerate(zip(ppl, row_averages)): + if avg_dissimilarity == 0: + avg_dissimilarity = 1e-10 # Avoid division by zero + enriched_value = ppl_value * avg_dissimilarity enriched_sample_ppl.append(enriched_value) enriched_ppl.append(np.array(enriched_sample_ppl)) # Collect enriched PPL values - # Return only metric for the best sample for prr calculation + # Return only metric for the best sample for PRR calculation best_elements = [] for best_id, metrics in zip(sample_ids, enriched_ppl): best_elements.append(metrics[best_id]) return np.array(best_elements) -class SemanticEnrichedPPLAveDissimilarity(Estimator): + +class SemanticEnrichedPPLTotalDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -213,9 +221,9 @@ def __init__( def __str__(self): if self.exp: - base = "SemanticEnrichedPPLAveDissimilarityexp" + base = "SemanticEnrichedPPLTotalDissimilarityexp" else: - base = "SemanticEnrichedPPLAveDissimilarity" + base = "SemanticEnrichedPPLTotalDissimilarity" return sample_strategy_to_prefix(self.sample_strategy) + base def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: @@ -241,9 +249,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1) row_averages.append(average_dissimilarity) + avg_dissimilarity = np.mean(row_averages) + # Step 3: Enrich each PPL independently by scaling with the average dissimilarity enriched_sample_ppl = [] - for i, (ppl_value, avg_dissimilarity) in enumerate(zip(ppl, row_averages)): + for ppl_value in ppl: if avg_dissimilarity == 0: avg_dissimilarity = 1e-10 # Avoid division by zero enriched_value = ppl_value * avg_dissimilarity @@ -259,175 +269,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(best_elements) -class SemanticAveTokenSARAveSimilarity(Estimator): - def __init__( - self, - verbose: bool = False, - exp: bool = False, - sample_strategy: str = "first" - ): - super().__init__( - [ - "sample_sentence_similarity", - "sample_log_likelihoods", - "sample_token_similarity", - ] + SAMPLE_SELECTION_STAT_KEYS, - "sequence", - ) - self.verbose = verbose - self.exp = exp - self.sample_strategy = sample_strategy - - def __str__(self): - if self.exp: - base = "SemanticAveTokenSARAveSimilarityexp" - else: - base = "SemanticAveTokenSARAveSimilarity" - return sample_strategy_to_prefix(self.sample_strategy) + base - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - batch_sample_log_likelihoods = stats["sample_log_likelihoods"] - batch_sample_token_similarity = stats["sample_token_similarity"] - batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - sample_ids = best_sample_ids(self.sample_strategy, stats) - - enriched_tokenSAR = [] - - for batch_data in zip( - batch_sample_log_likelihoods, - batch_sample_token_similarity, - batch_sample_sentence_similarity, - ): - sample_log_likelihoods = batch_data[0] - sample_token_similarity = batch_data[1] - sample_sentence_similarity = batch_data[2] - - tokenSAR = [] - for log_likelihoods, token_similarity in zip( - sample_log_likelihoods, sample_token_similarity - ): - log_likelihoods = np.array(log_likelihoods) - R_t = 1 - token_similarity - if R_t.sum() == 0: - R_t_norm = np.zeros_like(R_t) - else: - R_t_norm = R_t / R_t.sum() - E_t = -log_likelihoods * R_t_norm - tokenSAR.append(E_t.sum()) - - if self.exp: - tokenSAR = -np.exp(-np.array(tokenSAR)) - - # Compute row-wise average similarity excluding self-similarity - row_averages = [] - for i in range(sample_sentence_similarity.shape[0]): - row = sample_sentence_similarity[i] - average_similarity = (np.sum(row) - row[i]) / (len(row) - 1) # Exclude g_ii - row_averages.append(average_similarity) - - # Enrich each tokenSAR value - enriched_sample_tokenSAR = [] - for i, (sar_value, avg_similarity) in enumerate(zip(tokenSAR, row_averages)): - if avg_similarity == 0: - avg_similarity = 1e-10 # Avoid division by zero - enriched_value = sar_value * (1 / avg_similarity) - enriched_sample_tokenSAR.append(enriched_value) - - enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR)) - - # Return only metric for the best sample for prr calculation - best_elements = [] - for best_id, metrics in zip(sample_ids, enriched_tokenSAR): - best_elements.append(metrics[best_id]) - - return np.array(best_elements) - - -class SemanticEnrichedTokenSARAveDissimilarity(Estimator): - def __init__( - self, - verbose: bool = False, - exp: bool = False, - sample_strategy: str = "first" - ): - super().__init__( - [ - "sample_sentence_similarity", - "sample_log_likelihoods", - "sample_token_similarity", - ] + SAMPLE_SELECTION_STAT_KEYS, - "sequence", - ) - self.verbose = verbose - self.exp = exp - self.sample_strategy = sample_strategy - - def __str__(self): - if self.exp: - base = "SemanticEnrichedTokenSARAveDissimilarityexp" - else: - base = "SemanticEnrichedTokenSARAveDissimilarity" - return sample_strategy_to_prefix(self.sample_strategy) + base - - def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: - batch_sample_log_likelihoods = stats["sample_log_likelihoods"] - batch_sample_token_similarity = stats["sample_token_similarity"] - batch_sample_sentence_similarity = stats["sample_sentence_similarity"] - sample_ids = best_sample_ids(self.sample_strategy, stats) - - enriched_tokenSAR = [] - - for batch_data in zip( - batch_sample_log_likelihoods, - batch_sample_token_similarity, - batch_sample_sentence_similarity, - ): - sample_log_likelihoods = batch_data[0] - sample_token_similarity = batch_data[1] - sample_sentence_similarity = batch_data[2] - - tokenSAR = [] - for log_likelihoods, token_similarity in zip( - sample_log_likelihoods, sample_token_similarity - ): - log_likelihoods = np.array(log_likelihoods) - R_t = 1 - token_similarity - if R_t.sum() == 0: - R_t_norm = np.zeros_like(R_t) - else: - R_t_norm = R_t / R_t.sum() - E_t = -log_likelihoods * R_t_norm - tokenSAR.append(E_t.sum()) - - if self.exp: - tokenSAR = -np.exp(-np.array(tokenSAR)) - - # Compute row-wise average dissimilarity (1 - g), excluding self-similarity - row_averages = [] - for i in range(sample_sentence_similarity.shape[0]): - row = sample_sentence_similarity[i] - average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1) - row_averages.append(average_dissimilarity) - - # Enrich each tokenSAR value - enriched_sample_tokenSAR = [] - for i, (sar_value, avg_dissimilarity) in enumerate(zip(tokenSAR, row_averages)): - if avg_dissimilarity == 0: - avg_dissimilarity = 1e-10 # Avoid division by zero - enriched_value = sar_value * avg_dissimilarity - enriched_sample_tokenSAR.append(enriched_value) - - enriched_tokenSAR.append(np.array(enriched_sample_tokenSAR)) - - # Return only metric for the best sample for PRR calculation - best_elements = [] - for best_id, metrics in zip(sample_ids, enriched_tokenSAR): - best_elements.append(metrics[best_id]) - - return np.array(best_elements) - - -class SemanticAveMTEAveSimilarity(Estimator): +class SemanticEnrichedMTEAveDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -441,7 +283,7 @@ def __init__( self.sample_strategy = sample_strategy def __str__(self): - return sample_strategy_to_prefix(self.sample_strategy) + "SemanticAveMTEAveSimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + "SemanticEnrichedMTEAveDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_entropy = stats["sample_entropy"] @@ -453,24 +295,24 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: for sample_entropy, sample_sentence_similarity in zip( batch_sample_entropy, batch_sample_sentence_similarity ): - # Compute row-wise average similarity, excluding self-similarity + # Compute row-wise average dissimilarity (1 - g), excluding self-similarity row_averages = [] for i in range(sample_sentence_similarity.shape[0]): row = sample_sentence_similarity[i] - average_similarity = (np.sum(row) - row[i]) / (len(row) - 1) # Exclude g_ii - row_averages.append(average_similarity) + average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1) + row_averages.append(average_dissimilarity) # Enrich each sample's entropy value enriched_sample_entropy = [] - for i, (entropy, avg_similarity) in enumerate(zip(sample_entropy, row_averages)): - if avg_similarity == 0: - avg_similarity = 1e-10 # Avoid division by zero - enriched_value = entropy * (1 / avg_similarity) + for i, (entropy, avg_dissimilarity) in enumerate(zip(sample_entropy, row_averages)): + if avg_dissimilarity == 0: + avg_dissimilarity = 1e-10 # Avoid division by zero + enriched_value = entropy * avg_dissimilarity enriched_sample_entropy.append(enriched_value) enriched_entropy.append(np.array(enriched_sample_entropy)) - # Return only metric for the best sample for prr calculation + # Return only metric for the best sample for PRR calculation best_elements = [] for best_id, metrics in zip(sample_ids, enriched_entropy): best_elements.append(metrics[best_id]) @@ -478,8 +320,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: return np.array(best_elements) - -class SemanticEnrichedMTEAveDissimilarity(Estimator): +class SemanticEnrichedMTETotalDissimilarity(Estimator): def __init__( self, verbose: bool = False, @@ -493,7 +334,7 @@ def __init__( self.sample_strategy = sample_strategy def __str__(self): - return sample_strategy_to_prefix(self.sample_strategy) + "SemanticEnrichedMTEAveDissimilarity" + return sample_strategy_to_prefix(self.sample_strategy) + "SemanticEnrichedMTETotalDissimilarity" def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_entropy = stats["sample_entropy"] @@ -512,9 +353,11 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: average_dissimilarity = (np.sum(1 - row) - (1 - row[i])) / (len(row) - 1) row_averages.append(average_dissimilarity) + avg_dissimilarity = np.mean(row_averages) + # Enrich each sample's entropy value enriched_sample_entropy = [] - for i, (entropy, avg_dissimilarity) in enumerate(zip(sample_entropy, row_averages)): + for entropy in sample_entropy: if avg_dissimilarity == 0: avg_dissimilarity = 1e-10 # Avoid division by zero enriched_value = entropy * avg_dissimilarity diff --git a/src/lm_polygraph/utils/register_stat_calculators.py b/src/lm_polygraph/utils/register_stat_calculators.py index b453c89fb..cf01905be 100644 --- a/src/lm_polygraph/utils/register_stat_calculators.py +++ b/src/lm_polygraph/utils/register_stat_calculators.py @@ -30,15 +30,15 @@ def register_stat_calculators( log.info("=" * 100) log.info("Loading NLI model...") - #if language == "en": - # nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device) - #elif language in ["zh", "ar", "ru"]: - # nli_model = MultilingualDeberta( - # batch_size=deberta_batch_size, - # device=deberta_device, - # ) - #else: - # raise Exception(f"Unsupported language: {language}") + if language == "en": + nli_model = Deberta(batch_size=deberta_batch_size, device=deberta_device) + elif language in ["zh", "ar", "ru"]: + nli_model = MultilingualDeberta( + batch_size=deberta_batch_size, + device=deberta_device, + ) + else: + raise Exception(f"Unsupported language: {language}") nli_model = None log.info("=" * 100) From 0ab5eaf3546e5451fb649180c5238660e50dad2a Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Tue, 18 Feb 2025 15:55:37 +0400 Subject: [PATCH 84/97] Uncommited changes from cluster --- src/lm_polygraph/stat_calculators/greedy_align_matrix.py | 1 + src/lm_polygraph/stat_calculators/semantic_matrix.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py index a467e92e6..497118726 100644 --- a/src/lm_polygraph/stat_calculators/greedy_align_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_align_matrix.py @@ -42,6 +42,7 @@ def __call__( # Sampling from LLM often produces significant number of identical # outputs. We only need to score pairs of unqiue outputs texts = [text if text.strip() != "" else "" for text in texts] + greedy_text = greedy_text if greedy_text.strip() != "" else "" unique_texts, inv = np.unique(texts, return_inverse=True) batch_pairs.append(list(itertools.product([greedy_text], unique_texts))) batch_invs.append(inv) diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py index 57499408a..8fe738056 100644 --- a/src/lm_polygraph/stat_calculators/semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py @@ -83,7 +83,7 @@ def __call__( C = [] P = [] - for i, pairs in tqdm(enumerate(batch_pairs)): + for i, pairs in enumerate(tqdm(batch_pairs)): dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size) probs = [] for first_texts, second_texts in dl: @@ -92,7 +92,7 @@ def __call__( batch, padding=True, return_tensors="pt" ).to(device) logits = deberta.deberta(**encoded).logits.detach().to(device) - probs.append(softmax(logits).cpu().detach()) + probs.append(softmax(logits).detach()) probs = torch.cat(probs, dim=0) entail_probs = probs[:, ent_id] From d03d080087609a3a0f1a139a3f605fe2f65e2bf9 Mon Sep 17 00:00:00 2001 From: silvimica Date: Thu, 27 Mar 2025 17:24:58 +0400 Subject: [PATCH 85/97] Fixed x metric for samples --- scripts/polygraph_eval | 17 ++ .../generation_metrics/__init__.py | 1 + .../generation_metrics/x_metric.py | 145 ++++++++++++++ .../generation_metrics/x_metric_utils.py | 182 ++++++++++++++++++ 4 files changed, 345 insertions(+) create mode 100644 src/lm_polygraph/generation_metrics/x_metric.py create mode 100644 src/lm_polygraph/generation_metrics/x_metric_utils.py diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 8421863a5..8435cb2cd 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -28,6 +28,8 @@ from lm_polygraph.estimators.ensemble_token_measures import * from lm_polygraph.ue_metrics import * from lm_polygraph.utils.common import load_external_module from lm_polygraph.utils.generation_parameters import GenerationParameters +from lm_polygraph.generation_metrics.x_metric_utils import MT5ForRegression +from transformers import AutoTokenizer, AutoModelForCausalLM hydra_config = Path(os.environ["HYDRA_CONFIG"]) @@ -513,6 +515,21 @@ def get_generation_metrics(args): Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True), Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="Best"), Comet(comet_scorer, source_ignore_regex = ignore_regex, sample=True, sample_strategy="BestNormalized")] + model_name_or_path="google/metricx-24-hybrid-large-v2p6" + tokenizer_name="google/mt5-large" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model_xmetric = MT5ForRegression.from_pretrained(model_name_or_path) + model_xmetric.to(device) + model_xmetric.eval() + + tokenizer_xmetric = AutoTokenizer.from_pretrained( + tokenizer_name if tokenizer_name else model_name_or_path + ) + + result += [XMetric(model=model_xmetric, tokenizer=tokenizer_xmetric, source_ignore_regex = ignore_regex), + XMetric(model=model_xmetric, tokenizer=tokenizer_xmetric, source_ignore_regex = ignore_regex, sample=True), + XMetric(model=model_xmetric, tokenizer=tokenizer_xmetric, source_ignore_regex = ignore_regex, sample=True, sample_strategy="Best"), + XMetric(model=model_xmetric, tokenizer=tokenizer_xmetric, source_ignore_regex = ignore_regex, sample=True, sample_strategy="BestNormalized")] else: result = [] for metric in generation_metrics: diff --git a/src/lm_polygraph/generation_metrics/__init__.py b/src/lm_polygraph/generation_metrics/__init__.py index d9d66c958..f5e702401 100644 --- a/src/lm_polygraph/generation_metrics/__init__.py +++ b/src/lm_polygraph/generation_metrics/__init__.py @@ -10,3 +10,4 @@ from .sbert import SbertMetric from .aggregated_metric import AggregatedMetric from .preprocess_output_target import PreprocessOutputTarget +from .x_metric import XMetric diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py new file mode 100644 index 000000000..bc7c7e483 --- /dev/null +++ b/src/lm_polygraph/generation_metrics/x_metric.py @@ -0,0 +1,145 @@ +import re +import numpy as np + +from typing import List, Dict +from .generation_metric import GenerationMetric +from transformers import AutoTokenizer +from .x_metric_utils import MT5ForRegression +import torch +import datasets +from transformers import TrainingArguments, Trainer + +class XMetric(GenerationMetric): + """ + Calculates X-MERTIC (https://aclanthology.org/2023.wmt-1.63/) + between model-generated texts and ground truth texts. + """ + + def __init__(self, model ,tokenizer, + source_ignore_regex=None, translation_ignore_regex=None, sample: bool = False, sample_strategy: str = "First"): + if sample: + super().__init__([ + "first_sample_texts", + "best_sample_texts", + "best_normalized_sample_texts", + "input_texts"], + "sequence") + else: + super().__init__(["greedy_texts", "input_texts"], "sequence") + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = model + + self.tokenizer = tokenizer + self.source_ignore_regex = ( + re.compile(source_ignore_regex) if source_ignore_regex else None + ) + self.translation_ignore_regex = ( + re.compile(translation_ignore_regex) if translation_ignore_regex else None + ) + self.training_args = TrainingArguments( + output_dir=".", + per_device_eval_batch_size=1, + dataloader_pin_memory=False, + ) + + self.trainer = Trainer( + model=self.model, + args=self.training_args, + ) + self.sample = sample + self.sample_strategy=sample_strategy + + + def __str__(self): + if self.sample: + if self.sample_strategy == "First": + return f"Samplexmetric" + else: + return f"{self.sample_strategy}Samplexmetric" + return "xmetric" + + def _filter_source(self, text: str, ignore_regex: re.Pattern) -> str: + if ignore_regex is not None: + try: + return ignore_regex.findall(text)[-1] + except IndexError: + raise ValueError( + f"Source text '{text}' does not match the ignore regex '{ignore_regex}'" + ) + return text + + def _filter_translation(self, text: str, ignore_regex: re.Pattern) -> str: + return ignore_regex.sub("", text).strip() if ignore_regex else text.strip() + + + def _prepare_inputs(self, translations: List[str], references: List[str]): + """Prepares the input data for X-MERTIC scoring.""" + inputs = [ + f"candidate: {hyp} reference: {ref}" + for hyp, ref in zip(translations, references) + ] + tokenized = self.tokenizer( + inputs, + max_length=512, + truncation=True, + padding=False + ) + + # Convert to Hugging Face Dataset + dataset = datasets.Dataset.from_dict({ + "input_ids": tokenized["input_ids"], + "attention_mask": tokenized["attention_mask"], + "input":inputs + }).with_format("torch") + + def remove_eos(example): + example["input_ids"] = example["input_ids"][:-1] + example["attention_mask"] = example["attention_mask"][:-1] + return example + + dataset = dataset.map(remove_eos) + return dataset + + def __call__( + self, + stats: Dict[str, np.ndarray], + target_texts: List[str], + ) -> np.ndarray: + """ + Calculates X-MERTIC between stats['greedy_texts'] and target_texts. + + Parameters: + stats (Dict[str, np.ndarray]): input statistics, including: + * model-generated texts in 'greedy_texts' + target_texts (List[str]): ground-truth texts + input_texts (List[str]): input texts before translation + + Returns: + np.ndarray: list of X-MERTIC scores for each sample. + """ + references = [ + src + for src in stats["target_texts"] + ] + if self.sample: + if self.sample_strategy == "First": + gen_texts = stats["first_sample_texts"] + elif self.sample_strategy == "Best": + gen_texts = stats["best_sample_texts"] + elif self.sample_strategy == "BestNormalized": + gen_texts = stats["best_normalized_sample_texts"] + else: + raise ValueError(f"Invalid sample strategy: {self.sample_strategy}") + else: + gen_texts = stats["greedy_texts"] + + translations = [ + self._filter_translation(tr, self.source_ignore_regex) + for tr in gen_texts + ] + + inputs = self._prepare_inputs(translations, references) + scores, _, _ = self.trainer.predict(test_dataset=inputs) + for i, score in enumerate(scores): + scores[i] = (25 - score) / 25 + return scores \ No newline at end of file diff --git a/src/lm_polygraph/generation_metrics/x_metric_utils.py b/src/lm_polygraph/generation_metrics/x_metric_utils.py new file mode 100644 index 000000000..ce1314cd1 --- /dev/null +++ b/src/lm_polygraph/generation_metrics/x_metric_utils.py @@ -0,0 +1,182 @@ +import copy +import dataclasses +from typing import Optional, Tuple, Union +import warnings + +import torch +from torch import nn +import transformers +import transformers.modeling_outputs + +BaseModelOutput = transformers.modeling_outputs.BaseModelOutput +ModelOutput = transformers.modeling_outputs.ModelOutput + +MT5Config = transformers.models.mt5.modeling_mt5.MT5Config +MT5PreTrainedModel = transformers.models.mt5.modeling_mt5.MT5PreTrainedModel +MT5Stack = transformers.models.mt5.modeling_mt5.MT5Stack + +__HEAD_MASK_WARNING_MSG = ( + transformers.models.mt5.modeling_mt5.__HEAD_MASK_WARNING_MSG # pylint: disable=protected-access +) + + +@dataclasses.dataclass +class MT5ForRegressionOutput(ModelOutput): + loss: Optional[torch.FloatTensor] = None + predictions: torch.FloatTensor = None + + +class MT5ForRegression(MT5PreTrainedModel): + """MT5 model for regression.""" + + def __init__(self, config: MT5Config): + super().__init__(config) + self.model_dim = config.d_model + + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = MT5Stack(encoder_config, self.shared) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = MT5Stack(decoder_config, self.shared) + + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], MT5ForRegressionOutput]: + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # FutureWarning: head_mask was separated into two input args - head_mask, + # decoder_head_mask + if head_mask is not None and decoder_head_mask is None: + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask + + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + # Convert encoder inputs in embeddings if needed + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] + if len(encoder_outputs) > 1 + else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + hidden_states = encoder_outputs[0] + + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + + # Create 1 step of dummy input for the decoder. + batch_size = input_ids.size(0) + decoder_input_ids = torch.LongTensor([0]).repeat(batch_size).reshape(-1, 1) + if torch.cuda.is_available(): + decoder_input_ids = decoder_input_ids.to(torch.device("cuda")) + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + hidden_states = hidden_states.to(self.decoder.first_device) + if decoder_input_ids is not None: + decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) + if attention_mask is not None: + attention_mask = attention_mask.to(self.decoder.first_device) + if decoder_attention_mask is not None: + decoder_attention_mask = decoder_attention_mask.to( + self.decoder.first_device + ) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = decoder_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.encoder.first_device) + self.lm_head = self.lm_head.to(self.encoder.first_device) + sequence_output = sequence_output.to(self.lm_head.weight.device) + + if self.config.tie_word_embeddings: + # Rescale output before projecting on vocab + # See + # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 + sequence_output = sequence_output * (self.model_dim**-0.5) + + lm_logits = self.lm_head(sequence_output) + + # 250089 = + predictions = lm_logits[:, 0, 250089] + + # Clip to 0 to 25 + predictions = torch.clamp(predictions, 0, 25) + + loss = None + if labels is not None: + loss_fct = nn.MSELoss() + # move labels to correct device to enable PP + labels = labels.to(predictions.device) + loss = loss_fct(predictions.view(-1), labels.view(-1)) + + return MT5ForRegressionOutput( + loss=loss, + predictions=predictions, + ) \ No newline at end of file From fc90a1ea13bb9d7ffe130c39254d9daae604dca8 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 27 Mar 2025 17:47:14 +0400 Subject: [PATCH 86/97] Add semantic density --- src/lm_polygraph/estimators/__init__.py | 2 + .../estimators/semantic_density.py | 137 ++++++++++++++++ src/lm_polygraph/stat_calculators/__init__.py | 4 +- .../greedy_semantic_matrix.py | 155 ++++++++++++++++++ .../stat_calculators/semantic_matrix.py | 130 +++++++++++++++ 5 files changed, 426 insertions(+), 2 deletions(-) create mode 100644 src/lm_polygraph/estimators/semantic_density.py diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py index 254788487..26c3d0739 100644 --- a/src/lm_polygraph/estimators/__init__.py +++ b/src/lm_polygraph/estimators/__init__.py @@ -110,3 +110,5 @@ from .prob_cocoa import ProbCocoaMaxprob, ProbCocoaPPL, GreedyProbCocoaMaxprob, GreedyProbCocoaPPL from .supervised_sum_semantic_entropies import SupSumSemanticMaxprob, SupSumSemanticPPL, SupSumSemanticMTE, GreedySupSumSemanticMaxprob, GreedySupSumSemanticPPL, GreedySupSumSemanticMTE + +from .semantic_density import SemanticDensity, GreedySemanticDensity diff --git a/src/lm_polygraph/estimators/semantic_density.py b/src/lm_polygraph/estimators/semantic_density.py new file mode 100644 index 000000000..1c09250b2 --- /dev/null +++ b/src/lm_polygraph/estimators/semantic_density.py @@ -0,0 +1,137 @@ +import numpy as np + +from typing import Dict + +from .estimator import Estimator +from .common import sample_strategy_to_prefix, best_sample_ids, SAMPLE_SELECTION_STAT_KEYS + + +class SemanticDensity(Estimator): + def __init__(self, verbose: bool = False, sample_strategy: str = "first"): + super().__init__( + [ + "sample_log_probs", + "sample_tokens", + "sample_texts", + "concat_semantic_matrix_contra", + "concat_semantic_matrix_neutral", + ] + SAMPLE_SELECTION_STAT_KEYS, + "sequence", + ) + self.verbose = verbose + self.sample_strategy = sample_strategy + + def __str__(self): + base = "SemanticDensity" + return sample_strategy_to_prefix(self.sample_strategy) + base + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_ids = best_sample_ids(self.sample_strategy, stats) + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_tokens = stats["sample_tokens"] + batch_sample_texts = stats["sample_texts"] + batch_semantic_matrix_contra = stats["concat_semantic_matrix_contra"] + batch_semantic_matrix_neutral = stats["concat_semantic_matrix_neutral"] + + semantic_density = [] + for batch_data in zip( + batch_sample_ids, + batch_sample_log_probs, + batch_sample_tokens, + batch_sample_texts, + batch_semantic_matrix_contra, + batch_semantic_matrix_neutral, + ): + sample_id = batch_data[0] + sample_probs = np.exp(batch_data[1]) + sample_tokens = batch_data[2] + sample_texts = batch_data[3] + semantic_matrix_contra = batch_data[4] + semantic_matrix_neutral = batch_data[5] + + _, unique_sample_indices = np.unique(sample_texts, return_index=True) + + numerator, denominator = [], [] + + for _id in unique_sample_indices: + normed_prob = sample_probs[_id] ** (1 / len(sample_tokens[_id])) + distance = semantic_matrix_contra[sample_id, _id] + (semantic_matrix_neutral[sample_id, _id] / 2) + + if distance <= 1: + kernel_value = 1 - distance + else: + kernel_value = 0 + + numerator.append(normed_prob * kernel_value) + denominator.append(normed_prob) + + semantic_density.append(np.sum(numerator) / np.sum(denominator)) + + return np.array(semantic_density) + + +class GreedySemanticDensity(Estimator): + def __init__(self, verbose: bool = False): + super().__init__( + [ + "greedy_log_probs", + "sample_log_probs", + "sample_tokens", + "sample_texts", + "concat_greedy_semantic_matrix_contra_forward", + "concat_greedy_semantic_matrix_neutral_forward", + ], + "sequence", + ) + self.verbose = verbose + + def __str__(self): + return "GreedySemanticDensity" + + def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: + batch_sample_log_probs = stats["sample_log_probs"] + batch_sample_tokens = stats["sample_tokens"] + batch_sample_texts = stats["sample_texts"] + batch_semantic_matrix_contra = stats["concat_greedy_semantic_matrix_contra_forward"] + batch_semantic_matrix_neutral = stats["concat_greedy_semantic_matrix_neutral_forward"] + batch_greedy_log_likelihoods = stats["concat_greedy_log_likelihoods"] + + semantic_density = [] + for batch_data in zip( + batch_greedy_log_likelihoods, + batch_sample_log_probs, + batch_sample_tokens, + batch_sample_texts, + batch_semantic_matrix_contra, + batch_semantic_matrix_neutral, + ): + greedy_log_probs = batch_data[0] + sample_probs = np.exp(batch_data[1]) + sample_tokens = batch_data[2] + sample_texts = batch_data[3] + semantic_matrix_contra = batch_data[4] + semantic_matrix_neutral = batch_data[5] + + _, unique_sample_indices = np.unique(sample_texts, return_index=True) + + numerator, denominator = [], [] + + for _id in unique_sample_indices: + normed_prob = sample_probs[_id] ** (1 / len(sample_tokens[_id])) + distance = semantic_matrix_contra[_id] + (semantic_matrix_neutral[_id] / 2) + + if distance <= 1: + kernel_value = 1 - distance + else: + kernel_value = 0 + + numerator.append(normed_prob * kernel_value) + denominator.append(normed_prob) + + greedy_normed_prob = np.exp(np.sum(greedy_log_probs)) ** (1 / len(greedy_log_probs)) + numerator.append(greedy_normed_prob) + denominator.append(greedy_normed_prob) + + semantic_density.append(np.sum(numerator) / np.sum(denominator)) + + return np.array(semantic_density) diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py index 7f7e37aef..f88615976 100644 --- a/src/lm_polygraph/stat_calculators/__init__.py +++ b/src/lm_polygraph/stat_calculators/__init__.py @@ -20,12 +20,12 @@ from .model_score import ModelScoreCalculator from .embeddings import EmbeddingsCalculator from .ensemble_token_data import EnsembleTokenLevelDataCalculator -from .semantic_matrix import SemanticMatrixCalculator +from .semantic_matrix import SemanticMatrixCalculator, ConcatSemanticMatrixCalculator from .cross_encoder_similarity import CrossEncoderSimilarityMatrixCalculator from .extract_claims import ClaimsExtractor from .semantic_classes import SemanticClassesCalculator from .greedy_similarity import GreedySimilarityCalculator -from .greedy_semantic_matrix import GreedySemanticMatrixCalculator +from .greedy_semantic_matrix import GreedySemanticMatrixCalculator, ConcatGreedySemanticMatrixCalculator from .rouge_matrix import RougeLSemanticMatrixCalculator from .greedy_rouge_matrix import GreedyRougeLSemanticMatrixCalculator from .align_matrix import AlignMatrixCalculator diff --git a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py index a5e5cc9df..07c14e805 100644 --- a/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/greedy_semantic_matrix.py @@ -53,6 +53,8 @@ def __call__( device = deberta.device ent_id = deberta.deberta.config.label2id["ENTAILMENT"] + contra_id = deberta.deberta.config.label2id["CONTRADICTION"] + neutral_id = deberta.deberta.config.label2id["NEUTRAL"] softmax = nn.Softmax(dim=1) tokenizer = deberta.deberta_tokenizer @@ -60,6 +62,12 @@ def __call__( E_f = [] E_b = [] E = [] + N_f = [] + N_b = [] + N = [] + C_f = [] + C_b = [] + C = [] for i, pairs in enumerate(batch_pairs): dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size) @@ -88,17 +96,164 @@ def __call__( entail_probs_f = probs_f[:, ent_id] entail_probs_b = probs_b[:, ent_id] + contra_probs_f = probs_f[:, contra_id] + contra_probs_b = probs_b[:, contra_id] + neutral_probs_f = probs_f[:, neutral_id] + neutral_probs_b = probs_b[:, neutral_id] E_f.append(entail_probs_f[inv].numpy()) E_b.append(entail_probs_b[inv].numpy()) E.append((entail_probs_f[inv].numpy() + entail_probs_b[inv].numpy()) / 2) + N_f.append(neutral_probs_f[inv].numpy()) + N_b.append(neutral_probs_b[inv].numpy()) + N.append((neutral_probs_f[inv].numpy() + neutral_probs_b[inv].numpy()) / 2) + C_f.append(contra_probs_f[inv].numpy()) + C_b.append(contra_probs_b[inv].numpy()) + C.append((contra_probs_f[inv].numpy() + contra_probs_b[inv].numpy()) / 2) E_f = np.stack(E_f) E_b = np.stack(E_b) E = np.stack(E) + N_f = np.stack(N_f) + N_b = np.stack(N_b) + N = np.stack(N) + C_f = np.stack(C_f) + C_b = np.stack(C_b) + C = np.stack(C) return { "greedy_semantic_matrix_forward": E_f, "greedy_semantic_matrix_backward": E_b, "greedy_semantic_matrix": E, + "greedy_semantic_matrix_neutral_forward": N_f, + "greedy_semantic_matrix_neutral_backward": N_b, + "greedy_semantic_matrix_neutral": N, + "greedy_semantic_matrix_contra_forward": C_f, + "greedy_semantic_matrix_contra_backward": C_b, + "greedy_semantic_matrix_contra": C, + } + + +class ConcatGreedySemanticMatrixCalculator(StatCalculator): + """ + Calculates the NLI semantic matrix for generation samples using DeBERTa model. + """ + + def __init__(self, nli_model): + super().__init__( + [ + "concat_greedy_semantic_matrix_forward", + "concat_greedy_semantic_matrix_backward", + "concat_greedy_semantic_matrix", + ], + ["greedy_texts", "no_fewshot_input_texts", "sample_texts"], + ) + self.is_deberta_setup = False + self.nli_model = nli_model + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + deberta = self.nli_model + deberta_batch_size = deberta.batch_size + + batch_texts = dependencies["sample_texts"] + batch_greedy_texts = dependencies["greedy_texts"] + input_texts = dependencies["no_fewshot_input_texts"] + + + batch_pairs = [] + batch_invs = [] + for texts, greedy_text, input_text in zip(batch_texts, batch_greedy_texts, input_texts): + texts = [input_text + text for text in texts] + # Sampling from LLM often produces significant number of identical + # outputs. We only need to score pairs of unqiue outputs + unique_texts, inv = np.unique(texts, return_inverse=True) + batch_pairs.append(list(itertools.product([input_text + greedy_text], unique_texts))) + batch_invs.append(inv) + + device = deberta.device + ent_id = deberta.deberta.config.label2id["ENTAILMENT"] + contra_id = deberta.deberta.config.label2id["CONTRADICTION"] + neutral_id = deberta.deberta.config.label2id["NEUTRAL"] + + softmax = nn.Softmax(dim=1) + tokenizer = deberta.deberta_tokenizer + + E_f = [] + E_b = [] + E = [] + N_f = [] + N_b = [] + N = [] + C_f = [] + C_b = [] + C = [] + + for i, pairs in enumerate(batch_pairs): + dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size) + probs_f = [] + probs_b = [] + + for first_texts, second_texts in tqdm(dl): + batch = list(zip(first_texts, second_texts)) + encoded = tokenizer.batch_encode_plus( + batch, padding=True, return_tensors="pt" + ).to(device) + logits = deberta.deberta(**encoded).logits.detach().to(device) + probs_f.append(softmax(logits).cpu().detach()) + + batch = list(zip(second_texts, first_texts)) + encoded = tokenizer.batch_encode_plus( + batch, padding=True, return_tensors="pt" + ).to(device) + logits = deberta.deberta(**encoded).logits.detach().to(device) + probs_b.append(softmax(logits).cpu().detach()) + + probs_f = torch.cat(probs_f, dim=0) + probs_b = torch.cat(probs_b, dim=0) + + inv = batch_invs[i] + + entail_probs_f = probs_f[:, ent_id] + entail_probs_b = probs_b[:, ent_id] + contra_probs_f = probs_f[:, contra_id] + contra_probs_b = probs_b[:, contra_id] + neutral_probs_f = probs_f[:, neutral_id] + neutral_probs_b = probs_b[:, neutral_id] + + E_f.append(entail_probs_f[inv].numpy()) + E_b.append(entail_probs_b[inv].numpy()) + E.append((entail_probs_f[inv].numpy() + entail_probs_b[inv].numpy()) / 2) + N_f.append(neutral_probs_f[inv].numpy()) + N_b.append(neutral_probs_b[inv].numpy()) + N.append((neutral_probs_f[inv].numpy() + neutral_probs_b[inv].numpy()) / 2) + C_f.append(contra_probs_f[inv].numpy()) + C_b.append(contra_probs_b[inv].numpy()) + C.append((contra_probs_f[inv].numpy() + contra_probs_b[inv].numpy()) / 2) + + E_f = np.stack(E_f) + E_b = np.stack(E_b) + E = np.stack(E) + N_f = np.stack(N_f) + N_b = np.stack(N_b) + N = np.stack(N) + C_f = np.stack(C_f) + C_b = np.stack(C_b) + C = np.stack(C) + + return { + "concat_greedy_semantic_matrix_forward": E_f, + "concat_greedy_semantic_matrix_backward": E_b, + "concat_greedy_semantic_matrix": E, + "concat_greedy_semantic_matrix_neutral_forward": N_f, + "concat_greedy_semantic_matrix_neutral_backward": N_b, + "concat_greedy_semantic_matrix_neutral": N, + "concat_greedy_semantic_matrix_contra_forward": C_f, + "concat_greedy_semantic_matrix_contra_backward": C_b, + "concat_greedy_semantic_matrix_contra": C, } diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py index 8fe738056..cb9dd4f9d 100644 --- a/src/lm_polygraph/stat_calculators/semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py @@ -75,12 +75,14 @@ def __call__( device = deberta.device ent_id = deberta.deberta.config.label2id["ENTAILMENT"] contra_id = deberta.deberta.config.label2id["CONTRADICTION"] + neutral_id = deberta.deberta.config.label2id["NEUTRAL"] softmax = nn.Softmax(dim=1) tokenizer = deberta.deberta_tokenizer E = [] C = [] + N = [] P = [] for i, pairs in enumerate(tqdm(batch_pairs)): @@ -97,12 +99,14 @@ def __call__( entail_probs = probs[:, ent_id] contra_probs = probs[:, contra_id] + neutral_probs = probs[:, neutral_id] class_preds = probs.argmax(-1) unique_mat_shape = (batch_counts[i], batch_counts[i]) unique_E = entail_probs.view(unique_mat_shape) unique_C = contra_probs.view(unique_mat_shape) + unique_N = neutral_probs.view(unique_mat_shape) unique_P = class_preds.view(unique_mat_shape) inv = batch_invs[i] @@ -111,15 +115,141 @@ def __call__( # using inverse index E.append(unique_E.cpu().numpy()[inv, :][:, inv]) C.append(unique_C.cpu().numpy()[inv, :][:, inv]) + N.append(unique_N.cpu().numpy()[inv, :][:, inv]) P.append(unique_P.cpu().numpy()[inv, :][:, inv]) E = np.stack(E) C = np.stack(C) + N = np.stack(N) P = np.stack(P) return { "semantic_matrix_entail": E, "semantic_matrix_contra": C, + "semantic_matrix_neutral": N, "semantic_matrix_classes": P, "entailment_id": deberta.deberta.config.label2id["ENTAILMENT"], } + + +class ConcatSemanticMatrixCalculator(StatCalculator): + """ + Calculates the NLI semantic matrix for generation samples using DeBERTa model. + """ + + def __init__(self, nli_model): + super().__init__( + [ + "concat_semantic_matrix_entail", + "concat_semantic_matrix_contra", + "concat_semantic_matrix_classes", + "entailment_id", + ], + ["no_fewshot_input_texts", "sample_texts"], + ) + self.is_deberta_setup = False + self.nli_model = nli_model + + def __call__( + self, + dependencies: Dict[str, np.array], + texts: List[str], + model: WhiteboxModel, + max_new_tokens: int = 100, + ) -> Dict[str, np.ndarray]: + """ + Calculates the NLI semantic matrix for generation samples using DeBERTa model. + + Parameters: + dependencies (Dict[str, np.ndarray]): input statistics, containing: + - 'sample_texts' (List[List[str]]): several sampling generations + for each input text in the batch. + texts (List[str]): Input texts batch used for model generation. + model (Model): Model used for generation. + max_new_tokens (int): Maximum number of new tokens at model generation. Default: 100. + Returns: + Dict[str, np.ndarray]: dictionary with the following items: + - 'semantic_matrix_entail' (List[np.array]): for each input text: quadratic matrix of size + n_samples x n_samples, with probabilities of 'ENTAILMENT' output of DeBERTa. + - 'semantic_matrix_contra' (List[np.array]): for each input text: quadratic matrix of size + n_samples x n_samples, with probabilities of 'CONTRADICTION' output of DeBERTa. + - 'semantic_matrix_classes' (List[np.array]): for each input text: quadratic matrix of size + n_samples x n_samples, with the NLI label id corresponding to the DeBERTa prediction. + """ + + deberta = self.nli_model + deberta_batch_size = deberta.batch_size + batch_texts = dependencies["sample_texts"] + input_texts = dependencies["no_fewshot_input_texts"] + + batch_pairs = [] + batch_invs = [] + batch_counts = [] + for input_text, texts in zip(input_texts, batch_texts): + texts = [input_text + text for text in texts] + breakpoint() + # Sampling from LLM often produces significant number of identical + # outputs. We only need to score pairs of unqiue outputs + unique_texts, inv = np.unique(texts, return_inverse=True) + batch_pairs.append(list(itertools.product(unique_texts, unique_texts))) + batch_invs.append(inv) + batch_counts.append(len(unique_texts)) + + device = deberta.device + ent_id = deberta.deberta.config.label2id["ENTAILMENT"] + contra_id = deberta.deberta.config.label2id["CONTRADICTION"] + neutral_id = deberta.deberta.config.label2id["NEUTRAL"] + + softmax = nn.Softmax(dim=1) + tokenizer = deberta.deberta_tokenizer + + E = [] + C = [] + N = [] + P = [] + + for i, pairs in enumerate(tqdm(batch_pairs)): + dl = torch.utils.data.DataLoader(pairs, batch_size=deberta_batch_size) + probs = [] + for first_texts, second_texts in dl: + batch = list(zip(first_texts, second_texts)) + encoded = tokenizer.batch_encode_plus( + batch, padding=True, return_tensors="pt" + ).to(device) + logits = deberta.deberta(**encoded).logits.detach().to(device) + probs.append(softmax(logits).detach()) + probs = torch.cat(probs, dim=0) + + entail_probs = probs[:, ent_id] + contra_probs = probs[:, contra_id] + neutral_probs = probs[:, neutral_id] + class_preds = probs.argmax(-1) + + unique_mat_shape = (batch_counts[i], batch_counts[i]) + + unique_E = entail_probs.view(unique_mat_shape) + unique_C = contra_probs.view(unique_mat_shape) + unique_N = neutral_probs.view(unique_mat_shape) + unique_P = class_preds.view(unique_mat_shape) + + inv = batch_invs[i] + + # Recover full matrices from unques by gathering along both axes + # using inverse index + E.append(unique_E.cpu().numpy()[inv, :][:, inv]) + C.append(unique_C.cpu().numpy()[inv, :][:, inv]) + N.append(unique_N.cpu().numpy()[inv, :][:, inv]) + P.append(unique_P.cpu().numpy()[inv, :][:, inv]) + + E = np.stack(E) + C = np.stack(C) + N = np.stack(N) + P = np.stack(P) + + return { + "concat_semantic_matrix_entail": E, + "concat_semantic_matrix_contra": C, + "concat_semantic_matrix_neutral": N, + "concat_semantic_matrix_classes": P, + "entailment_id": deberta.deberta.config.label2id["ENTAILMENT"], + } From 3b1bdcb6305df8ca9f98fdcea07d217e0c6965e8 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 27 Mar 2025 17:51:21 +0400 Subject: [PATCH 87/97] Remove breakpoint --- src/lm_polygraph/stat_calculators/semantic_matrix.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lm_polygraph/stat_calculators/semantic_matrix.py b/src/lm_polygraph/stat_calculators/semantic_matrix.py index cb9dd4f9d..036046ce3 100644 --- a/src/lm_polygraph/stat_calculators/semantic_matrix.py +++ b/src/lm_polygraph/stat_calculators/semantic_matrix.py @@ -187,7 +187,6 @@ def __call__( batch_counts = [] for input_text, texts in zip(input_texts, batch_texts): texts = [input_text + text for text in texts] - breakpoint() # Sampling from LLM often produces significant number of identical # outputs. We only need to score pairs of unqiue outputs unique_texts, inv = np.unique(texts, return_inverse=True) From 25103254a6a20dd0e921f5fbc15c689a4d374f78 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Thu, 27 Mar 2025 17:56:06 +0400 Subject: [PATCH 88/97] Fix some typos --- src/lm_polygraph/generation_metrics/x_metric.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py index bc7c7e483..a5bd84c92 100644 --- a/src/lm_polygraph/generation_metrics/x_metric.py +++ b/src/lm_polygraph/generation_metrics/x_metric.py @@ -11,7 +11,7 @@ class XMetric(GenerationMetric): """ - Calculates X-MERTIC (https://aclanthology.org/2023.wmt-1.63/) + Calculates X-METRIC (https://aclanthology.org/2023.wmt-1.63/) between model-generated texts and ground truth texts. """ @@ -73,7 +73,7 @@ def _filter_translation(self, text: str, ignore_regex: re.Pattern) -> str: def _prepare_inputs(self, translations: List[str], references: List[str]): - """Prepares the input data for X-MERTIC scoring.""" + """Prepares the input data for X-METRIC scoring.""" inputs = [ f"candidate: {hyp} reference: {ref}" for hyp, ref in zip(translations, references) @@ -106,7 +106,7 @@ def __call__( target_texts: List[str], ) -> np.ndarray: """ - Calculates X-MERTIC between stats['greedy_texts'] and target_texts. + Calculates X-METRIC between stats['greedy_texts'] and target_texts. Parameters: stats (Dict[str, np.ndarray]): input statistics, including: @@ -142,4 +142,4 @@ def __call__( scores, _, _ = self.trainer.predict(test_dataset=inputs) for i, score in enumerate(scores): scores[i] = (25 - score) / 25 - return scores \ No newline at end of file + return scores From a7bc19c50a6368a3de921e60f7a1f5af8cba4868 Mon Sep 17 00:00:00 2001 From: silvimica Date: Sat, 29 Mar 2025 08:34:35 +0400 Subject: [PATCH 89/97] Gpt as a judge + Fixes to X Metric 24 --- .../generation_metrics/__init__.py | 1 + .../generation_metrics/gpt_judge_accuracy.py | 90 +++++++++++++++++++ .../generation_metrics/x_metric.py | 31 +++++-- 3 files changed, 116 insertions(+), 6 deletions(-) create mode 100644 src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py diff --git a/src/lm_polygraph/generation_metrics/__init__.py b/src/lm_polygraph/generation_metrics/__init__.py index f5e702401..83f7b58f6 100644 --- a/src/lm_polygraph/generation_metrics/__init__.py +++ b/src/lm_polygraph/generation_metrics/__init__.py @@ -11,3 +11,4 @@ from .aggregated_metric import AggregatedMetric from .preprocess_output_target import PreprocessOutputTarget from .x_metric import XMetric +from .gpt_judge_accuracy import GptAccuracyMetric \ No newline at end of file diff --git a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py new file mode 100644 index 000000000..d0cb36ed8 --- /dev/null +++ b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py @@ -0,0 +1,90 @@ +import openai +from .generation_metric import GenerationMetric +import numpy as np +import logging +from typing import Dict, List +import re +log = logging.getLogger("lm_polygraph") +import os + +class GptAccuracyMetric(GenerationMetric): + """ + Uses GPT to compare generated text with target and return 1 if semantically equivalent, else 0. + """ + + def __init__(self, model="gpt-4o-mini", sample=False, sample_strategy="First", api_key=None): + if sample: + super().__init__([ + "first_sample_texts", + "best_sample_texts", + "best_normalized_sample_texts", + "input_texts"], + "sequence") + else: + super().__init__(["greedy_texts", "input_texts"], "sequence") + self.sample = sample + self.sample_strategy = sample_strategy + self.model = model + self.api_key = api_key or os.getenv("OPENAI_API_KEY") + openai.api_key = self.api_key + + def __str__(self): + if self.sample == True: + return f"GptAccuracy_{self.model}_{self.sample_strategy}" + return f"GptAccuracy_{self.model}" + + def _filter_input(self, input): + matches = re.findall(r"Question:\s*(.*?)\nAnswer:", input, re.DOTALL) + if matches: + return matches[-1].strip() + return input + def _gpt_compare(self, output: str, target: str, question: str) -> int: + prompt = ( + f"You are a text evaluator. The model was asked the following question: {question.strip()}.\n" + "The 'Generated' text is a model's response. The 'Target' is the correct answer.\n" + "If the generated answer correctly answers the question based on the target, return 1.\n" + "If it is wrong, return 0.\n" + "Respond ONLY with a single digit: 1 or 0.\n\n" + f"Generated: {output.strip()}\n" + f"Target: {target.strip()}" + ) + + try: + response = openai.ChatCompletion.create( + model=self.model, + messages=[ + {"role": "system", "content": "You are a strict evaluator of text similarity."}, + {"role": "user", "content": prompt} + ], + temperature=0, + max_tokens=1, + n=1 + ) + + raw_reply = response['choices'][0]['message']['content'].strip() + return int(raw_reply) if raw_reply in ['0', '1'] else 0 + + except Exception as e: + log.error(f"GPT comparison failed: {e}") + return 0 # Safe default + + def __call__(self, stats: Dict[str, np.ndarray], target_texts: List[str]) -> np.ndarray: + if self.sample: + if self.sample_strategy == "First": + gen_texts = stats["first_sample_texts"] + elif self.sample_strategy == "Best": + gen_texts = stats["best_sample_texts"] + elif self.sample_strategy == "BestNormalized": + gen_texts = stats["best_normalized_sample_texts"] + else: + raise ValueError(f"Invalid sample strategy: {self.sample_strategy}") + else: + gen_texts = stats["greedy_texts"] + + results = [] + input_texts = [self._filter_input(text) for text in stats["input_texts"]] + for output, target, input in zip(gen_texts, target_texts, input_texts): + score = self._gpt_compare(output, target,input) + results.append(score) + + return np.array(results) diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py index bc7c7e483..4298fd126 100644 --- a/src/lm_polygraph/generation_metrics/x_metric.py +++ b/src/lm_polygraph/generation_metrics/x_metric.py @@ -7,7 +7,7 @@ from .x_metric_utils import MT5ForRegression import torch import datasets -from transformers import TrainingArguments, Trainer +from transformers import TrainingArguments, DataCollatorWithPadding, Trainer class XMetric(GenerationMetric): """ @@ -36,15 +36,19 @@ def __init__(self, model ,tokenizer, self.translation_ignore_regex = ( re.compile(translation_ignore_regex) if translation_ignore_regex else None ) + self.training_args = TrainingArguments( output_dir=".", per_device_eval_batch_size=1, dataloader_pin_memory=False, ) + data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) + self.trainer = Trainer( model=self.model, args=self.training_args, + data_collator=data_collator ) self.sample = sample self.sample_strategy=sample_strategy @@ -71,12 +75,22 @@ def _filter_source(self, text: str, ignore_regex: re.Pattern) -> str: def _filter_translation(self, text: str, ignore_regex: re.Pattern) -> str: return ignore_regex.sub("", text).strip() if ignore_regex else text.strip() - - def _prepare_inputs(self, translations: List[str], references: List[str]): + def _filter_text(self, text: str, ignore_regex: re.Pattern) -> str: + if ignore_regex is not None: + processed_text = ignore_regex.search(text) + if processed_text: + return processed_text.group(1) + else: + raise ValueError( + f"Source text {text} does not match the ignore regex {ignore_regex}" + ) + return text + + def _prepare_inputs(self, translations: List[str], references: List[str], sources: List[str],): """Prepares the input data for X-MERTIC scoring.""" inputs = [ - f"candidate: {hyp} reference: {ref}" - for hyp, ref in zip(translations, references) + f"source: {source} candidate: {hyp} reference: {ref}" + for hyp, ref, source in zip(translations, references, sources) ] tokenized = self.tokenizer( inputs, @@ -138,7 +152,12 @@ def __call__( for tr in gen_texts ] - inputs = self._prepare_inputs(translations, references) + sources = [ + self._filter_text(src, self.source_ignore_regex) + for src in stats["input_texts"] + ] + + inputs = self._prepare_inputs(translations, references, sources) scores, _, _ = self.trainer.predict(test_dataset=inputs) for i, score in enumerate(scores): scores[i] = (25 - score) / 25 From 75ad725e7ca2d607c43a2d125a7ee88066aca21a Mon Sep 17 00:00:00 2001 From: silvimica Date: Sat, 29 Mar 2025 08:36:58 +0400 Subject: [PATCH 90/97] Polygraph eval code + remove redundant funcion form x metric --- scripts/polygraph_eval | 5 ++++- src/lm_polygraph/generation_metrics/x_metric.py | 10 ---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 8435cb2cd..bcd4e976a 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -453,7 +453,7 @@ def get_generation_metrics(args): ckpt_path=ckpt_path, evaluation_mode="nli_sp", ) - + api_key =getattr(args, "openai_api_key", '') result = [ RougeMetric("rougeL"), BLEUMetric(), @@ -503,6 +503,9 @@ def get_generation_metrics(args): AlignScore(align_scorer, sample=True, sample_strategy="BestNormalized"), AlignScore(align_scorer, target_is_claims=False, sample=True, sample_strategy="BestNormalized"), AlignScore(align_scorer, ignore_target=True, sample=True, sample_strategy="BestNormalized"), + GptAccuracyMetric( api_key=api_key), + GptAccuracyMetric( api_key=api_key,sample=True, sample_strategy="Best"), + GptAccuracyMetric( api_key=api_key, sample=True, sample_strategy="First"), ] if getattr(args.model, "type", "Whitebox") != "Blackbox": diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py index 4298fd126..4835e2428 100644 --- a/src/lm_polygraph/generation_metrics/x_metric.py +++ b/src/lm_polygraph/generation_metrics/x_metric.py @@ -62,16 +62,6 @@ def __str__(self): return f"{self.sample_strategy}Samplexmetric" return "xmetric" - def _filter_source(self, text: str, ignore_regex: re.Pattern) -> str: - if ignore_regex is not None: - try: - return ignore_regex.findall(text)[-1] - except IndexError: - raise ValueError( - f"Source text '{text}' does not match the ignore regex '{ignore_regex}'" - ) - return text - def _filter_translation(self, text: str, ignore_regex: re.Pattern) -> str: return ignore_regex.sub("", text).strip() if ignore_regex else text.strip() From aa577fcd9e6c11b787248efd87fb8a1f2cf24579 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Sat, 29 Mar 2025 19:46:41 +0400 Subject: [PATCH 91/97] Add multiref support without aggregation, some other tweaks --- scripts/polygraph_eval | 2 +- .../generation_metrics/gpt_judge_accuracy.py | 46 +++++++++++++------ 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index bcd4e976a..7b6bde8f4 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -572,7 +572,7 @@ def get_generation_metrics(args): if getattr(args, "multiref", False): # Wrap each metric in AggregatedMetric - result = [AggregatedMetric(base_metric=metric) for metric in result] + result = [AggregatedMetric(base_metric=metric) if type(metric) != GptAccuracyMetric else metric for metric in result] log.info("Done with initializing generation metrics.") diff --git a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py index d0cb36ed8..7e3a170df 100644 --- a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py +++ b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py @@ -6,6 +6,7 @@ import re log = logging.getLogger("lm_polygraph") import os +from tqdm import tqdm class GptAccuracyMetric(GenerationMetric): """ @@ -15,13 +16,15 @@ class GptAccuracyMetric(GenerationMetric): def __init__(self, model="gpt-4o-mini", sample=False, sample_strategy="First", api_key=None): if sample: super().__init__([ + "no_fewshot_input_texts", "first_sample_texts", "best_sample_texts", "best_normalized_sample_texts", "input_texts"], "sequence") else: - super().__init__(["greedy_texts", "input_texts"], "sequence") + super().__init__(["no_fewshot_input_texts", "greedy_texts", "input_texts"], "sequence") + self.sample = sample self.sample_strategy = sample_strategy self.model = model @@ -32,28 +35,41 @@ def __str__(self): if self.sample == True: return f"GptAccuracy_{self.model}_{self.sample_strategy}" return f"GptAccuracy_{self.model}" - + def _filter_input(self, input): matches = re.findall(r"Question:\s*(.*?)\nAnswer:", input, re.DOTALL) if matches: return matches[-1].strip() return input + def _gpt_compare(self, output: str, target: str, question: str) -> int: - prompt = ( - f"You are a text evaluator. The model was asked the following question: {question.strip()}.\n" - "The 'Generated' text is a model's response. The 'Target' is the correct answer.\n" - "If the generated answer correctly answers the question based on the target, return 1.\n" - "If it is wrong, return 0.\n" - "Respond ONLY with a single digit: 1 or 0.\n\n" - f"Generated: {output.strip()}\n" - f"Target: {target.strip()}" - ) + if type(target) == list: + str_target = ", ".join(target) + prompt = ( + f"You are a text evaluator. The model was asked the following question:\n{question}\n" + "The 'Generated' text is a model's response. The 'Target' is the list of possible correct answers.\n" + "If the generated answer correctly answers the question (matches one of the target responses), return 1.\n" + "If it is wrong, return 0.\n" + "Respond ONLY with a single digit: 1 or 0.\n\n" + f"Generated: {output.strip()}\n" + f"Target list: {str_target.strip()}" + ) + else: + prompt = ( + f"You are a text evaluator. The model was asked the following question:\n{question}\n" + "The 'Generated' text is a model's response. The 'Target' is the correct answer.\n" + "If the generated answer correctly answers the question based on the target, return 1.\n" + "If it is wrong, return 0.\n" + "Respond ONLY with a single digit: 1 or 0.\n\n" + f"Generated: {output.strip()}\n" + f"Target: {target.strip()}" + ) try: response = openai.ChatCompletion.create( model=self.model, messages=[ - {"role": "system", "content": "You are a strict evaluator of text similarity."}, + {"role": "system", "content": "You are a strict evaluator of correctness of the model's response."}, {"role": "user", "content": prompt} ], temperature=0, @@ -62,6 +78,7 @@ def _gpt_compare(self, output: str, target: str, question: str) -> int: ) raw_reply = response['choices'][0]['message']['content'].strip() + return int(raw_reply) if raw_reply in ['0', '1'] else 0 except Exception as e: @@ -82,8 +99,9 @@ def __call__(self, stats: Dict[str, np.ndarray], target_texts: List[str]) -> np. gen_texts = stats["greedy_texts"] results = [] - input_texts = [self._filter_input(text) for text in stats["input_texts"]] - for output, target, input in zip(gen_texts, target_texts, input_texts): + input_texts = stats["no_fewshot_input_texts"] + + for output, target, input in tqdm(zip(gen_texts, target_texts, input_texts)): score = self._gpt_compare(output, target,input) results.append(score) From 66fc7371b065d8b22f169ff312bb7c0908d88084 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Sat, 29 Mar 2025 21:03:32 +0400 Subject: [PATCH 92/97] Show metricx progress --- src/lm_polygraph/generation_metrics/x_metric.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py index 37b69de35..3e1444354 100644 --- a/src/lm_polygraph/generation_metrics/x_metric.py +++ b/src/lm_polygraph/generation_metrics/x_metric.py @@ -48,7 +48,8 @@ def __init__(self, model ,tokenizer, self.trainer = Trainer( model=self.model, args=self.training_args, - data_collator=data_collator + data_collator=data_collator, + disable_tqdm=False ) self.sample = sample self.sample_strategy=sample_strategy From dae4f26e8b90f30ddb107d018d55895f6f7daa01 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Sat, 29 Mar 2025 21:05:04 +0400 Subject: [PATCH 93/97] Fix tqdm --- src/lm_polygraph/generation_metrics/x_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm_polygraph/generation_metrics/x_metric.py b/src/lm_polygraph/generation_metrics/x_metric.py index 3e1444354..e8f7fe6af 100644 --- a/src/lm_polygraph/generation_metrics/x_metric.py +++ b/src/lm_polygraph/generation_metrics/x_metric.py @@ -40,6 +40,7 @@ def __init__(self, model ,tokenizer, self.training_args = TrainingArguments( output_dir=".", per_device_eval_batch_size=1, + disable_tqdm=False, dataloader_pin_memory=False, ) @@ -49,7 +50,6 @@ def __init__(self, model ,tokenizer, model=self.model, args=self.training_args, data_collator=data_collator, - disable_tqdm=False ) self.sample = sample self.sample_strategy=sample_strategy From dd6bac43d24075aebab440303b9edfadc63a0dc3 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Sun, 30 Mar 2025 12:58:13 +0400 Subject: [PATCH 94/97] Fix loading manager with torch 2.6+ --- src/lm_polygraph/utils/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 987177e34..99449b4bb 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -721,7 +721,7 @@ def load(load_path: str, **kwargs) -> "UEManager": Parameters: load_path (str): Path to file with saved benchmark results to load. """ - res_dict = torch.load(load_path) + res_dict = torch.load(load_path, weights_only=False) default_kwargs = { "data": None, "model": None, From af8ce4c805c18a348833db6e2c52361301877f14 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Sun, 30 Mar 2025 14:58:34 +0400 Subject: [PATCH 95/97] Fix greedy semantic dens --- src/lm_polygraph/estimators/semantic_density.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm_polygraph/estimators/semantic_density.py b/src/lm_polygraph/estimators/semantic_density.py index 1c09250b2..8e5c76427 100644 --- a/src/lm_polygraph/estimators/semantic_density.py +++ b/src/lm_polygraph/estimators/semantic_density.py @@ -94,7 +94,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: batch_sample_texts = stats["sample_texts"] batch_semantic_matrix_contra = stats["concat_greedy_semantic_matrix_contra_forward"] batch_semantic_matrix_neutral = stats["concat_greedy_semantic_matrix_neutral_forward"] - batch_greedy_log_likelihoods = stats["concat_greedy_log_likelihoods"] + batch_greedy_log_likelihoods = stats["greedy_log_likelihoods"] semantic_density = [] for batch_data in zip( From ff96ac9da2d1092b76bc4e61572df8e9846db3c7 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Sun, 30 Mar 2025 21:18:18 +0400 Subject: [PATCH 96/97] Turn semantic density around --- src/lm_polygraph/estimators/semantic_density.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lm_polygraph/estimators/semantic_density.py b/src/lm_polygraph/estimators/semantic_density.py index 8e5c76427..693215a81 100644 --- a/src/lm_polygraph/estimators/semantic_density.py +++ b/src/lm_polygraph/estimators/semantic_density.py @@ -67,7 +67,7 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: semantic_density.append(np.sum(numerator) / np.sum(denominator)) - return np.array(semantic_density) + return -np.array(semantic_density) class GreedySemanticDensity(Estimator): @@ -134,4 +134,4 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray: semantic_density.append(np.sum(numerator) / np.sum(denominator)) - return np.array(semantic_density) + return -np.array(semantic_density) From c652a519bf43a305490478bf81ee5a82f8a78970 Mon Sep 17 00:00:00 2001 From: Roman Vashurin Date: Sun, 30 Mar 2025 21:23:37 +0400 Subject: [PATCH 97/97] Fix gpt naming --- src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py index 7e3a170df..ac95bfb58 100644 --- a/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py +++ b/src/lm_polygraph/generation_metrics/gpt_judge_accuracy.py @@ -33,7 +33,10 @@ def __init__(self, model="gpt-4o-mini", sample=False, sample_strategy="First", a def __str__(self): if self.sample == True: - return f"GptAccuracy_{self.model}_{self.sample_strategy}" + if self.sample_strategy == "First": + return f"SampleGptAccuracy_{self.model}" + else: + return f"{self.sample_strategy}GptAccuracy_{self.model}" return f"GptAccuracy_{self.model}" def _filter_input(self, input):