diff --git a/lm-polygraph/src/lm_polygraph/stat_calculators/extract_claims.py b/lm-polygraph/src/lm_polygraph/stat_calculators/extract_claims.py index 5b57a0b..47c862b 100644 --- a/lm-polygraph/src/lm_polygraph/stat_calculators/extract_claims.py +++ b/lm-polygraph/src/lm_polygraph/stat_calculators/extract_claims.py @@ -242,11 +242,16 @@ def claims_from_text(self, text: str, tokens: List[int], tokenizer) -> List[Clai # Iteratively decode tokenized text until decoded sequence length is # greater or equal to the starting position of current sentence. # Find sentence location in tokens: tokens[sent_start_token_idx:sent_end_token_idx] - while len(tokenizer.decode(tokens[:sent_start_token_idx])) < sent_start_idx: - # print("while 3") + + # while len(tokenizer.decode(tokens[:sent_start_token_idx])) < sent_start_idx: + # # print("while 3") + # sent_start_token_idx += 1 + # while len(tokenizer.decode(tokens[:sent_end_token_idx])) < sent_end_idx: + # # print("while 4") + # sent_end_token_idx += 1 + while s.replace(' ', '').lower() in tokenizer.decode(tokens[sent_start_token_idx + 1:]).replace(' ', '').lower() and sent_start_token_idx < len(tokens): sent_start_token_idx += 1 - while len(tokenizer.decode(tokens[:sent_end_token_idx])) < sent_end_idx: - # print("while 4") + while s.replace(' ', '').lower() not in tokenizer.decode(tokens[:sent_end_token_idx]).replace(' ', '').lower() and sent_end_token_idx < len(tokens): sent_end_token_idx += 1 # print("(sent_start_idx, sent_end_idx, sent_start_token_idx, sent_end_token_idx):", (sent_start_idx, sent_end_idx, sent_start_token_idx, sent_end_token_idx))