LLM/utils.py at main · nikhilrayaprolu/LLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# utils.py
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from torchtext.data.metrics import bleu_score

# --- Data Handling ---
class TranslationDataset(Dataset):
    # MODIFIED: Added max_len to the constructor
    def __init__(self, src_file, tgt_file, src_tokenizer, tgt_tokenizer,
                 src_sos, src_eos, tgt_sos, tgt_eos, max_len=500):
        self.src_tokenizer, self.tgt_tokenizer = src_tokenizer, tgt_tokenizer
        self.src_sos, self.src_eos = src_sos, src_eos
        self.tgt_sos, self.tgt_eos = tgt_sos, tgt_eos
        self.max_len = max_len

        with open(src_file, 'r', encoding='utf-8') as f: self.src_lines = [l.strip() for l in f if l.strip()]
        with open(tgt_file, 'r', encoding='utf-8') as f: self.tgt_lines = [l.strip() for l in f if l.strip()]

    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        # Tokenize the raw sentences first
        src_ids = self.src_tokenizer.encode(self.src_lines[idx]).ids
        tgt_ids = self.tgt_tokenizer.encode(self.tgt_lines[idx]).ids

        # MODIFIED: Truncate the token lists if they are too long.
        # We subtract 2 to leave space for the [SOS] and [EOS] tokens.
        if len(src_ids) > self.max_len - 2:
            src_ids = src_ids[:self.max_len - 2]
        if len(tgt_ids) > self.max_len - 2:
            tgt_ids = tgt_ids[:self.max_len - 2]

        # Add special tokens after truncation
        src_tokens = [self.src_sos] + src_ids + [self.src_eos]
        tgt_tokens = [self.tgt_sos] + tgt_ids + [self.tgt_eos]

        return torch.tensor(src_tokens), torch.tensor(tgt_tokens)


def collate_fn(batch, pad_idx):
    src_batch, tgt_batch = [item[0] for item in batch], [item[1] for item in batch]
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=pad_idx)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=pad_idx)
    return src_padded, tgt_padded

# --- Decoding Strategies ---

def translate_greedy(model, src_tokenizer, tgt_tokenizer, src_sentence, device, special_tokens, max_len=50):
    model.eval()
    src_tokens = [special_tokens['src_sos']] + src_tokenizer.encode(src_sentence).ids + [special_tokens['src_eos']]
    src_tensor = torch.LongTensor(src_tokens).unsqueeze(0).to(device)
    src_mask = (src_tensor == special_tokens['pad'])
    with torch.no_grad():
        memory = model.encode(src_tensor, src_mask)
    ys = torch.ones(1, 1).fill_(special_tokens['tgt_sos']).type(torch.long).to(device)
    for _ in range(max_len - 1):
        # MODIFIED: Manually create the look-ahead mask
        sz = ys.size(1)
        tgt_mask = torch.tril(torch.ones(sz, sz, device=device)).bool()

        out = model.decode(ys, memory, tgt_mask, memory_key_padding_mask=src_mask)
        prob = out[:, -1] # The from-scratch model's decode output is the logits
        _, next_word = torch.max(prob, dim=1)
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src_tensor.data).fill_(next_word.item())], dim=1)
        if next_word.item() == special_tokens['tgt_eos']:
            break
    tgt_tokens = [tgt_tokenizer.id_to_token(tok) for tok in ys.squeeze().tolist()]
    return " ".join(tgt_tokens).replace(" [SOS]", "").replace(" [EOS]", "").strip()


def translate_beam_search(model, src_tokenizer, tgt_tokenizer, src_sentence, device, special_tokens, beam_width=5, max_len=50):
    """
    Performs a batched beam search for faster, more efficient translation.
    """
    model.eval()

    # --- 1. Initial Setup ---
    src_tokens = [special_tokens['src_sos']] + src_tokenizer.encode(src_sentence).ids + [special_tokens['src_eos']]
    src_tensor = torch.LongTensor(src_tokens).unsqueeze(0).to(device)
    src_mask = (src_tensor == special_tokens['pad'])

    with torch.no_grad():
        memory = model.encode(src_tensor, src_mask)

    # --- 2. Initialize Beams ---
    sequences = torch.ones(1, 1).fill_(special_tokens['tgt_sos']).type(torch.long).to(device)
    scores = torch.zeros(1).to(device)
    finished_beams = []

    for step in range(max_len):
        if sequences.shape[0] == 0: break

        # --- 3. Batched Decode Step ---
        with torch.no_grad():
            current_beam_count = sequences.shape[0]

            # MODIFIED: Use .repeat() instead of .expand() for memory contiguity.
            # .expand() creates a view, .repeat() creates a new tensor with copied data.
            # The .view() operation inside the attention layer requires a contiguous tensor.
            memory_repeated = memory.repeat(current_beam_count, 1, 1)
            src_mask_repeated = src_mask.repeat(current_beam_count, 1)

            sz = sequences.size(1)
            tgt_mask = torch.tril(torch.ones(sz, sz, device=device)).bool()

            out = model.decode(sequences, memory_repeated, tgt_mask, memory_key_padding_mask=src_mask_repeated)
            log_probs = F.log_softmax(out[:, -1], dim=-1)

        # --- 4. Find Top-K Candidates Globally ---
        new_scores = scores.unsqueeze(1) + log_probs
        flattened_scores = new_scores.view(-1)
        top_scores, top_indices = torch.topk(flattened_scores, beam_width)

        # --- 5. Reconstruct Beams ---
        vocab_size = log_probs.shape[1]
        beam_indices = top_indices // vocab_size
        token_indices = top_indices % vocab_size

        # --- 6. Update Beams and Handle Finished Sequences ---
        next_sequences = []
        next_scores = []

        for i in range(beam_width):
            beam_idx = beam_indices[i]
            token_idx = token_indices[i]
            score = top_scores[i]

            new_seq = torch.cat([sequences[beam_idx], token_idx.view(1)])

            if token_idx.item() == special_tokens['tgt_eos']:
                finished_beams.append({'seq': new_seq, 'score': score / new_seq.size(0)**0.7})
            else:
                next_sequences.append(new_seq.unsqueeze(0))
                next_scores.append(score)

        if not next_sequences:
            break

        sequences = torch.cat(next_sequences, dim=0)
        scores = torch.tensor(next_scores).to(device)

    # --- 7. Select Best Beam ---
    if not finished_beams:
        if len(sequences) > 0:
            best_seq = sequences[0]
        else: # Handle case where all beams are finished on the first step
            return ""
    else:
        best_beam = sorted(finished_beams, key=lambda x: x['score'], reverse=True)[0]
        best_seq = best_beam['seq']

    best_seq_list = best_seq.squeeze().tolist()
    tgt_tokens = [tgt_tokenizer.id_to_token(tok) for tok in best_seq_list]
    return " ".join(tgt_tokens).replace(" [SOS]", "").replace(" [EOS]", "").strip()


def translate_topk(model, src_tokenizer, tgt_tokenizer, src_sentence, device, special_tokens, k=10, max_len=50):
    model.eval()
    src_tokens = [special_tokens['src_sos']] + src_tokenizer.encode(src_sentence).ids + [special_tokens['src_eos']]
    src_tensor = torch.LongTensor(src_tokens).unsqueeze(0).to(device)
    src_mask = (src_tensor == special_tokens['pad'])
    with torch.no_grad():
        memory = model.encode(src_tensor, src_mask)
    ys = torch.ones(1, 1).fill_(special_tokens['tgt_sos']).type(torch.long).to(device)
    for _ in range(max_len - 1):
        with torch.no_grad():
            # MODIFIED: Manually create the look-ahead mask
            sz = ys.size(1)
            tgt_mask = torch.tril(torch.ones(sz, sz, device=device)).bool()

            logits = model.decode(ys, memory, tgt_mask, memory_key_padding_mask=src_mask)[:, -1]

        top_k_logits, top_k_indices = torch.topk(logits, k, dim=-1)
        probabilities = F.softmax(top_k_logits, dim=-1)
        next_word = top_k_indices.gather(-1, torch.multinomial(probabilities, 1))
        ys = torch.cat([ys, next_word], dim=1)
        if next_word.item() == special_tokens['tgt_eos']:
            break
    tgt_tokens = [tgt_tokenizer.id_to_token(tok) for tok in ys.squeeze().tolist()]
    return " ".join(tgt_tokens).replace(" [SOS]", "").replace(" [EOS]", "").strip()


# --- BLEU Score Calculation ---

def greedy_decode_batch(model, src, src_mask, max_len, device, special_tokens):
    batch_size = src.shape[0]
    with torch.no_grad():
        memory = model.encode(src, src_mask)
    ys = torch.ones(batch_size, 1).fill_(special_tokens['tgt_sos']).type(torch.long).to(device)
    finished = torch.zeros(batch_size, dtype=torch.bool).to(device)
    for _ in range(max_len - 1):
        with torch.no_grad():
            # MODIFIED: Manually create the look-ahead mask
            sz = ys.size(1)
            tgt_mask = torch.tril(torch.ones(sz, sz, device=device)).bool()

            # The src_mask for the decoder's cross-attention is the memory_key_padding_mask
            out = model.decode(ys, memory, tgt_mask, memory_key_padding_mask=src_mask)
            prob = out[:, -1] # The output is already logits
        _, next_word = torch.max(prob, dim=1)
        ys = torch.cat([ys, next_word.unsqueeze(1)], dim=1)
        finished |= (next_word == special_tokens['tgt_eos'])
        if finished.all(): break
    return ys

def calculate_bleu(model, dataset, dataloader, src_tokenizer, tgt_tokenizer, device, args, special_tokens):
    model.eval()
    all_candidates, all_references = [], []
    tokens_to_remove = {'[SOS]', '[EOS]', '[PAD]', '[UNK]'}

    if args.decoding_strategy == 'greedy':
        progress_bar = tqdm(dataloader, desc='Calculating BLEU (Greedy Batch)', leave=False)
        for src, tgt in progress_bar:
            src = src.to(device)
            src_mask = (src == special_tokens['pad'])
            gen_tokens = greedy_decode_batch(model, src, src_mask, tgt.shape[1] + 5, device, special_tokens)
            for tokens in gen_tokens.cpu().tolist():
                all_candidates.append([tok for tok in tgt_tokenizer.decode(tokens).split() if tok not in tokens_to_remove])
            for tokens in tgt.cpu().tolist():
                all_references.append([[tok for tok in tgt_tokenizer.decode(tokens).split() if tok not in tokens_to_remove]])
    else:
        progress_bar = tqdm(range(len(dataset)), desc=f'Calculating BLEU ({args.decoding_strategy})', leave=False)
        for i in progress_bar:
            if args.decoding_strategy == 'beam':
                candidate_sent = translate_beam_search(model, src_tokenizer, tgt_tokenizer, dataset.src_lines[i], device, special_tokens, args.beam_width)
            elif args.decoding_strategy == 'topk':
                candidate_sent = translate_topk(model, src_tokenizer, tgt_tokenizer, dataset.src_lines[i], device, special_tokens, args.top_k)
            all_candidates.append(candidate_sent.split())
            all_references.append([dataset.tgt_lines[i].split()])

    return bleu_score(all_candidates, all_references) * 100