-
Notifications
You must be signed in to change notification settings - Fork 57
Expand file tree
/
Copy patheval_tooluse.py
More file actions
183 lines (148 loc) · 5.88 KB
/
eval_tooluse.py
File metadata and controls
183 lines (148 loc) · 5.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import argparse
import os
import json
import torch
import numpy as np
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import re
from collections import Counter
def parse_args():
parser = argparse.ArgumentParser(description="Evaluate a model on tooluse test set")
parser.add_argument("--model_path", type=str, required=True,
help="Path to the trained model")
parser.add_argument("--max_new_tokens", type=int, default=1024,
help="Maximum number of tokens to generate")
parser.add_argument("--output_dir", type=str, default=None,
help="Directory to save evaluation results (defaults to model_path)")
parser.add_argument("--temperature", type=float, default=0.0,
help="Sampling temperature (0 for greedy)")
return parser.parse_args()
def load_model_and_tokenizer(model_path, gpu_memory_utilization=0.8):
"""Load model using vLLM and tokenizer from the given path."""
print(f"Loading model from {model_path}")
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='left')
llm = LLM(
model=model_path,
gpu_memory_utilization=gpu_memory_utilization,
dtype=torch.bfloat16,
trust_remote_code=True,
)
return llm, tokenizer
def load_test_data(tokenizer):
"""Load and prepare tooluse test dataset."""
data_dir = 'data/tooluse_data/eval_data'
data = load_from_disk(data_dir).to_list()
# Format prompts
for example in data:
example['prompt'] = tokenizer.apply_chat_template(
[{'role': 'user', 'content': example['prompt']}],
tokenize=False,
add_generation_prompt=True
)
return data
def generate_responses(llm, tokenizer, prompts, max_new_tokens=1024, temperature=0.0):
"""Generate responses from the model using vLLM."""
sampling_params = SamplingParams(
temperature=temperature,
max_tokens=max_new_tokens,
stop_token_ids=[tokenizer.eos_token_id] if tokenizer.eos_token_id else None,
)
print(f"Generating responses for {len(prompts)} prompts...")
outputs = llm.generate(prompts, sampling_params)
return [output.outputs[0].text for output in outputs]
def extract_actions(text):
"""Extract all actions from model response."""
return re.findall(r'Action:\s*(\w+)', text)
def extract_action_inputs(text):
"""Extract and merge all action inputs from model response."""
json_blocks = re.findall(r'Action Input:\s*({.*?})', text, re.DOTALL)
combined_dict = {}
for block in json_blocks:
try:
parsed = json.loads(block)
combined_dict.update(parsed)
except json.JSONDecodeError:
continue
return combined_dict
def evaluate_correctness(responses, golden_answers):
"""
Evaluate if responses match the golden answers.
Returns list of scores (1 for correct, 0 for incorrect).
"""
results = []
for response, golden_answer in zip(responses, golden_answers):
# Extract predicted actions and inputs
pred_actions = extract_actions(response)
pred_inputs = extract_action_inputs(response)
# Extract ground truth actions and inputs
gt_actions = [item['Action'] for item in golden_answer]
gt_inputs = {}
for item in golden_answer:
try:
gt_inputs.update(json.loads(item['Action_Input']))
except:
pass
# Check if both actions and inputs match
actions_match = Counter(pred_actions) == Counter(gt_actions)
inputs_match = pred_inputs == gt_inputs
results.append(1 if (actions_match and inputs_match) else 0)
return results
def main():
args = parse_args()
# Load model and data
llm, tokenizer = load_model_and_tokenizer(args.model_path)
test_data = load_test_data(tokenizer)
prompts = [example['prompt'] for example in test_data]
golden_answers = [example['golden_answer'] for example in test_data]
# Generate responses
responses = generate_responses(
llm, tokenizer, prompts,
args.max_new_tokens,
args.temperature
)
# Evaluate correctness
print("\nEvaluating responses...")
scores = evaluate_correctness(responses, golden_answers)
accuracy = np.mean(scores)
# Print results
print("\n" + "=" * 60)
print(f"Evaluation Results:")
print(f" Total samples: {len(scores)}")
print(f" Correct: {sum(scores)}")
print(f" Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("=" * 60)
# Save results
output_dir = args.output_dir if args.output_dir else args.model_path
os.makedirs(output_dir, exist_ok=True)
results_to_save = {
"accuracy": float(accuracy),
"num_correct": int(sum(scores)),
"num_total": len(scores),
"per_sample_scores": scores,
"config": {
"model_path": args.model_path,
"max_new_tokens": args.max_new_tokens,
"temperature": args.temperature,
}
}
output_path = os.path.join(output_dir, "eval_results.json")
with open(output_path, "w") as f:
json.dump(results_to_save, f, indent=2)
print(f"\nSaved results to {output_path}")
# Optionally save responses for inspection
responses_path = os.path.join(output_dir, "eval_responses.json")
with open(responses_path, "w") as f:
json.dump([
{
"prompt": test_data[i]['prompt'],
"response": responses[i],
"golden_answer": golden_answers[i],
"correct": bool(scores[i])
}
for i in range(len(responses))
], f, indent=2)
print(f"Saved responses to {responses_path}")
if __name__ == "__main__":
main()