-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathevaluate_model.py
More file actions
91 lines (73 loc) · 3.74 KB
/
evaluate_model.py
File metadata and controls
91 lines (73 loc) · 3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from score_calculation import calculate_scores_for_mmlu, calculate_scores_for_exam_tests
from load_configuration import load_config, validate_model_config, validate_dataset_config
from eval_datasets.mmlu_pro.load_mmlu_pro import load_mmlu_pro_hy
from eval_datasets.armenian_history.load_armenian_history_test import load_armenian_history_test
from eval_datasets.armenian_language_and_literature.load_armenian_language_test import load_armenian_language_test
from eval_datasets.mathematics.load_matemathics_test import load_mathematics_test
from tqdm import tqdm
import os
import pandas as pd
import json
import argparse
import yaml
def evaluation(model_config_path = None, dataset_config_path = None):
results_json = {"mmlu_results": [], "unified_exam_results": []}
if not model_config_path:
raise ValueError("Error: 'model_config_path' is required.")
model_config = load_config(model_config_path)
model_config = validate_model_config(model_config)
model_name = model_config['model_name']
model_name = model_name.replace('/', '_')
dataset_loaders = {
'Armenian language and literature': load_armenian_language_test,
'Armenian history': load_armenian_history_test,
'Mathematics': load_mathematics_test,
'mmlu_pro': load_mmlu_pro_hy
}
if dataset_config_path:
dataset_config = load_config(dataset_config_path)
dataset_list = validate_dataset_config(dataset_config)
if not dataset_list:
dataset_list = list(dataset_loaders.keys())
else:
dataset_list = list(dataset_loaders.keys())
datasets = {name: dataset_loaders[name]() for name in dataset_list}
unified_exam_scores = []
for idx, (df_name, df) in enumerate(datasets.items()):
outputs_file = f"model_outputs/{model_name}/{df_name}_output.json"
if df_name == 'mmlu_pro':
results = calculate_scores_for_mmlu(outputs_file, df, model_name)
for category, score in results.items():
if category != "Model":
results_json["mmlu_results"].append({"category": category, "score": round(score, 4)})
else:
results = calculate_scores_for_exam_tests(outputs_file, df, model_name)
unified_exam_scores.append(results["score"])
results_json["unified_exam_results"].append({
"category": df_name,
"score": round(results["score"], 4)
})
if len(unified_exam_scores) == 3:
average_score = sum(unified_exam_scores)/len(unified_exam_scores)
results_json["unified_exam_results"].append({
"category": 'Average',
"score": round(average_score, 4)
})
json_output_path = f"model_outputs/{model_name}/results.json"
with open(json_output_path, "w", encoding="utf-8") as f:
json.dump(results_json, f, indent=4, ensure_ascii=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate a model on multiple datasets.")
parser.add_argument("--config", help="Path to the YAML config file.", default='configs/config.yaml')
parser.add_argument("--model_config", help="Path to the model configuration file.", default="configs/model_config.yaml")
parser.add_argument("--dataset_config", help="Path to the dataset configuration file.", default=None)
args = parser.parse_args()
if args.config:
with open(args.config, "r") as file:
yaml_config = yaml.safe_load(file)
args.model_config = yaml_config.get("model_config", args.model_config)
args.dataset_config = yaml_config.get("dataset_config", args.dataset_config)
evaluation(
model_config_path=args.model_config,
dataset_config_path=args.dataset_config,
)