harness-engineering/scripts/aggregate_repo_eval.py at main · 01clauding/harness-engineering · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
from pathlib import Path


def load_json(path: Path) -> dict:
    return json.loads(path.read_text(encoding="utf-8"))


def clamp_score(value) -> float:
    if value is None:
        return -1.0
    return float(value)


def summarize_run(manifest: dict, run: dict) -> dict:
    weights = manifest.get("scoring", {})
    weight_keys = list(weights.keys())
    manifest_task_ids = {task["id"] for task in manifest.get("tasks", [])}
    run_tasks = {task["id"]: task for task in run.get("tasks", [])}

    missing = sorted(manifest_task_ids - set(run_tasks.keys()))
    completed_scores = []
    dimension_totals = {key: [] for key in weight_keys}
    incomplete = []

    for task_id in sorted(manifest_task_ids):
        task = run_tasks.get(task_id)
        if not task:
            incomplete.append(task_id)
            continue
        scores = task.get("scores", {})
        if any(clamp_score(scores.get(key)) < 0 for key in weight_keys):
            incomplete.append(task_id)
            continue
        normalized = sum((float(scores[key]) / 5.0) * float(weights[key]) for key in weight_keys)
        completed_scores.append(normalized)
        for key in weight_keys:
            dimension_totals[key].append(float(scores[key]))

    coverage = 0.0
    if manifest_task_ids:
        coverage = len(completed_scores) / len(manifest_task_ids)

    average = sum(completed_scores) / len(completed_scores) if completed_scores else 0.0
    dimensions = {
        key: (sum(values) / len(values) if values else 0.0)
        for key, values in dimension_totals.items()
    }

    return {
        "run_id": run.get("run_id", "unknown"),
        "variant": run.get("variant", "unknown"),
        "model": run.get("model", "unknown"),
        "notes": run.get("notes", ""),
        "average_score": average,
        "average_score_pct": average * 100.0,
        "coverage": coverage,
        "coverage_pct": coverage * 100.0,
        "completed_tasks": len(completed_scores),
        "total_tasks": len(manifest_task_ids),
        "dimension_averages": dimensions,
        "missing_tasks": missing,
        "incomplete_tasks": incomplete,
    }


def print_markdown(summary: list[dict], weight_keys: list[str]) -> None:
    print('# Repo Eval Summary')
    print()
    if not summary:
        print('- No run files found')
        return

    print('| Variant | Model | Avg Score | Coverage | Completed |')
    print('| --- | --- | ---: | ---: | ---: |')
    for item in summary:
        print(
            f"| `{item['variant']}` | `{item['model']}` | {item['average_score_pct']:.1f} | {item['coverage_pct']:.1f}% | {item['completed_tasks']}/{item['total_tasks']} |"
        )

    print()
    for item in summary:
        print(f"## {item['variant']} ({item['run_id']})")
        print(f"- Model: `{item['model']}`")
        print(f"- Average score: `{item['average_score_pct']:.1f}`")
        print(f"- Coverage: `{item['coverage_pct']:.1f}%`")
        for key in weight_keys:
            print(f"- {key}: `{item['dimension_averages'][key]:.2f}` / 5")
        if item['notes']:
            print(f"- Notes: {item['notes']}")
        if item['missing_tasks']:
            print(f"- Missing tasks: `{', '.join(item['missing_tasks'])}`")
        if item['incomplete_tasks']:
            print(f"- Incomplete tasks: `{', '.join(item['incomplete_tasks'])}`")
        print()


def main() -> int:
    parser = argparse.ArgumentParser(description="Aggregate repo-specific eval run files.")
    parser.add_argument("eval_dir", help="Directory containing manifest.json and runs/")
    parser.add_argument("--json", action="store_true", help="Emit JSON instead of Markdown")
    parser.add_argument("--fail-under", type=float, default=None, help="Fail if the top average score is below this percentage")
    args = parser.parse_args()

    eval_dir = Path(args.eval_dir).resolve()
    manifest = load_json(eval_dir / 'manifest.json')
    weight_keys = list(manifest.get('scoring', {}).keys())
    runs_dir = eval_dir / 'runs'
    run_files = sorted(runs_dir.glob('*.json')) if runs_dir.exists() else []

    summaries = [summarize_run(manifest, load_json(path)) for path in run_files]
    summaries.sort(key=lambda item: (item['average_score'], item['coverage']), reverse=True)

    if args.json:
        print(json.dumps(summaries, ensure_ascii=False, indent=2))
    else:
        print_markdown(summaries, weight_keys)

    if args.fail_under is not None:
        top = summaries[0]['average_score_pct'] if summaries else 0.0
        if top < args.fail_under:
            raise SystemExit(1)

    return 0


if __name__ == '__main__':
    raise SystemExit(main())