diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py index 6a36792..85f20b9 100644 --- a/src/programbench/cli/main.py +++ b/src/programbench/cli/main.py @@ -9,6 +9,7 @@ import typer from programbench.cli.blob import app as blob_app +from programbench.cli.submit import app as submit_app from programbench.constants import DOCKER_CPUS app = typer.Typer( @@ -18,6 +19,7 @@ context_settings={"help_option_names": ["-h", "--help"]}, ) app.add_typer(blob_app, name="blob") +app.add_typer(submit_app, name="submit") @app.callback() diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py new file mode 100644 index 0000000..0980952 --- /dev/null +++ b/src/programbench/cli/submit.py @@ -0,0 +1,208 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json.""" + +from pathlib import Path + +import typer + +app = typer.Typer(no_args_is_help=True, help="Prepare, check, and reassemble leaderboard submissions.") + + +@app.command() +def package( + run_dir: Path = typer.Argument( + ..., help="A `programbench eval` run directory (//submission.tar.gz)." + ), + upload_to: str = typer.Option( + "", + "--upload-to", + metavar="ORG[/DATASET]", + help="Upload submission.tar.gz and the heavy eval.log.json to a HuggingFace dataset, " + "replacing each with a .url + .sha256. A bare org (e.g. 'programbench') creates a " + "per-submission dataset org/; pass 'org/name' to use an exact dataset.", + ), + overwrite: bool = typer.Option( + False, "--overwrite", help="With --upload-to, re-upload files already present on HF (default: skip them)." + ), +) -> None: + """Turn an evaluated run directory into a leaderboard submission, in place. + + Writes a submission.yaml manifest and _stats/score.json, and splits each large + eval.json into a light eval.json (kept) + a heavy .eval.log.json (raw log + + failure text) so the repo stays git-pushable. With --upload-to, the heavy files and + the submission.tar.gz artifacts are uploaded to HuggingFace. System metadata and + trajectories are left as TODO. + + \b + Examples: + programbench submit package output/my-run + programbench submit package output/my-run --upload-to programbench + """ + from rich.console import Console + + from programbench.package import package_run + + result = package_run(run_dir, upload_to=upload_to or None, overwrite=overwrite) + console = Console() + console.print( + f"Packaged [bold]{len(result.packaged)}[/bold] instance(s) in [bold]{result.run_dir}[/bold] " + f"(skipped {len(result.skipped)} unknown). " + f"mean_score={result.headline.mean_score * 100:.1f} resolved={result.headline.resolved_pct:.1f}%" + ) + console.print( + "[dim]Each eval.json was split into eval.json + .eval.log.json (recombine with " + "`programbench submit recombine`). Next: fill in submission.yaml + add traj.json files.[/dim]" + ) + + +@app.command() +def verify( + submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."), + tier1: bool = typer.Option( + False, "--tier1", help="Also re-run `programbench eval` and check artifacts reproduce the results (Docker)." + ), + workers: int = typer.Option(1, "-w", "--workers", help="Instance workers for the Tier-1 re-eval."), + filter_spec: str = typer.Option( + "", "--filter", help="Restrict Tier-1 re-eval to instance IDs matching this regex." + ), +) -> None: + """Verify a submission against its own claimed results. + + Tier 0 (default, no Docker) recomputes the headline from the submission's eval.json + files and checks it matches submission.yaml. Tier 1 (--tier1) additionally resolves + each submission.tar.gz and re-runs evaluation to confirm the artifacts reproduce the + reported scores. + + \b + Examples: + programbench submit verify ./their-submission + programbench submit verify ./their-submission --tier1 -w 4 + """ + from rich.console import Console + from rich.table import Table + + from programbench.verify import verify_tier0, verify_tier1 + + result = ( + verify_tier1(submission_dir, workers=workers, filter_spec=filter_spec) + if tier1 + else verify_tier0(submission_dir) + ) + + table = Table(title=f"Tier-{result.tier} verification", box=None) + table.add_column("Check", style="bold") + table.add_column("Claimed", justify="right") + table.add_column("Computed", justify="right") + table.add_column("", justify="center") + for c in result.checks: + table.add_row(c.name, str(c.claimed), str(c.computed), "✅" if c.ok else "❌") + console = Console() + console.print(table) + if result.ok: + console.print("[bold green]PASS[/bold green] — submission is consistent with its reported results.") + else: + console.print("[bold red]FAIL[/bold red] — discrepancies found above.") + raise typer.Exit(1) + + +@app.command() +def register( + submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."), + registry: str = typer.Option( + "", "--registry", help="Registry repo to PR against (default: ProgramBench/submissions)." + ), + source: str = typer.Option( + "", "--source", help="Public URL of this submission's repo (default: autodetected from its git remote)." + ), + commit: str = typer.Option( + "", "--commit", help="Commit SHA that was scored (default: autodetected from its git HEAD)." + ), + dry_run: bool = typer.Option( + False, "--dry-run", help="Build the registry entry locally and print the plan; touch no network." + ), + verify: bool = typer.Option( + True, "--verify/--no-verify", help="Run a Tier-0 verify gate before registering (default: on)." + ), +) -> None: + """Register a packaged submission on the leaderboard by opening a PR to the registry. + + The PR adds a small submissions// entry: a pointer.yaml (the submission repo URL + + the exact commit scored) plus the submission.yaml and _stats/ copied from this run. The + source URL and commit are read from the run directory's own git remote/HEAD. With `gh` + installed the registry is forked and the PR opened for you; otherwise the entry is left + committed on a branch and the steps to push + open the PR are printed. + + \b + Examples: + programbench submit register ./my-run --dry-run + programbench submit register ./my-run + """ + import tempfile + + from rich.console import Console + + from programbench.register import REGISTRY_DEFAULT, build_plan, register_submission, write_entry + + console = Console() + registry = registry or REGISTRY_DEFAULT + + if verify: + from programbench.verify import verify_tier0 + + if not verify_tier0(submission_dir).ok: + console.print( + "[bold red]FAIL[/bold red] — Tier-0 verification failed; fix the submission (or pass " + "--no-verify) before registering. Run `programbench submit verify .` to see the mismatch." + ) + raise typer.Exit(1) + + plan = build_plan(submission_dir, registry) + if source: + plan.source = source + if commit: + plan.commit = commit + + if dry_run: + with tempfile.TemporaryDirectory() as tmp: + entry = write_entry(plan, submission_dir, Path(tmp)) + files = sorted(str(p.relative_to(entry)) for p in entry.rglob("*") if p.is_file()) + console.print(f"[bold]Would register[/bold] [cyan]{plan.submission_id}[/cyan] to {plan.registry}") + console.print(f" branch: {plan.branch}") + console.print(f" source: {plan.source}\n commit: {plan.commit}") + console.print(" files: " + ", ".join(f"submissions/{plan.submission_id}/{f}" for f in files)) + console.print(f"\n[dim]pointer.yaml:[/dim]\n{plan.pointer.rstrip()}") + console.print(f"\n[dim]PR title:[/dim] {plan.title}\n[dim]PR body:[/dim]\n{plan.body}") + console.print("\n[dim]Dry run — nothing cloned, pushed, or opened. Drop --dry-run to register.[/dim]") + return + + result = register_submission(submission_dir, registry) + if result.pr_url: + console.print(f"[bold green]Opened PR[/bold green] for {plan.submission_id}: {result.pr_url}") + else: + console.print(f"[bold]Prepared[/bold] registry entry for {plan.submission_id}.\n{result.next_steps}") + + +@app.command() +def recombine( + run_dir: Path = typer.Argument(..., help="A packaged run/submission directory."), +) -> None: + """Reverse `package`'s eval split: fold each .eval.log.json back into its + eval.json, restoring the original full eval output. + + The heavy file is read locally, or downloaded from its .url if it was uploaded to HF. + + \b + Examples: + programbench submit recombine ./their-submission + """ + from rich.console import Console + + from programbench.submission import recombine_eval_json + + n = sum(recombine_eval_json(d, d.name) for d in sorted(p for p in run_dir.iterdir() if p.is_dir())) + Console().print(f"Recombined [bold]{n}[/bold] eval.json file(s) in {run_dir}") diff --git a/src/programbench/data/templates/README.md.j2 b/src/programbench/data/templates/README.md.j2 new file mode 100644 index 0000000..9e6d1bb --- /dev/null +++ b/src/programbench/data/templates/README.md.j2 @@ -0,0 +1,78 @@ +

+ ProgramBench +

+ +> A submission to the **[ProgramBench](https://programbench.com)** leaderboard — *can language models rebuild programs from scratch?* · [Leaderboard](https://programbench.com) · [How to submit](https://programbench.com/blog/submission-guide) + +# [Submission Name Here] + + + +## System overview + + + +## Reproducing this run + + + +```bash +# 1. install the agent / dependencies +# 2. run inference per task (no internet, per the eval protocol) +# 3. programbench eval +# 4. programbench submit package --upload-to +``` + +## Extra stats (optional) + +The leaderboard can show stats beyond `score` — e.g. cost or model calls. These are +**optional**, and each must be **computed by a script that reads your trajectories**, not +entered by hand: the number has to be recoverable from the run. `programbench` ships no +calculators (it makes no assumptions about your scaffold) — write your own that reads each +`traj.json` and emits a flat `{instance_id: value}` map to `_stats/.json`, and ship +the script here (e.g. under `_scripts/`) so the numbers are reproducible. + +## Links + + + +## Submission checklist + +- [ ] Ran `programbench eval` → `programbench submit package` to produce this submission +- [ ] Filled in every `submission.yaml` field (no `TODO` left), including `is_os_model` / `is_os_scaffold` +- [ ] Trajectories (`traj.json`) included for every task (agent submissions) +- [ ] Solutions present — inline `submission.tar.gz`, or a hosted `submission.tar.gz.url` + `.sha256` +- [ ] Any extra stats (cost/calls) were produced by a trajectory-reading script shipped here, not hand-written +- [ ] Filled in the System overview and Reproducing sections above +- [ ] `programbench submit verify .` passes +- [ ] Made this fork public +- [ ] Opened a registration PR to the submissions repo + +## Integrity attestations + +- [ ] Solutions were produced **only** from behavioral observation of the binary and its + bundled docs — no source code, repositories, mirrors, or package registries were consulted +- [ ] The model was not given internet access during evaluation +- [ ] The model did not have access to any unit tests during evaluation +- [ ] I consent to re-evaluation, and to flagging or removal if it contradicts the reported results + +## Auditing + +Anyone can independently check this submission with the following instructions: + +```bash +git clone +cd {{ submission_id }} +uvx programbench submit verify . # Tier-0: recompute the score from this repo's eval.json and check it matches submission.yaml (instant, offline) +uvx programbench submit verify . --tier1 # Tier-1: download each submission.tar.gz from HuggingFace, re-run evaluation, and confirm it reproduces the score (Docker) +``` + +* Tier-0 is self-contained. It reads the per-instance `eval.json` here plus the bundled test +metadata. +* Tier-1 additionally fetches the hosted solutions and the hidden tests and re-runs +them, so the reported `score` is reproduced from scratch. (Cost/calls are self-reported from +the trajectories; only `score` is independently re-verifiable.) diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2 new file mode 100644 index 0000000..061353d --- /dev/null +++ b/src/programbench/data/templates/submission.yaml.j2 @@ -0,0 +1,28 @@ +# Generated by `programbench package` from: {{ run_dir }} +# [auto] fields are recomputed on every `package`; all other fields are preserved. +schema_version: 1 + +submission_id: {{ submission_id | tojson }} +submitter: + name: {{ submitter_name | tojson }} + contact: {{ submitter_contact | tojson }} # email or @github + affiliation: {{ affiliation | tojson }} + +system: + agent: {{ agent | tojson }} # scaffold/harness; "none" for a pure human submission + description_url: {{ description_url | tojson }} + is_os_model: {{ is_os_model | tojson }} # true if the model's weights are openly available + is_os_scaffold: {{ is_os_scaffold | tojson }} # true if the agent/scaffold is open source + model: {{ model | tojson }} # display name used on the leaderboard + provider: {{ provider | tojson }} + type: {{ system_type | tojson }} # single-agent | multi-agent | other + +eval: + programbench_version: {{ programbench_version | tojson }} # [auto] + +headline: # [auto] score summary from evaluation; other stats live in stats/ + mean_score: {{ mean_score }} + resolved_pct: {{ resolved_pct }} + near_resolved_pct: {{ near_resolved_pct }} + n_instances_attempted: {{ n_attempted }} + n_instances_total: {{ n_total }} diff --git a/src/programbench/package.py b/src/programbench/package.py new file mode 100644 index 0000000..a86bcea --- /dev/null +++ b/src/programbench/package.py @@ -0,0 +1,216 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Turn a ``programbench eval`` run directory into a leaderboard submission, in place. + +Packaging is purely eval-derived. It writes: + +- ``_stats/score.json`` — per-instance, per-test pass/fail (the one stat from evaluation), +- ``submission.yaml`` — the manifest, with ``[auto]`` score fields recomputed and any + author-entered fields preserved across re-packaging, + +and splits each ``.eval.json`` into a light eval.json + a heavy ``.eval.log.json`` +(the raw log + failure text) so the run repo stays git-pushable; the two recombine to the +original via ``programbench submit recombine``. With ``--upload-to`` the heavy files and the +``submission.tar.gz`` artifacts go to a HuggingFace dataset (replaced by ``.url`` + ``.sha256``). + +Other stats (cost, calls, …) are optional and come from the agent trajectories via scripts +the submitter writes — this command produces none of them, and makes no assumptions about +the scaffold. The run directory stays a valid input to ``programbench eval``. +""" + +import logging +import os +import shutil +import tempfile +from dataclasses import dataclass +from importlib.metadata import version +from pathlib import Path + +import yaml +from jinja2 import Environment, PackageLoader + +from programbench.submission import ( + Headline, + aggregate, + benchmark_instances, + score_from_tests, + sha256_file, + split_eval_json, + test_results_map, + write_stat, +) + +log = logging.getLogger(__name__) + +TODO = "TODO" + +# Author-entered manifest fields preserved across re-packaging: template var -> (path, default). +_CARRIED = { + "affiliation": ("submitter.affiliation", ""), + "agent": ("system.agent", TODO), + "description_url": ("system.description_url", "README.md"), + "is_os_model": ("system.is_os_model", False), + "is_os_scaffold": ("system.is_os_scaffold", False), + "model": ("system.model", TODO), + "provider": ("system.provider", TODO), + "submitter_contact": ("submitter.contact", TODO), + "submitter_name": ("submitter.name", TODO), + "system_type": ("system.type", "single-agent"), +} + + +@dataclass +class PackageResult: + run_dir: Path + packaged: list[str] + skipped: list[str] + headline: Headline + + +def _dig(d: dict, dotted: str): + for key in dotted.split("."): + if not isinstance(d, dict): + return None + d = d.get(key) + return d + + +def _carried_values(run_dir: Path) -> dict: + manifest_path = run_dir / "submission.yaml" + existing = yaml.safe_load(manifest_path.read_text()) if manifest_path.exists() else {} + # Use "is None" (not "or") so a real False/empty value is preserved, not clobbered. + return { + var: (default if (val := _dig(existing, path)) is None else val) for var, (path, default) in _CARRIED.items() + } + + +def _upload_artifacts( + api, dataset: str, pending: list[tuple[Path, str, str]], existing: set[str], overwrite: bool +) -> None: + """Upload all pending files to HF, then replace each with a .url + .sha256 and delete it. + + ``pending`` is (instance_dir, instance_id, filename) — submission.tar.gz and the heavy + .eval.log.json. Files already on HF are skipped unless ``overwrite``. Uses + ``upload_large_folder`` (resumable, multi-commit, retrying) since logs can be hundreds + of MB and a single big commit is fragile; files are hard-linked into a staging tree so + nothing is copied. + """ + for instance_dir, iid, fname in pending: + (instance_dir / f"{fname}.sha256").write_text(sha256_file(instance_dir / fname) + "\n") + to_upload = [(d, iid, f) for d, iid, f in pending if overwrite or f"{iid}/{f}" not in existing] + if to_upload: + run_dir = pending[0][0].parent + with tempfile.TemporaryDirectory(dir=run_dir) as tmp: + staging = Path(tmp) + for instance_dir, iid, fname in to_upload: + dst = staging / iid / fname + dst.parent.mkdir(parents=True, exist_ok=True) + try: + os.link(instance_dir / fname, dst) # same-fs hardlink: no copy + except OSError: + shutil.copy2(instance_dir / fname, dst) + log.info("Uploading %d file(s) to %s (resumable)", len(to_upload), dataset) + api.upload_large_folder(repo_id=dataset, folder_path=str(staging), repo_type="dataset") + for instance_dir, iid, fname in pending: + (instance_dir / f"{fname}.url").write_text( + f"https://huggingface.co/datasets/{dataset}/resolve/main/{iid}/{fname}\n" + ) + (instance_dir / fname).unlink() + + +def package_run(run_dir: Path, upload_to: str | None = None, overwrite: bool = False) -> PackageResult: + instances = benchmark_instances() + run_name = run_dir.resolve().name + + api = dataset = None + existing: set[str] = set() + if upload_to: + # Each submission gets its own dataset: bare "org" -> "org/"; + # an explicit "org/name" is used as-is. + dataset = upload_to if "/" in upload_to else f"{upload_to}/{run_name}" + from huggingface_hub import HfApi + + api = HfApi() + api.create_repo(dataset, repo_type="dataset", exist_ok=True) + # Force public so `verify`/`recombine` can fetch the artifacts anonymously + # (orgs may default new datasets to private). + api.update_repo_settings(dataset, repo_type="dataset", private=False) + existing = set(api.list_repo_files(dataset, repo_type="dataset")) + + test_maps: dict[str, dict[str, bool]] = {} + packaged: list[str] = [] + skipped: list[str] = [] + pending: list[tuple[Path, str, str]] = [] + for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()): + iid = instance_dir.name + eval_json = instance_dir / f"{iid}.eval.json" + has_solution = (instance_dir / "submission.tar.gz").exists() or ( + instance_dir / "submission.tar.gz.url" + ).exists() + if not (eval_json.exists() and has_solution): + continue + if iid not in instances: + log.warning("Skipping %s (not a known ProgramBench instance)", iid) + skipped.append(iid) + continue + test_maps[iid] = test_results_map(eval_json, instances[iid]) + # Split the (potentially huge) eval.json into a light eval.json + a heavy + # .eval.log.json (log + failure text); they recombine to the original. + split_eval_json(instance_dir, iid) + if api: + for fname in (f"{iid}.eval.log.json", "submission.tar.gz"): + if (instance_dir / fname).exists(): + pending.append((instance_dir, iid, fname)) + packaged.append(iid) + + if not packaged: + raise ValueError(f"No packageable instances found under {run_dir}") + + # Write the scoring-derived artifacts first; they don't depend on the upload, so a + # failed/throttled upload leaves them correct and the run simply resumable. + # score.json is per-test ({iid: {test: passed}}) so scores can be recomputed later + # while striking out specific tests; the manifest headline is the score with no + # tests struck. + write_stat(run_dir, "score", test_maps) + scores = {iid: score_from_tests(m) for iid, m in test_maps.items()} + headline = aggregate(scores, len(instances)) + + carried = _carried_values(run_dir) + env = Environment(loader=PackageLoader("programbench", "data/templates"), autoescape=False) + (run_dir / "submission.yaml").write_text( + env.get_template("submission.yaml.j2").render( + run_dir=run_dir, + submission_id=run_dir.resolve().name, + programbench_version=version("programbench"), + mean_score=headline.mean_score, + resolved_pct=headline.resolved_pct, + near_resolved_pct=headline.near_resolved_pct, + n_attempted=headline.n_instances_attempted, + n_total=headline.n_instances_total, + **carried, + ) + + "\n" + ) + + # README is created once (a starting point for the author); never overwritten. + readme = run_dir / "README.md" + if not readme.exists(): + readme.write_text( + env.get_template("README.md.j2").render( + submission_id=run_dir.resolve().name, + mean_pct=round(headline.mean_score * 100, 1), + resolved_pct=headline.resolved_pct, + n_attempted=headline.n_instances_attempted, + n_total=headline.n_instances_total, + **carried, + ) + ) + + if api and pending: + _upload_artifacts(api, dataset, pending, existing, overwrite) + + return PackageResult(run_dir, packaged, skipped, headline) diff --git a/src/programbench/register.py b/src/programbench/register.py new file mode 100644 index 0000000..b5a4cb2 --- /dev/null +++ b/src/programbench/register.py @@ -0,0 +1,157 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Register a packaged submission into the leaderboard registry by opening a PR. + +A registry entry is small and self-contained: a pointer to the submission's own public +repo, plus the manifest and stat files copied out of it. + + submissions// + pointer.yaml # source repo URL + the exact commit that was scored + submission.yaml # copied from the submission + _stats/*.json # copied from the submission + +This builds that entry against a clone of the registry (default +github.com/ProgramBench/submissions) and opens the PR. With ``gh`` it forks the registry +and opens the PR for you; without it, it leaves the commit on a branch in a clone and +prints the compare URL so you can open the PR by hand. +""" + +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from pathlib import Path + +import yaml + +REGISTRY_DEFAULT = "https://github.com/ProgramBench/submissions" + + +def _git(cwd: Path, *args: str) -> str: + return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip() + + +def _to_https(url: str) -> str: + """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL.""" + url = url.removesuffix(".git") + if url.startswith("git@"): + host, path = url[4:].split(":", 1) + return f"https://{host}/{path}" + return url + + +def _slug(registry: str) -> str: + """``https://github.com/Owner/Repo`` -> ``Owner/Repo`` (what ``gh`` expects).""" + return _to_https(registry).removeprefix("https://github.com/") + + +@dataclass +class RegisterPlan: + submission_id: str + source: str + commit: str + registry: str + branch: str + pointer: str # rendered pointer.yaml + files: list[str] # entry-relative paths that will be added + title: str + body: str + + +@dataclass +class RegisterResult: + plan: RegisterPlan + pr_url: str | None # set when a PR was opened (gh path) + next_steps: str | None # set when manual steps remain (no-gh path) + + +def build_plan(submission_dir: Path, registry: str) -> RegisterPlan: + sub_id = submission_dir.resolve().name + manifest = yaml.safe_load((submission_dir / "submission.yaml").read_text()) + source = _to_https(_git(submission_dir, "remote", "get-url", "origin")) + commit = _git(submission_dir, "rev-parse", "HEAD") + pointer = yaml.safe_dump({"submission_id": sub_id, "source": source, "commit": commit}, sort_keys=False) + files = ["pointer.yaml", "submission.yaml"] + [ + f"_stats/{p.name}" for p in sorted((submission_dir / "_stats").glob("*.json")) + ] + system, head = manifest["system"], manifest["headline"] + body = ( + f"Registers **{system['model']}** ({system['provider']}) + {system['agent']}.\n\n" + f"- mean score: {head['mean_score'] * 100:.1f}\n" + f"- resolved: {head['resolved_pct']:.1f}% / near-resolved: {head['near_resolved_pct']:.1f}%\n" + f"- instances: {head['n_instances_attempted']}/{head['n_instances_total']}\n\n" + f"Source: {source}\nCommit: `{commit}`\n\n" + "Tier-0 verified (`programbench submit verify .`)." + ) + return RegisterPlan( + sub_id, source, commit, registry, f"add-{sub_id}", pointer, files, f"Add submission: {sub_id}", body + ) + + +def write_entry(plan: RegisterPlan, submission_dir: Path, registry_root: Path) -> Path: + """Materialize ``submissions//`` under ``registry_root`` (overwriting any existing entry).""" + entry = registry_root / "submissions" / plan.submission_id + if entry.exists(): + shutil.rmtree(entry) + (entry / "_stats").mkdir(parents=True) + (entry / "pointer.yaml").write_text(plan.pointer) + shutil.copyfile(submission_dir / "submission.yaml", entry / "submission.yaml") + for p in sorted((submission_dir / "_stats").glob("*.json")): + shutil.copyfile(p, entry / "_stats" / p.name) + return entry + + +def register_submission(submission_dir: Path, registry: str) -> RegisterResult: + """Clone the registry, commit the entry on a branch, and open the PR. + + Uses ``gh`` (fork + PR) when available, cleaning up its throwaway clone afterward. + Without ``gh`` it leaves the commit on a branch in a kept clone and returns the manual + push + compare-URL steps in ``next_steps`` (so the clone must outlive this call). + """ + plan = build_plan(submission_dir, registry) + slug = _slug(registry) + clone = Path(tempfile.mkdtemp(prefix="programbench-register-")) / "submissions" + + if shutil.which("gh"): + # Fork the registry under the authed user (no-op if it exists) and clone the fork; + # origin -> fork, upstream -> registry. + subprocess.run( + ["gh", "repo", "fork", slug, "--clone", "--default-branch-only", str(clone)], + check=True, + capture_output=True, + text=True, + ) + _git(clone, "checkout", "-b", plan.branch) + write_entry(plan, submission_dir, clone) + _git(clone, "add", f"submissions/{plan.submission_id}") + _git(clone, "commit", "-m", plan.title) + _git(clone, "push", "-u", "origin", plan.branch) + pr_url = subprocess.run( + ["gh", "pr", "create", "--repo", slug, "--title", plan.title, "--body", plan.body], + cwd=clone, + check=True, + capture_output=True, + text=True, + ).stdout.strip() + shutil.rmtree(clone.parent) + return RegisterResult(plan, pr_url, None) + + # No gh: clone the registry directly, commit the branch, and hand back the steps. + _git(clone.parent, "clone", "--depth", "1", _to_https(registry), str(clone)) + _git(clone, "checkout", "-b", plan.branch) + write_entry(plan, submission_dir, clone) + _git(clone, "add", f"submissions/{plan.submission_id}") + _git(clone, "commit", "-m", plan.title) + steps = ( + "`gh` not found, so the PR was not opened. The entry is committed on branch " + f"`{plan.branch}` in:\n {clone}\n\n" + "To finish, from that clone push the branch to your fork of the registry and open a PR:\n" + " git remote add fork https://github.com//submissions\n" + f" git push -u fork {plan.branch}\n" + f" {_to_https(registry)}/compare/main...:{plan.branch}?expand=1" + ) + return RegisterResult(plan, None, steps) diff --git a/src/programbench/submission.py b/src/programbench/submission.py new file mode 100644 index 0000000..71b8b34 --- /dev/null +++ b/src/programbench/submission.py @@ -0,0 +1,227 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Shared helpers for building (`package`) and checking (`verify`) submissions. + +Both commands must score a run directory the same way, so the scoring and headline +aggregation live here and are imported by each command. +""" + +import hashlib +import json +import logging +import shutil +import subprocess +import tarfile +import tempfile +import urllib.request +from dataclasses import asdict, dataclass +from pathlib import Path + +import yaml + +from programbench.eval.eval import EvaluationResult +from programbench.utils.load_data import get_active_branches, get_ignored_tests, load_all_instances + +log = logging.getLogger(__name__) + +RESOLVED_THRESHOLD = 1.0 +NEAR_RESOLVED_THRESHOLD = 0.95 +FIXTURE_PREFIX = "testorg__" + + +def benchmark_instances() -> dict[str, dict]: + """Real benchmark instances, keyed by id (excludes the bundled test fixture).""" + return {i["instance_id"]: i for i in load_all_instances() if not i["instance_id"].startswith(FIXTURE_PREFIX)} + + +def sha256_file(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +def test_results_map(eval_json: Path, instance: dict) -> dict[str, bool]: + """Per-test pass/fail for one instance, after the same active-branch / ignored-test + filtering as ``info``. Keyed by ``"/"``, value ``True`` iff passed. + + This is the raw material a score is computed from, so the leaderboard can later + recompute scores while striking out specific tests (see the registry's ignore map). + """ + result = EvaluationResult.model_validate_json(eval_json.read_text()) + result = result.for_branches(get_active_branches(instance)).without_ignored(get_ignored_tests(instance)) + return {t.full_name: t.is_resolved for t in result.test_results} + + +def score_from_tests(tests: dict[str, bool], ignore: set[str] = frozenset()) -> float: + """Fraction passed over the non-ignored tests (0.0 if none remain).""" + kept = [passed for name, passed in tests.items() if name not in ignore] + return sum(kept) / len(kept) if kept else 0.0 + + +def score_instance(eval_json: Path, instance: dict) -> float: + """Per-instance score with ignored-branch/test filtering (same logic as `info`).""" + return score_from_tests(test_results_map(eval_json, instance)) + + +def score_run(run_dir: Path, instances: dict[str, dict]) -> dict[str, float]: + """Map instance_id -> score for every /.eval.json present and known.""" + scores: dict[str, float] = {} + for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()): + iid = instance_dir.name + eval_json = instance_dir / f"{iid}.eval.json" + if eval_json.exists() and iid in instances: + scores[iid] = score_instance(eval_json, instances[iid]) + return scores + + +def write_stat(run_dir: Path, stat: str, by_instance: dict[str, float]) -> None: + """Write a per-instance stat file: ``/_stats/.json`` = ``{iid: value}``.""" + (run_dir / "_stats").mkdir(exist_ok=True) + (run_dir / "_stats" / f"{stat}.json").write_text(json.dumps(by_instance, indent=2, sort_keys=True)) + + +_HEAVY_EXTRA_KEYS = ("message", "text") + + +def _full_name(t: dict) -> str: + return f"{t['branch']}/{t['name']}" if t.get("branch") else t["name"] + + +def split_eval_json(instance_dir: Path, iid: str) -> None: + """Split ``.eval.json`` into a light eval.json + a heavy ``.eval.log.json``. + + The heavy file holds the only bulky parts — the top-level ``log`` and each failing + test's ``message``/``text`` — keyed so the two recombine into the exact original. + Nothing is dropped; the union of the two files is the original eval.json. + """ + p = instance_dir / f"{iid}.eval.json" + data = json.loads(p.read_text()) + # Idempotent: if there's nothing heavy left (already split, or genuinely light), do + # nothing — never clobber an existing eval.log.json. + has_heavy = bool(data.get("log")) or any( + k in (t.get("extra") or {}) for t in data.get("test_results", []) for k in _HEAVY_EXTRA_KEYS + ) + if not has_heavy: + return + heavy: dict = {"log": data.get("log") or [], "failures": {}} + for t in data.get("test_results", []): + extra = t.get("extra") or {} + moved = {k: extra.pop(k) for k in _HEAVY_EXTRA_KEYS if k in extra} + if moved: + heavy["failures"][_full_name(t)] = moved + data["log"] = [] + p.write_text(json.dumps(data, indent=2)) + (instance_dir / f"{iid}.eval.log.json").write_text(json.dumps(heavy)) + + +def recombine_eval_json(instance_dir: Path, iid: str) -> bool: + """Inverse of :func:`split_eval_json`: fold the heavy file back into ``.eval.json`` + (restoring the exact original), then remove the heavy file and its ``.url``/``.sha256``. + + The heavy file is read locally, or downloaded from ``.eval.log.json.url`` if hosted. + Returns True if a recombine happened. + """ + light = instance_dir / f"{iid}.eval.json" + log_file = instance_dir / f"{iid}.eval.log.json" + url_file = instance_dir / f"{iid}.eval.log.json.url" + if not light.exists(): + return False + if log_file.exists(): + heavy = json.loads(log_file.read_text()) + elif url_file.exists(): + with urllib.request.urlopen(url_file.read_text().strip()) as r: # noqa: S310 + heavy = json.loads(r.read()) + else: + return False + data = json.loads(light.read_text()) + data["log"] = heavy.get("log", []) + failures = heavy.get("failures", {}) + for t in data.get("test_results", []): + if (name := _full_name(t)) in failures: + t.setdefault("extra", {}).update(failures[name]) + light.write_text(json.dumps(data, indent=2)) + for f in (log_file, url_file, instance_dir / f"{iid}.eval.log.json.sha256"): + f.unlink(missing_ok=True) + return True + + +@dataclass +class Headline: + mean_score: float + resolved_pct: float + near_resolved_pct: float + n_instances_attempted: int + n_instances_total: int + + def as_dict(self) -> dict: + return asdict(self) + + +def aggregate(scores: dict[str, float], n_total: int) -> Headline: + values = list(scores.values()) + if not values: + raise ValueError("No scored instances found") + n = len(values) + # mean is over attempted instances; resolved/near are over the full benchmark + # (an unattempted task counts as unresolved). + return Headline( + mean_score=round(sum(values) / n, 4), + resolved_pct=round(100 * sum(s >= RESOLVED_THRESHOLD for s in values) / n_total, 1), + near_resolved_pct=round(100 * sum(s >= NEAR_RESOLVED_THRESHOLD for s in values) / n_total, 1), + n_instances_attempted=n, + n_instances_total=n_total, + ) + + +def load_manifest(submission_dir: Path) -> dict: + return yaml.safe_load((submission_dir / "submission.yaml").read_text()) + + +def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None: + """Materialize an instance's submission.tar.gz into ``dest_tar``, verifying sha256. + + Supports the artifact forms in SPEC.md: inline file, ``.url`` (downloaded), or + ``submission.ref.yaml`` (git checkout packed). The sha256 sidecar, when present, is + enforced for inline/url; for git it is advisory (packing is not byte-reproducible). + """ + sha_file = instance_dir / "submission.tar.gz.sha256" + expected = sha_file.read_text().split()[0] if sha_file.exists() else None + + inline = instance_dir / "submission.tar.gz" + url_file = instance_dir / "submission.tar.gz.url" + ref_file = instance_dir / "submission.ref.yaml" + if inline.exists(): + shutil.copy2(inline, dest_tar) + elif url_file.exists(): + urllib.request.urlretrieve(url_file.read_text().strip(), dest_tar) # noqa: S310 + elif ref_file.exists(): + _pack_git_ref(yaml.safe_load(ref_file.read_text()), dest_tar) + expected = None # git packing is not byte-reproducible; rely on re-eval instead + else: + raise ValueError(f"{instance_dir.name}: no submission.tar.gz, .url, or .ref.yaml found") + + if expected and (got := sha256_file(dest_tar)) != expected: + raise ValueError(f"{instance_dir.name}: sha256 mismatch (expected {expected[:12]}…, got {got[:12]}…)") + + +def _pack_git_ref(ref: dict, dest_tar: Path) -> None: + with tempfile.TemporaryDirectory() as tmp: + src = Path(tmp) / "src" + subprocess.run( + ["git", "clone", "--depth", "1", "--branch", ref["ref"], ref["repo"], str(src)], + check=True, + capture_output=True, + ) + root = src / ref["subpath"] if ref.get("subpath") else src + with tarfile.open(dest_tar, "w:gz") as tar: + for p in sorted(root.rglob("*")): + rel = p.relative_to(root).as_posix() + if rel.split("/", 1)[0] == ".git": + continue + tar.add(p, arcname=rel, recursive=False) diff --git a/src/programbench/verify.py b/src/programbench/verify.py new file mode 100644 index 0000000..3fdfd28 --- /dev/null +++ b/src/programbench/verify.py @@ -0,0 +1,100 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Verify a packaged submission against its own claimed results. + +Tier 0 (default, no Docker): recompute the headline from the submission's own eval.json +files (with ignored-test filtering) and check it matches submission.yaml. This is the +free consistency check a third party or CI can run with only ``programbench`` installed. + +Tier 1 (--tier1, Docker): resolve each submission.tar.gz, re-run ``programbench eval``, +and confirm the freshly produced scores match the submitted eval.json. This is what +proves the artifacts actually yield the reported results. +""" + +import logging +import tempfile +from dataclasses import dataclass +from pathlib import Path + +from programbench.submission import ( + Headline, + aggregate, + benchmark_instances, + load_manifest, + resolve_submission_tar, + score_run, +) + +log = logging.getLogger(__name__) + +TOLERANCE = 0.011 # headline floats are rounded; allow a hair more than the last digit + + +@dataclass +class Check: + name: str + claimed: object + computed: object + ok: bool + + +@dataclass +class VerifyResult: + tier: int + checks: list[Check] + + @property + def ok(self) -> bool: + return all(c.ok for c in self.checks) + + +def _close(a: object, b: object) -> bool: + if a is None: + return False + return abs(float(a) - float(b)) <= TOLERANCE + + +def _headline_checks(claimed: dict, computed: Headline) -> list[Check]: + return [ + Check(name, claimed.get(name), value, _close(claimed.get(name), value)) + for name, value in computed.as_dict().items() + ] + + +def verify_tier0(submission_dir: Path) -> VerifyResult: + manifest = load_manifest(submission_dir) + instances = benchmark_instances() + computed = aggregate(score_run(submission_dir, instances), len(instances)) + return VerifyResult(0, _headline_checks(manifest.get("headline", {}), computed)) + + +def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = "") -> VerifyResult: + from programbench.eval.eval_batch import run_eval_batch + + instances = benchmark_instances() + sub_root = submission_dir + submitted = score_run(sub_root, instances) + + with tempfile.TemporaryDirectory() as tmp: + run = Path(tmp) + for iid in submitted: + (run / iid).mkdir(parents=True) + resolve_submission_tar(sub_root / iid, run / iid / "submission.tar.gz") + run_eval_batch(sources=[run], workers=workers, filter_spec=filter_spec, force=True) + fresh = score_run(run, instances) + + checks = [ + Check( + iid, + round(submitted[iid], 4), + round(fresh.get(iid, float("nan")), 4), + _close(submitted[iid], fresh.get(iid)), + ) + for iid in submitted + if not filter_spec or iid in fresh + ] + return VerifyResult(1, checks)