Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/programbench/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import typer

from programbench.cli.blob import app as blob_app
from programbench.cli.submit import app as submit_app
from programbench.constants import DOCKER_CPUS

app = typer.Typer(
Expand All @@ -18,6 +19,7 @@
context_settings={"help_option_names": ["-h", "--help"]},
)
app.add_typer(blob_app, name="blob")
app.add_typer(submit_app, name="submit")


@app.callback()
Expand Down
208 changes: 208 additions & 0 deletions src/programbench/cli/submit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json."""

from pathlib import Path

import typer

app = typer.Typer(no_args_is_help=True, help="Prepare, check, and reassemble leaderboard submissions.")
Comment thread
klieret marked this conversation as resolved.


@app.command()
def package(
run_dir: Path = typer.Argument(
..., help="A `programbench eval` run directory (<run_dir>/<iid>/submission.tar.gz)."
),
upload_to: str = typer.Option(
"",
"--upload-to",
metavar="ORG[/DATASET]",
help="Upload submission.tar.gz and the heavy eval.log.json to a HuggingFace dataset, "
"replacing each with a .url + .sha256. A bare org (e.g. 'programbench') creates a "
"per-submission dataset org/<run-dir-name>; pass 'org/name' to use an exact dataset.",
),
overwrite: bool = typer.Option(
False, "--overwrite", help="With --upload-to, re-upload files already present on HF (default: skip them)."
),
) -> None:
"""Turn an evaluated run directory into a leaderboard submission, in place.

Writes a submission.yaml manifest and _stats/score.json, and splits each large
eval.json into a light eval.json (kept) + a heavy <iid>.eval.log.json (raw log +
failure text) so the repo stays git-pushable. With --upload-to, the heavy files and
the submission.tar.gz artifacts are uploaded to HuggingFace. System metadata and
trajectories are left as TODO.

\b
Examples:
programbench submit package output/my-run
programbench submit package output/my-run --upload-to programbench
"""
from rich.console import Console

from programbench.package import package_run

result = package_run(run_dir, upload_to=upload_to or None, overwrite=overwrite)
console = Console()
console.print(
f"Packaged [bold]{len(result.packaged)}[/bold] instance(s) in [bold]{result.run_dir}[/bold] "
f"(skipped {len(result.skipped)} unknown). "
f"mean_score={result.headline.mean_score * 100:.1f} resolved={result.headline.resolved_pct:.1f}%"
)
console.print(
"[dim]Each eval.json was split into eval.json + <iid>.eval.log.json (recombine with "
"`programbench submit recombine`). Next: fill in submission.yaml + add traj.json files.[/dim]"
)


@app.command()
def verify(
submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
tier1: bool = typer.Option(
False, "--tier1", help="Also re-run `programbench eval` and check artifacts reproduce the results (Docker)."
),
workers: int = typer.Option(1, "-w", "--workers", help="Instance workers for the Tier-1 re-eval."),
filter_spec: str = typer.Option(
"", "--filter", help="Restrict Tier-1 re-eval to instance IDs matching this regex."
),
) -> None:
"""Verify a submission against its own claimed results.

Tier 0 (default, no Docker) recomputes the headline from the submission's eval.json
files and checks it matches submission.yaml. Tier 1 (--tier1) additionally resolves
each submission.tar.gz and re-runs evaluation to confirm the artifacts reproduce the
reported scores.

\b
Examples:
programbench submit verify ./their-submission
programbench submit verify ./their-submission --tier1 -w 4
"""
from rich.console import Console
from rich.table import Table

from programbench.verify import verify_tier0, verify_tier1

result = (
verify_tier1(submission_dir, workers=workers, filter_spec=filter_spec)
if tier1
else verify_tier0(submission_dir)
)

table = Table(title=f"Tier-{result.tier} verification", box=None)
table.add_column("Check", style="bold")
table.add_column("Claimed", justify="right")
table.add_column("Computed", justify="right")
table.add_column("", justify="center")
for c in result.checks:
table.add_row(c.name, str(c.claimed), str(c.computed), "✅" if c.ok else "❌")
console = Console()
console.print(table)
if result.ok:
console.print("[bold green]PASS[/bold green] — submission is consistent with its reported results.")
else:
console.print("[bold red]FAIL[/bold red] — discrepancies found above.")
raise typer.Exit(1)


@app.command()
def register(
submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
registry: str = typer.Option(
"", "--registry", help="Registry repo to PR against (default: ProgramBench/submissions)."
),
source: str = typer.Option(
"", "--source", help="Public URL of this submission's repo (default: autodetected from its git remote)."
),
commit: str = typer.Option(
"", "--commit", help="Commit SHA that was scored (default: autodetected from its git HEAD)."
),
dry_run: bool = typer.Option(
False, "--dry-run", help="Build the registry entry locally and print the plan; touch no network."
),
verify: bool = typer.Option(
True, "--verify/--no-verify", help="Run a Tier-0 verify gate before registering (default: on)."
),
) -> None:
"""Register a packaged submission on the leaderboard by opening a PR to the registry.

The PR adds a small submissions/<id>/ entry: a pointer.yaml (the submission repo URL +
the exact commit scored) plus the submission.yaml and _stats/ copied from this run. The
source URL and commit are read from the run directory's own git remote/HEAD. With `gh`
installed the registry is forked and the PR opened for you; otherwise the entry is left
committed on a branch and the steps to push + open the PR are printed.

\b
Examples:
programbench submit register ./my-run --dry-run
programbench submit register ./my-run
"""
import tempfile

from rich.console import Console

from programbench.register import REGISTRY_DEFAULT, build_plan, register_submission, write_entry

console = Console()
registry = registry or REGISTRY_DEFAULT

if verify:
from programbench.verify import verify_tier0

if not verify_tier0(submission_dir).ok:
console.print(
"[bold red]FAIL[/bold red] — Tier-0 verification failed; fix the submission (or pass "
"--no-verify) before registering. Run `programbench submit verify .` to see the mismatch."
)
raise typer.Exit(1)

plan = build_plan(submission_dir, registry)
if source:
plan.source = source
if commit:
plan.commit = commit
Comment on lines +164 to +168

if dry_run:
with tempfile.TemporaryDirectory() as tmp:
entry = write_entry(plan, submission_dir, Path(tmp))
files = sorted(str(p.relative_to(entry)) for p in entry.rglob("*") if p.is_file())
console.print(f"[bold]Would register[/bold] [cyan]{plan.submission_id}[/cyan] to {plan.registry}")
console.print(f" branch: {plan.branch}")
console.print(f" source: {plan.source}\n commit: {plan.commit}")
console.print(" files: " + ", ".join(f"submissions/{plan.submission_id}/{f}" for f in files))
console.print(f"\n[dim]pointer.yaml:[/dim]\n{plan.pointer.rstrip()}")
console.print(f"\n[dim]PR title:[/dim] {plan.title}\n[dim]PR body:[/dim]\n{plan.body}")
console.print("\n[dim]Dry run — nothing cloned, pushed, or opened. Drop --dry-run to register.[/dim]")
return

result = register_submission(submission_dir, registry)
if result.pr_url:
console.print(f"[bold green]Opened PR[/bold green] for {plan.submission_id}: {result.pr_url}")
else:
console.print(f"[bold]Prepared[/bold] registry entry for {plan.submission_id}.\n{result.next_steps}")


@app.command()
def recombine(
run_dir: Path = typer.Argument(..., help="A packaged run/submission directory."),
) -> None:
"""Reverse `package`'s eval split: fold each <iid>.eval.log.json back into its
eval.json, restoring the original full eval output.

The heavy file is read locally, or downloaded from its .url if it was uploaded to HF.

\b
Examples:
programbench submit recombine ./their-submission
"""
from rich.console import Console

from programbench.submission import recombine_eval_json

n = sum(recombine_eval_json(d, d.name) for d in sorted(p for p in run_dir.iterdir() if p.is_dir()))
Console().print(f"Recombined [bold]{n}[/bold] eval.json file(s) in {run_dir}")
78 changes: 78 additions & 0 deletions src/programbench/data/templates/README.md.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
<p align="center">
<a href="https://programbench.com"><img src="https://programbench.com/static/images/fox_hero_200.png" width="110" alt="ProgramBench"></a>
</p>

> A submission to the **[ProgramBench](https://programbench.com)** leaderboard — *can language models rebuild programs from scratch?* · [Leaderboard](https://programbench.com) · [How to submit](https://programbench.com/blog/submission-guide)

# [Submission Name Here]

<!-- Manifest, scores, and per-test results live in `submission.yaml` and `_stats/`. This file
is for the things the manifest can't capture — please fill in the sections below. -->

## System overview

<!-- One or two paragraphs: what is your system and how does it work end to end? Cover
the model (exact id/version and key settings like temperature / reasoning effort),
the agent/scaffold (framework + version, prompting, tools, step limits), and your
test-time strategy (single attempt, best-of-N, iterative test/fix, ...). -->

## Reproducing this run

<!-- The exact commands to reproduce this submission, ideally runnable as-is. -->

```bash
# 1. install the agent / dependencies
# 2. run inference per task (no internet, per the eval protocol)
# 3. programbench eval <run-dir>
# 4. programbench submit package <run-dir> --upload-to <org>
```

## Extra stats (optional)

The leaderboard can show stats beyond `score` — e.g. cost or model calls. These are
**optional**, and each must be **computed by a script that reads your trajectories**, not
entered by hand: the number has to be recoverable from the run. `programbench` ships no
calculators (it makes no assumptions about your scaffold) — write your own that reads each
`traj.json` and emits a flat `{instance_id: value}` map to `_stats/<name>.json`, and ship
the script here (e.g. under `_scripts/`) so the numbers are reproducible.

## Links

<!-- Optional: agent/scaffold code, model card, paper, blog post. -->

## Submission checklist

- [ ] Ran `programbench eval` → `programbench submit package` to produce this submission
- [ ] Filled in every `submission.yaml` field (no `TODO` left), including `is_os_model` / `is_os_scaffold`
- [ ] Trajectories (`traj.json`) included for every task (agent submissions)
- [ ] Solutions present — inline `submission.tar.gz`, or a hosted `submission.tar.gz.url` + `.sha256`
- [ ] Any extra stats (cost/calls) were produced by a trajectory-reading script shipped here, not hand-written
- [ ] Filled in the System overview and Reproducing sections above
- [ ] `programbench submit verify .` passes
- [ ] Made this fork public
- [ ] Opened a registration PR to the submissions repo

## Integrity attestations

- [ ] Solutions were produced **only** from behavioral observation of the binary and its
bundled docs — no source code, repositories, mirrors, or package registries were consulted
- [ ] The model was not given internet access during evaluation
- [ ] The model did not have access to any unit tests during evaluation
- [ ] I consent to re-evaluation, and to flagging or removal if it contradicts the reported results

## Auditing

Anyone can independently check this submission with the following instructions:

```bash
git clone <your-submission-repo>
cd {{ submission_id }}
uvx programbench submit verify . # Tier-0: recompute the score from this repo's eval.json and check it matches submission.yaml (instant, offline)
uvx programbench submit verify . --tier1 # Tier-1: download each submission.tar.gz from HuggingFace, re-run evaluation, and confirm it reproduces the score (Docker)
```

* Tier-0 is self-contained. It reads the per-instance `eval.json` here plus the bundled test
metadata.
* Tier-1 additionally fetches the hosted solutions and the hidden tests and re-runs
them, so the reported `score` is reproduced from scratch. (Cost/calls are self-reported from
the trajectories; only `score` is independently re-verifiable.)
28 changes: 28 additions & 0 deletions src/programbench/data/templates/submission.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Generated by `programbench package` from: {{ run_dir }}
# [auto] fields are recomputed on every `package`; all other fields are preserved.
Comment on lines +1 to +2
schema_version: 1

submission_id: {{ submission_id | tojson }}
submitter:
name: {{ submitter_name | tojson }}
contact: {{ submitter_contact | tojson }} # email or @github
affiliation: {{ affiliation | tojson }}

system:
agent: {{ agent | tojson }} # scaffold/harness; "none" for a pure human submission
description_url: {{ description_url | tojson }}
is_os_model: {{ is_os_model | tojson }} # true if the model's weights are openly available
is_os_scaffold: {{ is_os_scaffold | tojson }} # true if the agent/scaffold is open source
model: {{ model | tojson }} # display name used on the leaderboard
provider: {{ provider | tojson }}
type: {{ system_type | tojson }} # single-agent | multi-agent | other

eval:
programbench_version: {{ programbench_version | tojson }} # [auto]

headline: # [auto] score summary from evaluation; other stats live in stats/
mean_score: {{ mean_score }}
resolved_pct: {{ resolved_pct }}
near_resolved_pct: {{ near_resolved_pct }}
n_instances_attempted: {{ n_attempted }}
n_instances_total: {{ n_total }}
Loading
Loading