facebookresearch · john-b-yang · Jun 17, 2026
diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py
@@ -9,6 +9,7 @@
 import typer
 
 from programbench.cli.blob import app as blob_app
+from programbench.cli.submit import app as submit_app
 from programbench.constants import DOCKER_CPUS
 
 app = typer.Typer(
@@ -18,6 +19,7 @@
     context_settings={"help_option_names": ["-h", "--help"]},
 )
 app.add_typer(blob_app, name="blob")
+app.add_typer(submit_app, name="submit")
 
 
 @app.callback()

diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py
@@ -0,0 +1,208 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json."""
+
+from pathlib import Path
+
+import typer
+
+app = typer.Typer(no_args_is_help=True, help="Prepare, check, and reassemble leaderboard submissions.")
+
+
+@app.command()
+def package(
+    run_dir: Path = typer.Argument(
+        ..., help="A `programbench eval` run directory (<run_dir>/<iid>/submission.tar.gz)."
+    ),
+    upload_to: str = typer.Option(
+        "",
+        "--upload-to",
+        metavar="ORG[/DATASET]",
+        help="Upload submission.tar.gz and the heavy eval.log.json to a HuggingFace dataset, "
+        "replacing each with a .url + .sha256. A bare org (e.g. 'programbench') creates a "
+        "per-submission dataset org/<run-dir-name>; pass 'org/name' to use an exact dataset.",
+    ),
+    overwrite: bool = typer.Option(
+        False, "--overwrite", help="With --upload-to, re-upload files already present on HF (default: skip them)."
+    ),
+) -> None:
+    """Turn an evaluated run directory into a leaderboard submission, in place.
+
+    Writes a submission.yaml manifest and _stats/score.json, and splits each large
+    eval.json into a light eval.json (kept) + a heavy <iid>.eval.log.json (raw log +
+    failure text) so the repo stays git-pushable. With --upload-to, the heavy files and
+    the submission.tar.gz artifacts are uploaded to HuggingFace. System metadata and
+    trajectories are left as TODO.
+
+    \b
+    Examples:
+        programbench submit package output/my-run
+        programbench submit package output/my-run --upload-to programbench
+    """
+    from rich.console import Console
+
+    from programbench.package import package_run
+
+    result = package_run(run_dir, upload_to=upload_to or None, overwrite=overwrite)
+    console = Console()
+    console.print(
+        f"Packaged [bold]{len(result.packaged)}[/bold] instance(s) in [bold]{result.run_dir}[/bold] "
+        f"(skipped {len(result.skipped)} unknown). "
+        f"mean_score={result.headline.mean_score * 100:.1f} resolved={result.headline.resolved_pct:.1f}%"
+    )
+    console.print(
+        "[dim]Each eval.json was split into eval.json + <iid>.eval.log.json (recombine with "
+        "`programbench submit recombine`). Next: fill in submission.yaml + add traj.json files.[/dim]"
+    )
+
+
+@app.command()
+def verify(
+    submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
+    tier1: bool = typer.Option(
+        False, "--tier1", help="Also re-run `programbench eval` and check artifacts reproduce the results (Docker)."
+    ),
+    workers: int = typer.Option(1, "-w", "--workers", help="Instance workers for the Tier-1 re-eval."),
+    filter_spec: str = typer.Option(
+        "", "--filter", help="Restrict Tier-1 re-eval to instance IDs matching this regex."
+    ),
+) -> None:
+    """Verify a submission against its own claimed results.
+
+    Tier 0 (default, no Docker) recomputes the headline from the submission's eval.json
+    files and checks it matches submission.yaml. Tier 1 (--tier1) additionally resolves
+    each submission.tar.gz and re-runs evaluation to confirm the artifacts reproduce the
+    reported scores.
+
+    \b
+    Examples:
+        programbench submit verify ./their-submission
+        programbench submit verify ./their-submission --tier1 -w 4
+    """
+    from rich.console import Console
+    from rich.table import Table
+
+    from programbench.verify import verify_tier0, verify_tier1
+
+    result = (
+        verify_tier1(submission_dir, workers=workers, filter_spec=filter_spec)
+        if tier1
+        else verify_tier0(submission_dir)
+    )
+
+    table = Table(title=f"Tier-{result.tier} verification", box=None)
+    table.add_column("Check", style="bold")
+    table.add_column("Claimed", justify="right")
+    table.add_column("Computed", justify="right")
+    table.add_column("", justify="center")
+    for c in result.checks:
+        table.add_row(c.name, str(c.claimed), str(c.computed), "✅" if c.ok else "❌")
+    console = Console()
+    console.print(table)
+    if result.ok:
+        console.print("[bold green]PASS[/bold green] — submission is consistent with its reported results.")
+    else:
+        console.print("[bold red]FAIL[/bold red] — discrepancies found above.")
+        raise typer.Exit(1)
+
+
+@app.command()
+def register(
+    submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
+    registry: str = typer.Option(
+        "", "--registry", help="Registry repo to PR against (default: ProgramBench/submissions)."
+    ),
+    source: str = typer.Option(
+        "", "--source", help="Public URL of this submission's repo (default: autodetected from its git remote)."
+    ),
+    commit: str = typer.Option(
+        "", "--commit", help="Commit SHA that was scored (default: autodetected from its git HEAD)."
+    ),
+    dry_run: bool = typer.Option(
+        False, "--dry-run", help="Build the registry entry locally and print the plan; touch no network."
+    ),
+    verify: bool = typer.Option(
+        True, "--verify/--no-verify", help="Run a Tier-0 verify gate before registering (default: on)."
+    ),
+) -> None:
+    """Register a packaged submission on the leaderboard by opening a PR to the registry.
+
+    The PR adds a small submissions/<id>/ entry: a pointer.yaml (the submission repo URL +
+    the exact commit scored) plus the submission.yaml and _stats/ copied from this run. The
+    source URL and commit are read from the run directory's own git remote/HEAD. With `gh`
+    installed the registry is forked and the PR opened for you; otherwise the entry is left
+    committed on a branch and the steps to push + open the PR are printed.
+
+    \b
+    Examples:
+        programbench submit register ./my-run --dry-run
+        programbench submit register ./my-run
+    """
+    import tempfile
+
+    from rich.console import Console
+
+    from programbench.register import REGISTRY_DEFAULT, build_plan, register_submission, write_entry
+
+    console = Console()
+    registry = registry or REGISTRY_DEFAULT
+
+    if verify:
+        from programbench.verify import verify_tier0
+
+        if not verify_tier0(submission_dir).ok:
+            console.print(
+                "[bold red]FAIL[/bold red] — Tier-0 verification failed; fix the submission (or pass "
+                "--no-verify) before registering. Run `programbench submit verify .` to see the mismatch."
+            )
+            raise typer.Exit(1)
+
+    plan = build_plan(submission_dir, registry)
+    if source:
+        plan.source = source
+    if commit:
+        plan.commit = commit
+
+    if dry_run:
+        with tempfile.TemporaryDirectory() as tmp:
+            entry = write_entry(plan, submission_dir, Path(tmp))
+            files = sorted(str(p.relative_to(entry)) for p in entry.rglob("*") if p.is_file())
+        console.print(f"[bold]Would register[/bold] [cyan]{plan.submission_id}[/cyan] to {plan.registry}")
+        console.print(f"  branch: {plan.branch}")
+        console.print(f"  source: {plan.source}\n  commit: {plan.commit}")
+        console.print("  files:  " + ", ".join(f"submissions/{plan.submission_id}/{f}" for f in files))
+        console.print(f"\n[dim]pointer.yaml:[/dim]\n{plan.pointer.rstrip()}")
+        console.print(f"\n[dim]PR title:[/dim] {plan.title}\n[dim]PR body:[/dim]\n{plan.body}")
+        console.print("\n[dim]Dry run — nothing cloned, pushed, or opened. Drop --dry-run to register.[/dim]")
+        return
+
+    result = register_submission(submission_dir, registry)
+    if result.pr_url:
+        console.print(f"[bold green]Opened PR[/bold green] for {plan.submission_id}: {result.pr_url}")
+    else:
+        console.print(f"[bold]Prepared[/bold] registry entry for {plan.submission_id}.\n{result.next_steps}")
+
+
+@app.command()
+def recombine(
+    run_dir: Path = typer.Argument(..., help="A packaged run/submission directory."),
+) -> None:
+    """Reverse `package`'s eval split: fold each <iid>.eval.log.json back into its
+    eval.json, restoring the original full eval output.
+
+    The heavy file is read locally, or downloaded from its .url if it was uploaded to HF.
+
+    \b
+    Examples:
+        programbench submit recombine ./their-submission
+    """
+    from rich.console import Console
+
+    from programbench.submission import recombine_eval_json
+
+    n = sum(recombine_eval_json(d, d.name) for d in sorted(p for p in run_dir.iterdir() if p.is_dir()))
+    Console().print(f"Recombined [bold]{n}[/bold] eval.json file(s) in {run_dir}")
diff --git a/src/programbench/data/templates/README.md.j2 b/src/programbench/data/templates/README.md.j2
@@ -0,0 +1,78 @@
+<p align="center">
+  <a href="https://programbench.com"><img src="https://programbench.com/static/images/fox_hero_200.png" width="110" alt="ProgramBench"></a>
+</p>
+
+> A submission to the **[ProgramBench](https://programbench.com)** leaderboard — *can language models rebuild programs from scratch?*  ·  [Leaderboard](https://programbench.com)  ·  [How to submit](https://programbench.com/blog/submission-guide)
+
+# [Submission Name Here]
+
+<!-- Manifest, scores, and per-test results live in `submission.yaml` and `_stats/`. This file
+is for the things the manifest can't capture — please fill in the sections below. -->
+
+## System overview
+
+<!-- One or two paragraphs: what is your system and how does it work end to end? Cover
+     the model (exact id/version and key settings like temperature / reasoning effort),
+     the agent/scaffold (framework + version, prompting, tools, step limits), and your
+     test-time strategy (single attempt, best-of-N, iterative test/fix, ...). -->
+
+## Reproducing this run
+
+<!-- The exact commands to reproduce this submission, ideally runnable as-is. -->
+
+```bash
+# 1. install the agent / dependencies
+# 2. run inference per task (no internet, per the eval protocol)
+# 3. programbench eval <run-dir>
+# 4. programbench submit package <run-dir> --upload-to <org>
+```
+
+## Extra stats (optional)
+
+The leaderboard can show stats beyond `score` — e.g. cost or model calls. These are
+**optional**, and each must be **computed by a script that reads your trajectories**, not
+entered by hand: the number has to be recoverable from the run. `programbench` ships no
+calculators (it makes no assumptions about your scaffold) — write your own that reads each
+`traj.json` and emits a flat `{instance_id: value}` map to `_stats/<name>.json`, and ship
+the script here (e.g. under `_scripts/`) so the numbers are reproducible.
+
+## Links
+
+<!-- Optional: agent/scaffold code, model card, paper, blog post. -->
+
+## Submission checklist
+
+- [ ] Ran `programbench eval` → `programbench submit package` to produce this submission
+- [ ] Filled in every `submission.yaml` field (no `TODO` left), including `is_os_model` / `is_os_scaffold`
+- [ ] Trajectories (`traj.json`) included for every task (agent submissions)
+- [ ] Solutions present — inline `submission.tar.gz`, or a hosted `submission.tar.gz.url` + `.sha256`
+- [ ] Any extra stats (cost/calls) were produced by a trajectory-reading script shipped here, not hand-written
+- [ ] Filled in the System overview and Reproducing sections above
+- [ ] `programbench submit verify .` passes
+- [ ] Made this fork public
+- [ ] Opened a registration PR to the submissions repo
+
+## Integrity attestations
+
+- [ ] Solutions were produced **only** from behavioral observation of the binary and its
+      bundled docs — no source code, repositories, mirrors, or package registries were consulted
+- [ ] The model was not given internet access during evaluation
+- [ ] The model did not have access to any unit tests during evaluation
+- [ ] I consent to re-evaluation, and to flagging or removal if it contradicts the reported results
+
+## Auditing
+
+Anyone can independently check this submission with the following instructions:
+
+```bash
+git clone <your-submission-repo>
+cd {{ submission_id }}
+uvx programbench submit verify .          # Tier-0: recompute the score from this repo's eval.json and check it matches submission.yaml (instant, offline)
+uvx programbench submit verify . --tier1  # Tier-1: download each submission.tar.gz from HuggingFace, re-run evaluation, and confirm it reproduces the score (Docker)
+```
+
+* Tier-0 is self-contained. It reads the per-instance `eval.json` here plus the bundled test
+metadata.
+* Tier-1 additionally fetches the hosted solutions and the hidden tests and re-runs
+them, so the reported `score` is reproduced from scratch. (Cost/calls are self-reported from
+the trajectories; only `score` is independently re-verifiable.)
diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2
@@ -0,0 +1,28 @@
+# Generated by `programbench package` from: {{ run_dir }}
+# [auto] fields are recomputed on every `package`; all other fields are preserved.
+schema_version: 1
+
+submission_id: {{ submission_id | tojson }}
+submitter:
+  name: {{ submitter_name | tojson }}
+  contact: {{ submitter_contact | tojson }}    # email or @github
+  affiliation: {{ affiliation | tojson }}
+
+system:
+  agent: {{ agent | tojson }}                   # scaffold/harness; "none" for a pure human submission
+  description_url: {{ description_url | tojson }}
+  is_os_model: {{ is_os_model | tojson }}       # true if the model's weights are openly available
+  is_os_scaffold: {{ is_os_scaffold | tojson }} # true if the agent/scaffold is open source
+  model: {{ model | tojson }}                   # display name used on the leaderboard
+  provider: {{ provider | tojson }}
+  type: {{ system_type | tojson }}              # single-agent | multi-agent | other
+
+eval:
+  programbench_version: {{ programbench_version | tojson }}   # [auto]
+
+headline:                # [auto] score summary from evaluation; other stats live in stats/
+  mean_score: {{ mean_score }}
+  resolved_pct: {{ resolved_pct }}
+  near_resolved_pct: {{ near_resolved_pct }}
+  n_instances_attempted: {{ n_attempted }}
+  n_instances_total: {{ n_total }}