diff --git a/src/programbench/cli/main.py b/src/programbench/cli/main.py
index 6a36792..85f20b9 100644
--- a/src/programbench/cli/main.py
+++ b/src/programbench/cli/main.py
@@ -9,6 +9,7 @@
 import typer
 
 from programbench.cli.blob import app as blob_app
+from programbench.cli.submit import app as submit_app
 from programbench.constants import DOCKER_CPUS
 
 app = typer.Typer(
@@ -18,6 +19,7 @@
     context_settings={"help_option_names": ["-h", "--help"]},
 )
 app.add_typer(blob_app, name="blob")
+app.add_typer(submit_app, name="submit")
 
 
 @app.callback()
diff --git a/src/programbench/cli/submit.py b/src/programbench/cli/submit.py
new file mode 100644
index 0000000..0980952
--- /dev/null
+++ b/src/programbench/cli/submit.py
@@ -0,0 +1,208 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Submission lifecycle commands: package an eval run, verify a submission, recombine eval.json."""
+
+from pathlib import Path
+
+import typer
+
+app = typer.Typer(no_args_is_help=True, help="Prepare, check, and reassemble leaderboard submissions.")
+
+
+@app.command()
+def package(
+    run_dir: Path = typer.Argument(
+        ..., help="A `programbench eval` run directory (<run_dir>/<iid>/submission.tar.gz)."
+    ),
+    upload_to: str = typer.Option(
+        "",
+        "--upload-to",
+        metavar="ORG[/DATASET]",
+        help="Upload submission.tar.gz and the heavy eval.log.json to a HuggingFace dataset, "
+        "replacing each with a .url + .sha256. A bare org (e.g. 'programbench') creates a "
+        "per-submission dataset org/<run-dir-name>; pass 'org/name' to use an exact dataset.",
+    ),
+    overwrite: bool = typer.Option(
+        False, "--overwrite", help="With --upload-to, re-upload files already present on HF (default: skip them)."
+    ),
+) -> None:
+    """Turn an evaluated run directory into a leaderboard submission, in place.
+
+    Writes a submission.yaml manifest and _stats/score.json, and splits each large
+    eval.json into a light eval.json (kept) + a heavy <iid>.eval.log.json (raw log +
+    failure text) so the repo stays git-pushable. With --upload-to, the heavy files and
+    the submission.tar.gz artifacts are uploaded to HuggingFace. System metadata and
+    trajectories are left as TODO.
+
+    \b
+    Examples:
+        programbench submit package output/my-run
+        programbench submit package output/my-run --upload-to programbench
+    """
+    from rich.console import Console
+
+    from programbench.package import package_run
+
+    result = package_run(run_dir, upload_to=upload_to or None, overwrite=overwrite)
+    console = Console()
+    console.print(
+        f"Packaged [bold]{len(result.packaged)}[/bold] instance(s) in [bold]{result.run_dir}[/bold] "
+        f"(skipped {len(result.skipped)} unknown). "
+        f"mean_score={result.headline.mean_score * 100:.1f} resolved={result.headline.resolved_pct:.1f}%"
+    )
+    console.print(
+        "[dim]Each eval.json was split into eval.json + <iid>.eval.log.json (recombine with "
+        "`programbench submit recombine`). Next: fill in submission.yaml + add traj.json files.[/dim]"
+    )
+
+
+@app.command()
+def verify(
+    submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
+    tier1: bool = typer.Option(
+        False, "--tier1", help="Also re-run `programbench eval` and check artifacts reproduce the results (Docker)."
+    ),
+    workers: int = typer.Option(1, "-w", "--workers", help="Instance workers for the Tier-1 re-eval."),
+    filter_spec: str = typer.Option(
+        "", "--filter", help="Restrict Tier-1 re-eval to instance IDs matching this regex."
+    ),
+) -> None:
+    """Verify a submission against its own claimed results.
+
+    Tier 0 (default, no Docker) recomputes the headline from the submission's eval.json
+    files and checks it matches submission.yaml. Tier 1 (--tier1) additionally resolves
+    each submission.tar.gz and re-runs evaluation to confirm the artifacts reproduce the
+    reported scores.
+
+    \b
+    Examples:
+        programbench submit verify ./their-submission
+        programbench submit verify ./their-submission --tier1 -w 4
+    """
+    from rich.console import Console
+    from rich.table import Table
+
+    from programbench.verify import verify_tier0, verify_tier1
+
+    result = (
+        verify_tier1(submission_dir, workers=workers, filter_spec=filter_spec)
+        if tier1
+        else verify_tier0(submission_dir)
+    )
+
+    table = Table(title=f"Tier-{result.tier} verification", box=None)
+    table.add_column("Check", style="bold")
+    table.add_column("Claimed", justify="right")
+    table.add_column("Computed", justify="right")
+    table.add_column("", justify="center")
+    for c in result.checks:
+        table.add_row(c.name, str(c.claimed), str(c.computed), "✅" if c.ok else "❌")
+    console = Console()
+    console.print(table)
+    if result.ok:
+        console.print("[bold green]PASS[/bold green] — submission is consistent with its reported results.")
+    else:
+        console.print("[bold red]FAIL[/bold red] — discrepancies found above.")
+        raise typer.Exit(1)
+
+
+@app.command()
+def register(
+    submission_dir: Path = typer.Argument(..., help="A packaged submission directory (contains submission.yaml)."),
+    registry: str = typer.Option(
+        "", "--registry", help="Registry repo to PR against (default: ProgramBench/submissions)."
+    ),
+    source: str = typer.Option(
+        "", "--source", help="Public URL of this submission's repo (default: autodetected from its git remote)."
+    ),
+    commit: str = typer.Option(
+        "", "--commit", help="Commit SHA that was scored (default: autodetected from its git HEAD)."
+    ),
+    dry_run: bool = typer.Option(
+        False, "--dry-run", help="Build the registry entry locally and print the plan; touch no network."
+    ),
+    verify: bool = typer.Option(
+        True, "--verify/--no-verify", help="Run a Tier-0 verify gate before registering (default: on)."
+    ),
+) -> None:
+    """Register a packaged submission on the leaderboard by opening a PR to the registry.
+
+    The PR adds a small submissions/<id>/ entry: a pointer.yaml (the submission repo URL +
+    the exact commit scored) plus the submission.yaml and _stats/ copied from this run. The
+    source URL and commit are read from the run directory's own git remote/HEAD. With `gh`
+    installed the registry is forked and the PR opened for you; otherwise the entry is left
+    committed on a branch and the steps to push + open the PR are printed.
+
+    \b
+    Examples:
+        programbench submit register ./my-run --dry-run
+        programbench submit register ./my-run
+    """
+    import tempfile
+
+    from rich.console import Console
+
+    from programbench.register import REGISTRY_DEFAULT, build_plan, register_submission, write_entry
+
+    console = Console()
+    registry = registry or REGISTRY_DEFAULT
+
+    if verify:
+        from programbench.verify import verify_tier0
+
+        if not verify_tier0(submission_dir).ok:
+            console.print(
+                "[bold red]FAIL[/bold red] — Tier-0 verification failed; fix the submission (or pass "
+                "--no-verify) before registering. Run `programbench submit verify .` to see the mismatch."
+            )
+            raise typer.Exit(1)
+
+    plan = build_plan(submission_dir, registry)
+    if source:
+        plan.source = source
+    if commit:
+        plan.commit = commit
+
+    if dry_run:
+        with tempfile.TemporaryDirectory() as tmp:
+            entry = write_entry(plan, submission_dir, Path(tmp))
+            files = sorted(str(p.relative_to(entry)) for p in entry.rglob("*") if p.is_file())
+        console.print(f"[bold]Would register[/bold] [cyan]{plan.submission_id}[/cyan] to {plan.registry}")
+        console.print(f"  branch: {plan.branch}")
+        console.print(f"  source: {plan.source}\n  commit: {plan.commit}")
+        console.print("  files:  " + ", ".join(f"submissions/{plan.submission_id}/{f}" for f in files))
+        console.print(f"\n[dim]pointer.yaml:[/dim]\n{plan.pointer.rstrip()}")
+        console.print(f"\n[dim]PR title:[/dim] {plan.title}\n[dim]PR body:[/dim]\n{plan.body}")
+        console.print("\n[dim]Dry run — nothing cloned, pushed, or opened. Drop --dry-run to register.[/dim]")
+        return
+
+    result = register_submission(submission_dir, registry)
+    if result.pr_url:
+        console.print(f"[bold green]Opened PR[/bold green] for {plan.submission_id}: {result.pr_url}")
+    else:
+        console.print(f"[bold]Prepared[/bold] registry entry for {plan.submission_id}.\n{result.next_steps}")
+
+
+@app.command()
+def recombine(
+    run_dir: Path = typer.Argument(..., help="A packaged run/submission directory."),
+) -> None:
+    """Reverse `package`'s eval split: fold each <iid>.eval.log.json back into its
+    eval.json, restoring the original full eval output.
+
+    The heavy file is read locally, or downloaded from its .url if it was uploaded to HF.
+
+    \b
+    Examples:
+        programbench submit recombine ./their-submission
+    """
+    from rich.console import Console
+
+    from programbench.submission import recombine_eval_json
+
+    n = sum(recombine_eval_json(d, d.name) for d in sorted(p for p in run_dir.iterdir() if p.is_dir()))
+    Console().print(f"Recombined [bold]{n}[/bold] eval.json file(s) in {run_dir}")
diff --git a/src/programbench/data/templates/README.md.j2 b/src/programbench/data/templates/README.md.j2
new file mode 100644
index 0000000..9e6d1bb
--- /dev/null
+++ b/src/programbench/data/templates/README.md.j2
@@ -0,0 +1,78 @@
+<p align="center">
+  <a href="https://programbench.com"><img src="https://programbench.com/static/images/fox_hero_200.png" width="110" alt="ProgramBench"></a>
+</p>
+
+> A submission to the **[ProgramBench](https://programbench.com)** leaderboard — *can language models rebuild programs from scratch?*  ·  [Leaderboard](https://programbench.com)  ·  [How to submit](https://programbench.com/blog/submission-guide)
+
+# [Submission Name Here]
+
+<!-- Manifest, scores, and per-test results live in `submission.yaml` and `_stats/`. This file
+is for the things the manifest can't capture — please fill in the sections below. -->
+
+## System overview
+
+<!-- One or two paragraphs: what is your system and how does it work end to end? Cover
+     the model (exact id/version and key settings like temperature / reasoning effort),
+     the agent/scaffold (framework + version, prompting, tools, step limits), and your
+     test-time strategy (single attempt, best-of-N, iterative test/fix, ...). -->
+
+## Reproducing this run
+
+<!-- The exact commands to reproduce this submission, ideally runnable as-is. -->
+
+```bash
+# 1. install the agent / dependencies
+# 2. run inference per task (no internet, per the eval protocol)
+# 3. programbench eval <run-dir>
+# 4. programbench submit package <run-dir> --upload-to <org>
+```
+
+## Extra stats (optional)
+
+The leaderboard can show stats beyond `score` — e.g. cost or model calls. These are
+**optional**, and each must be **computed by a script that reads your trajectories**, not
+entered by hand: the number has to be recoverable from the run. `programbench` ships no
+calculators (it makes no assumptions about your scaffold) — write your own that reads each
+`traj.json` and emits a flat `{instance_id: value}` map to `_stats/<name>.json`, and ship
+the script here (e.g. under `_scripts/`) so the numbers are reproducible.
+
+## Links
+
+<!-- Optional: agent/scaffold code, model card, paper, blog post. -->
+
+## Submission checklist
+
+- [ ] Ran `programbench eval` → `programbench submit package` to produce this submission
+- [ ] Filled in every `submission.yaml` field (no `TODO` left), including `is_os_model` / `is_os_scaffold`
+- [ ] Trajectories (`traj.json`) included for every task (agent submissions)
+- [ ] Solutions present — inline `submission.tar.gz`, or a hosted `submission.tar.gz.url` + `.sha256`
+- [ ] Any extra stats (cost/calls) were produced by a trajectory-reading script shipped here, not hand-written
+- [ ] Filled in the System overview and Reproducing sections above
+- [ ] `programbench submit verify .` passes
+- [ ] Made this fork public
+- [ ] Opened a registration PR to the submissions repo
+
+## Integrity attestations
+
+- [ ] Solutions were produced **only** from behavioral observation of the binary and its
+      bundled docs — no source code, repositories, mirrors, or package registries were consulted
+- [ ] The model was not given internet access during evaluation
+- [ ] The model did not have access to any unit tests during evaluation
+- [ ] I consent to re-evaluation, and to flagging or removal if it contradicts the reported results
+
+## Auditing
+
+Anyone can independently check this submission with the following instructions:
+
+```bash
+git clone <your-submission-repo>
+cd {{ submission_id }}
+uvx programbench submit verify .          # Tier-0: recompute the score from this repo's eval.json and check it matches submission.yaml (instant, offline)
+uvx programbench submit verify . --tier1  # Tier-1: download each submission.tar.gz from HuggingFace, re-run evaluation, and confirm it reproduces the score (Docker)
+```
+
+* Tier-0 is self-contained. It reads the per-instance `eval.json` here plus the bundled test
+metadata.
+* Tier-1 additionally fetches the hosted solutions and the hidden tests and re-runs
+them, so the reported `score` is reproduced from scratch. (Cost/calls are self-reported from
+the trajectories; only `score` is independently re-verifiable.)
diff --git a/src/programbench/data/templates/submission.yaml.j2 b/src/programbench/data/templates/submission.yaml.j2
new file mode 100644
index 0000000..061353d
--- /dev/null
+++ b/src/programbench/data/templates/submission.yaml.j2
@@ -0,0 +1,28 @@
+# Generated by `programbench package` from: {{ run_dir }}
+# [auto] fields are recomputed on every `package`; all other fields are preserved.
+schema_version: 1
+
+submission_id: {{ submission_id | tojson }}
+submitter:
+  name: {{ submitter_name | tojson }}
+  contact: {{ submitter_contact | tojson }}    # email or @github
+  affiliation: {{ affiliation | tojson }}
+
+system:
+  agent: {{ agent | tojson }}                   # scaffold/harness; "none" for a pure human submission
+  description_url: {{ description_url | tojson }}
+  is_os_model: {{ is_os_model | tojson }}       # true if the model's weights are openly available
+  is_os_scaffold: {{ is_os_scaffold | tojson }} # true if the agent/scaffold is open source
+  model: {{ model | tojson }}                   # display name used on the leaderboard
+  provider: {{ provider | tojson }}
+  type: {{ system_type | tojson }}              # single-agent | multi-agent | other
+
+eval:
+  programbench_version: {{ programbench_version | tojson }}   # [auto]
+
+headline:                # [auto] score summary from evaluation; other stats live in stats/
+  mean_score: {{ mean_score }}
+  resolved_pct: {{ resolved_pct }}
+  near_resolved_pct: {{ near_resolved_pct }}
+  n_instances_attempted: {{ n_attempted }}
+  n_instances_total: {{ n_total }}
diff --git a/src/programbench/package.py b/src/programbench/package.py
new file mode 100644
index 0000000..a86bcea
--- /dev/null
+++ b/src/programbench/package.py
@@ -0,0 +1,216 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Turn a ``programbench eval`` run directory into a leaderboard submission, in place.
+
+Packaging is purely eval-derived. It writes:
+
+- ``_stats/score.json`` — per-instance, per-test pass/fail (the one stat from evaluation),
+- ``submission.yaml`` — the manifest, with ``[auto]`` score fields recomputed and any
+  author-entered fields preserved across re-packaging,
+
+and splits each ``<iid>.eval.json`` into a light eval.json + a heavy ``<iid>.eval.log.json``
+(the raw log + failure text) so the run repo stays git-pushable; the two recombine to the
+original via ``programbench submit recombine``. With ``--upload-to`` the heavy files and the
+``submission.tar.gz`` artifacts go to a HuggingFace dataset (replaced by ``.url`` + ``.sha256``).
+
+Other stats (cost, calls, …) are optional and come from the agent trajectories via scripts
+the submitter writes — this command produces none of them, and makes no assumptions about
+the scaffold. The run directory stays a valid input to ``programbench eval``.
+"""
+
+import logging
+import os
+import shutil
+import tempfile
+from dataclasses import dataclass
+from importlib.metadata import version
+from pathlib import Path
+
+import yaml
+from jinja2 import Environment, PackageLoader
+
+from programbench.submission import (
+    Headline,
+    aggregate,
+    benchmark_instances,
+    score_from_tests,
+    sha256_file,
+    split_eval_json,
+    test_results_map,
+    write_stat,
+)
+
+log = logging.getLogger(__name__)
+
+TODO = "TODO"
+
+# Author-entered manifest fields preserved across re-packaging: template var -> (path, default).
+_CARRIED = {
+    "affiliation": ("submitter.affiliation", ""),
+    "agent": ("system.agent", TODO),
+    "description_url": ("system.description_url", "README.md"),
+    "is_os_model": ("system.is_os_model", False),
+    "is_os_scaffold": ("system.is_os_scaffold", False),
+    "model": ("system.model", TODO),
+    "provider": ("system.provider", TODO),
+    "submitter_contact": ("submitter.contact", TODO),
+    "submitter_name": ("submitter.name", TODO),
+    "system_type": ("system.type", "single-agent"),
+}
+
+
+@dataclass
+class PackageResult:
+    run_dir: Path
+    packaged: list[str]
+    skipped: list[str]
+    headline: Headline
+
+
+def _dig(d: dict, dotted: str):
+    for key in dotted.split("."):
+        if not isinstance(d, dict):
+            return None
+        d = d.get(key)
+    return d
+
+
+def _carried_values(run_dir: Path) -> dict:
+    manifest_path = run_dir / "submission.yaml"
+    existing = yaml.safe_load(manifest_path.read_text()) if manifest_path.exists() else {}
+    # Use "is None" (not "or") so a real False/empty value is preserved, not clobbered.
+    return {
+        var: (default if (val := _dig(existing, path)) is None else val) for var, (path, default) in _CARRIED.items()
+    }
+
+
+def _upload_artifacts(
+    api, dataset: str, pending: list[tuple[Path, str, str]], existing: set[str], overwrite: bool
+) -> None:
+    """Upload all pending files to HF, then replace each with a .url + .sha256 and delete it.
+
+    ``pending`` is (instance_dir, instance_id, filename) — submission.tar.gz and the heavy
+    <iid>.eval.log.json. Files already on HF are skipped unless ``overwrite``. Uses
+    ``upload_large_folder`` (resumable, multi-commit, retrying) since logs can be hundreds
+    of MB and a single big commit is fragile; files are hard-linked into a staging tree so
+    nothing is copied.
+    """
+    for instance_dir, iid, fname in pending:
+        (instance_dir / f"{fname}.sha256").write_text(sha256_file(instance_dir / fname) + "\n")
+    to_upload = [(d, iid, f) for d, iid, f in pending if overwrite or f"{iid}/{f}" not in existing]
+    if to_upload:
+        run_dir = pending[0][0].parent
+        with tempfile.TemporaryDirectory(dir=run_dir) as tmp:
+            staging = Path(tmp)
+            for instance_dir, iid, fname in to_upload:
+                dst = staging / iid / fname
+                dst.parent.mkdir(parents=True, exist_ok=True)
+                try:
+                    os.link(instance_dir / fname, dst)  # same-fs hardlink: no copy
+                except OSError:
+                    shutil.copy2(instance_dir / fname, dst)
+            log.info("Uploading %d file(s) to %s (resumable)", len(to_upload), dataset)
+            api.upload_large_folder(repo_id=dataset, folder_path=str(staging), repo_type="dataset")
+    for instance_dir, iid, fname in pending:
+        (instance_dir / f"{fname}.url").write_text(
+            f"https://huggingface.co/datasets/{dataset}/resolve/main/{iid}/{fname}\n"
+        )
+        (instance_dir / fname).unlink()
+
+
+def package_run(run_dir: Path, upload_to: str | None = None, overwrite: bool = False) -> PackageResult:
+    instances = benchmark_instances()
+    run_name = run_dir.resolve().name
+
+    api = dataset = None
+    existing: set[str] = set()
+    if upload_to:
+        # Each submission gets its own dataset: bare "org" -> "org/<run-name>";
+        # an explicit "org/name" is used as-is.
+        dataset = upload_to if "/" in upload_to else f"{upload_to}/{run_name}"
+        from huggingface_hub import HfApi
+
+        api = HfApi()
+        api.create_repo(dataset, repo_type="dataset", exist_ok=True)
+        # Force public so `verify`/`recombine` can fetch the artifacts anonymously
+        # (orgs may default new datasets to private).
+        api.update_repo_settings(dataset, repo_type="dataset", private=False)
+        existing = set(api.list_repo_files(dataset, repo_type="dataset"))
+
+    test_maps: dict[str, dict[str, bool]] = {}
+    packaged: list[str] = []
+    skipped: list[str] = []
+    pending: list[tuple[Path, str, str]] = []
+    for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()):
+        iid = instance_dir.name
+        eval_json = instance_dir / f"{iid}.eval.json"
+        has_solution = (instance_dir / "submission.tar.gz").exists() or (
+            instance_dir / "submission.tar.gz.url"
+        ).exists()
+        if not (eval_json.exists() and has_solution):
+            continue
+        if iid not in instances:
+            log.warning("Skipping %s (not a known ProgramBench instance)", iid)
+            skipped.append(iid)
+            continue
+        test_maps[iid] = test_results_map(eval_json, instances[iid])
+        # Split the (potentially huge) eval.json into a light eval.json + a heavy
+        # <iid>.eval.log.json (log + failure text); they recombine to the original.
+        split_eval_json(instance_dir, iid)
+        if api:
+            for fname in (f"{iid}.eval.log.json", "submission.tar.gz"):
+                if (instance_dir / fname).exists():
+                    pending.append((instance_dir, iid, fname))
+        packaged.append(iid)
+
+    if not packaged:
+        raise ValueError(f"No packageable instances found under {run_dir}")
+
+    # Write the scoring-derived artifacts first; they don't depend on the upload, so a
+    # failed/throttled upload leaves them correct and the run simply resumable.
+    # score.json is per-test ({iid: {test: passed}}) so scores can be recomputed later
+    # while striking out specific tests; the manifest headline is the score with no
+    # tests struck.
+    write_stat(run_dir, "score", test_maps)
+    scores = {iid: score_from_tests(m) for iid, m in test_maps.items()}
+    headline = aggregate(scores, len(instances))
+
+    carried = _carried_values(run_dir)
+    env = Environment(loader=PackageLoader("programbench", "data/templates"), autoescape=False)
+    (run_dir / "submission.yaml").write_text(
+        env.get_template("submission.yaml.j2").render(
+            run_dir=run_dir,
+            submission_id=run_dir.resolve().name,
+            programbench_version=version("programbench"),
+            mean_score=headline.mean_score,
+            resolved_pct=headline.resolved_pct,
+            near_resolved_pct=headline.near_resolved_pct,
+            n_attempted=headline.n_instances_attempted,
+            n_total=headline.n_instances_total,
+            **carried,
+        )
+        + "\n"
+    )
+
+    # README is created once (a starting point for the author); never overwritten.
+    readme = run_dir / "README.md"
+    if not readme.exists():
+        readme.write_text(
+            env.get_template("README.md.j2").render(
+                submission_id=run_dir.resolve().name,
+                mean_pct=round(headline.mean_score * 100, 1),
+                resolved_pct=headline.resolved_pct,
+                n_attempted=headline.n_instances_attempted,
+                n_total=headline.n_instances_total,
+                **carried,
+            )
+        )
+
+    if api and pending:
+        _upload_artifacts(api, dataset, pending, existing, overwrite)
+
+    return PackageResult(run_dir, packaged, skipped, headline)
diff --git a/src/programbench/register.py b/src/programbench/register.py
new file mode 100644
index 0000000..b5a4cb2
--- /dev/null
+++ b/src/programbench/register.py
@@ -0,0 +1,157 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Register a packaged submission into the leaderboard registry by opening a PR.
+
+A registry entry is small and self-contained: a pointer to the submission's own public
+repo, plus the manifest and stat files copied out of it.
+
+    submissions/<id>/
+      pointer.yaml      # source repo URL + the exact commit that was scored
+      submission.yaml   # copied from the submission
+      _stats/*.json     # copied from the submission
+
+This builds that entry against a clone of the registry (default
+github.com/ProgramBench/submissions) and opens the PR. With ``gh`` it forks the registry
+and opens the PR for you; without it, it leaves the commit on a branch in a clone and
+prints the compare URL so you can open the PR by hand.
+"""
+
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+import yaml
+
+REGISTRY_DEFAULT = "https://github.com/ProgramBench/submissions"
+
+
+def _git(cwd: Path, *args: str) -> str:
+    return subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True).stdout.strip()
+
+
+def _to_https(url: str) -> str:
+    """A git remote (``git@host:owner/repo.git`` or ``https://…``) as a browsable https URL."""
+    url = url.removesuffix(".git")
+    if url.startswith("git@"):
+        host, path = url[4:].split(":", 1)
+        return f"https://{host}/{path}"
+    return url
+
+
+def _slug(registry: str) -> str:
+    """``https://github.com/Owner/Repo`` -> ``Owner/Repo`` (what ``gh`` expects)."""
+    return _to_https(registry).removeprefix("https://github.com/")
+
+
+@dataclass
+class RegisterPlan:
+    submission_id: str
+    source: str
+    commit: str
+    registry: str
+    branch: str
+    pointer: str  # rendered pointer.yaml
+    files: list[str]  # entry-relative paths that will be added
+    title: str
+    body: str
+
+
+@dataclass
+class RegisterResult:
+    plan: RegisterPlan
+    pr_url: str | None  # set when a PR was opened (gh path)
+    next_steps: str | None  # set when manual steps remain (no-gh path)
+
+
+def build_plan(submission_dir: Path, registry: str) -> RegisterPlan:
+    sub_id = submission_dir.resolve().name
+    manifest = yaml.safe_load((submission_dir / "submission.yaml").read_text())
+    source = _to_https(_git(submission_dir, "remote", "get-url", "origin"))
+    commit = _git(submission_dir, "rev-parse", "HEAD")
+    pointer = yaml.safe_dump({"submission_id": sub_id, "source": source, "commit": commit}, sort_keys=False)
+    files = ["pointer.yaml", "submission.yaml"] + [
+        f"_stats/{p.name}" for p in sorted((submission_dir / "_stats").glob("*.json"))
+    ]
+    system, head = manifest["system"], manifest["headline"]
+    body = (
+        f"Registers **{system['model']}** ({system['provider']}) + {system['agent']}.\n\n"
+        f"- mean score: {head['mean_score'] * 100:.1f}\n"
+        f"- resolved: {head['resolved_pct']:.1f}% / near-resolved: {head['near_resolved_pct']:.1f}%\n"
+        f"- instances: {head['n_instances_attempted']}/{head['n_instances_total']}\n\n"
+        f"Source: {source}\nCommit: `{commit}`\n\n"
+        "Tier-0 verified (`programbench submit verify .`)."
+    )
+    return RegisterPlan(
+        sub_id, source, commit, registry, f"add-{sub_id}", pointer, files, f"Add submission: {sub_id}", body
+    )
+
+
+def write_entry(plan: RegisterPlan, submission_dir: Path, registry_root: Path) -> Path:
+    """Materialize ``submissions/<id>/`` under ``registry_root`` (overwriting any existing entry)."""
+    entry = registry_root / "submissions" / plan.submission_id
+    if entry.exists():
+        shutil.rmtree(entry)
+    (entry / "_stats").mkdir(parents=True)
+    (entry / "pointer.yaml").write_text(plan.pointer)
+    shutil.copyfile(submission_dir / "submission.yaml", entry / "submission.yaml")
+    for p in sorted((submission_dir / "_stats").glob("*.json")):
+        shutil.copyfile(p, entry / "_stats" / p.name)
+    return entry
+
+
+def register_submission(submission_dir: Path, registry: str) -> RegisterResult:
+    """Clone the registry, commit the entry on a branch, and open the PR.
+
+    Uses ``gh`` (fork + PR) when available, cleaning up its throwaway clone afterward.
+    Without ``gh`` it leaves the commit on a branch in a kept clone and returns the manual
+    push + compare-URL steps in ``next_steps`` (so the clone must outlive this call).
+    """
+    plan = build_plan(submission_dir, registry)
+    slug = _slug(registry)
+    clone = Path(tempfile.mkdtemp(prefix="programbench-register-")) / "submissions"
+
+    if shutil.which("gh"):
+        # Fork the registry under the authed user (no-op if it exists) and clone the fork;
+        # origin -> fork, upstream -> registry.
+        subprocess.run(
+            ["gh", "repo", "fork", slug, "--clone", "--default-branch-only", str(clone)],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        _git(clone, "checkout", "-b", plan.branch)
+        write_entry(plan, submission_dir, clone)
+        _git(clone, "add", f"submissions/{plan.submission_id}")
+        _git(clone, "commit", "-m", plan.title)
+        _git(clone, "push", "-u", "origin", plan.branch)
+        pr_url = subprocess.run(
+            ["gh", "pr", "create", "--repo", slug, "--title", plan.title, "--body", plan.body],
+            cwd=clone,
+            check=True,
+            capture_output=True,
+            text=True,
+        ).stdout.strip()
+        shutil.rmtree(clone.parent)
+        return RegisterResult(plan, pr_url, None)
+
+    # No gh: clone the registry directly, commit the branch, and hand back the steps.
+    _git(clone.parent, "clone", "--depth", "1", _to_https(registry), str(clone))
+    _git(clone, "checkout", "-b", plan.branch)
+    write_entry(plan, submission_dir, clone)
+    _git(clone, "add", f"submissions/{plan.submission_id}")
+    _git(clone, "commit", "-m", plan.title)
+    steps = (
+        "`gh` not found, so the PR was not opened. The entry is committed on branch "
+        f"`{plan.branch}` in:\n  {clone}\n\n"
+        "To finish, from that clone push the branch to your fork of the registry and open a PR:\n"
+        "  git remote add fork https://github.com/<you>/submissions\n"
+        f"  git push -u fork {plan.branch}\n"
+        f"  {_to_https(registry)}/compare/main...<you>:{plan.branch}?expand=1"
+    )
+    return RegisterResult(plan, None, steps)
diff --git a/src/programbench/submission.py b/src/programbench/submission.py
new file mode 100644
index 0000000..71b8b34
--- /dev/null
+++ b/src/programbench/submission.py
@@ -0,0 +1,227 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Shared helpers for building (`package`) and checking (`verify`) submissions.
+
+Both commands must score a run directory the same way, so the scoring and headline
+aggregation live here and are imported by each command.
+"""
+
+import hashlib
+import json
+import logging
+import shutil
+import subprocess
+import tarfile
+import tempfile
+import urllib.request
+from dataclasses import asdict, dataclass
+from pathlib import Path
+
+import yaml
+
+from programbench.eval.eval import EvaluationResult
+from programbench.utils.load_data import get_active_branches, get_ignored_tests, load_all_instances
+
+log = logging.getLogger(__name__)
+
+RESOLVED_THRESHOLD = 1.0
+NEAR_RESOLVED_THRESHOLD = 0.95
+FIXTURE_PREFIX = "testorg__"
+
+
+def benchmark_instances() -> dict[str, dict]:
+    """Real benchmark instances, keyed by id (excludes the bundled test fixture)."""
+    return {i["instance_id"]: i for i in load_all_instances() if not i["instance_id"].startswith(FIXTURE_PREFIX)}
+
+
+def sha256_file(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(1 << 20), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def test_results_map(eval_json: Path, instance: dict) -> dict[str, bool]:
+    """Per-test pass/fail for one instance, after the same active-branch / ignored-test
+    filtering as ``info``. Keyed by ``"<branch>/<test_name>"``, value ``True`` iff passed.
+
+    This is the raw material a score is computed from, so the leaderboard can later
+    recompute scores while striking out specific tests (see the registry's ignore map).
+    """
+    result = EvaluationResult.model_validate_json(eval_json.read_text())
+    result = result.for_branches(get_active_branches(instance)).without_ignored(get_ignored_tests(instance))
+    return {t.full_name: t.is_resolved for t in result.test_results}
+
+
+def score_from_tests(tests: dict[str, bool], ignore: set[str] = frozenset()) -> float:
+    """Fraction passed over the non-ignored tests (0.0 if none remain)."""
+    kept = [passed for name, passed in tests.items() if name not in ignore]
+    return sum(kept) / len(kept) if kept else 0.0
+
+
+def score_instance(eval_json: Path, instance: dict) -> float:
+    """Per-instance score with ignored-branch/test filtering (same logic as `info`)."""
+    return score_from_tests(test_results_map(eval_json, instance))
+
+
+def score_run(run_dir: Path, instances: dict[str, dict]) -> dict[str, float]:
+    """Map instance_id -> score for every <iid>/<iid>.eval.json present and known."""
+    scores: dict[str, float] = {}
+    for instance_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()):
+        iid = instance_dir.name
+        eval_json = instance_dir / f"{iid}.eval.json"
+        if eval_json.exists() and iid in instances:
+            scores[iid] = score_instance(eval_json, instances[iid])
+    return scores
+
+
+def write_stat(run_dir: Path, stat: str, by_instance: dict[str, float]) -> None:
+    """Write a per-instance stat file: ``<run_dir>/_stats/<stat>.json`` = ``{iid: value}``."""
+    (run_dir / "_stats").mkdir(exist_ok=True)
+    (run_dir / "_stats" / f"{stat}.json").write_text(json.dumps(by_instance, indent=2, sort_keys=True))
+
+
+_HEAVY_EXTRA_KEYS = ("message", "text")
+
+
+def _full_name(t: dict) -> str:
+    return f"{t['branch']}/{t['name']}" if t.get("branch") else t["name"]
+
+
+def split_eval_json(instance_dir: Path, iid: str) -> None:
+    """Split ``<iid>.eval.json`` into a light eval.json + a heavy ``<iid>.eval.log.json``.
+
+    The heavy file holds the only bulky parts — the top-level ``log`` and each failing
+    test's ``message``/``text`` — keyed so the two recombine into the exact original.
+    Nothing is dropped; the union of the two files is the original eval.json.
+    """
+    p = instance_dir / f"{iid}.eval.json"
+    data = json.loads(p.read_text())
+    # Idempotent: if there's nothing heavy left (already split, or genuinely light), do
+    # nothing — never clobber an existing eval.log.json.
+    has_heavy = bool(data.get("log")) or any(
+        k in (t.get("extra") or {}) for t in data.get("test_results", []) for k in _HEAVY_EXTRA_KEYS
+    )
+    if not has_heavy:
+        return
+    heavy: dict = {"log": data.get("log") or [], "failures": {}}
+    for t in data.get("test_results", []):
+        extra = t.get("extra") or {}
+        moved = {k: extra.pop(k) for k in _HEAVY_EXTRA_KEYS if k in extra}
+        if moved:
+            heavy["failures"][_full_name(t)] = moved
+    data["log"] = []
+    p.write_text(json.dumps(data, indent=2))
+    (instance_dir / f"{iid}.eval.log.json").write_text(json.dumps(heavy))
+
+
+def recombine_eval_json(instance_dir: Path, iid: str) -> bool:
+    """Inverse of :func:`split_eval_json`: fold the heavy file back into ``<iid>.eval.json``
+    (restoring the exact original), then remove the heavy file and its ``.url``/``.sha256``.
+
+    The heavy file is read locally, or downloaded from ``<iid>.eval.log.json.url`` if hosted.
+    Returns True if a recombine happened.
+    """
+    light = instance_dir / f"{iid}.eval.json"
+    log_file = instance_dir / f"{iid}.eval.log.json"
+    url_file = instance_dir / f"{iid}.eval.log.json.url"
+    if not light.exists():
+        return False
+    if log_file.exists():
+        heavy = json.loads(log_file.read_text())
+    elif url_file.exists():
+        with urllib.request.urlopen(url_file.read_text().strip()) as r:  # noqa: S310
+            heavy = json.loads(r.read())
+    else:
+        return False
+    data = json.loads(light.read_text())
+    data["log"] = heavy.get("log", [])
+    failures = heavy.get("failures", {})
+    for t in data.get("test_results", []):
+        if (name := _full_name(t)) in failures:
+            t.setdefault("extra", {}).update(failures[name])
+    light.write_text(json.dumps(data, indent=2))
+    for f in (log_file, url_file, instance_dir / f"{iid}.eval.log.json.sha256"):
+        f.unlink(missing_ok=True)
+    return True
+
+
+@dataclass
+class Headline:
+    mean_score: float
+    resolved_pct: float
+    near_resolved_pct: float
+    n_instances_attempted: int
+    n_instances_total: int
+
+    def as_dict(self) -> dict:
+        return asdict(self)
+
+
+def aggregate(scores: dict[str, float], n_total: int) -> Headline:
+    values = list(scores.values())
+    if not values:
+        raise ValueError("No scored instances found")
+    n = len(values)
+    # mean is over attempted instances; resolved/near are over the full benchmark
+    # (an unattempted task counts as unresolved).
+    return Headline(
+        mean_score=round(sum(values) / n, 4),
+        resolved_pct=round(100 * sum(s >= RESOLVED_THRESHOLD for s in values) / n_total, 1),
+        near_resolved_pct=round(100 * sum(s >= NEAR_RESOLVED_THRESHOLD for s in values) / n_total, 1),
+        n_instances_attempted=n,
+        n_instances_total=n_total,
+    )
+
+
+def load_manifest(submission_dir: Path) -> dict:
+    return yaml.safe_load((submission_dir / "submission.yaml").read_text())
+
+
+def resolve_submission_tar(instance_dir: Path, dest_tar: Path) -> None:
+    """Materialize an instance's submission.tar.gz into ``dest_tar``, verifying sha256.
+
+    Supports the artifact forms in SPEC.md: inline file, ``.url`` (downloaded), or
+    ``submission.ref.yaml`` (git checkout packed). The sha256 sidecar, when present, is
+    enforced for inline/url; for git it is advisory (packing is not byte-reproducible).
+    """
+    sha_file = instance_dir / "submission.tar.gz.sha256"
+    expected = sha_file.read_text().split()[0] if sha_file.exists() else None
+
+    inline = instance_dir / "submission.tar.gz"
+    url_file = instance_dir / "submission.tar.gz.url"
+    ref_file = instance_dir / "submission.ref.yaml"
+    if inline.exists():
+        shutil.copy2(inline, dest_tar)
+    elif url_file.exists():
+        urllib.request.urlretrieve(url_file.read_text().strip(), dest_tar)  # noqa: S310
+    elif ref_file.exists():
+        _pack_git_ref(yaml.safe_load(ref_file.read_text()), dest_tar)
+        expected = None  # git packing is not byte-reproducible; rely on re-eval instead
+    else:
+        raise ValueError(f"{instance_dir.name}: no submission.tar.gz, .url, or .ref.yaml found")
+
+    if expected and (got := sha256_file(dest_tar)) != expected:
+        raise ValueError(f"{instance_dir.name}: sha256 mismatch (expected {expected[:12]}…, got {got[:12]}…)")
+
+
+def _pack_git_ref(ref: dict, dest_tar: Path) -> None:
+    with tempfile.TemporaryDirectory() as tmp:
+        src = Path(tmp) / "src"
+        subprocess.run(
+            ["git", "clone", "--depth", "1", "--branch", ref["ref"], ref["repo"], str(src)],
+            check=True,
+            capture_output=True,
+        )
+        root = src / ref["subpath"] if ref.get("subpath") else src
+        with tarfile.open(dest_tar, "w:gz") as tar:
+            for p in sorted(root.rglob("*")):
+                rel = p.relative_to(root).as_posix()
+                if rel.split("/", 1)[0] == ".git":
+                    continue
+                tar.add(p, arcname=rel, recursive=False)
diff --git a/src/programbench/verify.py b/src/programbench/verify.py
new file mode 100644
index 0000000..3fdfd28
--- /dev/null
+++ b/src/programbench/verify.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Verify a packaged submission against its own claimed results.
+
+Tier 0 (default, no Docker): recompute the headline from the submission's own eval.json
+files (with ignored-test filtering) and check it matches submission.yaml. This is the
+free consistency check a third party or CI can run with only ``programbench`` installed.
+
+Tier 1 (--tier1, Docker): resolve each submission.tar.gz, re-run ``programbench eval``,
+and confirm the freshly produced scores match the submitted eval.json. This is what
+proves the artifacts actually yield the reported results.
+"""
+
+import logging
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+from programbench.submission import (
+    Headline,
+    aggregate,
+    benchmark_instances,
+    load_manifest,
+    resolve_submission_tar,
+    score_run,
+)
+
+log = logging.getLogger(__name__)
+
+TOLERANCE = 0.011  # headline floats are rounded; allow a hair more than the last digit
+
+
+@dataclass
+class Check:
+    name: str
+    claimed: object
+    computed: object
+    ok: bool
+
+
+@dataclass
+class VerifyResult:
+    tier: int
+    checks: list[Check]
+
+    @property
+    def ok(self) -> bool:
+        return all(c.ok for c in self.checks)
+
+
+def _close(a: object, b: object) -> bool:
+    if a is None:
+        return False
+    return abs(float(a) - float(b)) <= TOLERANCE
+
+
+def _headline_checks(claimed: dict, computed: Headline) -> list[Check]:
+    return [
+        Check(name, claimed.get(name), value, _close(claimed.get(name), value))
+        for name, value in computed.as_dict().items()
+    ]
+
+
+def verify_tier0(submission_dir: Path) -> VerifyResult:
+    manifest = load_manifest(submission_dir)
+    instances = benchmark_instances()
+    computed = aggregate(score_run(submission_dir, instances), len(instances))
+    return VerifyResult(0, _headline_checks(manifest.get("headline", {}), computed))
+
+
+def verify_tier1(submission_dir: Path, *, workers: int = 1, filter_spec: str = "") -> VerifyResult:
+    from programbench.eval.eval_batch import run_eval_batch
+
+    instances = benchmark_instances()
+    sub_root = submission_dir
+    submitted = score_run(sub_root, instances)
+
+    with tempfile.TemporaryDirectory() as tmp:
+        run = Path(tmp)
+        for iid in submitted:
+            (run / iid).mkdir(parents=True)
+            resolve_submission_tar(sub_root / iid, run / iid / "submission.tar.gz")
+        run_eval_batch(sources=[run], workers=workers, filter_spec=filter_spec, force=True)
+        fresh = score_run(run, instances)
+
+    checks = [
+        Check(
+            iid,
+            round(submitted[iid], 4),
+            round(fresh.get(iid, float("nan")), 4),
+            _close(submitted[iid], fresh.get(iid)),
+        )
+        for iid in submitted
+        if not filter_spec or iid in fresh
+    ]
+    return VerifyResult(1, checks)