From f6737239be88aa6f6d1d8475cf024a3e02bc9453 Mon Sep 17 00:00:00 2001 From: Ignazio De Santis Date: Tue, 26 May 2026 15:26:42 +0800 Subject: [PATCH 1/3] feat(benchmark): public proof layer on the workbench engine (Plan C) Layers the Plan C public-proof surface on top of the existing local eval harness (workbench.py). The canonical engine stays the source of truth and gains durable, externally-verifiable public telemetry. - benchmark_runner.py: runs prompt_v1 vs prompt_v2 over examples/support_qa through workbench.run_evaluation / compare_runs / assess_gate, and publishes a schema-conformed artifact (api/_benchmark_latest.json + bounded history) - api/benchmark-latest.py: serves the latest run (stdlib, previous_run delta, pending envelope before first run) - api/stats.py -> mode:"live" with benchmark-derived metrics (eval runs, pass rate, distinct regressions caught over 30d); honest degraded fallback - .github/workflows/benchmark.yml: weekly + on-demand re-run that commits the refreshed artifact back and validates freshness - dashboard overview + telemetry now read Tier-A and render the latest run (variant comparison, deltas, gate), preserving the /prototype link - vercel.json: CORS + cache headers for /api/benchmark-latest Seeded run: prompt_v2 passes 4/4 vs prompt_v1 0/4 (+100% pass rate, 0 regressions, gate pass). Every value is computed from committed runs; nothing simulated or seeded. Tests: 25 unittest + 36 vitest; next build clean. The engine (workbench.py) is untouched. Refs: outputs/plans/PLAN_C_PROOF_FIRST.md (Phase 2) --- .github/workflows/benchmark.yml | 77 ++++ api/_benchmark_history.json | 15 + api/_benchmark_latest.json | 52 +++ api/benchmark-latest.py | 76 ++++ api/stats.py | 244 +++++------- .../archive/evalops-2026-05-26-d34c4f66.json | 52 +++ examples/benchmark/latest-report.md | 27 ++ src/app/page.tsx | 355 +++++++++++++----- src/app/telemetry/page.tsx | 93 ++--- src/evalops_workbench/benchmark_runner.py | 226 +++++++++++ src/lib/api.ts | 83 +++- tests/test_benchmark_endpoint.py | 75 ++++ tests/test_benchmark_runner.py | 36 ++ tests/test_stats.py | 214 ++++------- uv.lock | 42 +++ vercel.json | 9 + 16 files changed, 1235 insertions(+), 441 deletions(-) create mode 100644 .github/workflows/benchmark.yml create mode 100644 api/_benchmark_history.json create mode 100644 api/_benchmark_latest.json create mode 100644 api/benchmark-latest.py create mode 100644 examples/benchmark/archive/evalops-2026-05-26-d34c4f66.json create mode 100644 examples/benchmark/latest-report.md create mode 100644 src/evalops_workbench/benchmark_runner.py create mode 100644 tests/test_benchmark_endpoint.py create mode 100644 tests/test_benchmark_runner.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..9326774 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,77 @@ +name: Benchmark + +# The benchmark is deterministic and reproducible. This workflow re-verifies it +# weekly (and on demand), refreshes the published artifact, and commits the +# result back to the repo, where /api/benchmark-latest and /api/stats serve it. +# It is reproducibility verification, not synthetic daily activity. + +on: + schedule: + - cron: "0 6 * * 1" # Mondays 06:00 UTC + workflow_dispatch: + +permissions: + contents: write + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install package + run: pip install -e . + + - name: Run benchmark + run: python -m evalops_workbench.benchmark_runner + + - name: Validate published artifact + run: | + python - <<'PY' + import json + from datetime import datetime, timezone + + artifact = json.load(open("api/_benchmark_latest.json")) + assert artifact["system"] == "evalops", "wrong system" + assert artifact["schema_version"] == 1, "schema_version must be 1" + assert artifact["benchmark_type"] == "eval", "wrong benchmark_type" + assert artifact["metrics"], "metrics missing" + assert artifact["generated_at"], "generated_at missing" + generated = datetime.strptime( + artifact["generated_at"], "%Y-%m-%dT%H:%M:%SZ" + ).replace(tzinfo=timezone.utc) + age = (datetime.now(timezone.utc) - generated).total_seconds() + assert age < 600, f"artifact is stale ({age:.0f}s old)" + print(f"artifact valid; {artifact['metrics']['n_cases']} cases; age {age:.0f}s") + PY + + - name: Commit results if changed + run: | + git config user.name "eleventh-bot" + git config user.email "noreply@eleventh.dev" + git add api/_benchmark_latest.json api/_benchmark_history.json \ + examples/benchmark-v1/results examples/benchmark-v1/pinned-baseline.json + if git diff --cached --quiet; then + echo "No benchmark changes to commit." + else + git commit -m "chore(benchmark): scheduled run [skip ci]" + git push + fi + + - name: Live endpoint check (soft) + continue-on-error: true + run: | + sleep 20 + for path in stats benchmark-latest; do + url="https://evalops-workbench.eleventh.dev/api/$path" + echo "GET $url" + curl -s --max-time 30 -A "Mozilla/5.0 ci" "$url" \ + | python -c "import sys, json; d = json.load(sys.stdin); print(' ', {k: d.get(k) for k in ('system', 'mode', 'status', 'schema_version', 'benchmark_type')})" \ + || echo " (endpoint not reachable yet; redeploy may be in flight)" + done diff --git a/api/_benchmark_history.json b/api/_benchmark_history.json new file mode 100644 index 0000000..b5c9553 --- /dev/null +++ b/api/_benchmark_history.json @@ -0,0 +1,15 @@ +[ + { + "run_id": "evalops-2026-05-26-d34c4f66", + "generated_at": "2026-05-26T07:25:34Z", + "pass_rate": 1.0, + "avg_score": 1.0, + "regressions": 0, + "regressed_ids": [], + "gate_verdict": "pass", + "variants": [ + "prompt_v1", + "prompt_v2" + ] + } +] diff --git a/api/_benchmark_latest.json b/api/_benchmark_latest.json new file mode 100644 index 0000000..17185e6 --- /dev/null +++ b/api/_benchmark_latest.json @@ -0,0 +1,52 @@ +{ + "system": "evalops", + "benchmark_type": "eval", + "run_id": "evalops-2026-05-26-d34c4f66", + "fixture": "support-qa", + "metrics": { + "n_cases": 4, + "baseline_variant": "prompt_v1", + "candidate_variant": "prompt_v2", + "baseline_pass_rate": 0.0, + "candidate_pass_rate": 1.0, + "baseline_avg_score": 0.333, + "candidate_avg_score": 1.0, + "pass_rate_delta": 1.0, + "avg_score_delta": 0.667, + "regressions": 0, + "improvements": 4, + "gate_verdict": "pass" + }, + "variants": [ + { + "name": "prompt_v1", + "pass_rate": 0.0, + "avg_score": 0.333, + "passed_cases": 0, + "total_cases": 4 + }, + { + "name": "prompt_v2", + "pass_rate": 1.0, + "avg_score": 1.0, + "passed_cases": 4, + "total_cases": 4 + } + ], + "regressions": [], + "gate": { + "passed": true, + "reasons": [], + "max_regressions": 0, + "max_score_drop": 0.0, + "max_pass_rate_drop": 0.0 + }, + "artifact_urls": { + "report": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark/latest-report.md", + "fixture": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/support_qa.json", + "run": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark/archive/evalops-2026-05-26-d34c4f66.json" + }, + "schema_version": 1, + "generated_at": "2026-05-26T07:25:34Z", + "previous_run": null +} diff --git a/api/benchmark-latest.py b/api/benchmark-latest.py new file mode 100644 index 0000000..299cee9 --- /dev/null +++ b/api/benchmark-latest.py @@ -0,0 +1,76 @@ +"""Public benchmark endpoint: the latest published evaluation run. + +Stdlib-only Vercel Python serverless function. Serves the committed artifact at +``api/_benchmark_latest.json`` (written by ``evalops_workbench.benchmark_runner`` +and refreshed by the nightly cron). The artifact already conforms to the +benchmark-latest specification in TELEMETRY_SCHEMA.md, so this endpoint reads and +returns it directly. The contract forbids HTTP 5xx; a missing artifact yields a +valid ``status: "pending"`` envelope. +""" +from __future__ import annotations + +import json +from datetime import datetime, timezone +from http.server import BaseHTTPRequestHandler +from pathlib import Path +from typing import Any + +SYSTEM_SLUG = "evalops" +BENCHMARK_TYPE = "eval" +SCHEMA_VERSION = 1 +ARTIFACT_FILE = Path(__file__).parent / "_benchmark_latest.json" + + +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _pending_payload() -> dict[str, Any]: + """Honest envelope for the window before the first run is published.""" + return { + "system": SYSTEM_SLUG, + "benchmark_type": BENCHMARK_TYPE, + "status": "pending", + "run_id": None, + "metrics": None, + "schema_version": SCHEMA_VERSION, + "generated_at": _now_iso(), + } + + +def build_response() -> dict[str, Any]: + try: + return json.loads(ARTIFACT_FILE.read_text(encoding="utf-8")) + except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError): + return _pending_payload() + + +class handler(BaseHTTPRequestHandler): + """Vercel Python serverless entrypoint.""" + + def _write_common_headers(self) -> None: + self.send_header("Cache-Control", "public, max-age=30, stale-while-revalidate=60") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + + def do_OPTIONS(self) -> None: # noqa: N802 (interface contract) + self.send_response(204) + self._write_common_headers() + self.end_headers() + + def do_GET(self) -> None: # noqa: N802 (interface contract) + try: + payload = build_response() + except Exception: # noqa: BLE001 (last resort: contract forbids 5xx) + payload = _pending_payload() + body = json.dumps(payload, separators=(",", ":")).encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "application/json") + self._write_common_headers() + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, fmt: str, *args: Any) -> None: # noqa: A002, ARG002 + return diff --git a/api/stats.py b/api/stats.py index 69b9754..3ad5228 100644 --- a/api/stats.py +++ b/api/stats.py @@ -1,50 +1,44 @@ -"""Public telemetry endpoint for the showcase deploy. +"""Public telemetry endpoint for EvalOps Workbench (Tier A, live workload). -Stdlib-only Vercel Python serverless function. Reports honest GitHub-derived -signals about the codebase, never simulated workload metrics. The Tier B -endpoint is consumed by the Production Telemetry panel on -https://eleventh.dev. See: +Stdlib-only Vercel Python serverless function. The live workload is the public +benchmark: ``evalops_workbench.benchmark_runner`` runs nightly, persists each +result to the repo, and this endpoint reports honest metrics derived from that +durable history. See: https://github.com/IgnazioDS/IgnazioDS/blob/main/TELEMETRY_SCHEMA.md + +Every value is computed from committed run records. Nothing is simulated, +seeded, or incremented in memory. If no run has been published the endpoint +degrades honestly (status="degraded", zeroed metrics) and never returns 5xx. """ from __future__ import annotations import json import os -import re -import time from datetime import datetime, timedelta, timezone from http.server import BaseHTTPRequestHandler from pathlib import Path from typing import Any -from urllib.error import HTTPError, URLError -from urllib.request import Request, urlopen -# --- repo identity --- SYSTEM_SLUG = "evalops" -GITHUB_OWNER = "IgnazioDS" -GITHUB_REPO = "evalops-workbench" - -# --- contract constants --- SCHEMA_VERSION = 1 -HTTP_TIMEOUT_S = 4.0 -CACHE_TTL_S = 300 # 5 min, stays well under GitHub's 60-req/hr unauth cap -# --- safety caps: never expose values larger than these --- +ARTIFACT_FILE = Path(__file__).parent / "_benchmark_latest.json" +HISTORY_FILE = Path(__file__).parent / "_benchmark_history.json" +STATIC_FILE = Path(__file__).parent / "_telemetry_static.json" + +# Sanity caps: never expose values larger than these (defence against a runaway +# history file). The benchmark publishes one run per scheduled invocation. SAFETY_CAPS: dict[str, int] = { - "commits_total": 1_000_000, - "commits_30d": 100_000, - "lines_of_code": 10_000_000, - "repo_stars": 1_000_000, + "eval_runs_total": 1_000_000, + "eval_runs_24h": 10_000, + "regressions_caught_30d": 1_000_000, + "experiments_tracked": 100_000, } -GITHUB_API = "https://api.github.com" -USER_AGENT = "eleventh-telemetry/1.0 (+https://eleventh.dev)" -STATIC_FILE = Path(__file__).parent / "_telemetry_static.json" -# Module-scope cache survives across warm Vercel invocations; cold starts pay -# one GitHub round-trip and prime the cache for ~5min of subsequent requests. -_cache: dict[str, Any] = {"ts": 0.0, "payload": None} +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def _cap(name: str, value: int) -> int: @@ -52,159 +46,102 @@ def _cap(name: str, value: int) -> int: return min(value, cap) if cap is not None else value -def _now_iso() -> str: - return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - - -def _load_static() -> dict[str, Any]: - """Read the build-time artifact (lines_of_code, built_at). Missing fields - are silently treated as absent per the spec ("omit rather than estimate").""" +def _read_json(path: Path) -> Any: try: - return json.loads(STATIC_FILE.read_text(encoding="utf-8")) + return json.loads(path.read_text(encoding="utf-8")) except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError): - return {} + return None -def _http_get(url: str) -> tuple[Any, dict[str, str]]: - """Stdlib HTTP GET. Returns (parsed_json, response_headers).""" - req = Request( - url, - headers={"User-Agent": USER_AGENT, "Accept": "application/vnd.github+json"}, - ) - with urlopen(req, timeout=HTTP_TIMEOUT_S) as resp: # noqa: S310 (https only) - body = resp.read().decode("utf-8") - # Headers is a Message object; convert to plain dict for portability. - hdrs = {k.lower(): v for k, v in resp.getheaders()} - return json.loads(body), hdrs - +def _parse_iso(value: Any) -> datetime | None: + if not isinstance(value, str): + return None + try: + return datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc) + except ValueError: + return None -_LAST_PAGE_RE = re.compile(r'<[^>]*[?&]page=(\d+)[^>]*>;\s*rel="last"') +def _within(record: dict, days: int, now: datetime) -> bool: + stamp = _parse_iso(record.get("generated_at")) + return stamp is not None and (now - stamp) <= timedelta(days=days) -def _commits_count_from_link_header(link_header: str, when_no_last: int) -> int: - """Parse the 'last' page number from GitHub's Link header. - With per_page=1, the page count IS the total record count. When no Link - header is present (single page of results), fall back to ``when_no_last``. - """ - match = _LAST_PAGE_RE.search(link_header or "") - if match: - return int(match.group(1)) - return when_no_last +def _zeroed_metrics() -> dict[str, Any]: + return { + "eval_runs_total": 0, + "eval_runs_24h": 0, + "last_pass_rate": 0.0, + "rolling_pass_rate_7d": 0.0, + "regressions_caught_30d": 0, + "experiments_tracked": 0, + } -def _fetch_metrics() -> tuple[dict[str, Any], str | None]: - """Pull GitHub-derived metrics. Returns (metrics, last_commit_at).""" - repo, _ = _http_get(f"{GITHUB_API}/repos/{GITHUB_OWNER}/{GITHUB_REPO}") - repo_stars = _cap("repo_stars", int(repo.get("stargazers_count") or 0)) - primary_language = repo.get("language") or "Unknown" +def _metrics_from_history(history: list[dict], now: datetime) -> dict[str, Any]: + if not history: + return _zeroed_metrics() - commits_url = ( - f"{GITHUB_API}/repos/{GITHUB_OWNER}/{GITHUB_REPO}/commits?per_page=1" - ) - latest_commits, latest_hdrs = _http_get(commits_url) - commits_total = _cap( - "commits_total", - _commits_count_from_link_header(latest_hdrs.get("link", ""), len(latest_commits)), - ) - last_commit_at: str | None = None - if latest_commits: - last_commit_at = ( - latest_commits[0].get("commit", {}).get("author", {}).get("date") - ) - - since = (datetime.now(timezone.utc) - timedelta(days=30)).strftime( - "%Y-%m-%dT%H:%M:%SZ" - ) - recent_url = ( - f"{GITHUB_API}/repos/{GITHUB_OWNER}/{GITHUB_REPO}" - f"/commits?per_page=1&since={since}" - ) - recent_commits, recent_hdrs = _http_get(recent_url) - commits_30d = _cap( - "commits_30d", - _commits_count_from_link_header(recent_hdrs.get("link", ""), len(recent_commits)), + runs_7d = [r for r in history if _within(r, 7, now)] + pass_rates_7d = [float(r.get("pass_rate", 0.0)) for r in runs_7d] + rolling_7d = ( + round(sum(pass_rates_7d) / len(pass_rates_7d), 4) + if pass_rates_7d + else float(history[-1].get("pass_rate", 0.0)) ) - metrics: dict[str, Any] = { - "commits_30d": commits_30d, - "commits_total": commits_total, - "primary_language": primary_language, - "repo_stars": repo_stars, + variants: set[str] = set() + # Distinct regressions, not a per-run sum: re-detecting the same case nightly + # is not catching a new regression, so the 30-day count unions case ids. + regressed_30d: set[str] = set() + for record in history: + variants.update(record.get("variants", []) or []) + if _within(record, 30, now): + regressed_30d.update(record.get("regressed_ids", []) or []) + + return { + "eval_runs_total": _cap("eval_runs_total", len(history)), + "eval_runs_24h": _cap("eval_runs_24h", sum(1 for r in history if _within(r, 1, now))), + "last_pass_rate": round(float(history[-1].get("pass_rate", 0.0)), 4), + "rolling_pass_rate_7d": rolling_7d, + "regressions_caught_30d": _cap("regressions_caught_30d", len(regressed_30d)), + "experiments_tracked": _cap("experiments_tracked", len(variants)), } - static = _load_static() - loc = static.get("lines_of_code") - if isinstance(loc, int) and loc > 0: - metrics["lines_of_code"] = _cap("lines_of_code", loc) - return metrics, last_commit_at - - -def _zeroed_metrics() -> dict[str, Any]: - metrics: dict[str, Any] = { - "commits_30d": 0, - "commits_total": 0, - "primary_language": "Unknown", - "repo_stars": 0, - } - static = _load_static() - loc = static.get("lines_of_code") - if isinstance(loc, int) and loc > 0: - metrics["lines_of_code"] = _cap("lines_of_code", loc) - return metrics def _build_response() -> dict[str, Any]: - """Compose the full response object. Always returns a parseable dict.""" - now = time.time() - cached = _cache.get("payload") - if cached is not None and (now - _cache["ts"]) < CACHE_TTL_S: - fresh = dict(cached) - fresh["generated_at"] = _now_iso() - return fresh - - static = _load_static() - last_deployed_at = ( - os.environ.get("VERCEL_GIT_COMMIT_AUTHOR_DATE") or static.get("built_at") - ) - - try: - metrics, last_commit_at = _fetch_metrics() + now = datetime.now(timezone.utc) + static = _read_json(STATIC_FILE) or {} + last_deployed_at = os.environ.get("VERCEL_GIT_COMMIT_AUTHOR_DATE") or static.get("built_at") + + history = _read_json(HISTORY_FILE) + artifact = _read_json(ARTIFACT_FILE) + + if isinstance(history, list) and history: + metrics = _metrics_from_history(history, now) + last_active_at = ( + artifact.get("generated_at") if isinstance(artifact, dict) else None + ) or history[-1].get("generated_at") status = "operational" - except (HTTPError, URLError, OSError, json.JSONDecodeError, ValueError, TimeoutError): - # Upstream unreachable. Serve last good cache if we have one, - # otherwise zeros. Never propagate the error. - if cached is not None: - stale = dict(cached) - stale["status"] = "degraded" - stale["generated_at"] = _now_iso() - return stale + else: metrics = _zeroed_metrics() - last_commit_at = None + last_active_at = None status = "degraded" - response: dict[str, Any] = { + return { "system": SYSTEM_SLUG, - "mode": "showcase", + "mode": "live", "status": status, "last_deployed_at": last_deployed_at, - "last_commit_at": last_commit_at, + "last_active_at": last_active_at, "metrics": metrics, "schema_version": SCHEMA_VERSION, "generated_at": _now_iso(), } - if status == "operational": - _cache["payload"] = response - _cache["ts"] = now - return response - class handler(BaseHTTPRequestHandler): - """Vercel Python serverless entrypoint. - - Vercel discovers this class by name; the runtime invokes ``do_GET`` / - ``do_OPTIONS`` per the BaseHTTPRequestHandler protocol. - """ + """Vercel Python serverless entrypoint.""" def _write_common_headers(self) -> None: self.send_header("Cache-Control", "public, max-age=30, stale-while-revalidate=60") @@ -220,18 +157,17 @@ def do_OPTIONS(self) -> None: # noqa: N802 (interface contract) def do_GET(self) -> None: # noqa: N802 (interface contract) try: payload = _build_response() - except Exception: # noqa: BLE001 (last-resort: contract forbids 5xx) + except Exception: # noqa: BLE001 (last resort: contract forbids 5xx) payload = { "system": SYSTEM_SLUG, - "mode": "showcase", + "mode": "live", "status": "degraded", "last_deployed_at": None, - "last_commit_at": None, + "last_active_at": None, "metrics": _zeroed_metrics(), "schema_version": SCHEMA_VERSION, "generated_at": _now_iso(), } - body = json.dumps(payload, separators=(",", ":")).encode("utf-8") self.send_response(200) self.send_header("Content-Type", "application/json") @@ -241,4 +177,4 @@ def do_GET(self) -> None: # noqa: N802 (interface contract) self.wfile.write(body) def log_message(self, fmt: str, *args: Any) -> None: # noqa: A002, ARG002 - return # Suppress default access log; Vercel captures stdout/stderr. + return diff --git a/examples/benchmark/archive/evalops-2026-05-26-d34c4f66.json b/examples/benchmark/archive/evalops-2026-05-26-d34c4f66.json new file mode 100644 index 0000000..17185e6 --- /dev/null +++ b/examples/benchmark/archive/evalops-2026-05-26-d34c4f66.json @@ -0,0 +1,52 @@ +{ + "system": "evalops", + "benchmark_type": "eval", + "run_id": "evalops-2026-05-26-d34c4f66", + "fixture": "support-qa", + "metrics": { + "n_cases": 4, + "baseline_variant": "prompt_v1", + "candidate_variant": "prompt_v2", + "baseline_pass_rate": 0.0, + "candidate_pass_rate": 1.0, + "baseline_avg_score": 0.333, + "candidate_avg_score": 1.0, + "pass_rate_delta": 1.0, + "avg_score_delta": 0.667, + "regressions": 0, + "improvements": 4, + "gate_verdict": "pass" + }, + "variants": [ + { + "name": "prompt_v1", + "pass_rate": 0.0, + "avg_score": 0.333, + "passed_cases": 0, + "total_cases": 4 + }, + { + "name": "prompt_v2", + "pass_rate": 1.0, + "avg_score": 1.0, + "passed_cases": 4, + "total_cases": 4 + } + ], + "regressions": [], + "gate": { + "passed": true, + "reasons": [], + "max_regressions": 0, + "max_score_drop": 0.0, + "max_pass_rate_drop": 0.0 + }, + "artifact_urls": { + "report": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark/latest-report.md", + "fixture": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/support_qa.json", + "run": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark/archive/evalops-2026-05-26-d34c4f66.json" + }, + "schema_version": 1, + "generated_at": "2026-05-26T07:25:34Z", + "previous_run": null +} diff --git a/examples/benchmark/latest-report.md b/examples/benchmark/latest-report.md new file mode 100644 index 0000000..e268f9b --- /dev/null +++ b/examples/benchmark/latest-report.md @@ -0,0 +1,27 @@ +# EvalOps benchmark: support-qa + +- Run: `evalops-2026-05-26-d34c4f66` +- Generated: 2026-05-26T07:25:34Z +- Gate verdict: **PASS** + +# EvalOps Comparison Report + +- Base run: `run_20260526T072533950741_09b3b0` +- Candidate run: `run_20260526T072533983518_2b6669` +- Average score delta: `+0.667` +- Pass-rate delta: `+1.000` +- Regressions: `0` +- Improvements: `4` + +## Regressions + +No regressions. + +## Improvements + +| case_id | kind | score delta | notes | +| --- | --- | ---: | --- | +| `data_residency` | `fail_to_pass` | `+0.667` | Case data_residency improved from fail to pass. | +| `refund_policy` | `fail_to_pass` | `+0.333` | Case refund_policy improved from fail to pass. | +| `seat_upgrade` | `fail_to_pass` | `+1.000` | Case seat_upgrade improved from fail to pass. | +| `sla_enterprise` | `fail_to_pass` | `+0.667` | Case sla_enterprise improved from fail to pass. | diff --git a/src/app/page.tsx b/src/app/page.tsx index 219523d..672c9fe 100644 --- a/src/app/page.tsx +++ b/src/app/page.tsx @@ -4,57 +4,59 @@ import { useEffect, useState } from "react"; import { ArrowRight, ExternalLink, - GitCommit, + FileText, + FlaskConical, + GitCompare, Github, - Lightbulb, - Star, - TrendingUp, + ShieldAlert, + Target, Users, } from "lucide-react"; -import { fetchPublicStats, type PublicStats } from "@/lib/api"; +import { + fetchBenchmarkLatest, + fetchPublicStats, + type PublicBenchmark, + type PublicStats, +} from "@/lib/api"; import { TopBar } from "@/components/layout/TopBar"; import { Card, CardHeader, CardTitle, CardContent, CardDescription } from "@/components/ui/card"; import { Button } from "@/components/ui/button"; import { Badge } from "@/components/ui/badge"; import { StatusDot } from "@/components/ui/status-dot"; -import { StatCard } from "@/components/dashboard/StatCard"; import { Skeleton } from "@/components/ui/skeleton"; -import { Sparkline } from "@/components/ui/sparkline"; import { PROJECT } from "@/lib/project"; -import { formatRelative } from "@/lib/utils"; +import { formatNumber, formatRelative } from "@/lib/utils"; -/** - * Build a deterministic 10-point shape derived from the live value, so - * StatCard sparklines convey velocity without claiming a measured history - * the showcase tier doesn't have. - */ -function shapeFromValue(target: number, points = 10): number[] { - if (target <= 0) return Array(points).fill(0); - const result: number[] = []; - for (let i = 0; i < points; i++) { - const ratio = i / (points - 1); - const eased = ratio * ratio; - const wobble = Math.sin(i + target) * 0.06; - result.push(target * (eased + wobble + 0.1)); - } - return result; +function pct(value: number | undefined): string { + if (value === undefined || Number.isNaN(value)) return "—"; + return `${Math.round(value * 100)}%`; +} + +function signedPoints(value: number): string { + const points = Math.round(value * 100); + return `${points >= 0 ? "+" : ""}${points} pts`; } export default function OverviewPage() { const [stats, setStats] = useState(null); + const [benchmark, setBenchmark] = useState(null); const [loading, setLoading] = useState(true); useEffect(() => { - fetchPublicStats() - .then(setStats) - .catch(() => null) + Promise.allSettled([fetchPublicStats(), fetchBenchmarkLatest()]) + .then(([statsResult, benchmarkResult]) => { + if (statsResult.status === "fulfilled") setStats(statsResult.value); + if (benchmarkResult.status === "fulfilled") setBenchmark(benchmarkResult.value); + }) .finally(() => setLoading(false)); }, []); - const commitsTotal = (stats?.metrics.commits_total as number | undefined) ?? 0; - const commits30d = (stats?.metrics.commits_30d as number | undefined) ?? 0; - const stars = (stats?.metrics.repo_stars as number | undefined) ?? 0; - const loc = (stats?.metrics.lines_of_code as number | undefined) ?? 0; + const metrics = stats?.metrics ?? {}; + const lastPass = metrics.last_pass_rate as number | undefined; + const rolling = metrics.rolling_pass_rate_7d as number | undefined; + const regressions = metrics.regressions_caught_30d as number | undefined; + const runs = metrics.eval_runs_total as number | undefined; + const experiments = metrics.experiments_tracked as number | undefined; return ( <> @@ -103,11 +105,7 @@ export default function OverviewPage() { + + + )} + + + ); +} + +function Stat({ + title, + value, + subtitle, + icon: Icon, + loading, +}: { + title: string; + value: string; + subtitle?: string; + icon: typeof Target; + loading: boolean; +}) { + return ( + +
+
+

+ {title} +

+
+ +
+
+ {loading ? ( + + ) : ( +

{value}

+ )} + {subtitle &&

{subtitle}

} +
+
+ ); +} + function StatusCell({ label, value, @@ -262,14 +431,8 @@ function StatusCell({

{label}

-

- {value} -

- {hint && ( -

- {hint} -

- )} +

{value}

+ {hint &&

{hint}

} ); } diff --git a/src/app/telemetry/page.tsx b/src/app/telemetry/page.tsx index f888169..3bfdf58 100644 --- a/src/app/telemetry/page.tsx +++ b/src/app/telemetry/page.tsx @@ -3,11 +3,11 @@ import { useState } from "react"; import { CheckCircle2, - Code2, - GitCommit, - Layers, + FlaskConical, + GitCompare, RefreshCw, - Star, + ShieldAlert, + Target, } from "lucide-react"; import { fetchPublicStats, type PublicStats } from "@/lib/api"; import { TopBar } from "@/components/layout/TopBar"; @@ -28,6 +28,12 @@ import { const POLL_INTERVAL_MS = 30_000; +function pct(value: unknown): string { + return typeof value === "number" && !Number.isNaN(value) + ? `${Math.round(value * 100)}%` + : "—"; +} + export default function TelemetryPage() { const { data: stats, loading, error, refetch } = usePolling( fetchPublicStats, @@ -77,7 +83,7 @@ export default function TelemetryPage() {
- {stats?.mode ?? "showcase"} + {stats?.mode ?? "live"} generated {formatRelative(stats?.generated_at)} @@ -102,61 +108,57 @@ export default function TelemetryPage() { )} - {/* Tier-B metric grid */} + {/* Tier-A metric grid */}
@@ -187,7 +189,7 @@ export default function TelemetryPage() { /> This endpoint runs in{" "} - mode: "showcase" + mode: "live" {" "} per the public schema at{" "} TELEMETRY_SCHEMA.md - . Counters are sourced from the GitHub REST API - (commits, language, stars) plus a build-time line-of-code - snapshot, behind a 5-minute module-scope cache. + . Every metric is computed from the committed history of + the public benchmark, which re-runs on a schedule and on + every change. Nothing is simulated, seeded, or incremented + in memory.

- The endpoint never returns 5xx — GitHub failures degrade - to{" "} + The endpoint never returns 5xx. With no published run it + degrades to{" "} status: "degraded" {" "} - with the last cached response (or zeros) and a - contract-valid envelope. + with zeroed metrics and a contract-valid envelope. The + latest run is served at{" "} + + /api/benchmark-latest + + .

- {`curl -i https://${PROJECT.slug}.vercel.app/api/stats`} + {`curl -i https://${PROJECT.slug}.eleventh.dev/api/stats`}
@@ -284,7 +291,7 @@ function MetricTile({ }: { label: string; value: string; - icon: typeof GitCommit; + icon: typeof CheckCircle2; loading: boolean; }) { return ( diff --git a/src/evalops_workbench/benchmark_runner.py b/src/evalops_workbench/benchmark_runner.py new file mode 100644 index 0000000..1954979 --- /dev/null +++ b/src/evalops_workbench/benchmark_runner.py @@ -0,0 +1,226 @@ +"""Run the public benchmark on top of the workbench engine and publish it. + +This is the public-proof layer over ``workbench.py``: it runs the canonical +eval engine over the committed support-QA dataset, comparing the baseline +prompt variant against the candidate, gates regressions, and writes a +schema-conformed artifact the stdlib /api/benchmark-latest and /api/stats +endpoints serve. Re-running on the committed fixture reproduces the numbers. + +The workbench engine (DuckDB-backed) runs here in CI; the deployed endpoints +only ever read the committed JSON artifact. +""" +from __future__ import annotations + +import hashlib +import json +import shutil +import tempfile +from datetime import datetime, timezone +from pathlib import Path + +from .workbench import ( + assess_gate, + compare_runs, + format_comparison_markdown, + run_evaluation, +) + +SYSTEM_SLUG = "evalops" +BENCHMARK_TYPE = "eval" +SCHEMA_VERSION = 1 +FIXTURE_ID = "support-qa" +DATASET_REL = "examples/support_qa.json" +BASELINE_VARIANT = "prompt_v1" +CANDIDATE_VARIANT = "prompt_v2" + +# Gate thresholds: any regression, any aggregate drop, blocks. The candidate +# must not lose ground on the pinned dataset. +GATE_KWARGS = {"max_regressions": 0, "max_score_drop": 0.0, "max_pass_rate_drop": 0.0} + +_RAW_BASE = "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main" +_HISTORY_KEEP = 100 + + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[2] + + +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _read_history(path: Path) -> list[dict]: + try: + data = json.loads(path.read_text(encoding="utf-8")) + except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError): + return [] + return data if isinstance(data, list) else [] + + +def _run_id(generated_at: str, base, candidate) -> str: + seed = f"{candidate.pass_rate}|{candidate.avg_score}|{base.pass_rate}|{base.avg_score}|{candidate.total_cases}" + digest = hashlib.sha256(seed.encode("utf-8")).hexdigest()[:8] + return f"{SYSTEM_SLUG}-{generated_at[:10]}-{digest}" + + +def _variant_row(summary) -> dict: + return { + "name": summary.variant, + "pass_rate": round(summary.pass_rate, 4), + "avg_score": round(summary.avg_score, 4), + "passed_cases": summary.passed_cases, + "total_cases": summary.total_cases, + } + + +def _regression_rows(comparison) -> list[dict]: + return [ + { + "case_id": item.get("case_id"), + "score_delta": item.get("score_delta"), + "reason": item.get("summary", ""), + } + for item in comparison.regressions + ] + + +def _previous_block(previous: dict | None, metrics: dict) -> dict | None: + if not previous: + return None + return { + "run_id": previous.get("run_id"), + "generated_at": previous.get("generated_at"), + "delta": { + "pass_rate": round(metrics["candidate_pass_rate"] - float(previous.get("pass_rate", 0.0)), 4), + "avg_score": round(metrics["candidate_avg_score"] - float(previous.get("avg_score", 0.0)), 4), + "regressions": metrics["regressions"] - int(previous.get("regressions", 0)), + }, + } + + +def _build_artifact(*, base, candidate, comparison, gate, generated_at, previous) -> dict: + run_id = _run_id(generated_at, base, candidate) + metrics = { + "n_cases": candidate.total_cases, + "baseline_variant": base.variant, + "candidate_variant": candidate.variant, + "baseline_pass_rate": round(base.pass_rate, 4), + "candidate_pass_rate": round(candidate.pass_rate, 4), + "baseline_avg_score": round(base.avg_score, 4), + "candidate_avg_score": round(candidate.avg_score, 4), + "pass_rate_delta": round(comparison.pass_rate_delta, 4), + "avg_score_delta": round(comparison.avg_score_delta, 4), + "regressions": len(comparison.regressions), + "improvements": len(comparison.improvements), + "gate_verdict": "pass" if gate.passed else "fail", + } + return { + "system": SYSTEM_SLUG, + "benchmark_type": BENCHMARK_TYPE, + "run_id": run_id, + "fixture": FIXTURE_ID, + "metrics": metrics, + "variants": [_variant_row(base), _variant_row(candidate)], + "regressions": _regression_rows(comparison), + "gate": { + "passed": gate.passed, + "reasons": gate.reasons, + "max_regressions": gate.max_regressions, + "max_score_drop": gate.max_score_drop, + "max_pass_rate_drop": gate.max_pass_rate_drop, + }, + "artifact_urls": { + "report": f"{_RAW_BASE}/examples/benchmark/latest-report.md", + "fixture": f"{_RAW_BASE}/{DATASET_REL}", + "run": f"{_RAW_BASE}/examples/benchmark/archive/{run_id}.json", + }, + "schema_version": SCHEMA_VERSION, + "generated_at": generated_at, + "previous_run": _previous_block(previous, metrics), + } + + +def _slim_record(artifact: dict) -> dict: + metrics = artifact["metrics"] + return { + "run_id": artifact["run_id"], + "generated_at": artifact["generated_at"], + "pass_rate": metrics["candidate_pass_rate"], + "avg_score": metrics["candidate_avg_score"], + "regressions": metrics["regressions"], + "regressed_ids": [r["case_id"] for r in artifact["regressions"]], + "gate_verdict": metrics["gate_verdict"], + "variants": [v["name"] for v in artifact["variants"]], + } + + +def _write_json(path: Path, payload) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") + + +def run_benchmark(repo_root: Path | None = None, *, write: bool = True) -> dict: + """Run baseline vs candidate through the workbench engine and publish.""" + root = repo_root or _repo_root() + dataset = root / DATASET_REL + results_dir = root / "examples" / "benchmark" + history_path = root / "api" / "_benchmark_history.json" + latest_path = root / "api" / "_benchmark_latest.json" + + workspace = Path(tempfile.mkdtemp(prefix="evalops-bench-")) + try: + base = run_evaluation(dataset, BASELINE_VARIANT, workspace, root) + candidate = run_evaluation(dataset, CANDIDATE_VARIANT, workspace, root) + comparison = compare_runs(base.run_id, candidate.run_id, workspace) + gate = assess_gate(comparison, **GATE_KWARGS) + report_md = format_comparison_markdown(comparison, limit=20) + finally: + shutil.rmtree(workspace, ignore_errors=True) + + generated_at = _now_iso() + previous = (_read_history(history_path) or [None])[-1] + artifact = _build_artifact( + base=base, + candidate=candidate, + comparison=comparison, + gate=gate, + generated_at=generated_at, + previous=previous, + ) + + if write: + _write_json(latest_path, artifact) + _write_json(results_dir / "archive" / f"{artifact['run_id']}.json", artifact) + report_path = results_dir / "latest-report.md" + report_path.parent.mkdir(parents=True, exist_ok=True) + header = ( + f"# EvalOps benchmark: {FIXTURE_ID}\n\n" + f"- Run: `{artifact['run_id']}`\n" + f"- Generated: {generated_at}\n" + f"- Gate verdict: **{artifact['metrics']['gate_verdict'].upper()}**\n\n" + ) + report_path.write_text(header + report_md + "\n", encoding="utf-8") + + history = _read_history(history_path) + history.append(_slim_record(artifact)) + _write_json(history_path, history[-_HISTORY_KEEP:]) + + return artifact + + +def main(argv: list[str] | None = None) -> int: + artifact = run_benchmark() + metrics = artifact["metrics"] + print( + f"[{artifact['run_id']}] cases={metrics['n_cases']} " + f"candidate={metrics['candidate_variant']} pass_rate={metrics['candidate_pass_rate']:.3f} " + f"(vs baseline {metrics['pass_rate_delta']:+.3f}) " + f"avg_score={metrics['candidate_avg_score']:.3f} " + f"regressions={metrics['regressions']} improvements={metrics['improvements']} " + f"gate={metrics['gate_verdict']}" + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/lib/api.ts b/src/lib/api.ts index aa5474c..449069f 100644 --- a/src/lib/api.ts +++ b/src/lib/api.ts @@ -1,6 +1,7 @@ -// Slim API surface for the showcase dashboard. -// Only the public /api/stats endpoint is real on showcase deploys; the -// Tier-A BFF endpoints (run, documents, ui/*) don't exist here. +// Slim API surface for the dashboard. +// Two public, unauthenticated endpoints are real on this deploy: +// /api/stats — Tier-A telemetry (TELEMETRY_SCHEMA.md) +// /api/benchmark-latest — the latest published benchmark run async function publicFetch(path: string, init?: RequestInit): Promise { const res = await fetch(path, { @@ -13,19 +14,21 @@ async function publicFetch(path: string, init?: RequestInit): Promise { return res.json() as Promise; } -/** Tier-B telemetry response — see TELEMETRY_SCHEMA.md. */ +/** Tier-A telemetry response — see TELEMETRY_SCHEMA.md. */ export interface PublicStats { system: string; mode?: "live" | "showcase"; status: "operational" | "degraded" | "down"; last_deployed_at: string | null; + last_active_at?: string | null; last_commit_at?: string | null; metrics: { - commits_30d?: number; - commits_total?: number; - primary_language?: string; - repo_stars?: number; - lines_of_code?: number; + eval_runs_total?: number; + eval_runs_24h?: number; + last_pass_rate?: number; + rolling_pass_rate_7d?: number; + regressions_caught_30d?: number; + experiments_tracked?: number; [key: string]: number | string | undefined; }; schema_version: number; @@ -35,3 +38,65 @@ export interface PublicStats { export function fetchPublicStats(): Promise { return publicFetch("/api/stats"); } + +/** One variant's aggregate in a benchmark run (rubric-scored). */ +export interface BenchmarkVariant { + name: string; + pass_rate: number; + avg_score: number; + passed_cases: number; + total_cases: number; +} + +/** A per-case regression surfaced by the run. */ +export interface BenchmarkRegression { + case_id: string; + score_delta: number; + reason: string; +} + +export interface BenchmarkMetrics { + n_cases: number; + baseline_variant: string; + candidate_variant: string; + baseline_pass_rate: number; + candidate_pass_rate: number; + baseline_avg_score: number; + candidate_avg_score: number; + pass_rate_delta: number; + avg_score_delta: number; + regressions: number; + improvements: number; + gate_verdict: string; +} + +/** /api/benchmark-latest response — see the benchmark-latest spec in TELEMETRY_SCHEMA.md. */ +export interface PublicBenchmark { + system: string; + benchmark_type: string; + status?: string; + run_id: string | null; + fixture?: string; + metrics: BenchmarkMetrics | null; + variants?: BenchmarkVariant[]; + regressions?: BenchmarkRegression[]; + gate?: { + passed: boolean; + reasons: string[]; + max_regressions: number; + max_score_drop: number; + max_pass_rate_drop: number; + }; + artifact_urls?: { report: string; fixture: string; run: string }; + schema_version: number; + generated_at: string; + previous_run?: { + run_id: string | null; + generated_at: string | null; + delta: Record; + } | null; +} + +export function fetchBenchmarkLatest(): Promise { + return publicFetch("/api/benchmark-latest"); +} diff --git a/tests/test_benchmark_endpoint.py b/tests/test_benchmark_endpoint.py new file mode 100644 index 0000000..c24df67 --- /dev/null +++ b/tests/test_benchmark_endpoint.py @@ -0,0 +1,75 @@ +"""Unit tests for the /api/benchmark-latest serverless function. + +The module file name contains a hyphen (matching the Vercel route), so it is +loaded by path rather than imported by name. +""" +from __future__ import annotations + +import importlib.util +import json +import tempfile +import unittest +from pathlib import Path + +_API_DIR = Path(__file__).resolve().parent.parent / "api" + + +def _load_endpoint(): + spec = importlib.util.spec_from_file_location( + "benchmark_latest_endpoint", _API_DIR / "benchmark-latest.py" + ) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +class BenchmarkEndpointTests(unittest.TestCase): + def setUp(self) -> None: + self.mod = _load_endpoint() + self._orig_artifact = self.mod.ARTIFACT_FILE + + def tearDown(self) -> None: + self.mod.ARTIFACT_FILE = self._orig_artifact + + def test_returns_committed_artifact(self) -> None: + artifact = { + "system": "evalops", + "benchmark_type": "eval", + "run_id": "evalops-2026-05-26-abc12345", + "metrics": {"token_f1": 0.65}, + "schema_version": 1, + "generated_at": "2026-05-26T00:00:00Z", + } + path = Path(tempfile.mkdtemp()) / "_benchmark_latest.json" + path.write_text(json.dumps(artifact), encoding="utf-8") + self.mod.ARTIFACT_FILE = path + + response = self.mod.build_response() + self.assertEqual(response["system"], "evalops") + self.assertEqual(response["run_id"], "evalops-2026-05-26-abc12345") + self.assertEqual(response["schema_version"], 1) + + def test_pending_when_artifact_missing(self) -> None: + self.mod.ARTIFACT_FILE = Path("/nonexistent/_benchmark_latest.json") + response = self.mod.build_response() + self.assertEqual(response["system"], "evalops") + self.assertEqual(response["status"], "pending") + self.assertEqual(response["benchmark_type"], "eval") + self.assertIsNone(response["run_id"]) + self.assertEqual(response["schema_version"], 1) + + def test_seeded_repo_artifact_is_valid(self) -> None: + """The committed artifact in the repo must be schema-valid.""" + response = self.mod.build_response() + # In the repo the seed exists; if a developer cleared it, accept pending. + if response.get("status") == "pending": + self.skipTest("no seeded artifact present") + self.assertEqual(response["system"], "evalops") + self.assertEqual(response["schema_version"], 1) + self.assertIn("metrics", response) + self.assertIn("generated_at", response) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_benchmark_runner.py b/tests/test_benchmark_runner.py new file mode 100644 index 0000000..5e81fe0 --- /dev/null +++ b/tests/test_benchmark_runner.py @@ -0,0 +1,36 @@ +"""Integration test: the public benchmark runner over the workbench engine.""" +from __future__ import annotations + +import unittest + +from evalops_workbench.benchmark_runner import run_benchmark + + +class BenchmarkRunnerTests(unittest.TestCase): + def test_runs_and_produces_valid_artifact(self) -> None: + artifact = run_benchmark(write=False) + + self.assertEqual(artifact["system"], "evalops") + self.assertEqual(artifact["benchmark_type"], "eval") + self.assertEqual(artifact["schema_version"], 1) + self.assertTrue(artifact["run_id"].startswith("evalops-")) + + metrics = artifact["metrics"] + self.assertEqual(metrics["baseline_variant"], "prompt_v1") + self.assertEqual(metrics["candidate_variant"], "prompt_v2") + self.assertIn(metrics["gate_verdict"], {"pass", "fail"}) + # On the committed fixture the grounded candidate must not lose ground. + self.assertGreaterEqual(metrics["candidate_pass_rate"], metrics["baseline_pass_rate"]) + + self.assertEqual(len(artifact["variants"]), 2) + for variant in artifact["variants"]: + self.assertEqual(set(variant), {"name", "pass_rate", "avg_score", "passed_cases", "total_cases"}) + + def test_run_id_is_deterministic(self) -> None: + first = run_benchmark(write=False)["run_id"] + second = run_benchmark(write=False)["run_id"] + self.assertEqual(first, second) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stats.py b/tests/test_stats.py index 32c876a..df001fb 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -1,156 +1,97 @@ -"""Unit tests for the /api/stats Vercel serverless function. +"""Unit tests for the /api/stats Vercel serverless function (Tier A, live). Covers: -- happy path: GitHub reachable, response shape matches Tier B contract -- degraded path: GitHub unreachable, contract still satisfied with status="degraded" -- safety caps: oversize values are clamped -- never returns 5xx (handler always emits HTTP 200) +- live path: benchmark history present, metrics derived from records +- degraded path: no history yet, contract satisfied with zeroed metrics +- schema shape matches the evalops Tier-A contract in TELEMETRY_SCHEMA.md +- safety caps and the never-5xx handler guarantee """ from __future__ import annotations import io import json import sys +import tempfile import unittest from pathlib import Path -from unittest.mock import MagicMock, patch -from urllib.error import URLError +from unittest.mock import MagicMock -# Add repo root to sys.path so we can import the api/stats.py module. +# Add repo root /api to sys.path so we can import the api/stats.py module. sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "api")) import stats # type: ignore # noqa: E402 +_TIER_A_METRICS = { + "eval_runs_total", + "eval_runs_24h", + "last_pass_rate", + "rolling_pass_rate_7d", + "regressions_caught_30d", + "experiments_tracked", +} -def _reset_cache() -> None: - stats._cache = {"ts": 0.0, "payload": None} +def _write_history(records: list[dict]) -> Path: + path = Path(tempfile.mkdtemp()) / "_benchmark_history.json" + path.write_text(json.dumps(records), encoding="utf-8") + return path -def _fake_response(body: object, link_header: str = "") -> MagicMock: - """Build a context-manager-compatible mock that mimics urlopen's return.""" - raw = json.dumps(body).encode("utf-8") - cm = MagicMock() - cm.__enter__ = MagicMock(return_value=cm) - cm.__exit__ = MagicMock(return_value=False) - cm.read = MagicMock(return_value=raw) - cm.getheaders = MagicMock( - return_value=[("Link", link_header)] if link_header else [] - ) - return cm - -class ResponseShapeTests(unittest.TestCase): +class LiveResponseTests(unittest.TestCase): def setUp(self) -> None: - _reset_cache() - - def test_happy_path_matches_contract(self) -> None: - repo_payload = {"stargazers_count": 7, "language": "Python"} - commit_payload = [ - {"commit": {"author": {"date": "2026-04-26T12:00:00Z"}}} - ] - - def side_effect(req, timeout=None): - url = req.full_url - if "/commits" not in url: - return _fake_response(repo_payload) - return _fake_response( - commit_payload, - link_header=( - f"; rel=\"next\", " - f"; rel=\"last\"" - ), - ) - - with patch.object(stats, "urlopen", side_effect=side_effect): - response = stats._build_response() - - self.assertEqual(response["schema_version"], 1) - self.assertEqual(response["mode"], "showcase") + self._orig_history = stats.HISTORY_FILE + self._orig_artifact = stats.ARTIFACT_FILE + + def tearDown(self) -> None: + stats.HISTORY_FILE = self._orig_history + stats.ARTIFACT_FILE = self._orig_artifact + + def test_live_operational_from_history(self) -> None: + stats.HISTORY_FILE = _write_history( + [ + {"run_id": "r1", "generated_at": stats._now_iso(), "pass_rate": 0.6, + "regressions": 2, "regressed_ids": ["a", "b"], + "variants": ["overlap_sentence", "span_extract"]}, + {"run_id": "r2", "generated_at": stats._now_iso(), "pass_rate": 0.7, + "regressions": 2, "regressed_ids": ["b", "c"], + "variants": ["span_extract", "first_sentence"]}, + ] + ) + stats.ARTIFACT_FILE = Path("/nonexistent/_benchmark_latest.json") + response = stats._build_response() + + self.assertEqual(response["mode"], "live") self.assertEqual(response["status"], "operational") - self.assertEqual(response["system"], stats.SYSTEM_SLUG) - self.assertIn("metrics", response) - self.assertEqual(response["metrics"]["repo_stars"], 7) - self.assertEqual(response["metrics"]["primary_language"], "Python") - self.assertEqual(response["metrics"]["commits_total"], 42) - self.assertEqual(response["last_commit_at"], "2026-04-26T12:00:00Z") - # generated_at is ISO-8601 with Z suffix. + self.assertEqual(response["schema_version"], 1) + self.assertEqual(set(response["metrics"]), _TIER_A_METRICS) + self.assertEqual(response["metrics"]["eval_runs_total"], 2) + self.assertEqual(response["metrics"]["last_pass_rate"], 0.7) + # Distinct regressions across the window: union of {a,b} and {b,c} = 3. + self.assertEqual(response["metrics"]["regressions_caught_30d"], 3) + self.assertEqual(response["metrics"]["experiments_tracked"], 3) self.assertTrue(response["generated_at"].endswith("Z")) - def test_degraded_when_github_unreachable(self) -> None: - with patch.object(stats, "urlopen", side_effect=URLError("offline")): - response = stats._build_response() + def test_degraded_without_history(self) -> None: + stats.HISTORY_FILE = Path("/nonexistent/_benchmark_history.json") + stats.ARTIFACT_FILE = Path("/nonexistent/_benchmark_latest.json") + response = stats._build_response() - self.assertEqual(response["schema_version"], 1) - self.assertEqual(response["mode"], "showcase") + self.assertEqual(response["mode"], "live") self.assertEqual(response["status"], "degraded") - self.assertEqual(response["metrics"]["commits_total"], 0) - self.assertEqual(response["metrics"]["repo_stars"], 0) - self.assertIsNone(response["last_commit_at"]) - - def test_serves_stale_cache_on_subsequent_failure(self) -> None: - # First call: successful. Second call: GitHub is down. Expect status - # to flip to "degraded" but the metric values from the cache are kept. - repo_payload = {"stargazers_count": 11, "language": "Go"} - commit_payload = [ - {"commit": {"author": {"date": "2026-04-25T08:00:00Z"}}} - ] - - def good(req, timeout=None): - if "/commits" not in req.full_url: - return _fake_response(repo_payload) - return _fake_response( - commit_payload, - link_header=( - '; rel="next", ' - '; rel="last"' - ), - ) - - with patch.object(stats, "urlopen", side_effect=good): - first = stats._build_response() - self.assertEqual(first["status"], "operational") - - with patch.object(stats, "_fetch_metrics", side_effect=URLError("offline")): - # Force cache miss by advancing the clock past the TTL. - stats._cache["ts"] = 0.0 - stale = stats._build_response() - self.assertEqual(stale["status"], "degraded") - self.assertEqual(stale["metrics"]["repo_stars"], 11) - self.assertEqual(stale["metrics"]["commits_total"], 99) + self.assertEqual(response["metrics"], stats._zeroed_metrics()) + self.assertIsNone(response["last_active_at"]) class SafetyCapTests(unittest.TestCase): - def test_oversize_values_are_clamped(self) -> None: - self.assertEqual(stats._cap("repo_stars", 99_999_999), 1_000_000) - self.assertEqual(stats._cap("commits_total", 50_000_000), 1_000_000) - self.assertEqual(stats._cap("commits_30d", 500_000), 100_000) - self.assertEqual(stats._cap("lines_of_code", 999_999_999), 10_000_000) - # Unknown key passes through unchanged. + def test_caps_clamp(self) -> None: + self.assertEqual(stats._cap("eval_runs_total", 9_999_999), 1_000_000) + self.assertEqual(stats._cap("experiments_tracked", 999_999), 100_000) self.assertEqual(stats._cap("not_a_field", 42), 42) class HandlerTests(unittest.TestCase): - """Exercise the BaseHTTPRequestHandler entrypoint end-to-end.""" - - def setUp(self) -> None: - _reset_cache() - def _invoke(self, method: str = "GET") -> tuple[int, dict[str, str], bytes]: - # Build a minimal raw HTTP request the handler can parse. - request_text = ( - f"{method} /api/stats HTTP/1.0\r\nHost: x\r\n\r\n" - ).encode("utf-8") - rfile = io.BytesIO(request_text) + rfile = io.BytesIO(f"{method} /api/stats HTTP/1.0\r\nHost: x\r\n\r\n".encode()) wfile = io.BytesIO() - - class _Conn: - def makefile(self, *_args: object, **_kwargs: object) -> io.BytesIO: - return rfile - - # BaseHTTPRequestHandler init runs the request automatically. h = stats.handler.__new__(stats.handler) h.rfile = rfile h.wfile = wfile @@ -161,39 +102,34 @@ def makefile(self, *_args: object, **_kwargs: object) -> io.BytesIO: h.request_version = "HTTP/1.0" h.headers = {} h.requestline = f"{method} /api/stats HTTP/1.0" - if method == "OPTIONS": h.do_OPTIONS() else: - with patch.object(stats, "urlopen", side_effect=URLError("test")): - h.do_GET() - + h.do_GET() raw = wfile.getvalue().decode("utf-8", errors="replace") head, _, body = raw.partition("\r\n\r\n") - status_line = head.split("\r\n", 1)[0] - status_code = int(status_line.split(" ", 2)[1]) - hdrs = {} + status_code = int(head.split("\r\n", 1)[0].split(" ", 2)[1]) + headers = {} for line in head.split("\r\n")[1:]: if ": " in line: - k, v = line.split(": ", 1) - hdrs[k] = v - return status_code, hdrs, body.encode("utf-8") + key, value = line.split(": ", 1) + headers[key] = value + return status_code, headers, body.encode("utf-8") - def test_get_returns_200_even_when_upstream_fails(self) -> None: - status, hdrs, body = self._invoke("GET") + def test_get_returns_200_with_valid_contract(self) -> None: + status, headers, body = self._invoke("GET") self.assertEqual(status, 200) - self.assertEqual(hdrs.get("Content-Type"), "application/json") - self.assertEqual(hdrs.get("Access-Control-Allow-Origin"), "*") - self.assertIn("max-age=30", hdrs.get("Cache-Control", "")) + self.assertEqual(headers.get("Content-Type"), "application/json") + self.assertEqual(headers.get("Access-Control-Allow-Origin"), "*") + self.assertIn("max-age=30", headers.get("Cache-Control", "")) payload = json.loads(body) self.assertEqual(payload["schema_version"], 1) - self.assertEqual(payload["status"], "degraded") + self.assertEqual(payload["mode"], "live") def test_options_returns_204(self) -> None: - status, hdrs, _ = self._invoke("OPTIONS") + status, headers, _ = self._invoke("OPTIONS") self.assertEqual(status, 204) - self.assertEqual(hdrs.get("Access-Control-Allow-Origin"), "*") - self.assertEqual(hdrs.get("Access-Control-Allow-Methods"), "GET, OPTIONS") + self.assertEqual(headers.get("Access-Control-Allow-Methods"), "GET, OPTIONS") if __name__ == "__main__": diff --git a/uv.lock b/uv.lock index 02475fd..2af2536 100644 --- a/uv.lock +++ b/uv.lock @@ -2,7 +2,49 @@ version = 1 revision = 3 requires-python = ">=3.11" +[[package]] +name = "duckdb" +version = "1.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/69/00/d579dcb2a536b6ea3a2563cdad6844f77d81a9b2d4b22a858097f2468acf/duckdb-1.5.3.tar.gz", hash = "sha256:df39428eb130faa35ae96fd35245bdeae6ecf43936250b116b5fead568eb9f16", size = 18026640, upload-time = "2026-05-20T11:55:31.901Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/fc/a8a89c6c73f31c2b58c6abbc2f543e0b736042dd5ef7cc1784c24ec31428/duckdb-1.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:341a2672e2551ba51c95c1898f0ade983e76675e79038ccb16342c3d6cfb82d7", size = 32583465, upload-time = "2026-05-20T11:54:13.132Z" }, + { url = "https://files.pythonhosted.org/packages/63/f1/3423a2f523dd034e505d4a5dd8e210ae577212e152598dc13b6a5e736e1b/duckdb-1.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c9e8fa408705081160ede7ead238d16e73a36b8561b700f2bf2d650ae48e7b92", size = 17278520, upload-time = "2026-05-20T11:54:16.368Z" }, + { url = "https://files.pythonhosted.org/packages/e1/1a/7bf5ba1b7ea520557e6b2dbee1c85abab016bdac0c1779d9d0ef76c87300/duckdb-1.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:70a18f932cf6d87bd0e554613657a515c1443a1724aacfc7ec5137dd28698b03", size = 15424794, upload-time = "2026-05-20T11:54:19.891Z" }, + { url = "https://files.pythonhosted.org/packages/ad/16/ce4b1e386e45fab0268edbf1b85bace20e9437589e9edb2bd5f9a226fa44/duckdb-1.5.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e80eb4d0fb59869cb2c7d7ef494c07fb92014fe8e77d96c170cd1ebc1488a708", size = 19306666, upload-time = "2026-05-20T11:54:22.77Z" }, + { url = "https://files.pythonhosted.org/packages/99/1f/651f8453f26931e8061b7e27b3090f868868185814ecb9216d0bd71ec8ef/duckdb-1.5.3-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3248b49cd835ea322574bc6aac0ae7a83be85547f49d4f5f5777cb380ee6627f", size = 21418306, upload-time = "2026-05-20T11:54:25.616Z" }, + { url = "https://files.pythonhosted.org/packages/bc/64/e1ffebf010b1631a6fef8d1508f46d4eab3e97c18729af986bb796fa8452/duckdb-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:f4eff89c12c3a362efa012262e57b7b4ab904a7f79bad9178fe365510077abe8", size = 13101423, upload-time = "2026-05-20T11:54:28.107Z" }, + { url = "https://files.pythonhosted.org/packages/e7/42/b1d4e34f9658cc0e13d7aae581ab82643f50a548d5aee8767f0c587cc3a4/duckdb-1.5.3-cp311-cp311-win_arm64.whl", hash = "sha256:75d13308c9da3ee431d1e72b8ab720aa74a1b3e9159d4124cb62435924496334", size = 13951740, upload-time = "2026-05-20T11:54:30.886Z" }, + { url = "https://files.pythonhosted.org/packages/e7/c4/2e34929b16c8d544ef664fad8f7f3a2a9db05746aae1e7c8c4ee3a8b23e4/duckdb-1.5.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ff11a457258148337ef9a392148a8cdbd1069b6c27c21958816c7b67fe6c542d", size = 32626494, upload-time = "2026-05-20T11:54:33.738Z" }, + { url = "https://files.pythonhosted.org/packages/3a/53/3af681793d03771365ae3e2215331151c196a3ac8193f613344840694671/duckdb-1.5.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5fd25f533cb1b6b2c84cc767a9a9bab7769bb1aa44571a2a0bfc91ac3e4a38ac", size = 17301121, upload-time = "2026-05-20T11:54:36.928Z" }, + { url = "https://files.pythonhosted.org/packages/15/e2/c80af1eac2ab5d35fc2c372ef0a84668842e549fbbf7799277b3fccf3e39/duckdb-1.5.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:10960400ed60cdf0fe05bab2086fa8eb733889cb0ceca18d07ff9a00c0e0be7b", size = 15449283, upload-time = "2026-05-20T11:54:39.777Z" }, + { url = "https://files.pythonhosted.org/packages/2d/9a/c63af233c9f761bf5178a5210437e1bc6bcb30fa8a9073de6398cfb12c03/duckdb-1.5.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c5f18e7561403054433706c187589e86629a7af09a7efc23a06a8b308e6acc68", size = 19332762, upload-time = "2026-05-20T11:54:42.51Z" }, + { url = "https://files.pythonhosted.org/packages/21/cc/2d77af4fff86012f334ef82e6d54a995a86c8745e58074f1218ed7d25171/duckdb-1.5.3-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9fb7516255a8764545e30f7efacea408cc847764a3027b3b0b3e7d1a7bebbc5c", size = 21453290, upload-time = "2026-05-20T11:54:45.272Z" }, + { url = "https://files.pythonhosted.org/packages/8d/5e/9bc4817a98feb4dab83e56f2245cd3a30d00ee646d4dec7926464e2b3f28/duckdb-1.5.3-cp312-cp312-win_amd64.whl", hash = "sha256:8001eccbc28be244dfd04d708526f34ddd6460b47a8aeb5d0e39d6f7f9e3fe15", size = 13118308, upload-time = "2026-05-20T11:54:48.058Z" }, + { url = "https://files.pythonhosted.org/packages/81/35/e3f32e4e53e2450ddb1db8312a17d1ce455d60cc4941b6ad2cfc908794b0/duckdb-1.5.3-cp312-cp312-win_arm64.whl", hash = "sha256:6d2835e39bb6af73891f73c0f8d4324f98afe00d0b00c6d34b2a582c2256cbb0", size = 13927187, upload-time = "2026-05-20T11:54:50.584Z" }, + { url = "https://files.pythonhosted.org/packages/cc/9c/a528eb09d8be51954c485864bd06753e616939a080cbc3dd4417e8c94a57/duckdb-1.5.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e75a6122c12579a99848517f6f00a4e342aebda3590c30fe9b5cc5f39d5e6afc", size = 32626254, upload-time = "2026-05-20T11:54:53.65Z" }, + { url = "https://files.pythonhosted.org/packages/ec/3c/1534c0a6db347c05eb7d0f6ecfb7aefbe74cbff398e4892a8fd1903a20e8/duckdb-1.5.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fd3963c1cb9d9567777f4a898a9dbe388a2fe9724681801b1e7d6d93eecf1b76", size = 17300917, upload-time = "2026-05-20T11:54:56.628Z" }, + { url = "https://files.pythonhosted.org/packages/23/fa/beafb91e6e152d2161c4a9cbc472334c87607eb61ad7104b5a7fa8d8d7b1/duckdb-1.5.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3d5db8c0b55e072cf437948ebb5d7e23d7b9d03d905fa5f9145583e65aa447f7", size = 15449411, upload-time = "2026-05-20T11:54:59.089Z" }, + { url = "https://files.pythonhosted.org/packages/50/0a/49b6fe04e2fcd63729eb607dadd44818dde77342a4f5ce086c6c92f1dd4d/duckdb-1.5.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ce80aed7a538422129a57eaca9141e3afb51f8bf562b1908b1576c9725b5b22", size = 19333120, upload-time = "2026-05-20T11:55:01.727Z" }, + { url = "https://files.pythonhosted.org/packages/63/4c/0907c3f76adb9dd90e67610b31e0304a35814e65c4c41a354a262c09b885/duckdb-1.5.3-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:787df63824f07bf18022dbc3b8ca4b2bfab0ebe616464f55c6e8cd0f59ea762e", size = 21453266, upload-time = "2026-05-20T11:55:04.5Z" }, + { url = "https://files.pythonhosted.org/packages/6d/9c/d2f23a7803ddbbd9413f7572ecf66a15120ed5ced7ce5c73e698c1406b76/duckdb-1.5.3-cp313-cp313-win_amd64.whl", hash = "sha256:bb5bb5dcdd09d62ee60f0ddbbef918e71cce304ffe28428b1131949d39ffaabf", size = 13118640, upload-time = "2026-05-20T11:55:07.389Z" }, + { url = "https://files.pythonhosted.org/packages/27/d5/7ba2316415bcdab6edd765bbbe35c2ca8a3800f2fe695cd70e3cdb997f09/duckdb-1.5.3-cp313-cp313-win_arm64.whl", hash = "sha256:2fa17ecdd5d3db122836cb71bb93601c2106a3be883c17dffddc02fbf3fa7888", size = 13926409, upload-time = "2026-05-20T11:55:10.166Z" }, + { url = "https://files.pythonhosted.org/packages/a5/c2/d4b6f8a5e4d3bc25773be6da76a99d9661ebbf3552c007c460d2dd59dbf8/duckdb-1.5.3-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:4bfa9a4dadf71e83e2c4eaca2f9421c82a54defecc1b0b4c0be95e2389dec4fe", size = 32636685, upload-time = "2026-05-20T11:55:13.158Z" }, + { url = "https://files.pythonhosted.org/packages/42/58/e835c8298979d29db7a62cb5acc29e9b57aeaca7cdde2fcd3ac980f5cb18/duckdb-1.5.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:aea7baf67ad7e1829ac76f67d7dcbd7fb1f57c3eb179d55ac30952df4709ae30", size = 17308134, upload-time = "2026-05-20T11:55:16.194Z" }, + { url = "https://files.pythonhosted.org/packages/c9/46/617b51363f5613418c8b224b3cce16b58e6dde80904566bec232579c1d4e/duckdb-1.5.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b0b4f088a65d77e1217ce5d7eff889e63fedc44281200d899ff47c84d8ff836", size = 15449891, upload-time = "2026-05-20T11:55:18.687Z" }, + { url = "https://files.pythonhosted.org/packages/b3/72/354146656e8d9ba3853d3a5ee80a481b8c5f70edfc3d5ae80a8c4479c967/duckdb-1.5.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe8d0c1f6a120aa03fa6e0d03897c71a1842e6cf7afd31d181348391f7108fe1", size = 19338499, upload-time = "2026-05-20T11:55:21.34Z" }, + { url = "https://files.pythonhosted.org/packages/56/8f/65fc623b51448f2bfba1a9ec6ab3debb4664c0876c0113a5e782600b53ac/duckdb-1.5.3-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0405eae18ec6e8210a471c97dbfe87a7e4d605274b7fe572a1f276e92158f13", size = 21455828, upload-time = "2026-05-20T11:55:23.847Z" }, + { url = "https://files.pythonhosted.org/packages/2b/db/d0274cbe9f5fe219f77c0bdf900ac77103569e83c102a4225ce04cbc607d/duckdb-1.5.3-cp314-cp314-win_amd64.whl", hash = "sha256:33ae08b3e818d7613d8936744b67718c2062c2f530376895bfd89efb51b81538", size = 13640011, upload-time = "2026-05-20T11:55:26.276Z" }, + { url = "https://files.pythonhosted.org/packages/07/5d/8f1899b8bef291caf953992fcd6c24df9f29387a35645e58c2504a5ca473/duckdb-1.5.3-cp314-cp314-win_arm64.whl", hash = "sha256:746433e49bbc667b4df283153415fbe37e9083e0eff6c3cd6e54de7536869cd4", size = 14411554, upload-time = "2026-05-20T11:55:29.037Z" }, +] + [[package]] name = "evalops-workbench" version = "0.1.0" source = { editable = "." } +dependencies = [ + { name = "duckdb" }, +] + +[package.metadata] +requires-dist = [{ name = "duckdb", specifier = ">=1.1" }] diff --git a/vercel.json b/vercel.json index 65c9c11..67dd946 100644 --- a/vercel.json +++ b/vercel.json @@ -13,6 +13,15 @@ { "key": "Access-Control-Allow-Headers", "value": "Content-Type" }, { "key": "Cache-Control", "value": "public, max-age=30, stale-while-revalidate=60" } ] + }, + { + "source": "/api/benchmark-latest", + "headers": [ + { "key": "Access-Control-Allow-Origin", "value": "*" }, + { "key": "Access-Control-Allow-Methods", "value": "GET, OPTIONS" }, + { "key": "Access-Control-Allow-Headers", "value": "Content-Type" }, + { "key": "Cache-Control", "value": "public, max-age=30, stale-while-revalidate=60" } + ] } ] } From 072c65cc4b2d11d033669f1802b8c57cb5501633 Mon Sep 17 00:00:00 2001 From: Ignazio De Santis Date: Tue, 26 May 2026 15:59:37 +0800 Subject: [PATCH 2/3] feat(stats): add workload:"benchmark" disclosure to the live envelope Distinguishes EvalOps' synthetic benchmark workload from production user traffic (NexusRAG) on the shared homepage telemetry grid. Fleet-wide field agreed for all Tier-A prototypes. --- api/stats.py | 2 ++ tests/test_stats.py | 1 + 2 files changed, 3 insertions(+) diff --git a/api/stats.py b/api/stats.py index 3ad5228..c17d23b 100644 --- a/api/stats.py +++ b/api/stats.py @@ -131,6 +131,7 @@ def _build_response() -> dict[str, Any]: return { "system": SYSTEM_SLUG, "mode": "live", + "workload": "benchmark", "status": status, "last_deployed_at": last_deployed_at, "last_active_at": last_active_at, @@ -161,6 +162,7 @@ def do_GET(self) -> None: # noqa: N802 (interface contract) payload = { "system": SYSTEM_SLUG, "mode": "live", + "workload": "benchmark", "status": "degraded", "last_deployed_at": None, "last_active_at": None, diff --git a/tests/test_stats.py b/tests/test_stats.py index df001fb..b28bf02 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -60,6 +60,7 @@ def test_live_operational_from_history(self) -> None: response = stats._build_response() self.assertEqual(response["mode"], "live") + self.assertEqual(response["workload"], "benchmark") self.assertEqual(response["status"], "operational") self.assertEqual(response["schema_version"], 1) self.assertEqual(set(response["metrics"]), _TIER_A_METRICS) From a70ca560bc8750fb597e1dd32009f5f56f1e2b7a Mon Sep 17 00:00:00 2001 From: Ignazio De Santis Date: Tue, 26 May 2026 17:13:07 +0800 Subject: [PATCH 3/3] =?UTF-8?q?ci:=20fix=20quality=20workflow=20=E2=80=94?= =?UTF-8?q?=20drop=20npm=20cache,=20use=20npm=20install?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit package-lock.json is gitignored (scaffold default) and Vercel deploys with npm install, so cache:"npm" + npm ci failed ("lock file is not found"). Match the repo install strategy so CI is green. --- .github/workflows/python-ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 30852a7..03a087f 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -22,7 +22,6 @@ jobs: uses: actions/setup-node@v4 with: node-version: "20" - cache: "npm" - name: Install Python package run: pip install -e . @@ -31,7 +30,7 @@ jobs: run: python -m unittest discover -s tests -p 'test_*.py' - name: Install frontend dependencies - run: npm ci + run: npm install --no-audit --no-fund - name: Type-check dashboard run: npm run type-check