From 1f5519b4f028e1cedd4c8691f948c7a0ee5d9b74 Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Fri, 19 Jun 2026 15:35:32 +1000 Subject: [PATCH 1/2] fix(eval): let server idle reaper own sandbox session lifecycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The eval harness leaked sandbox session containers. Each agent run that calls `panda execute` makes the server create a persistent session container. The provider tried to clean these up per-run by scraping session ids out of the agent transcript with regexes (_SESSION_IN_ARGS/_SESSION_IN_OUTPUT/_LIST_CMD). That scraping was broken: _SESSION_IN_OUTPUT matched the literal `session_id`, but panda actually prints `[session] (ttl: …)` on the CLI route (pkg/cli/execute.go:113) and `[session] id= ttl=… …` on the MCP route (pkg/tool/execute_python.go:209). Neither matches, so a session was only torn down if a later command reused its id in args. Every single-`execute` run leaked its container; they piled up to the max_sessions cap and the host tapped out at low concurrency. Fix: stop scraping logs and let the server's idle reaper own session lifecycle. The reaper already destroys idle session containers (pkg/sandbox/session.go: cleanupLoop -> cleanupExpired -> cleanupCallback), never reaps a session with an active execution (activeExecs guard in cleanupExpired), and bumps lastUsed on each use (unmarkExecuting) so an actively-used multi-turn session is safe. - _panda_env.py: set a short 5m sandbox.sessions.ttl so leaked single-shot sessions self-reap fast (well longer than any inter-turn gap, far shorter than the 30m default); lower max_sessions 150 -> 100. - provider.py: delete the log-scraping teardown, its regexes, the call site, the destroyed_sandbox_sessions metadata field, and the now-unused _server_base helper / yaml / urllib imports. purge_sessions (server-shutdown final sweep) is kept. --- tests/eval/promptfoo/provider.py | 56 ++------------------------------ tests/eval/scripts/_panda_env.py | 21 ++++++++---- 2 files changed, 16 insertions(+), 61 deletions(-) diff --git a/tests/eval/promptfoo/provider.py b/tests/eval/promptfoo/provider.py index d9200a06..2dfb3c47 100644 --- a/tests/eval/promptfoo/provider.py +++ b/tests/eval/promptfoo/provider.py @@ -13,14 +13,10 @@ import asyncio import json import os -import re import sys -import urllib.request sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # tests/eval on path -import yaml - from config.settings import DEFAULT_AGENT_MODEL, DEFAULT_AGENT_ROUTE from harness.subject import OpencodeSubject from harness.trace import TOOLS_MARKER @@ -87,54 +83,6 @@ def _graded_output(trace) -> str: return "\n".join(lines) -# Panda sandbox session ids are 12-char hex. We only count an id as THIS agent's if it -# appears next to a session keyword in the agent's own commands, or as a `session_id` -# in a command's response — and never from `session list` output, which shows OTHER -# agents' sessions too. -_SESSION_IN_ARGS = re.compile(r"session[^\n]{0,40}?\b([0-9a-f]{12})\b", re.IGNORECASE) -_SESSION_IN_OUTPUT = re.compile(r"session_id[\"'\s:=]{1,5}([0-9a-f]{12})", re.IGNORECASE) -_LIST_CMD = re.compile(r"session(s\b|\s+list)", re.IGNORECASE) - - -def _server_base(subject) -> str: - """The base URL of the panda server THIS run talked to: the CLI route resolves it - from PANDA_CONFIG (the harden scratch config), else the subject's mcp_url.""" - cfg_path = os.environ.get("PANDA_CONFIG", "") - if cfg_path and os.path.exists(cfg_path): - try: - with open(cfg_path) as f: - cfg = yaml.safe_load(f) or {} - base = (cfg.get("server") or {}).get("base_url") - if base: - return str(base).rstrip("/") - except Exception: # noqa: BLE001 - fall through to the default - pass - return str(subject.settings.mcp_url).rstrip("/") - - -def _teardown_agent_sessions(subject, trace) -> list[str]: - """Destroy the sandbox sessions THIS agent created/used, now that its run is over. - Per-agent by construction: only ids from this run's own transcript — concurrent - agents' sessions (or a human's) are never touched. Best-effort; already-gone is fine.""" - ids: set[str] = set() - for t in trace.tool_calls: - ids.update(_SESSION_IN_ARGS.findall(t.arguments or "")) - if not _LIST_CMD.search(t.arguments or ""): - ids.update(_SESSION_IN_OUTPUT.findall(t.output or "")) - if not ids: - return [] - base = _server_base(subject) - destroyed = [] - for sid in sorted(ids): - req = urllib.request.Request(f"{base}/api/v1/sessions/{sid}", method="DELETE") - try: - with urllib.request.urlopen(req, timeout=10): # noqa: S310 - destroyed.append(sid) - except Exception: # noqa: BLE001 - not a session / already gone - continue - return destroyed - - def call_api(prompt, options, context): cfg = (options or {}).get("config", {}) or {} # The first turn is the rendered prompt; extra turns come from the `followups` var @@ -151,7 +99,8 @@ def call_api(prompt, options, context): retried = True trace = asyncio.run(subject.run(prompts)) subject.flush() # push the run's Langfuse trace before promptfoo moves to the next - destroyed = _teardown_agent_sessions(subject, trace) # this agent's sandboxes only + # Sandbox sessions are reaped server-side by the idle TTL (pkg/sandbox/session.go); + # no per-run teardown here. A leaked single-shot session self-reaps once idle. return { "output": _graded_output(trace), "tokenUsage": { @@ -170,7 +119,6 @@ def call_api(prompt, options, context): "trace_id": trace.trace_id, "trace_url": subject.trace_url(trace.trace_id), "session_id": subject.session_id, - "destroyed_sandbox_sessions": destroyed, "retried": retried, "input_tokens": trace.input_tokens, "output_tokens": trace.output_tokens, diff --git a/tests/eval/scripts/_panda_env.py b/tests/eval/scripts/_panda_env.py index 98c5a8ce..33392346 100644 --- a/tests/eval/scripts/_panda_env.py +++ b/tests/eval/scripts/_panda_env.py @@ -55,9 +55,16 @@ def write_scratch_config(port: int, *, source: Path | None = None) -> Path: sb["network"] = "ethpandaops-panda-harden" sb["host_shared_path"] = str(shared) # Eval runs many agents at once (questions x phrasings x subjects, -j concurrent); - # the default 50-session cap can starve them. The sessions are torn down after - # every measure (purge_sessions), so a high cap costs nothing when idle. - sb.setdefault("sessions", {})["max_sessions"] = 150 + # the server's own idle reaper (pkg/sandbox/session.go) owns session lifecycle, so a + # leaked single-shot session self-reaps once it's idle past the TTL. A short 5m TTL + # keeps leaked containers from piling up; it's comfortably longer than any inter-turn + # gap in a multi-turn run (an actively-used session's lastUsed is bumped per call and + # is never reaped mid-execution), yet far shorter than the 30m default. With reaping + # the cap no longer needs to be huge — 100 sits well above run concurrency (~24) plus + # a TTL-window backlog. + sessions = sb.setdefault("sessions", {}) + sessions["ttl"] = "5m" + sessions["max_sessions"] = 100 cfg["storage"] = {"base_dir": str(storage), "cache_dir": str(cache)} cfg.setdefault("observability", {})["metrics_enabled"] = False @@ -122,10 +129,10 @@ def _wait_ready(self) -> None: def purge_sessions(port: int) -> int: """Destroy all remaining sandbox sessions on the SCRATCH server, called only as it - shuts down — its containers would otherwise outlive it as orphans (per-agent - teardown in the provider handles the during-run case; this is the final sweep of a - dying server, never a server anyone else uses). Best-effort: a dead/unreachable - server just means nothing to purge.""" + shuts down — its containers would otherwise outlive it as orphans (the server's idle + reaper handles the during-run case via TTL; this is the final sweep of a dying server, + never a server anyone else uses). Best-effort: a dead/unreachable server just means + nothing to purge.""" base = f"http://localhost:{port}/api/v1/sessions" try: with urllib.request.urlopen(base, timeout=10) as r: # noqa: S310 From f65384b4b62188aa8cbb6cdce75723e196f1e74a Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Fri, 19 Jun 2026 16:31:16 +1000 Subject: [PATCH 2/2] fix(eval): deterministic per-test sandbox cleanup via owner attribution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The eval leaked sandbox session containers. The prior fix leaned on the server's idle TTL reaper alone — lazy GC that lets a finished test's sandboxes linger up to the TTL. Replace that with deterministic per-test cleanup, done server-authoritatively (not by scraping the agent log). Sessions are already owner-scoped server-side (CreateSession/ListSessions/ DestroySession key on an owner id). The owner is authOwnerID(r): the GitHub user id, or "" when unauthenticated — so an unauthenticated server (local mode, the eval) couldn't ask "which sessions are mine". - server: authOwnerID falls back to the caller attribution (X-Panda-On-Behalf-Of) when there's no auth user. Backward-compatible (authed -> GitHubID; unauth+attribution -> the value; neither -> "") and a genuine improvement: caller-scoped sessions in local mode. - eval: mint a unique owner id per worker process (set into PANDA_ON_BEHALF_OF at import so the host serve inherits it; passed -e into the docker serve). Each worker's panda CLI calls now carry that attribution. - eval: after each test, GET /api/v1/sessions then DELETE each, both carrying the worker's attribution header. Workers run tests serially, so this tears down exactly the just-finished test's sandboxes. Returns destroyed_sandbox_sessions for debugging. - the short 5m TTL + max_sessions stay as a BACKSTOP for crashed/missed workers; comments updated to make primary-vs-backstop clear. Adds a unit test for authOwnerID's attribution fallback. --- pkg/server/api.go | 10 ++-- pkg/server/runtime_attribution_test.go | 12 +++++ tests/eval/agent/opencode_agent.py | 17 +++++++ tests/eval/promptfoo/provider.py | 64 +++++++++++++++++++++++++- tests/eval/scripts/_panda_env.py | 17 +++---- 5 files changed, 106 insertions(+), 14 deletions(-) diff --git a/pkg/server/api.go b/pkg/server/api.go index 8d412c77..f87195a2 100644 --- a/pkg/server/api.go +++ b/pkg/server/api.go @@ -883,12 +883,14 @@ func runtimeExecutionID(ctx context.Context) string { } func authOwnerID(r *http.Request) string { - user := auth.GetAuthUser(r.Context()) - if user == nil { - return "" + if user := auth.GetAuthUser(r.Context()); user != nil { + return fmt.Sprintf("%d", user.GitHubID) } - return fmt.Sprintf("%d", user.GitHubID) + // Fall back to caller attribution so sessions are owner-scoped even when + // the server runs unauthenticated (e.g. local mode, the eval harness). + // Empty when no attribution is present, preserving prior behavior. + return attribution.FromContext(r.Context()) } func parseOptionalInt(r *http.Request, key string) (int, error) { diff --git a/pkg/server/runtime_attribution_test.go b/pkg/server/runtime_attribution_test.go index 6bbd6bc5..18fd3e33 100644 --- a/pkg/server/runtime_attribution_test.go +++ b/pkg/server/runtime_attribution_test.go @@ -93,3 +93,15 @@ func TestRuntimeCallbacksInheritExecutionAttribution(t *testing.T) { require.Equal(t, "discord:sam", inherited, "runtime callbacks must inherit the spawning execution's attribution") } + +func TestAuthOwnerIDAttributionFallback(t *testing.T) { + // No auth user: fall back to the caller attribution so sessions are owner-scoped + // even on an unauthenticated server (local mode, the eval harness). + withAttr := httptest.NewRequest(http.MethodGet, "/api/v1/sessions", nil) + withAttr = withAttr.WithContext(attribution.WithValue(withAttr.Context(), "eval-worker-abc123")) + require.Equal(t, "eval-worker-abc123", authOwnerID(withAttr)) + + // Neither auth user nor attribution: empty owner, preserving prior behavior. + bare := httptest.NewRequest(http.MethodGet, "/api/v1/sessions", nil) + require.Equal(t, "", authOwnerID(bare)) +} diff --git a/tests/eval/agent/opencode_agent.py b/tests/eval/agent/opencode_agent.py index d979f740..9c3b6193 100644 --- a/tests/eval/agent/opencode_agent.py +++ b/tests/eval/agent/opencode_agent.py @@ -41,6 +41,20 @@ # Langfuse session. In CI that's the GitHub run id; locally a per-process uuid. _LANGFUSE_SESSION_ID = os.environ.get("GITHUB_RUN_ID") or uuid.uuid4().hex +# Caller-attribution env var the panda CLI forwards as the X-Panda-On-Behalf-Of +# header (pkg/attribution/attribution.go: const EnvVar). The server scopes sandbox +# sessions to this value (authOwnerID falls back to it when unauthenticated), so a +# per-worker identity lets the eval tear down exactly this worker's sessions. +_ATTRIBUTION_ENVVAR = "PANDA_ON_BEHALF_OF" + +# A unique owner id for THIS worker process. Each promptfoo worker runs its tests +# serially against its own opencode serve, so "this worker's sessions" == "the +# just-finished test's sessions". Set into the environment at import so the host +# serve inherits it; the docker serve is passed it explicitly (see _docker_serve). +# An externally-provided attribution (a real chat-agent test) is honored if set. +os.environ.setdefault(_ATTRIBUTION_ENVVAR, f"eval-worker-{uuid.uuid4().hex[:12]}") +EVAL_WORKER_OWNER_ID = os.environ[_ATTRIBUTION_ENVVAR] + SYSTEM_PROMPT_MCP = "You are an ethpandaops agent. You have access to panda via its MCP tools." SYSTEM_PROMPT_CLI = "You are an ethpandaops agent. You have access to the panda CLI." @@ -307,6 +321,9 @@ def _docker_serve( "-p", f"127.0.0.1:{port}:{port}", "--add-host=host.docker.internal:host-gateway", "-e", "OPENCODE_GO_API_KEY", # pass through (value stays out of the arg list) + # Caller attribution so the panda CLI inside carries X-Panda-On-Behalf-Of; + # the value is in os.environ (set at import), passed through by name. + "-e", _ATTRIBUTION_ENVVAR, "-v", f"{panda_dir}:/opt/pandabin:ro", "-v", f"{workdir / 'opencode.json'}:/work/opencode.json:ro", "-v", f"{workdir / 'auth.json'}:/root/.local/share/opencode/auth.json", diff --git a/tests/eval/promptfoo/provider.py b/tests/eval/promptfoo/provider.py index 2dfb3c47..9672fc3a 100644 --- a/tests/eval/promptfoo/provider.py +++ b/tests/eval/promptfoo/provider.py @@ -14,13 +14,22 @@ import json import os import sys +import urllib.request + +import yaml sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # tests/eval on path +from agent.opencode_agent import EVAL_WORKER_OWNER_ID from config.settings import DEFAULT_AGENT_MODEL, DEFAULT_AGENT_ROUTE from harness.subject import OpencodeSubject from harness.trace import TOOLS_MARKER +# The HTTP header the panda CLI forwards the attribution env var as +# (pkg/attribution/attribution.go: const Header). The server reads it to scope +# sandbox sessions to the caller, so teardown carries it to find + delete ours. +_ATTRIBUTION_HEADER = "X-Panda-On-Behalf-Of" + _SUBJECTS: dict[tuple, OpencodeSubject] = {} @@ -33,6 +42,54 @@ def _subject(cfg: dict) -> OpencodeSubject: return _SUBJECTS[key] +def _server_base(subject) -> str: + """The base URL of the panda server THIS run talked to: the CLI route resolves it + from PANDA_CONFIG (the harden scratch config), else the subject's mcp_url.""" + cfg_path = os.environ.get("PANDA_CONFIG", "") + if cfg_path and os.path.exists(cfg_path): + try: + with open(cfg_path) as f: + cfg = yaml.safe_load(f) or {} + base = (cfg.get("server") or {}).get("base_url") + if base: + return str(base).rstrip("/") + except Exception: # noqa: BLE001 - fall through to the default + pass + return str(subject.settings.mcp_url).rstrip("/") + + +def _teardown_worker_sessions(subject) -> list[str]: + """Destroy the sandbox sessions owned by THIS worker, now its just-finished test is + done. Deterministic and server-authoritative: the agent's panda CLI carries + X-Panda-On-Behalf-Of=, so the server scopes those sessions to this worker + (authOwnerID's attribution fallback). We ask the server for this worker's sessions and + delete them, carrying the same header so DestroySession's owner check passes. Tests run + serially within a worker, so this targets exactly the test that just finished; the short + TTL reaper is only a backstop for crashed/missed workers. Best-effort throughout.""" + base = _server_base(subject) + headers = {_ATTRIBUTION_HEADER: EVAL_WORKER_OWNER_ID} + try: + req = urllib.request.Request(f"{base}/api/v1/sessions", headers=headers) # noqa: S310 + with urllib.request.urlopen(req, timeout=10) as r: # noqa: S310 + sessions = (json.loads(r.read().decode()) or {}).get("sessions") or [] + except Exception: # noqa: BLE001 - server gone / unreachable -> nothing to tear down + return [] + destroyed: list[str] = [] + for s in sessions: + sid = s.get("session_id") + if not sid: + continue + req = urllib.request.Request( # noqa: S310 + f"{base}/api/v1/sessions/{sid}", method="DELETE", headers=headers + ) + try: + with urllib.request.urlopen(req, timeout=10): # noqa: S310 + destroyed.append(sid) + except Exception: # noqa: BLE001 - already gone / not ours is fine + continue + return destroyed + + def _followups(vars_: dict) -> list: """Decode the `followups` var. It's JSON-encoded by the config (so promptfoo doesn't expand a raw list into a test matrix); tolerate a raw list too, for safety.""" @@ -99,8 +156,10 @@ def call_api(prompt, options, context): retried = True trace = asyncio.run(subject.run(prompts)) subject.flush() # push the run's Langfuse trace before promptfoo moves to the next - # Sandbox sessions are reaped server-side by the idle TTL (pkg/sandbox/session.go); - # no per-run teardown here. A leaked single-shot session self-reaps once idle. + # Primary cleanup: deterministically tear down THIS worker's sandbox sessions now the + # test is done (owner-scoped via attribution, server-authoritative). The server's short + # idle TTL (pkg/sandbox/session.go) is only a backstop for crashed/missed workers. + destroyed = _teardown_worker_sessions(subject) return { "output": _graded_output(trace), "tokenUsage": { @@ -119,6 +178,7 @@ def call_api(prompt, options, context): "trace_id": trace.trace_id, "trace_url": subject.trace_url(trace.trace_id), "session_id": subject.session_id, + "destroyed_sandbox_sessions": destroyed, "retried": retried, "input_tokens": trace.input_tokens, "output_tokens": trace.output_tokens, diff --git a/tests/eval/scripts/_panda_env.py b/tests/eval/scripts/_panda_env.py index 33392346..5a7727e3 100644 --- a/tests/eval/scripts/_panda_env.py +++ b/tests/eval/scripts/_panda_env.py @@ -54,14 +54,15 @@ def write_scratch_config(port: int, *, source: Path | None = None) -> Path: sb["image"] = "ethpandaops-panda-sandbox:latest" sb["network"] = "ethpandaops-panda-harden" sb["host_shared_path"] = str(shared) - # Eval runs many agents at once (questions x phrasings x subjects, -j concurrent); - # the server's own idle reaper (pkg/sandbox/session.go) owns session lifecycle, so a - # leaked single-shot session self-reaps once it's idle past the TTL. A short 5m TTL - # keeps leaked containers from piling up; it's comfortably longer than any inter-turn - # gap in a multi-turn run (an actively-used session's lastUsed is bumped per call and - # is never reaped mid-execution), yet far shorter than the 30m default. With reaping - # the cap no longer needs to be huge — 100 sits well above run concurrency (~24) plus - # a TTL-window backlog. + # Eval runs many agents at once (questions x phrasings x subjects, -j concurrent). + # Primary cleanup is per-test and owner-scoped (provider.py tears down each worker's + # sessions when its test finishes). This short TTL is only a BACKSTOP for crashed or + # missed workers: the server's idle reaper (pkg/sandbox/session.go) self-reaps a leaked + # session once it's idle past the TTL. 5m is comfortably longer than any inter-turn gap + # in a multi-turn run (an actively-used session's lastUsed is bumped per call and is + # never reaped mid-execution), yet far shorter than the 30m default. With both the + # per-test delete and the reaper, the cap no longer needs to be huge — 100 sits well + # above run concurrency (~24) plus a TTL-window backlog. sessions = sb.setdefault("sessions", {}) sessions["ttl"] = "5m" sessions["max_sessions"] = 100