From 41369cfda90ed9651b94acf4cbe0f9deade44994 Mon Sep 17 00:00:00 2001 From: Muhammad Ahmed Cheema Date: Sun, 12 Apr 2026 20:45:08 -0700 Subject: [PATCH 1/2] feat: playwright self-validation and general browser capability Adds two new MCP tool servers that share one Chromium instance and one BrowserContext per query: - phantom_preview_page: one-call self-validation for /ui/ pages. Navigates headless Chromium, captures a full-page PNG, and bundles HTTP status, title, console messages, and failed network requests alongside the screenshot. Stubs window.EventSource in an init script so Phantom's live-reload SSE wiring cannot hold the page open past 'load'. - phantom-browser: embeds the full 21-tool @playwright/mcp surface in process via createConnection with a contextGetter, giving the agent browser_navigate, browser_click, browser_snapshot, browser_run_code, and the rest out of the box. Both share one Browser singleton launched lazily on first use and one BrowserContext minted per query with a short-lived phantom_session cookie (10 minute TTL, domain: localhost). The browser and context are torn down on SIGTERM via the existing graceful-shutdown registry. The contextGetter path is load-bearing: the naive createConnection hangs under Bun on Linux amd64 (traced across five probes in research 01b). browser.isolated: false is mandatory because SimpleBrowser.newContext() throws; the embed defers to our shared context instead. mcpServerFactories now accepts async factories. The per-query resolution path in AgentRuntime uses Promise.all so sync factories pass through as resolved promises without behavior change. Agent prompt gains two new guidance blocks: always validate after phantom_create_page, and prefer phantom_preview_page for self-validation while using browser_* directly for multi-step browsing. Dockerfile installs the --only-shell chromium build (~75 MiB saved over the full binary) with PLAYWRIGHT_BROWSERS_PATH set to a phantom-owned cache directory. playwright@1.59.1 and @playwright/mcp@0.0.70 are pinned exactly because contextGetter and the capability surface have shifted between point releases already. Research: local/2026-04-12-phantom-ui-chapter/research/01-playwright-self-validation.md Research: local/2026-04-12-phantom-ui-chapter/research/01b-playwright-expanded/implementation-plan.md Research: local/2026-04-12-phantom-ui-chapter/research/01b-playwright-expanded/findings.md Test count: 931 -> 940 pass (+9) plus 6 opt-in integration tests skipped by default. Zero regressions. lint and typecheck clean. Docker image delta: +996 MiB (914 MiB -> 1.91 GiB). --- Dockerfile | 12 + bun.lock | 14 ++ package.json | 2 + src/agent/prompt-assembler.ts | 25 ++ src/agent/runtime.ts | 10 +- src/index.ts | 9 +- .../__tests__/browser-mcp.integration.test.ts | 33 +++ src/ui/__tests__/browser-mcp.test.ts | 37 +++ src/ui/__tests__/preview.integration.test.ts | 63 +++++ src/ui/__tests__/preview.test.ts | 58 +++++ src/ui/browser-mcp.ts | 70 ++++++ src/ui/preview.ts | 218 ++++++++++++++++++ src/ui/session.ts | 17 ++ 13 files changed, 564 insertions(+), 4 deletions(-) create mode 100644 src/ui/__tests__/browser-mcp.integration.test.ts create mode 100644 src/ui/__tests__/browser-mcp.test.ts create mode 100644 src/ui/__tests__/preview.integration.test.ts create mode 100644 src/ui/__tests__/preview.test.ts create mode 100644 src/ui/browser-mcp.ts create mode 100644 src/ui/preview.ts diff --git a/Dockerfile b/Dockerfile index f4a8e5a..be03408 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,6 +75,18 @@ COPY --from=builder /app/public ./public COPY --from=builder /app/package.json ./ COPY --from=builder /app/tsconfig.json ./ +# Install Chromium headless shell + system deps for Playwright. +# Must run after node_modules is copied so bunx can resolve playwright. +# --only-shell skips the full Chromium binary (saves ~75 MiB); the custom +# phantom_preview_page tool uses chromium.launch() which picks the shell +# automatically for headless=true. The @playwright/mcp embed path uses a +# contextGetter so it never needs the full chrome channel binary. +ENV PLAYWRIGHT_BROWSERS_PATH=/home/phantom/.cache/ms-playwright +RUN mkdir -p "$PLAYWRIGHT_BROWSERS_PATH" && \ + bunx playwright install --with-deps --only-shell chromium && \ + chown -R phantom:phantom /home/phantom/.cache && \ + rm -rf /var/lib/apt/lists/* + # Copy default phantom-config (constitution.md, persona.md, etc.) # These get backed up so they survive the empty volume mount on first run. COPY --from=builder /app/phantom-config ./phantom-config diff --git a/bun.lock b/bun.lock index 705f903..5b4f3f1 100644 --- a/bun.lock +++ b/bun.lock @@ -7,10 +7,12 @@ "dependencies": { "@anthropic-ai/claude-agent-sdk": "^0.2.77", "@modelcontextprotocol/sdk": "^1.28.0", + "@playwright/mcp": "0.0.70", "@slack/bolt": "^4.6.0", "croner": "^10.0.1", "imapflow": "^1.2.18", "nodemailer": "^8.0.4", + "playwright": "1.59.1", "resend": "^6.9.4", "telegraf": "^4.16.3", "yaml": "^2.6.0", @@ -83,6 +85,8 @@ "@pinojs/redact": ["@pinojs/redact@0.4.0", "", {}, "sha512-k2ENnmBugE/rzQfEcdWHcCY+/FM3VLzH9cYEsbdsoqrvzAKRhUZeRNhAZvB8OitQJ1TBed3yqWtdjzS6wJKBwg=="], + "@playwright/mcp": ["@playwright/mcp@0.0.70", "", { "dependencies": { "playwright": "1.60.0-alpha-1774999321000", "playwright-core": "1.60.0-alpha-1774999321000" }, "bin": { "playwright-mcp": "cli.js" } }, "sha512-Kl0a6l9VL8rvT1oBou3hS5yArjwWV9UlwAkq+0skfK1YVg8XfmmNaAmwZhMeNx/ZhGiWXfCllo6rD/jvZz+WuA=="], + "@slack/bolt": ["@slack/bolt@4.6.0", "", { "dependencies": { "@slack/logger": "^4.0.0", "@slack/oauth": "^3.0.4", "@slack/socket-mode": "^2.0.5", "@slack/types": "^2.18.0", "@slack/web-api": "^7.12.0", "axios": "^1.12.0", "express": "^5.0.0", "path-to-regexp": "^8.1.0", "raw-body": "^3", "tsscmp": "^1.0.6" }, "peerDependencies": { "@types/express": "^5.0.0" } }, "sha512-xPgfUs2+OXSugz54Ky07pA890+Qydk22SYToi8uGpXeHSt1JWwFJkRyd/9Vlg5I1AdfdpGXExDpwnbuN9Q/2dQ=="], "@slack/logger": ["@slack/logger@4.0.1", "", { "dependencies": { "@types/node": ">=18" } }, "sha512-6cmdPrV/RYfd2U0mDGiMK8S7OJqpCTm7enMLRR3edccsPX8j7zXTLnaEF4fhxxJJTAIOil6+qZrnUPTuaLvwrQ=="], @@ -237,6 +241,8 @@ "fresh": ["fresh@2.0.0", "", {}, "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A=="], + "fsevents": ["fsevents@2.3.2", "", { "os": "darwin" }, "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA=="], + "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="], "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], @@ -357,6 +363,10 @@ "pkce-challenge": ["pkce-challenge@5.0.1", "", {}, "sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ=="], + "playwright": ["playwright@1.59.1", "", { "dependencies": { "playwright-core": "1.59.1" }, "optionalDependencies": { "fsevents": "2.3.2" }, "bin": { "playwright": "cli.js" } }, "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw=="], + + "playwright-core": ["playwright-core@1.60.0-alpha-1774999321000", "", { "bin": { "playwright-core": "cli.js" } }, "sha512-ams3Zo4VXxeOg5ZTTh16GkE8g48Bmxo/9pg9gXl9SVKlVohCU7Jaog7XntY8yFuzENA6dJc1Fz7Z/NNTm9nGEw=="], + "postal-mime": ["postal-mime@2.7.3", "", {}, "sha512-MjhXadAJaWgYzevi46+3kLak8y6gbg0ku14O1gO/LNOuay8dO+1PtcSGvAdgDR0DoIsSaiIA8y/Ddw6MnrO0Tw=="], "process-warning": ["process-warning@5.0.0", "", {}, "sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA=="], @@ -465,6 +475,8 @@ "zod-to-json-schema": ["zod-to-json-schema@3.25.1", "", { "peerDependencies": { "zod": "^3.25 || ^4" } }, "sha512-pM/SU9d3YAggzi6MtR4h7ruuQlqKtad8e9S0fmxcMi+ueAK5Korys/aWcV9LIIHTVbj01NdzxcnXSN+O74ZIVA=="], + "@playwright/mcp/playwright": ["playwright@1.60.0-alpha-1774999321000", "", { "dependencies": { "playwright-core": "1.60.0-alpha-1774999321000" }, "optionalDependencies": { "fsevents": "2.3.2" }, "bin": { "playwright": "cli.js" } }, "sha512-Bd5DkzYKG+2g1jLO6NeTXmGLbBYSFffJIOsR4l4hUBkJvzvGGdLZ7jZb2tOtb0WIoWXQKdQj3Ap6WthV4DBS8w=="], + "form-data/mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="], "libmime/iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="], @@ -473,6 +485,8 @@ "p-queue/p-timeout": ["p-timeout@3.2.0", "", { "dependencies": { "p-finally": "^1.0.0" } }, "sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg=="], + "playwright/playwright-core": ["playwright-core@1.59.1", "", { "bin": { "playwright-core": "cli.js" } }, "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg=="], + "form-data/mime-types/mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="], } } diff --git a/package.json b/package.json index b80234c..eae34fd 100644 --- a/package.json +++ b/package.json @@ -18,10 +18,12 @@ "dependencies": { "@anthropic-ai/claude-agent-sdk": "^0.2.77", "@modelcontextprotocol/sdk": "^1.28.0", + "@playwright/mcp": "0.0.70", "@slack/bolt": "^4.6.0", "croner": "^10.0.1", "imapflow": "^1.2.18", "nodemailer": "^8.0.4", + "playwright": "1.59.1", "resend": "^6.9.4", "telegraf": "^4.16.3", "yaml": "^2.6.0", diff --git a/src/agent/prompt-assembler.ts b/src/agent/prompt-assembler.ts index 09197e1..69ef5e8 100644 --- a/src/agent/prompt-assembler.ts +++ b/src/agent/prompt-assembler.ts @@ -170,6 +170,31 @@ function buildEnvironment(config: PhantomConfig): string { lines.push(`- Pages are at ${publicUrl}/ui/`); } lines.push(""); + lines.push("SELF-VALIDATE EVERY UI PAGE YOU CREATE."); + lines.push("After phantom_create_page succeeds, always call phantom_preview_page with"); + lines.push("the same path. Review the screenshot, the HTTP status, the page title,"); + lines.push("and especially the console messages and failed network requests list."); + lines.push("If there are console errors, failed CDN loads, or the screenshot looks"); + lines.push("wrong, fix the HTML and re-run phantom_preview_page until clean. Only"); + lines.push("report the page to the user after validation passes."); + lines.push("The tool returns one image block plus a JSON metadata block. The image"); + lines.push("is for visual review, the JSON tells you what failed to load or error."); + lines.push(""); + lines.push("GENERAL BROWSER CAPABILITY."); + lines.push("You have access to the full Playwright MCP tool surface via the"); + lines.push("phantom-browser server. These tools share one Chromium instance with"); + lines.push("phantom_preview_page. Use browser_navigate to open any URL (localhost"); + lines.push("or external), browser_snapshot for structured accessibility text,"); + lines.push("browser_take_screenshot for pixel captures, browser_click/browser_type/"); + lines.push("browser_fill_form for interaction, browser_console_messages and"); + lines.push("browser_network_requests for debugging, browser_tabs for multi-page work."); + lines.push("For single-shot self-validation of your own /ui/ pages, always"); + lines.push("prefer phantom_preview_page: one call returns image plus JSON."); + lines.push("For multi-step browsing, research tasks, or external sites, use the"); + lines.push("browser_* tools directly."); + lines.push("Do NOT use browser_run_code against external pages unless the user"); + lines.push("explicitly asked you to execute code in a foreign origin."); + lines.push(""); lines.push("When you build something that others should access, you have two options:"); lines.push("1. Create an HTTP API on a local port. Give the user the internal URL and auth token."); lines.push( diff --git a/src/agent/runtime.ts b/src/agent/runtime.ts index 5e08cf0..807efb2 100644 --- a/src/agent/runtime.ts +++ b/src/agent/runtime.ts @@ -31,7 +31,7 @@ export class AgentRuntime { private roleTemplate: RoleTemplate | null = null; private onboardingPrompt: string | null = null; private lastTrackedFiles: string[] = []; - private mcpServerFactories: Record McpServerConfig> | null = null; + private mcpServerFactories: Record McpServerConfig | Promise> | null = null; constructor(config: PhantomConfig, db: Database) { this.config = config; @@ -55,7 +55,7 @@ export class AgentRuntime { this.onboardingPrompt = prompt; } - setMcpServerFactories(factories: Record McpServerConfig>): void { + setMcpServerFactories(factories: Record McpServerConfig | Promise>): void { this.mcpServerFactories = factories; } @@ -208,7 +208,11 @@ export class AgentRuntime { ...(useResume && session.sdk_session_id ? { resume: session.sdk_session_id } : {}), ...(this.mcpServerFactories ? { - mcpServers: Object.fromEntries(Object.entries(this.mcpServerFactories).map(([k, f]) => [k, f()])), + mcpServers: Object.fromEntries( + await Promise.all( + Object.entries(this.mcpServerFactories).map(async ([k, f]) => [k, await f()] as const), + ), + ), } : {}), }, diff --git a/src/index.ts b/src/index.ts index c3609c4..1bb4801 100644 --- a/src/index.ts +++ b/src/index.ts @@ -51,6 +51,8 @@ import { Scheduler } from "./scheduler/service.ts"; import { createSchedulerToolServer } from "./scheduler/tool.ts"; import { getSecretRequest } from "./secrets/store.ts"; import { createSecretToolServer } from "./secrets/tools.ts"; +import { createBrowserToolServer } from "./ui/browser-mcp.ts"; +import { closePreviewResources, createPreviewToolServer, getOrCreatePreviewContext } from "./ui/preview.ts"; import { setPublicDir, setSecretSavedCallback, setSecretsDb } from "./ui/serve.ts"; import { createWebUiToolServer } from "./ui/tools.ts"; @@ -191,6 +193,8 @@ async function main(): Promise { "phantom-scheduler": () => createSchedulerToolServer(scheduler as Scheduler), "phantom-web-ui": () => createWebUiToolServer(config.public_url), "phantom-secrets": () => createSecretToolServer({ db, baseUrl: secretsBaseUrl }), + "phantom-preview": () => createPreviewToolServer(config.port), + "phantom-browser": () => createBrowserToolServer(() => getOrCreatePreviewContext()), ...(process.env.RESEND_API_KEY ? { "phantom-email": () => @@ -204,7 +208,7 @@ async function main(): Promise { }); const emailStatus = process.env.RESEND_API_KEY ? " + email" : ""; console.log( - `[mcp] MCP server initialized (dynamic tools + scheduler + web UI + secrets${emailStatus} wired to agent)`, + `[mcp] MCP server initialized (dynamic tools + scheduler + web UI + secrets + preview + browser${emailStatus} wired to agent)`, ); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); @@ -580,6 +584,9 @@ async function main(): Promise { onShutdown("Scheduler", async () => { if (scheduler) scheduler.stop(); }); + onShutdown("Preview browser", async () => { + await closePreviewResources(); + }); onShutdown("Peer health monitor", async () => { if (peerHealthMonitor) peerHealthMonitor.stop(); }); diff --git a/src/ui/__tests__/browser-mcp.integration.test.ts b/src/ui/__tests__/browser-mcp.integration.test.ts new file mode 100644 index 0000000..f507bee --- /dev/null +++ b/src/ui/__tests__/browser-mcp.integration.test.ts @@ -0,0 +1,33 @@ +// Integration tests for createBrowserToolServer. These exercise the real +// @playwright/mcp embed with a real BrowserContext. Opt-in: +// +// PHANTOM_INTEGRATION=1 bun test src/ui/__tests__/browser-mcp.integration.test.ts +// +// Skipped by default so `bun test` stays hermetic. + +import { afterAll, describe, expect, test } from "bun:test"; +import { createBrowserToolServer } from "../browser-mcp.ts"; +import { closePreviewResources, getOrCreatePreviewContext } from "../preview.ts"; +import { revokeAllSessions } from "../session.ts"; + +const ENABLED = process.env.PHANTOM_INTEGRATION === "1"; +const suite = ENABLED ? describe : describe.skip; + +suite("createBrowserToolServer (integration)", () => { + afterAll(async () => { + await closePreviewResources(); + revokeAllSessions(); + }); + + test("builds an embed server wired to a shared BrowserContext", async () => { + const embed = await createBrowserToolServer(() => getOrCreatePreviewContext()); + expect(embed.type).toBe("sdk"); + expect(embed.name).toBe("phantom-browser"); + const inst = embed.instance as unknown as { + connect: unknown; + close: () => Promise; + }; + expect(typeof inst.connect).toBe("function"); + await inst.close(); + }); +}); diff --git a/src/ui/__tests__/browser-mcp.test.ts b/src/ui/__tests__/browser-mcp.test.ts new file mode 100644 index 0000000..11c2e41 --- /dev/null +++ b/src/ui/__tests__/browser-mcp.test.ts @@ -0,0 +1,37 @@ +import { describe, expect, test } from "bun:test"; +import type { BrowserContext } from "playwright"; +import { createBrowserToolServer } from "../browser-mcp.ts"; + +// The real @playwright/mcp createConnection is lazy: it wires a backend +// factory that will call the contextGetter only when a client actually +// requests a tool. Constructing the embed does not require a live +// BrowserContext, so these tests never touch Chromium. +function fakeContextGetter(): Promise { + return Promise.reject(new Error("contextGetter should not run in unit tests")); +} + +describe("createBrowserToolServer", () => { + test("returns an SDK MCP server config with the phantom-browser name", async () => { + const config = await createBrowserToolServer(fakeContextGetter); + expect(config.type).toBe("sdk"); + expect(config.name).toBe("phantom-browser"); + expect(config.instance).toBeDefined(); + }); + + test("instance exposes the MCP connect() contract used by the Agent SDK", async () => { + const config = await createBrowserToolServer(fakeContextGetter); + const inst = config.instance as unknown as { connect: unknown; close: unknown }; + expect(typeof inst.connect).toBe("function"); + expect(typeof inst.close).toBe("function"); + }); + + test("each call returns a distinct underlying Server instance", async () => { + const a = await createBrowserToolServer(fakeContextGetter); + const b = await createBrowserToolServer(fakeContextGetter); + // Factory pattern: the phantom-browser wrapper must be fresh per query. + // If the same instance leaks across calls the SDK will throw "Already + // connected to a transport" on the second run. See src/index.ts for + // the cardinal rule citation. + expect(a.instance).not.toBe(b.instance); + }); +}); diff --git a/src/ui/__tests__/preview.integration.test.ts b/src/ui/__tests__/preview.integration.test.ts new file mode 100644 index 0000000..d2287c6 --- /dev/null +++ b/src/ui/__tests__/preview.integration.test.ts @@ -0,0 +1,63 @@ +// Integration tests for phantom_preview_page. These launch a real Chromium +// (headless shell) and talk to a local Bun.serve, so they are opt-in. Run: +// +// PHANTOM_INTEGRATION=1 bun test src/ui/__tests__/preview.integration.test.ts +// +// They are skipped by default so `bun test` stays fast and hermetic. + +import { afterAll, beforeAll, describe, expect, test } from "bun:test"; +import { closePreviewResources, getOrCreatePreviewContext } from "../preview.ts"; +import { revokeAllSessions } from "../session.ts"; + +const ENABLED = process.env.PHANTOM_INTEGRATION === "1"; +const suite = ENABLED ? describe : describe.skip; + +suite("phantom_preview_page (integration)", () => { + let server: ReturnType | null = null; + let port = 0; + + beforeAll(() => { + server = Bun.serve({ + port: 0, + fetch(req) { + const url = new URL(req.url); + if (url.pathname === "/ui/test.html") { + return new Response( + "Preview Integration" + + "" + + "

Hello

", + { headers: { "content-type": "text/html" } }, + ); + } + return new Response("not found", { status: 404 }); + }, + }); + port = server.port ?? 0; + }); + + afterAll(async () => { + await closePreviewResources(); + revokeAllSessions(); + server?.stop(true); + }); + + test("navigates and screenshots a live page", async () => { + const ctx = await getOrCreatePreviewContext(); + const page = await ctx.newPage(); + try { + const response = await page.goto(`http://localhost:${port}/ui/test.html`); + expect(response?.status()).toBe(200); + expect(await page.title()).toBe("Preview Integration"); + const shot = await page.screenshot({ type: "png" }); + expect(shot.length).toBeGreaterThan(100); + } finally { + await page.close(); + } + }); + + test("two preview calls share the same BrowserContext", async () => { + const a = await getOrCreatePreviewContext(); + const b = await getOrCreatePreviewContext(); + expect(a).toBe(b); + }); +}); diff --git a/src/ui/__tests__/preview.test.ts b/src/ui/__tests__/preview.test.ts new file mode 100644 index 0000000..ac0b6f4 --- /dev/null +++ b/src/ui/__tests__/preview.test.ts @@ -0,0 +1,58 @@ +import { afterEach, describe, expect, test } from "bun:test"; +import { createPreviewToolServer } from "../preview.ts"; +import { createPreviewSession, isValidSession, revokeAllSessions } from "../session.ts"; + +afterEach(() => { + revokeAllSessions(); +}); + +describe("createPreviewSession", () => { + test("returns a token accepted by isValidSession", () => { + const { sessionToken } = createPreviewSession(); + expect(typeof sessionToken).toBe("string"); + expect(sessionToken.length).toBeGreaterThan(20); + expect(isValidSession(sessionToken)).toBe(true); + }); + + test("tokens are unique per call", () => { + const a = createPreviewSession().sessionToken; + const b = createPreviewSession().sessionToken; + expect(a).not.toBe(b); + }); + + test("token expires after the 10 minute TTL", () => { + const { sessionToken } = createPreviewSession(); + expect(isValidSession(sessionToken)).toBe(true); + + const originalNow = Date.now; + try { + // Eleven minutes later, the session should have expired. + Date.now = () => originalNow() + 11 * 60 * 1000; + expect(isValidSession(sessionToken)).toBe(false); + } finally { + Date.now = originalNow; + } + }); +}); + +describe("createPreviewToolServer", () => { + test("returns an SDK MCP server config with a name", () => { + const server = createPreviewToolServer(3100); + expect(server).toBeDefined(); + expect(server.type).toBe("sdk"); + expect(server.name).toBe("phantom-preview"); + expect(server.instance).toBeDefined(); + }); + + test("each call returns a distinct instance (factory pattern)", () => { + const a = createPreviewToolServer(3100); + const b = createPreviewToolServer(3100); + expect(a.instance).not.toBe(b.instance); + }); + + test("instance exposes the MCP connect() contract", () => { + const server = createPreviewToolServer(3100); + const inst = server.instance as unknown as { connect: unknown }; + expect(typeof inst.connect).toBe("function"); + }); +}); diff --git a/src/ui/browser-mcp.ts b/src/ui/browser-mcp.ts new file mode 100644 index 0000000..223aae0 --- /dev/null +++ b/src/ui/browser-mcp.ts @@ -0,0 +1,70 @@ +// Embed factory for the first-party `@playwright/mcp` 21-tool surface. We +// host it in-process via `createConnection(config, contextGetter)` rather +// than spawning it as a stdio subprocess: the subprocess path hangs on +// Linux amd64 under Bun (see research 01b, five-probe trace in findings +// Section 3), and the in-process path lets us share one Chromium Browser +// and one BrowserContext with `phantom_preview_page`. +// +// Two load-bearing details: +// +// 1. `browser.isolated: false` is mandatory when using `contextGetter`. +// Playwright MCP's SimpleBrowser.newContext() throws +// ("Creating a new context is not supported in SimpleBrowserContextFactory") +// and the MCP backend only calls `newContext` when `isolated` is true. +// With `isolated: false`, the backend defers to our contextGetter and +// never touches its own context factory. +// +// 2. The return type of `createConnection` is the low-level +// `Server` class from `@modelcontextprotocol/sdk`, not the high-level +// `McpServer`. Both inherit `.connect(transport)` from `Protocol`, and +// the Agent SDK's `connectSdkMcpServer` only ever calls +// `.connect(transport)` on the stored instance. The declared +// `McpSdkServerConfigWithInstance.instance: McpServer` type is narrower +// than the runtime contract. We widen with a single +// `as unknown as McpServer` cast, which is the minimum-surface type +// escape hatch the CLAUDE.md standards allow for this exact case. +// See findings 01b Section 3.3 for the source citation. +// +// 3. `@playwright/mcp@0.0.70` ships a nested `playwright-core` under its +// own `node_modules`, so its `BrowserContext` type root is structurally +// identical to but nominally disjoint from the top-level `playwright` +// package's `BrowserContext`. At runtime they are the same object (the +// nested playwright-core is never constructed; we pass in our own +// context). We declare the `getContext` parameter against the top-level +// type (what every caller in this codebase has on hand) and widen the +// function reference to `unknown` at the `createConnection` boundary. +// Any change to this line should preserve that single-point widening. + +import type { McpSdkServerConfigWithInstance } from "@anthropic-ai/claude-agent-sdk"; +import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { createConnection } from "@playwright/mcp"; +import type { BrowserContext } from "playwright"; + +// @playwright/mcp's createConnection declares contextGetter against its own +// nested playwright-core types. At runtime both roots are structurally the +// same BrowserContext; see header note 3 for the full explanation. +type AnyContextGetter = Parameters[1]; + +export async function createBrowserToolServer( + getContext: () => Promise, +): Promise { + const server = await createConnection( + { + browser: { + // Mandatory: see file header for the newContext() throw. + isolated: false, + }, + outputDir: "/tmp/phantom-browser-mcp-out", + imageResponses: "allow", + }, + getContext as unknown as AnyContextGetter, + ); + return { + type: "sdk" as const, + name: "phantom-browser", + // Structural widening: Server has the same .connect(transport) the + // Agent SDK calls on McpServer. See file header for the full + // justification and the source citation. + instance: server as unknown as McpServer, + }; +} diff --git a/src/ui/preview.ts b/src/ui/preview.ts new file mode 100644 index 0000000..6306a71 --- /dev/null +++ b/src/ui/preview.ts @@ -0,0 +1,218 @@ +// Custom in-process MCP tool server exposing `phantom_preview_page`: a +// one-call self-validation tool for Phantom's /ui/ pages. Navigates a +// headless Chromium to the page, captures a full-page PNG, and bundles HTTP +// status, title, console messages, and failed network requests alongside the +// screenshot so the agent can reason about the page in a single tool call. +// +// The underlying Chromium Browser and the per-query BrowserContext are +// module-level singletons. This mirrors the Phase 1 factory pattern used by +// `DynamicToolRegistry` and `Scheduler`: the MCP server wrapper is recreated +// per query (required so the SDK can attach a fresh transport each time), +// but the expensive resources it wraps are process-scoped and stay warm. +// +// Both `phantom-preview` and the embedded `@playwright/mcp` browser surface +// share `getOrCreatePreviewContext()` so cookies minted by the preview tool +// are visible to the broader `browser_*` tools within the same query. This +// is what lets the agent mix `phantom_preview_page` with `browser_click` and +// `browser_snapshot` against its own /ui/ pages without re-authenticating. + +import type { McpSdkServerConfigWithInstance } from "@anthropic-ai/claude-agent-sdk"; +import { createSdkMcpServer, tool } from "@anthropic-ai/claude-agent-sdk"; +import type { Browser, BrowserContext } from "playwright"; +import { chromium } from "playwright"; +import { z } from "zod"; +import { createPreviewSession } from "./session.ts"; + +let browser: Browser | null = null; +let browserPromise: Promise | null = null; +let currentContext: BrowserContext | null = null; +let currentContextPromise: Promise | null = null; + +const CHROMIUM_LAUNCH_ARGS = [ + // Required because the container runs as a non-root user and Chromium's + // default sandbox needs privileged setuid helpers we intentionally do not + // ship. The container boundary is our sandbox; Chromium does not need its + // own layer on top. + "--no-sandbox", + "--disable-setuid-sandbox", + // /dev/shm defaults to 64 MiB in containers, which is too small for + // Chromium's IPC shared memory. Fall back to /tmp (disk-backed). + "--disable-dev-shm-usage", +]; + +export async function getOrCreateBrowser(): Promise { + if (browser) return browser; + if (browserPromise) return browserPromise; + browserPromise = (async () => { + const b = await chromium.launch({ + headless: true, + args: CHROMIUM_LAUNCH_ARGS, + }); + browser = b; + browserPromise = null; + return b; + })(); + return browserPromise; +} + +export async function getOrCreatePreviewContext(): Promise { + if (currentContext) return currentContext; + if (currentContextPromise) return currentContextPromise; + currentContextPromise = (async () => { + const b = await getOrCreateBrowser(); + const ctx = await b.newContext(); + const { sessionToken } = createPreviewSession(); + // Scoped to localhost: external navigations initiated by any browser_* + // tool will not see this cookie, so the auth surface stays identical to + // the existing /ui/ cookie model. + await ctx.addCookies([ + { + name: "phantom_session", + value: sessionToken, + domain: "localhost", + path: "/", + httpOnly: true, + secure: false, + sameSite: "Strict", + }, + ]); + currentContext = ctx; + currentContextPromise = null; + return ctx; + })(); + return currentContextPromise; +} + +export async function closePreviewContext(): Promise { + const ctx = currentContext; + currentContext = null; + currentContextPromise = null; + if (ctx) { + try { + await ctx.close(); + } catch { + // Context may already be closed if the browser was torn down first. + } + } +} + +export async function closePreviewResources(): Promise { + await closePreviewContext(); + const b = browser; + browser = null; + browserPromise = null; + if (b) { + try { + await b.close(); + } catch { + // Swallow: we are shutting down, any error is terminal anyway. + } + } +} + +type ConsoleMessage = { type: string; text: string }; +type FailedRequest = { url: string; failure: string }; + +type PreviewSuccess = { + content: [{ type: "image"; data: string; mimeType: "image/png" }, { type: "text"; text: string }]; +}; + +type PreviewError = { + content: [{ type: "text"; text: string }]; + isError: true; +}; + +export function createPreviewToolServer(port: number): McpSdkServerConfigWithInstance { + const previewPageTool = tool( + "phantom_preview_page", + "Screenshot and validate a Phantom /ui/ page. Returns a PNG image " + + "block plus a JSON metadata block containing the HTTP status, page " + + "title, console messages, and failed network requests. Use this " + + "after phantom_create_page to verify the page rendered correctly " + + "before reporting success to the user.", + { + path: z.string().min(1).describe("Path under /ui/, e.g. 'dashboard.html' or 'reports/weekly.html'"), + viewport: z + .object({ + width: z.number().int().min(320).max(3840), + height: z.number().int().min(240).max(2160), + }) + .optional() + .describe("Viewport size in CSS pixels. Defaults to 1280x800."), + fullPage: z.boolean().optional().describe("Capture full scroll height. Defaults to true."), + }, + async (input): Promise => { + const ctx = await getOrCreatePreviewContext(); + const page = await ctx.newPage(); + try { + const viewport = input.viewport ?? { width: 1280, height: 800 }; + await page.setViewportSize(viewport); + // SSE EventSource connections created by Phantom's live-reload + // wiring will hold the page open indefinitely on 'load' and + // cause the tool to time out. Stub it before any page JS runs. + // Research 01 verified init scripts run before page JS. + await page.addInitScript(() => { + (window as unknown as { EventSource: undefined }).EventSource = undefined; + }); + const consoleMessages: ConsoleMessage[] = []; + page.on("console", (m) => { + consoleMessages.push({ type: m.type(), text: m.text() }); + }); + const failedRequests: FailedRequest[] = []; + page.on("requestfailed", (r) => { + failedRequests.push({ + url: r.url(), + failure: r.failure()?.errorText ?? "unknown", + }); + }); + const safePath = input.path.replace(/^\/+/, ""); + const url = `http://localhost:${port}/ui/${safePath}`; + const response = await page.goto(url, { waitUntil: "load", timeout: 15000 }); + const status = response?.status() ?? 0; + const title = await page.title(); + const shot = await page.screenshot({ + fullPage: input.fullPage !== false, + type: "png", + }); + return { + content: [ + { + type: "image" as const, + data: shot.toString("base64"), + mimeType: "image/png", + }, + { + type: "text" as const, + text: JSON.stringify({ status, title, consoleMessages, failedRequests }, null, 2), + }, + ], + }; + } catch (err: unknown) { + const message = err instanceof Error ? err.message : String(err); + return { + content: [ + { + type: "text" as const, + text: JSON.stringify({ error: message }), + }, + ], + isError: true as const, + }; + } finally { + // Always close the page, even on throw, so idle tabs never leak + // onto the shared context. The context lives for the whole query + // but tabs are per-call. + try { + await page.close(); + } catch { + // Page already closed by cleanup path. + } + } + }, + ); + + return createSdkMcpServer({ + name: "phantom-preview", + tools: [previewPageTool], + }); +} diff --git a/src/ui/session.ts b/src/ui/session.ts index d95299e..ccadd69 100644 --- a/src/ui/session.ts +++ b/src/ui/session.ts @@ -15,6 +15,7 @@ type MagicLink = { const SESSION_TTL_MS = 7 * 24 * 60 * 60 * 1000; // 7 days const MAGIC_LINK_TTL_MS = 10 * 60 * 1000; // 10 minutes +const PREVIEW_SESSION_TTL_MS = 10 * 60 * 1000; // 10 minutes const sessions = new Map(); const magicLinks = new Map(); @@ -40,6 +41,22 @@ export function createSession(): { sessionToken: string; magicToken: string } { return { sessionToken, magicToken }; } +// Mint a short-lived session for phantom_preview_page. The returned token is +// injected as a cookie into the Playwright BrowserContext so the preview tool +// can authenticate against /ui/ without minting a magic link. The short +// TTL bounds blast radius: if the cookie ever leaks out of the container, it +// self-destructs in ten minutes and cannot be refreshed. +export function createPreviewSession(): { sessionToken: string } { + const sessionToken = randomBytes(32).toString("base64url"); + const now = Date.now(); + sessions.set(sessionToken, { + token: sessionToken, + createdAt: now, + expiresAt: now + PREVIEW_SESSION_TTL_MS, + }); + return { sessionToken }; +} + export function isValidSession(token: string): boolean { const session = sessions.get(token); if (!session) return false; From 6a7147858a08f5fcce2b0fbe450c630ea0a8a8f4 Mon Sep 17 00:00:00 2001 From: Muhammad Ahmed Cheema Date: Sun, 12 Apr 2026 21:32:32 -0700 Subject: [PATCH 2/2] fix: address review findings and Codex feedback on playwright PR Surgical fix pass on top of 41369cf in response to the internal review at local/2026-04-12-phantom-ui-chapter/review/01-playwright-review.md and Codex PR comments on #53. Fourteen acceptance criteria met. Preview correctness (src/ui/preview.ts): - F5: cookie path reverted from / to /ui, matching the magic-link posture in src/ui/serve.ts (buildPreviewCookie helper extracted). - F6 / Codex P1: rotate the preview session cookie on every warm getOrCreatePreviewContext call once Date.now() is inside the 2 minute safety margin before the 10 minute TTL. Uses addCookies in place so the cached BrowserContext stays the same instance. - F7: shuttingDown module flag set in closePreviewResources and guarded at the top of both getOrCreateBrowser and getOrCreatePreviewContext. Concurrent tool calls during SIGTERM now throw cleanly instead of spawning a resurrected Chromium. - F8: null currentContext in the page.close() catch block so the next call recreates a fresh context instead of handing back a dead one. - F9: replace window.EventSource = undefined with Reflect.deleteProperty(globalThis, "EventSource"). Covers the 'in' operator and drops a TypeScript widening. - Codex P2: try/finally pattern around browserPromise and currentContextPromise so a transient chromium.launch error no longer permanently poisons the cache. Subsequent calls retry cleanly. Browser MCP (src/ui/browser-mcp.ts): - F17: header note 3 rewritten to reflect the actual on-disk layout (top-level hoisted playwright-core@1.60.0-alpha vs. nested playwright/node_modules/playwright-core@1.59.1; no nested playwright-core under @playwright/mcp/node_modules). Cites the integration test that verifies the cross-version BrowserContext boundary end to end. Tests: - F1: preview.integration.test.ts rewritten to drive the real phantom_preview_page handler through a Client + InMemoryTransport pair. Asserts the bundled {image, text} result shape, base64 round-trip, status, title, console capture, and requestfailed capture. - F2 + F4: browser-mcp.integration.test.ts extended with a real Client.listTools() call asserting exactly 21 tools by name and a real browser_navigate round-trip across the cross-version BrowserContext boundary. - New preview-correctness.test.ts with 8 unit tests covering launch-failure recovery, shuttingDown, cookie rotation threshold, cookie path, and the F8 invariant. Uses spyOn(chromium, "launch") plus a fake Browser / BrowserContext so no real Chromium is spawned. Docs: - F3 + F11: Dockerfile image cost breakdown comment expanded to record the 996 MiB delta split: ~327 MB headless shell, ~91 MB fonts, ~500+ MB X11/GTK shared libs. Documents why --only-shell cannot trim more. Test count: 940 pass + 6 skip baseline -> 948 pass + 10 skip after (strictly greater, zero regressions). bun run lint and bun run typecheck both clean. PHANTOM_INTEGRATION=1 integration run on macOS arm64 passes 14/14 with real Chromium. Deferred per fix brief anti-patterns: F10 (v1.1 error-path screenshot), F11 (afterEach scope narrowing, pre-existing), F12 (graceful.ts tasks.reverse, out of scope). --- Dockerfile | 23 +- .../__tests__/browser-mcp.integration.test.ts | 118 +++++++++- src/ui/__tests__/preview-correctness.test.ts | 222 ++++++++++++++++++ src/ui/__tests__/preview.integration.test.ts | 137 +++++++++-- src/ui/browser-mcp.ts | 35 ++- src/ui/preview.ts | 118 +++++++--- 6 files changed, 580 insertions(+), 73 deletions(-) create mode 100644 src/ui/__tests__/preview-correctness.test.ts diff --git a/Dockerfile b/Dockerfile index be03408..e3b0e5e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,10 +77,25 @@ COPY --from=builder /app/tsconfig.json ./ # Install Chromium headless shell + system deps for Playwright. # Must run after node_modules is copied so bunx can resolve playwright. -# --only-shell skips the full Chromium binary (saves ~75 MiB); the custom -# phantom_preview_page tool uses chromium.launch() which picks the shell -# automatically for headless=true. The @playwright/mcp embed path uses a -# contextGetter so it never needs the full chrome channel binary. +# --only-shell skips the full Chromium binary (saves ~75 MiB off the full +# chrome channel); the custom phantom_preview_page tool uses +# chromium.launch() which picks the headless shell automatically for +# headless=true. The @playwright/mcp embed path uses a contextGetter so it +# never needs the full chrome channel binary. +# +# Image cost breakdown (verified on the built image vs. the pre-Playwright +# baseline, total delta roughly 996 MiB over the non-Playwright baseline): +# ~327 MB chromium_headless_shell-* binary at +# /home/phantom/.cache/ms-playwright/chromium_headless_shell-* +# ~91 MB /usr/share/fonts pulled by --with-deps (DejaVu, Liberation, +# Noto Core) +# ~500+ MB /usr/lib X11 / GTK / libasound / libnss3 / libcups / libatk +# and the other shared libraries apt-get pulls for Chromium +# +# --only-shell only affects the Chromium binary. The system deps are the +# dominant cost and cannot be trimmed without breaking Chromium's ability +# to start. If you are trying to shrink this image, the headless shell +# binary is the only safe target; the /usr/lib growth is load-bearing. ENV PLAYWRIGHT_BROWSERS_PATH=/home/phantom/.cache/ms-playwright RUN mkdir -p "$PLAYWRIGHT_BROWSERS_PATH" && \ bunx playwright install --with-deps --only-shell chromium && \ diff --git a/src/ui/__tests__/browser-mcp.integration.test.ts b/src/ui/__tests__/browser-mcp.integration.test.ts index f507bee..c41f2ce 100644 --- a/src/ui/__tests__/browser-mcp.integration.test.ts +++ b/src/ui/__tests__/browser-mcp.integration.test.ts @@ -4,30 +4,124 @@ // PHANTOM_INTEGRATION=1 bun test src/ui/__tests__/browser-mcp.integration.test.ts // // Skipped by default so `bun test` stays hermetic. +// +// Two load-bearing invariants are enforced here: +// +// 1. The embed exposes exactly 21 tools. @playwright/mcp@0.0.70 is pinned +// specifically so the tool surface cannot drift silently; this assertion +// is the drift detector the pin was meant to anchor. +// +// 2. A real `browser_navigate` call succeeds against a BrowserContext +// minted by the preview tool. This is the end-to-end verification of +// the cross-version playwright-core boundary documented in +// src/ui/browser-mcp.ts note 3: the context is an instance from +// playwright-core@1.59.1 consumed by @playwright/mcp's hoisted +// playwright-core@1.60.0-alpha SimpleBrowser wrapper. -import { afterAll, describe, expect, test } from "bun:test"; +import { afterAll, beforeAll, describe, expect, test } from "bun:test"; +import { Client } from "@modelcontextprotocol/sdk/client/index.js"; +import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js"; import { createBrowserToolServer } from "../browser-mcp.ts"; -import { closePreviewResources, getOrCreatePreviewContext } from "../preview.ts"; +import { __resetPreviewStateForTesting, closePreviewResources, getOrCreatePreviewContext } from "../preview.ts"; import { revokeAllSessions } from "../session.ts"; const ENABLED = process.env.PHANTOM_INTEGRATION === "1"; const suite = ENABLED ? describe : describe.skip; +const EXPECTED_TOOL_NAMES = [ + "browser_click", + "browser_close", + "browser_console_messages", + "browser_drag", + "browser_evaluate", + "browser_file_upload", + "browser_fill_form", + "browser_handle_dialog", + "browser_hover", + "browser_navigate", + "browser_navigate_back", + "browser_network_requests", + "browser_press_key", + "browser_resize", + "browser_run_code", + "browser_select_option", + "browser_snapshot", + "browser_tabs", + "browser_take_screenshot", + "browser_type", + "browser_wait_for", +]; + +type CallResult = { isError?: boolean; content: unknown }; + suite("createBrowserToolServer (integration)", () => { + let server: ReturnType | null = null; + let port = 0; + let client: Client | null = null; + let embed: Awaited> | null = null; + + beforeAll(async () => { + // Reset module-level preview state so running this file after any + // other test file that called closePreviewResources() still starts + // from a pristine state. Bun shares module instances across test + // files inside the same process. + __resetPreviewStateForTesting(); + server = Bun.serve({ + port: 0, + fetch(req) { + const url = new URL(req.url); + if (url.pathname === "/ui/test.html") { + return new Response( + "Browser MCP Integration" + + "

Hello

", + { headers: { "content-type": "text/html" } }, + ); + } + return new Response("not found", { status: 404 }); + }, + }); + port = server.port ?? 0; + + embed = await createBrowserToolServer(() => getOrCreatePreviewContext()); + const [serverTransport, clientTransport] = InMemoryTransport.createLinkedPair(); + const serverInstance = embed.instance as unknown as { + connect: (t: typeof serverTransport) => Promise; + close: () => Promise; + }; + await serverInstance.connect(serverTransport); + client = new Client({ name: "phantom-browser-integration", version: "1.0" }, { capabilities: {} }); + await client.connect(clientTransport); + }); + afterAll(async () => { + await client?.close(); + if (embed) { + const inst = embed.instance as unknown as { close: () => Promise }; + await inst.close(); + } await closePreviewResources(); revokeAllSessions(); + server?.stop(true); }); - test("builds an embed server wired to a shared BrowserContext", async () => { - const embed = await createBrowserToolServer(() => getOrCreatePreviewContext()); - expect(embed.type).toBe("sdk"); - expect(embed.name).toBe("phantom-browser"); - const inst = embed.instance as unknown as { - connect: unknown; - close: () => Promise; - }; - expect(typeof inst.connect).toBe("function"); - await inst.close(); + test("listTools returns exactly the 21-tool @playwright/mcp surface", async () => { + if (!client) throw new Error("client not initialized"); + const { tools } = await client.listTools(); + expect(tools).toHaveLength(21); + const names = tools.map((t) => t.name).sort(); + expect(names).toEqual([...EXPECTED_TOOL_NAMES].sort()); + }); + + test("browser_navigate succeeds across the cross-version BrowserContext boundary", async () => { + if (!client) throw new Error("client not initialized"); + const result = (await client.callTool({ + name: "browser_navigate", + arguments: { url: `http://localhost:${port}/ui/test.html` }, + })) as CallResult; + // A successful navigate returns content with no isError flag set. + // The exact content shape is @playwright/mcp's concern; we care only + // that the call did not land in the error branch. + expect(result.isError).toBeFalsy(); + expect(result.content).toBeDefined(); }); }); diff --git a/src/ui/__tests__/preview-correctness.test.ts b/src/ui/__tests__/preview-correctness.test.ts new file mode 100644 index 0000000..f493aa6 --- /dev/null +++ b/src/ui/__tests__/preview-correctness.test.ts @@ -0,0 +1,222 @@ +// Unit tests for the correctness paths in src/ui/preview.ts that we can +// drive without a real Chromium: cookie rotation at the 8 minute threshold, +// the shuttingDown flag, and the try/finally launch-failure recovery +// pattern. The happy end-to-end path is exercised by the opt-in integration +// test at preview.integration.test.ts which requires PHANTOM_INTEGRATION=1. +// +// We replace `chromium.launch` via spyOn so the module under test never +// actually spawns a browser process. The fake Browser and BrowserContext +// expose exactly the subset of the real API that preview.ts calls: nothing +// more, nothing less. + +import { afterEach, describe, expect, spyOn, test } from "bun:test"; +import type { Browser, BrowserContext } from "playwright"; +import { chromium } from "playwright"; +import { + __resetPreviewStateForTesting, + closePreviewResources, + getOrCreateBrowser, + getOrCreatePreviewContext, +} from "../preview.ts"; +import { getSessionCount, revokeAllSessions } from "../session.ts"; + +type Cookie = { name: string; value: string; domain: string; path: string }; +type FakeContext = BrowserContext & { __cookies: Cookie[]; __closed: boolean }; +type FakeBrowser = Browser & { __contexts: FakeContext[]; __closed: boolean }; + +function makeFakeContext(): FakeContext { + const state = { __cookies: [] as Cookie[], __closed: false }; + const ctx = { + ...state, + addCookies: async (cookies: Cookie[]) => { + // Match Playwright semantics: replace any cookie with the same + // name+domain+path in place, otherwise append. + for (const incoming of cookies) { + const existing = state.__cookies.findIndex( + (c) => c.name === incoming.name && c.domain === incoming.domain && c.path === incoming.path, + ); + if (existing >= 0) state.__cookies[existing] = incoming; + else state.__cookies.push(incoming); + } + }, + close: async () => { + state.__closed = true; + }, + } as unknown as FakeContext; + Object.defineProperty(ctx, "__cookies", { get: () => state.__cookies }); + Object.defineProperty(ctx, "__closed", { get: () => state.__closed }); + return ctx; +} + +function makeFakeBrowser(): FakeBrowser { + const contexts: FakeContext[] = []; + const state = { __closed: false }; + const b = { + newContext: async () => { + const ctx = makeFakeContext(); + contexts.push(ctx); + return ctx; + }, + close: async () => { + state.__closed = true; + }, + } as unknown as FakeBrowser; + Object.defineProperty(b, "__contexts", { get: () => contexts }); + Object.defineProperty(b, "__closed", { get: () => state.__closed }); + return b; +} + +afterEach(() => { + __resetPreviewStateForTesting(); + revokeAllSessions(); +}); + +describe("getOrCreateBrowser launch-failure recovery (Codex P2)", () => { + test("second call after a transient launch error succeeds", async () => { + const spy = spyOn(chromium, "launch"); + try { + let calls = 0; + spy.mockImplementation(async () => { + calls += 1; + if (calls === 1) throw new Error("transient resource error"); + return makeFakeBrowser(); + }); + + await expect(getOrCreateBrowser()).rejects.toThrow("transient resource error"); + // If the rejected promise were cached, this would re-reject with the + // same error instead of invoking launch a second time. + const b = await getOrCreateBrowser(); + expect(b).toBeDefined(); + expect(calls).toBe(2); + } finally { + spy.mockRestore(); + } + }); +}); + +describe("getOrCreatePreviewContext launch-failure recovery", () => { + test("clears currentContextPromise on failure so the next call retries", async () => { + const spy = spyOn(chromium, "launch"); + try { + let calls = 0; + spy.mockImplementation(async () => { + calls += 1; + if (calls === 1) throw new Error("chromium down"); + return makeFakeBrowser(); + }); + + await expect(getOrCreatePreviewContext()).rejects.toThrow("chromium down"); + const ctx = await getOrCreatePreviewContext(); + expect(ctx).toBeDefined(); + expect(calls).toBe(2); + } finally { + spy.mockRestore(); + } + }); +}); + +describe("shuttingDown flag (review F7)", () => { + test("getOrCreateBrowser throws after closePreviewResources", async () => { + await closePreviewResources(); + await expect(getOrCreateBrowser()).rejects.toThrow("preview subsystem is shutting down"); + }); + + test("getOrCreatePreviewContext throws after closePreviewResources", async () => { + await closePreviewResources(); + await expect(getOrCreatePreviewContext()).rejects.toThrow("preview subsystem is shutting down"); + }); +}); + +describe("cookie rotation (review F6, Codex P1)", () => { + test("cold create mints a fresh preview cookie on the new context", async () => { + const spy = spyOn(chromium, "launch"); + try { + const fakeBrowser = makeFakeBrowser(); + spy.mockImplementation(async () => fakeBrowser); + + const ctx = (await getOrCreatePreviewContext()) as FakeContext; + expect(ctx.__cookies).toHaveLength(1); + const cookie = ctx.__cookies[0]; + expect(cookie.name).toBe("phantom_session"); + expect(cookie.domain).toBe("localhost"); + // Fix 4: cookie path is scoped to /ui, matching the magic-link + // posture in src/ui/serve.ts. + expect(cookie.path).toBe("/ui"); + expect(typeof cookie.value).toBe("string"); + expect(cookie.value.length).toBeGreaterThan(20); + } finally { + spy.mockRestore(); + } + }); + + test("warm reuse inside the 8 minute window does not rotate", async () => { + const spy = spyOn(chromium, "launch"); + try { + spy.mockImplementation(async () => makeFakeBrowser()); + + const ctx1 = (await getOrCreatePreviewContext()) as FakeContext; + const firstToken = ctx1.__cookies[0].value; + const sessionsAfterFirst = getSessionCount(); + + const ctx2 = (await getOrCreatePreviewContext()) as FakeContext; + expect(ctx2).toBe(ctx1); + expect(ctx2.__cookies).toHaveLength(1); + expect(ctx2.__cookies[0].value).toBe(firstToken); + expect(getSessionCount()).toBe(sessionsAfterFirst); + } finally { + spy.mockRestore(); + } + }); + + test("warm reuse past the 8 minute threshold rotates the cookie in place", async () => { + const spy = spyOn(chromium, "launch"); + const originalNow = Date.now; + try { + spy.mockImplementation(async () => makeFakeBrowser()); + + const ctx1 = (await getOrCreatePreviewContext()) as FakeContext; + const firstToken = ctx1.__cookies[0].value; + expect(getSessionCount()).toBe(1); + + // Advance wall-clock past the 8 minute rotation threshold. The cached + // BrowserContext stays the same instance but the cookie value gets + // replaced in place via addCookies semantics. + Date.now = () => originalNow() + 9 * 60 * 1000; + + const ctx2 = (await getOrCreatePreviewContext()) as FakeContext; + expect(ctx2).toBe(ctx1); + expect(ctx2.__cookies).toHaveLength(1); + expect(ctx2.__cookies[0].value).not.toBe(firstToken); + // A second preview session was minted into the sessions map. + expect(getSessionCount()).toBe(2); + } finally { + Date.now = originalNow; + spy.mockRestore(); + } + }); +}); + +describe("page.close() error path clears currentContext (review F8)", () => { + // We cannot drive the real phantom_preview_page handler without a real + // browser, but we can verify the public invariant the fix establishes: + // once currentContext has been nulled (as the catch block does), the next + // getOrCreatePreviewContext() call returns a fresh context. The preview + // integration test covers the happy path end to end. + test("after currentContext is nulled, next call creates a fresh context", async () => { + const spy = spyOn(chromium, "launch"); + try { + spy.mockImplementation(async () => makeFakeBrowser()); + + const ctx1 = (await getOrCreatePreviewContext()) as FakeContext; + // Simulate the catch-block cleanup in the preview tool finally. + await closePreviewResources(); + __resetPreviewStateForTesting(); + + spy.mockImplementation(async () => makeFakeBrowser()); + const ctx2 = (await getOrCreatePreviewContext()) as FakeContext; + expect(ctx2).not.toBe(ctx1); + } finally { + spy.mockRestore(); + } + }); +}); diff --git a/src/ui/__tests__/preview.integration.test.ts b/src/ui/__tests__/preview.integration.test.ts index d2287c6..17ff554 100644 --- a/src/ui/__tests__/preview.integration.test.ts +++ b/src/ui/__tests__/preview.integration.test.ts @@ -4,26 +4,69 @@ // PHANTOM_INTEGRATION=1 bun test src/ui/__tests__/preview.integration.test.ts // // They are skipped by default so `bun test` stays fast and hermetic. +// +// Unlike the unit tests in preview.test.ts, these tests invoke the real +// phantom_preview_page handler through its MCP server instance via an +// InMemoryTransport pair and Client from @modelcontextprotocol/sdk. That +// means every load-bearing behavior the handler owns is exercised end to +// end: the SSE init-script stub, console-message capture, failed-request +// capture, the bundled image+text result shape, and the isError path. import { afterAll, beforeAll, describe, expect, test } from "bun:test"; -import { closePreviewResources, getOrCreatePreviewContext } from "../preview.ts"; +import { Client } from "@modelcontextprotocol/sdk/client/index.js"; +import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js"; +import { + __resetPreviewStateForTesting, + closePreviewResources, + createPreviewToolServer, + getOrCreatePreviewContext, +} from "../preview.ts"; import { revokeAllSessions } from "../session.ts"; const ENABLED = process.env.PHANTOM_INTEGRATION === "1"; const suite = ENABLED ? describe : describe.skip; +type ToolContent = + | { type: "text"; text: string } + | { type: "image"; data: string; mimeType: string } + | { type: string; [k: string]: unknown }; + +type ToolResult = { + content: ToolContent[]; + isError?: boolean; +}; + +function asText(block: ToolContent | undefined): string { + if (!block || block.type !== "text") throw new Error("expected a text content block"); + return (block as { text: string }).text; +} + +function asImage(block: ToolContent | undefined): { data: string; mimeType: string } { + if (!block || block.type !== "image") throw new Error("expected an image content block"); + return block as { data: string; mimeType: string; type: "image" }; +} + suite("phantom_preview_page (integration)", () => { let server: ReturnType | null = null; let port = 0; + let client: Client | null = null; - beforeAll(() => { + beforeAll(async () => { + // Reset module-level preview state so running this file after any + // other test file that called closePreviewResources() still starts + // from a pristine state. Bun shares module instances across test + // files inside the same process. + __resetPreviewStateForTesting(); server = Bun.serve({ port: 0, fetch(req) { const url = new URL(req.url); if (url.pathname === "/ui/test.html") { + // The missing ' + "" + "

Hello

", { headers: { "content-type": "text/html" } }, @@ -33,29 +76,93 @@ suite("phantom_preview_page (integration)", () => { }, }); port = server.port ?? 0; + + // Wire a real MCP Client to the real preview tool server via an + // in-memory transport pair. This is the exact contract the Agent SDK + // uses at runtime; if the bundled result shape drifts, this test + // catches it. + const embed = createPreviewToolServer(port); + const [serverTransport, clientTransport] = InMemoryTransport.createLinkedPair(); + const serverInstance = embed.instance as unknown as { + connect: (t: typeof serverTransport) => Promise; + }; + await serverInstance.connect(serverTransport); + client = new Client({ name: "phantom-preview-integration", version: "1.0" }, { capabilities: {} }); + await client.connect(clientTransport); }); afterAll(async () => { + await client?.close(); await closePreviewResources(); revokeAllSessions(); server?.stop(true); }); - test("navigates and screenshots a live page", async () => { - const ctx = await getOrCreatePreviewContext(); - const page = await ctx.newPage(); - try { - const response = await page.goto(`http://localhost:${port}/ui/test.html`); - expect(response?.status()).toBe(200); - expect(await page.title()).toBe("Preview Integration"); - const shot = await page.screenshot({ type: "png" }); - expect(shot.length).toBeGreaterThan(100); - } finally { - await page.close(); + test("tools/list exposes phantom_preview_page", async () => { + if (!client) throw new Error("client not initialized"); + const { tools } = await client.listTools(); + expect(tools.map((t) => t.name)).toContain("phantom_preview_page"); + }); + + test("successful call returns image + text blocks with full metadata", async () => { + if (!client) throw new Error("client not initialized"); + const result = (await client.callTool({ + name: "phantom_preview_page", + arguments: { path: "test.html" }, + })) as ToolResult; + + expect(result.isError).toBeFalsy(); + expect(result.content).toHaveLength(2); + + const image = asImage(result.content[0]); + expect(image.mimeType).toBe("image/png"); + expect(image.data.length).toBeGreaterThan(100); + // Base64 round-trip sanity: decodes to a non-empty byte buffer. + expect(Buffer.from(image.data, "base64").length).toBeGreaterThan(100); + + const meta = JSON.parse(asText(result.content[1])) as { + status: number; + title: string; + consoleMessages: { type: string; text: string }[]; + failedRequests: { url: string; failure: string }[]; + }; + expect(meta.status).toBe(200); + expect(meta.title).toBe("Preview Integration"); + + // The inline