diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index da63102..b802285 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -12,16 +12,17 @@ { "name": "browse", "source": "./", - "description": "Automate web browser interactions using natural language. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications.", + "description": "Automate web browser interactions using natural language. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or coordinate multiple agents across tabs in one browser.", "version": "0.0.1", "author": { "name": "Browserbase" }, "category": "automation", - "keywords": ["browser", "automation", "web-scraping", "stagehand", "screenshots"], + "keywords": ["browser", "automation", "web-scraping", "stagehand", "screenshots", "multi-agent"], "strict": false, "skills": [ - "./skills/browser" + "./skills/browser", + "./skills/browser-swarm" ] }, { diff --git a/README.md b/README.md index a5d69e9..f2bd6b7 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ This plugin includes the following skills (see `skills/` for details): | Skill | Description | |-------|-------------| | [browser](skills/browser/SKILL.md) | Automate web browser interactions via CLI commands — supports remote Browserbase sessions with anti-bot stealth, CAPTCHA solving, and residential proxies | +| [browser-swarm](skills/browser-swarm/SKILL.md) | Coordinate multiple agents across separate tabs of one local Chrome using auto-connect, with target-bound CDP/Stagehand fallback guidance for true parallel tab ownership | | [browserbase-cli](skills/browserbase-cli/SKILL.md) | Use the official `bb` CLI for Browserbase Functions and platform API workflows including sessions, projects, contexts, extensions, fetch, and dashboard | | [functions](skills/functions/SKILL.md) | Deploy serverless browser automation to Browserbase cloud using the `bb` CLI | | [site-debugger](skills/site-debugger/SKILL.md) | Diagnose and fix failing browser automations — analyzes bot detection, selectors, timing, auth, and captchas, then generates a tested site playbook | diff --git a/skills/browser-swarm/SKILL.md b/skills/browser-swarm/SKILL.md new file mode 100644 index 0000000..7d54dbf --- /dev/null +++ b/skills/browser-swarm/SKILL.md @@ -0,0 +1,177 @@ +--- +name: browser-swarm +description: Coordinate multiple agents working in separate tabs of one local Chrome via Browserbase CLI auto-connect and deterministic Stagehand/Understudy page handles. Use for experimental same-browser multi-agent browsing, multi-tab task decomposition, /swarm-style workflows, or derisking whether a browser task can run in tandem across Gmail, expense tools, research sites, and other authenticated pages. +compatibility: "Requires Browserbase CLI (`bb browse`) or browse CLI, Chrome with remote debugging available, and `env local --auto-connect`. Use deterministic Stagehand/Understudy as the primary target-bound action layer until browse CLI exposes first-class target-scoped commands." +license: MIT +allowed-tools: Bash Agent +metadata: + author: browserbase + homepage: https://github.com/browserbase/skills +--- + +# Browser Swarm + +Run multiple browser workstreams in separate tabs of the same user-owned Chrome session. Prefer `bb browse`; if the installed CLI only exposes `browse`, use the same subcommands without the `bb browse` prefix. + +## Operating model + +- Use one Chrome instance, many tabs, and one CLI session per workstream. +- Always start with `env local --auto-connect`; this is the product path being exercised. +- Treat the run as experimental until every session reports `localSource: "attached-existing"` and the same `resolvedCdpUrl`/browser websocket. +- Use deterministic Stagehand/Understudy `Page` objects as the primary action layer. The CDP websocket is only the attach transport; workers should act through target-bound `page.deepLocator(...)`, `page.goto(...)`, `page.screenshot(...)`, etc. +- Tabs do not need OS focus when an agent holds a target-specific Understudy page handle. They do need careful ownership if commands are routed through the active page. +- Subagent creation is an orchestrator-level responsibility. Do not assume a spawned worker can recursively create more workers; if nested agents are unavailable, the top-level agent should spawn all workstream agents itself. +- Do not submit purchases, payments, expense reports, reservations, emails, or irreversible forms without explicit user approval. + +## Setup + +Check the CLI: + +```bash +which bb +bb browse --help +``` + +If `bb browse` is unavailable: + +```bash +which browse +browse --help +``` + +Ask the user to open Chrome with remote debugging enabled if needed. If Chrome shows an "Allow remote debugging?" prompt, the user must approve it before auto-connect sessions can inspect or control tabs. + +The installed CLI must include reliable auto-connect discovery. If `curl http://127.0.0.1:/json/version` shows a debuggable browser but `status` still reports `localSource: "isolated-fallback"`, treat that as a CLI gap or stale CLI version and retest with the fixed/newer CLI before claiming the swarm works. + +## Swarm workflow + +Create one named session per workstream: + +```bash +bb browse --session swarm-gmail env local --auto-connect +bb browse --session swarm-ramp env local --auto-connect +bb browse --session swarm-research env local --auto-connect +``` + +Verify all sessions are attached to the same browser: + +```bash +bb browse --session swarm-gmail status +bb browse --session swarm-ramp status +bb browse --session swarm-research status +bb browse --session swarm-gmail pages +``` + +Use the final `status` output as the source of truth; the immediate `env local --auto-connect` response may only report `localStrategy: "auto"`. Proceed only if every session reports an attached existing local browser and the same browser websocket. If any session reports `localSource: "isolated-fallback"`, a fallback reason, or a different websocket, stop and fix auto-connect before continuing. + +Create or identify one tab per workstream. Prefer `newpage` when claiming tabs because `open`/`goto` navigates the current active page and can race under parallel agents: + +```bash +bb browse --session swarm-gmail newpage https://mail.google.com/ +bb browse --session swarm-ramp newpage https://ramp.com/ +bb browse --session swarm-research newpage https://www.google.com/search?q=san+diego+restaurants +``` + +When target ownership matters, derive the HTTP origin from the shared browser websocket and list targets directly: + +```bash +curl -s http://127.0.0.1:9222/json/list | jq '.[] | {id, type, title, url}' +``` + +For low-risk reconnaissance, separate agents may use their named sessions to collect snapshots, page titles, screenshots, and status. For real concurrent mutation, give each agent an explicit target identity and require target-bound scripting instead of index-based tab switching. + +## Parallel agent contract + +The agent invoking this skill should be the top-level orchestrator. If the runtime exposes an Agent/subagent tool, spawn one worker per workstream from that top-level agent. If the skill is already running inside a spawned worker and no nested Agent tool is available, report that nested agents are unavailable and ask the parent/orchestrator to spawn the workers instead. + +When spawning workers, give each one: + +- The exact `--session` name it owns. +- The specific tab URL/title/targetId it owns. +- The user-visible limits, especially "do not submit" boundaries. +- The proof artifact it must return: status output, targetId, final URL/title, and screenshot path. + +Use wording like: + +```text +You own session swarm-gmail and targetId . Stay in that tab. Do not use tab_switch by index. Use target-bound deterministic Stagehand/Understudy operations for mutations. Return evidence only; do not submit irreversible forms. +``` + +Do not substitute Browserbase Autonomous Agent sessions for Codex/Claude subagents unless the user explicitly asks for that product path; they are different execution models and do not prove editor-agent swarm orchestration. + +## Deterministic Understudy target binding + +Current browser CLIs can race when multiple agents rely on "current page" or `tab_switch `. For robust tandem operation, attach Stagehand/Understudy to the shared Chrome websocket and operate on a specific `Page` selected by `targetId`. + +Use the bundled helper for common deterministic commands: + +```bash +node skills/browser-swarm/scripts/understudy-target.mjs \ + --cdp-url "$CDP_URL" \ + list + +node skills/browser-swarm/scripts/understudy-target.mjs \ + --cdp-url "$CDP_URL" \ + --target-id "$TARGET_ID" \ + text body + +node skills/browser-swarm/scripts/understudy-target.mjs \ + --cdp-url "$CDP_URL" \ + --target-id "$TARGET_ID" \ + screenshot /tmp/swarm-tab.png +``` + +If testing against a local Stagehand checkout or fork, pass its built ESM entrypoint: + +```bash +node skills/browser-swarm/scripts/understudy-target.mjs \ + --stagehand-import /tmp/stagehand-pr-2049/packages/core/dist/esm/index.js \ + --cdp-url "$CDP_URL" \ + list +``` + +Direct Understudy scripting pattern: + +```js +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + localBrowserLaunchOptions: { cdpUrl }, +}); + +await stagehand.init(); +const page = stagehand.context.pages().find((candidate) => { + return candidate.targetId && candidate.targetId() === targetId; +}); + +if (!page) throw new Error(`No page found for targetId=${targetId}`); + +await page.deepLocator("input[name='q']").fill("receipt OR itinerary"); +await page.keyPress("Enter"); +const text = await page.deepLocator("body").innerText(); +``` + +Only use raw CDP or Playwright as a diagnostic fallback when Understudy lacks a needed primitive or when you are debugging browser attachment itself. They should not be the default product path for this skill. + +Prefer deterministic Understudy when agents must click, type, or extract in parallel. Do not rely on foreground focus for correctness. + +## Known gaps to report + +Report these as browse CLI gaps when they block a swarm: + +- Commands route through the active page instead of a claimed target/page. +- Parallel `open`/`goto` calls can navigate the same active tab; use `newpage` plus targetId ownership instead. +- `tab_switch ` is not stable under parallel agents and focuses the tab. +- There is no first-class `claim target` / targetId-scoped deterministic Understudy command surface yet. +- Chrome may require a remote-debugging approval prompt for each new attaching process. + +## Proof checklist + +A successful run should include: + +- Each session's `status` showing `localSource: "attached-existing"`. +- The shared `resolvedCdpUrl` or browser websocket for all sessions. +- A page list with the expected workstream tabs. +- Per-agent evidence: owned target/tab, final URL/title, and screenshot. +- Any gaps or CLI primitives needed before the workflow is safe to productize. diff --git a/skills/browser-swarm/scripts/understudy-target.mjs b/skills/browser-swarm/scripts/understudy-target.mjs new file mode 100755 index 0000000..c4c95aa --- /dev/null +++ b/skills/browser-swarm/scripts/understudy-target.mjs @@ -0,0 +1,258 @@ +#!/usr/bin/env node +import { writeFile } from "node:fs/promises"; +import path from "node:path"; +import { pathToFileURL } from "node:url"; + +function usage() { + return `Usage: + understudy-target.mjs --cdp-url list + understudy-target.mjs --cdp-url newpage [url] + understudy-target.mjs --cdp-url --target-id title + understudy-target.mjs --cdp-url --target-id text [selector] + understudy-target.mjs --cdp-url --target-id click + understudy-target.mjs --cdp-url --target-id fill + understudy-target.mjs --cdp-url --target-id press + understudy-target.mjs --cdp-url --target-id goto + understudy-target.mjs --cdp-url --target-id screenshot + +Options: + --cdp-url Browser-level CDP websocket. Also reads BROWSER_SWARM_CDP_URL or CDP_URL. + --target-id Chrome target id / Understudy page id to own. + --url-includes Select the first page whose URL includes text. + --title-includes Select the first page whose title includes text. + --stagehand-import Import specifier or path for @browserbasehq/stagehand. + --full-page Capture a full-page screenshot. +`; +} + +function parseArgs(argv) { + const opts = { + cdpUrl: process.env.BROWSER_SWARM_CDP_URL || process.env.CDP_URL, + stagehandImport: "@browserbasehq/stagehand", + fullPage: false, + }; + const positional = []; + + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + if (arg === "--help" || arg === "-h") { + opts.help = true; + } else if (arg === "--full-page") { + opts.fullPage = true; + } else if (arg === "--cdp-url") { + opts.cdpUrl = argv[++i]; + } else if (arg === "--target-id" || arg === "--page-id") { + opts.targetId = argv[++i]; + } else if (arg === "--url-includes") { + opts.urlIncludes = argv[++i]; + } else if (arg === "--title-includes") { + opts.titleIncludes = argv[++i]; + } else if (arg === "--stagehand-import") { + opts.stagehandImport = argv[++i]; + } else if (arg.startsWith("--")) { + throw new Error(`Unknown option: ${arg}`); + } else { + positional.push(arg); + } + } + + opts.command = positional[0] || "list"; + opts.args = positional.slice(1); + return opts; +} + +async function importStagehand(spec) { + if (spec.startsWith("file:")) { + return import(spec); + } + if (spec.startsWith("/") || spec.startsWith(".")) { + return import(pathToFileURL(path.resolve(spec)).href); + } + return import(spec); +} + +async function pageInfo(page, index) { + let title = ""; + try { + title = await page.title(); + } catch { + title = ""; + } + return { + index, + targetId: page.targetId(), + url: page.url(), + title, + }; +} + +async function selectPage(context, opts) { + const pages = context.pages(); + + if (opts.targetId) { + const page = pages.find((candidate) => candidate.targetId() === opts.targetId); + if (!page) { + throw new Error(`No page found for targetId=${opts.targetId}`); + } + return page; + } + + if (opts.urlIncludes) { + const page = pages.find((candidate) => candidate.url().includes(opts.urlIncludes)); + if (!page) { + throw new Error(`No page URL includes ${JSON.stringify(opts.urlIncludes)}`); + } + return page; + } + + if (opts.titleIncludes) { + for (const page of pages) { + const info = await pageInfo(page); + if (info.title.includes(opts.titleIncludes)) return page; + } + throw new Error(`No page title includes ${JSON.stringify(opts.titleIncludes)}`); + } + + throw new Error("Pass --target-id, --url-includes, or --title-includes for this command"); +} + +async function main() { + const opts = parseArgs(process.argv.slice(2)); + if (opts.help) { + console.log(usage()); + return; + } + if (!opts.cdpUrl) { + throw new Error("Missing --cdp-url, BROWSER_SWARM_CDP_URL, or CDP_URL"); + } + + const { Stagehand } = await importStagehand(opts.stagehandImport); + const stagehand = new Stagehand({ + env: "LOCAL", + localBrowserLaunchOptions: { + cdpUrl: opts.cdpUrl, + connectTimeoutMs: 10000, + }, + disablePino: true, + verbose: 0, + }); + + await stagehand.init(); + if (opts.command === "list") { + const pages = await Promise.all(stagehand.context.pages().map(pageInfo)); + console.log(JSON.stringify({ ok: true, pages }, null, 2)); + return; + } + + if (opts.command === "newpage") { + const page = await stagehand.context.newPage(opts.args[0] || "about:blank"); + console.log( + JSON.stringify( + { + ok: true, + command: opts.command, + result: await pageInfo(page), + }, + null, + 2, + ), + ); + return; + } + + const page = await selectPage(stagehand.context, opts); + let result; + + switch (opts.command) { + case "title": + result = await pageInfo(page); + break; + case "url": + result = { targetId: page.targetId(), url: page.url() }; + break; + case "goto": { + const [url] = opts.args; + if (!url) throw new Error("goto requires "); + await page.goto(url, { waitUntil: "domcontentloaded", timeoutMs: 30000 }); + result = await pageInfo(page); + break; + } + case "click": { + const [selector] = opts.args; + if (!selector) throw new Error("click requires "); + await page.deepLocator(selector).click(); + result = { clicked: true, selector }; + break; + } + case "fill": { + const [selector, ...valueParts] = opts.args; + if (!selector || valueParts.length === 0) { + throw new Error("fill requires "); + } + const value = valueParts.join(" "); + await page.deepLocator(selector).fill(value); + result = { filled: true, selector, value }; + break; + } + case "type": { + const [selector, ...textParts] = opts.args; + if (!selector || textParts.length === 0) { + throw new Error("type requires "); + } + const text = textParts.join(" "); + await page.deepLocator(selector).type(text); + result = { typed: true, selector, text }; + break; + } + case "press": { + const [key] = opts.args; + if (!key) throw new Error("press requires "); + await page.keyPress(key); + result = { pressed: key }; + break; + } + case "text": { + const selector = opts.args[0] || "body"; + const text = await page.deepLocator(selector).innerText(); + result = { selector, text }; + break; + } + case "html": { + const selector = opts.args[0] || "body"; + const html = await page.deepLocator(selector).innerHtml(); + result = { selector, html }; + break; + } + case "screenshot": { + const [outPath] = opts.args; + if (!outPath) throw new Error("screenshot requires "); + const buffer = await page.screenshot({ fullPage: opts.fullPage }); + await writeFile(outPath, buffer); + result = { screenshot: outPath, fullPage: opts.fullPage }; + break; + } + default: + throw new Error(`Unknown command: ${opts.command}`); + } + + console.log( + JSON.stringify( + { + ok: true, + command: opts.command, + targetId: page.targetId(), + url: page.url(), + result, + }, + null, + 2, + ), + ); +} + +main() + .then(() => process.exit(0)) + .catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); + });