From 3aad6d725f38f795f3d1d19d339063ecad4098ea Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 02:04:28 +0530 Subject: [PATCH 01/33] feat(init): one-command setup + zero-config Vite plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collapse Iris integration from a 6-step manual process into `npx @syrin/iris init` plus a dev-server restart, and add a Vite plugin that makes the whole React integration a single config line. @syrin/iris-vite-plugin (exposed as @syrin/iris/vite): - iris() with apply:'serve' so Vite drops it from `vite build` entirely — production-leak protection is structural, not a user-managed env gate - reuses the babel plugin to stamp data-iris-source, and injects a dev-only iris.connect() via transformIndexHtml (no entry-file edit needed) - options: port/session/token/sourceMapping/inject; non-default port is baked into the injected connect() url - integration test runs Vite's real resolveConfig to prove the plugin is filtered out of the build pipeline and present in serve iris init: - detects framework (Next/Vite/HTML), React major, and package manager - merges an iris entry into .mcp.json without clobbering, runs the dependency install, and for Vite patches the config with iris() - bail-to-manual safety: only auto-patches the obvious shape, otherwise prints the exact paste-in snippet rather than half-editing a build config; Next config wrap + layout mount are intentionally manual - pure modules (detect/mcp-config/vite-config/plan) + a thin injectable IO shell, fully unit-tested; --dry-run, --no-mcp, --no-install, --yes, --port Docs: getting-started.md now leads with the init fast path and recommends the Vite plugin. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/getting-started.md | 61 +++++- packages/iris/package.json | 5 + packages/iris/src/vite.ts | 3 + packages/iris/tsup.config.ts | 1 + packages/server/src/cli.test.ts | 39 ++++ packages/server/src/cli.ts | 66 ++++++ packages/server/src/init/detect.test.ts | 92 +++++++++ packages/server/src/init/detect.ts | 110 ++++++++++ packages/server/src/init/mcp-config.test.ts | 45 ++++ packages/server/src/init/mcp-config.ts | 57 +++++ packages/server/src/init/node-io.ts | 47 +++++ packages/server/src/init/plan.test.ts | 94 +++++++++ packages/server/src/init/plan.ts | 194 ++++++++++++++++++ packages/server/src/init/run.test.ts | 104 ++++++++++ packages/server/src/init/run.ts | 146 +++++++++++++ packages/server/src/init/snippets.ts | 61 ++++++ packages/server/src/init/vite-config.test.ts | 48 +++++ packages/server/src/init/vite-config.ts | 53 +++++ packages/vite-plugin/README.md | 45 ++++ packages/vite-plugin/package.json | 59 ++++++ .../vite-plugin/src/build.integration.test.ts | 49 +++++ packages/vite-plugin/src/index.test.ts | 67 ++++++ packages/vite-plugin/src/index.ts | 108 ++++++++++ packages/vite-plugin/tsconfig.json | 11 + pnpm-lock.yaml | 68 +++++- tsconfig.json | 3 +- 26 files changed, 1626 insertions(+), 10 deletions(-) create mode 100644 packages/iris/src/vite.ts create mode 100644 packages/server/src/init/detect.test.ts create mode 100644 packages/server/src/init/detect.ts create mode 100644 packages/server/src/init/mcp-config.test.ts create mode 100644 packages/server/src/init/mcp-config.ts create mode 100644 packages/server/src/init/node-io.ts create mode 100644 packages/server/src/init/plan.test.ts create mode 100644 packages/server/src/init/plan.ts create mode 100644 packages/server/src/init/run.test.ts create mode 100644 packages/server/src/init/run.ts create mode 100644 packages/server/src/init/snippets.ts create mode 100644 packages/server/src/init/vite-config.test.ts create mode 100644 packages/server/src/init/vite-config.ts create mode 100644 packages/vite-plugin/README.md create mode 100644 packages/vite-plugin/package.json create mode 100644 packages/vite-plugin/src/build.integration.test.ts create mode 100644 packages/vite-plugin/src/index.test.ts create mode 100644 packages/vite-plugin/src/index.ts create mode 100644 packages/vite-plugin/tsconfig.json diff --git a/docs/getting-started.md b/docs/getting-started.md index c88c998..7c3a957 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -48,6 +48,31 @@ Everything is **dev-only** and **localhost-only**. It's tree-shaken out of produ --- +## Fastest path — `iris init` + +From your project root: + +```bash +npx @syrin/iris init +``` + +It detects your framework, package manager, and React version, then: + +- merges an `iris` entry into `.mcp.json` (never clobbering an existing one), +- installs `@syrin/iris` as a dev dependency, +- **Vite:** adds the `iris()` plugin to your config — which wires source mapping _and_ + `iris.connect()` for you, so there is nothing else to edit, +- **Next / other:** creates the dev component and prints the exact `withIris` / mount / connect + snippets to paste (it never half-edits a build config). + +Re-running is safe (already-done steps are skipped). Preview without writing via +`npx @syrin/iris init --dry-run`. Flags: `--port N`, `--no-mcp`, `--no-install`, `--yes`. + +Then restart your dev server and skip to [Step 4](#step-4--run-it--verify-the-connection). The +manual steps below explain what `init` sets up, if you prefer to wire it yourself. + +--- + ## Step 1 — Connect your coding agent (MCP) You don't start the server manually — your agent starts it via MCP. Add Iris to your agent's @@ -95,7 +120,27 @@ Then call `iris.connect()` once, in dev only. Where you put it depends on your f ### Vite + React -In your entry file (`src/main.tsx`): +**Recommended — the Vite plugin (one line, does everything).** Add `iris()` to your +`vite.config.ts`: + +```ts +import { defineConfig } from 'vite'; +import react from '@vitejs/plugin-react'; +import { iris } from '@syrin/iris/vite'; + +export default defineConfig({ + plugins: [react(), iris()], +}); +``` + +This injects `iris.connect()` for you _and_ handles React 19 source mapping (Step 3) — so there's +no entry-file edit and no separate Babel setup. `apply: 'serve'` means it's dropped from +`vite build` entirely, so it can never reach production. (This is exactly what `iris init` adds.) + +
+Prefer to wire it by hand instead of the plugin? + +In your entry file (`src/main.tsx`), call `connect()` in dev only: ```ts import { StrictMode } from 'react'; @@ -114,6 +159,11 @@ createRoot(document.getElementById('root')!).render( ); ``` +On React 19 you then also need the source-mapping Babel plugin from Step 3. The Vite plugin +above bundles both, which is why it's the recommended path. + +
+ ### Next.js Create a tiny client component and mount it in your root layout, dev-only: @@ -186,8 +236,10 @@ if (import.meta.env.DEV) installIrisReact(); // call before iris.connect() **React ≤ 18:** that's all — it uses React's dev `_debugSource`. -**React 19:** React removed `_debugSource`, so add the Babel plugin (also bundled in -`@syrin/iris`, at `@syrin/iris/babel`) to stamp the source onto elements in dev: +**React 19:** React removed `_debugSource`, so the source has to be stamped at build time. +**If you added the `iris()` Vite plugin in Step 2, this is already handled — skip ahead.** +Otherwise add the Babel plugin (also bundled in `@syrin/iris`, at `@syrin/iris/babel`) to stamp +the source onto elements in dev: ```ts // vite.config.ts @@ -355,8 +407,7 @@ Everything below comes from the single `@syrin/iris` install. | Stack | SDK connect | Source mapping | | ---------------------- | ------------------------------------------ | ------------------------------------------------------- | -| Vite + React 19 | `iris.connect()` in `main.tsx` (dev) | `install()` from `@syrin/iris` + `@syrin/iris/babel` | -| Vite + React ≤18 | same | `install()` from `@syrin/iris` (no plugin needed) | +| Vite + React (any) | `iris()` plugin (auto) — or `connect()` | `iris()` plugin handles it (incl. React 19) | | Next.js (app router) | `IrisDev` client component in layout (dev) | `@syrin/iris/next` (`withIris`) → component + file:line | | Vue / Svelte / vanilla | `iris.connect()` at boot (dev) | core works; framework adapters on the roadmap | diff --git a/packages/iris/package.json b/packages/iris/package.json index 8f7be63..2336abb 100644 --- a/packages/iris/package.json +++ b/packages/iris/package.json @@ -32,6 +32,10 @@ "types": "./dist/babel.d.ts", "default": "./dist/babel.js" }, + "./vite": { + "types": "./dist/vite.d.ts", + "default": "./dist/vite.js" + }, "./eslint": { "types": "./dist/eslint.d.ts", "default": "./dist/eslint.js" @@ -61,6 +65,7 @@ "@syrin/iris-react": "workspace:*", "@syrin/iris-server": "workspace:*", "@syrin/iris-test": "workspace:*", + "@syrin/iris-vite-plugin": "workspace:*", "@types/ws": "^8.5.13", "tsup": "^8.3.5", "typescript": "^5.6.0" diff --git a/packages/iris/src/vite.ts b/packages/iris/src/vite.ts new file mode 100644 index 0000000..94bfdfb --- /dev/null +++ b/packages/iris/src/vite.ts @@ -0,0 +1,3 @@ +// '@syrin/iris/vite' → the Vite plugin: dev-only source mapping + auto-injected connect() (bundled). +export { iris, IRIS_VITE_PLUGIN_NAME } from '@syrin/iris-vite-plugin'; +export type { IrisVitePlugin, IrisVitePluginOptions } from '@syrin/iris-vite-plugin'; diff --git a/packages/iris/tsup.config.ts b/packages/iris/tsup.config.ts index 3f0e175..df934f4 100644 --- a/packages/iris/tsup.config.ts +++ b/packages/iris/tsup.config.ts @@ -9,6 +9,7 @@ export default defineConfig({ test: 'src/test.ts', // './test' — declarative spec runner next: 'src/next.ts', // './next' — Next.js source mapping babel: 'src/babel.ts', // './babel' — React 19 babel plugin + vite: 'src/vite.ts', // './vite' — Vite plugin (source mapping + connect injection) eslint: 'src/eslint.ts', // './eslint' — require-signal-on-mutation rule cli: 'src/cli.ts', // bin — the `iris` CLI }, diff --git a/packages/server/src/cli.test.ts b/packages/server/src/cli.test.ts index 1bb30da..92cfa12 100644 --- a/packages/server/src/cli.test.ts +++ b/packages/server/src/cli.test.ts @@ -39,6 +39,45 @@ describe('parseCliArgs', () => { }); }); + it('init with no flags defaults to mcp + install on, no dry run, no port', () => { + expect(parseCliArgs(['init'], PORT)).toEqual({ + kind: 'init', + port: undefined, + mcp: true, + dryRun: false, + install: true, + }); + }); + + it('init --dry-run --no-mcp --no-install --port sets each flag', () => { + expect( + parseCliArgs(['init', '--dry-run', '--no-mcp', '--no-install', '--port', '4500'], PORT), + ).toEqual({ + kind: 'init', + port: 4500, + mcp: false, + dryRun: true, + install: false, + }); + }); + + it('init --yes is accepted', () => { + expect(parseCliArgs(['init', '--yes'], PORT)).toEqual({ + kind: 'init', + port: undefined, + mcp: true, + dryRun: false, + install: true, + }); + }); + + it('init rejects unknown flags', () => { + expect(parseCliArgs(['init', '--bogus'], PORT)).toEqual({ + kind: 'error', + message: CLI_USAGE, + }); + }); + it('stop returns stop result with quiet false', () => { expect(parseCliArgs(['stop'], PORT)).toEqual({ kind: 'stop', port: PORT, quiet: false }); }); diff --git a/packages/server/src/cli.ts b/packages/server/src/cli.ts index a9fe3fe..5ba2d92 100644 --- a/packages/server/src/cli.ts +++ b/packages/server/src/cli.ts @@ -5,15 +5,19 @@ import { start, startDaemon } from './index.js'; import { log } from './log.js'; import { readPid, isAlive, isRunning, removePid, spawnDaemon } from './daemon.js'; import { waitForDaemon, startMcpProxy, probeDaemon } from './mcp-proxy.js'; +import { runInit } from './init/run.js'; +import { buildNodeIo } from './init/node-io.js'; import type { StartOptions } from './index.js'; export const CLI_USAGE = `usage: + iris init [--yes] [--dry-run] [--port N] [--no-mcp] [--no-install] (wire Iris into the project in this directory) iris serve [--port N] [--drive ] [--headed] iris stop [--port N] [--quiet] iris status [--port N] iris drive [--headed] (foreground mode — for debugging) iris mcp [--port N] [--drive ] [--headed] (MCP stdio proxy — auto-starts daemon if needed)`; +const INIT_COMMAND = 'init'; const SERVE_COMMAND = 'serve'; const STOP_COMMAND = 'stop'; const STATUS_COMMAND = 'status'; @@ -25,8 +29,13 @@ const HEADED_FLAG = '--headed'; const PORT_FLAG = '--port'; const DRIVE_FLAG = '--drive'; const QUIET_FLAG = '--quiet'; +const DRY_RUN_FLAG = '--dry-run'; +const YES_FLAG = '--yes'; +const NO_MCP_FLAG = '--no-mcp'; +const NO_INSTALL_FLAG = '--no-install'; export type CliResult = + | { kind: 'init'; port: number | undefined; mcp: boolean; dryRun: boolean; install: boolean } | { kind: 'serve'; port: number; driveUrl?: string; headless: boolean } | { kind: 'stop'; port: number; quiet: boolean } | { kind: 'status'; port: number } @@ -98,6 +107,41 @@ function parseDriveSuffix(args: string[], port: number): DriveSuffix { return { kind: 'ok', port, driveUrl, headless }; } +type InitFlags = + | { kind: 'ok'; port: number | undefined; mcp: boolean; dryRun: boolean; install: boolean } + | { kind: 'error'; message: string }; + +function parseInitFlags(args: string[]): InitFlags { + let port: number | undefined; + let mcp = true; + let dryRun = false; + let install = true; + let i = 0; + while (i < args.length) { + const arg = args[i]; + if (arg === PORT_FLAG) { + i++; + const n = args[i]; + if (n === undefined) return { kind: 'error', message: CLI_USAGE }; + const parsed = parseInt(n, 10); + if (isNaN(parsed)) return { kind: 'error', message: CLI_USAGE }; + port = parsed; + } else if (arg === NO_MCP_FLAG) { + mcp = false; + } else if (arg === NO_INSTALL_FLAG) { + install = false; + } else if (arg === DRY_RUN_FLAG) { + dryRun = true; + } else if (arg === YES_FLAG) { + // Accepted for scripting/CI; init has no interactive prompts today. + } else { + return { kind: 'error', message: CLI_USAGE }; + } + i++; + } + return { kind: 'ok', port, mcp, dryRun, install }; +} + /** Pure CLI arg parser — exported for unit tests. argv = process.argv.slice(2). */ export function parseCliArgs(argv: string[], defaultPort: number): CliResult { if (argv.length === 0) return { kind: 'serve', port: defaultPort, headless: true }; @@ -105,6 +149,11 @@ export function parseCliArgs(argv: string[], defaultPort: number): CliResult { const [cmd, ...rest] = argv; switch (cmd) { + case INIT_COMMAND: { + const r = parseInitFlags(rest); + if (r.kind === 'error') return r; + return { kind: 'init', port: r.port, mcp: r.mcp, dryRun: r.dryRun, install: r.install }; + } case SERVE_COMMAND: { const r = parseServeFlags(rest, defaultPort); if (r.kind === 'error') return r; @@ -154,6 +203,20 @@ export function parseCliArgs(argv: string[], defaultPort: number): CliResult { } } +function handleInit(parsed: { + port: number | undefined; + mcp: boolean; + dryRun: boolean; + install: boolean; +}): void { + const cwd = process.cwd(); + const result = runInit( + { cwd, port: parsed.port, mcp: parsed.mcp, dryRun: parsed.dryRun, install: parsed.install }, + buildNodeIo(cwd), + ); + if (!result.ok) process.exit(1); +} + function handleServe(parsed: { port: number; driveUrl?: string; headless: boolean }): void { if (isRunning(parsed.port)) { log('iris_daemon_already_running', { port: parsed.port }); @@ -308,6 +371,9 @@ function main(): void { log('iris_usage_error', { message: parsed.message }); process.exit(1); break; + case 'init': + handleInit(parsed); + break; case 'serve': handleServe(parsed); break; diff --git a/packages/server/src/init/detect.test.ts b/packages/server/src/init/detect.test.ts new file mode 100644 index 0000000..c89b41b --- /dev/null +++ b/packages/server/src/init/detect.test.ts @@ -0,0 +1,92 @@ +import { describe, expect, it } from 'vitest'; +import { + detect, + parseMajor, + installCommand, + Framework, + PackageManager, + type DetectInput, +} from './detect.js'; + +function input(partial: Partial): DetectInput { + return { + pkg: partial.pkg ?? {}, + configFiles: partial.configFiles ?? new Set(), + lockfiles: partial.lockfiles ?? new Set(), + }; +} + +describe('parseMajor', () => { + it('reads the major from common range forms', () => { + expect(parseMajor('^19.0.0')).toBe(19); + expect(parseMajor('~18.2.1')).toBe(18); + expect(parseMajor('19.1.1')).toBe(19); + expect(parseMajor('>=18')).toBe(18); + }); + it('returns undefined for missing/garbage', () => { + expect(parseMajor(undefined)).toBeUndefined(); + expect(parseMajor('latest')).toBeUndefined(); + }); +}); + +describe('detect framework', () => { + it('detects next from the dependency', () => { + expect(detect(input({ pkg: { dependencies: { next: '15.0.0' } } })).framework).toBe( + Framework.NEXT, + ); + }); + it('detects next from a config file even without the dep listed', () => { + expect(detect(input({ configFiles: new Set(['next.config.mjs']) })).framework).toBe( + Framework.NEXT, + ); + }); + it('detects vite from the dependency', () => { + expect(detect(input({ pkg: { devDependencies: { vite: '^5.0.0' } } })).framework).toBe( + Framework.VITE, + ); + }); + it('falls back to html when no bundler is present', () => { + expect(detect(input({ pkg: { dependencies: { react: '^18' } } })).framework).toBe( + Framework.HTML, + ); + }); + it('prefers next over vite when both are present', () => { + expect(detect(input({ pkg: { dependencies: { next: '15', vite: '5' } } })).framework).toBe( + Framework.NEXT, + ); + }); +}); + +describe('detect source mapping need', () => { + it('flags React 19 as needing source mapping', () => { + const d = detect(input({ pkg: { dependencies: { react: '^19.0.0', vite: '5' } } })); + expect(d.reactMajor).toBe(19); + expect(d.needsSourceMapping).toBe(true); + }); + it('does not flag React 18', () => { + const d = detect(input({ pkg: { dependencies: { react: '^18.2.0', vite: '5' } } })); + expect(d.needsSourceMapping).toBe(false); + }); +}); + +describe('detect package manager', () => { + it('reads the lockfile', () => { + expect(detect(input({ lockfiles: new Set(['pnpm-lock.yaml']) })).packageManager).toBe( + PackageManager.PNPM, + ); + expect(detect(input({ lockfiles: new Set(['yarn.lock']) })).packageManager).toBe( + PackageManager.YARN, + ); + expect(detect(input({ lockfiles: new Set(['bun.lockb']) })).packageManager).toBe( + PackageManager.BUN, + ); + expect(detect(input({})).packageManager).toBe(PackageManager.NPM); + }); +}); + +describe('installCommand', () => { + it('renders the dev-install command per manager', () => { + expect(installCommand(PackageManager.PNPM, '@syrin/iris')).toBe('pnpm add -D @syrin/iris'); + expect(installCommand(PackageManager.NPM, '@syrin/iris')).toBe('npm i -D @syrin/iris'); + }); +}); diff --git a/packages/server/src/init/detect.ts b/packages/server/src/init/detect.ts new file mode 100644 index 0000000..f2b4b9e --- /dev/null +++ b/packages/server/src/init/detect.ts @@ -0,0 +1,110 @@ +/** + * Pure framework + toolchain detection for `iris init`. No filesystem access — callers pass in + * the parsed package.json and the set of config/lock filenames present in the project root. + */ + +export const Framework = { + NEXT: 'next', + VITE: 'vite', + HTML: 'html', +} as const; +export type Framework = (typeof Framework)[keyof typeof Framework]; + +export const PackageManager = { + PNPM: 'pnpm', + YARN: 'yarn', + BUN: 'bun', + NPM: 'npm', +} as const; +export type PackageManager = (typeof PackageManager)[keyof typeof PackageManager]; + +export interface PackageJsonLike { + dependencies?: Record; + devDependencies?: Record; + peerDependencies?: Record; +} + +export interface DetectInput { + pkg: PackageJsonLike; + /** Basenames of config files present in the project root (e.g. 'next.config.mjs'). */ + configFiles: ReadonlySet; + /** Lockfile basenames present in the project root. */ + lockfiles: ReadonlySet; +} + +export interface Detection { + framework: Framework; + reactMajor: number | undefined; + /** React 19 dropped _debugSource, so it needs the build-time source-map stamp. */ + needsSourceMapping: boolean; + packageManager: PackageManager; +} + +const NEXT_CONFIGS = ['next.config.js', 'next.config.mjs', 'next.config.ts', 'next.config.cjs']; +const VITE_CONFIGS = ['vite.config.js', 'vite.config.ts', 'vite.config.mjs', 'vite.config.mts']; + +function depVersion(pkg: PackageJsonLike, name: string): string | undefined { + return pkg.dependencies?.[name] ?? pkg.devDependencies?.[name] ?? pkg.peerDependencies?.[name]; +} + +function hasAnyConfig(files: ReadonlySet, candidates: readonly string[]): boolean { + return candidates.some((c) => files.has(c)); +} + +/** Extract the leading major version from a semver range like "^19.0.0" or "19.1.1". */ +export function parseMajor(range: string | undefined): number | undefined { + if (range === undefined) return undefined; + const match = range.match(/(\d+)/); + if (match === null || match[1] === undefined) return undefined; + const major = parseInt(match[1], 10); + return isNaN(major) ? undefined : major; +} + +function detectPackageManager(lockfiles: ReadonlySet): PackageManager { + if (lockfiles.has('pnpm-lock.yaml')) return PackageManager.PNPM; + if (lockfiles.has('yarn.lock')) return PackageManager.YARN; + if (lockfiles.has('bun.lockb') || lockfiles.has('bun.lock')) return PackageManager.BUN; + return PackageManager.NPM; +} + +function detectFramework(input: DetectInput): Framework { + const { pkg, configFiles } = input; + if (depVersion(pkg, 'next') !== undefined || hasAnyConfig(configFiles, NEXT_CONFIGS)) { + return Framework.NEXT; + } + if (depVersion(pkg, 'vite') !== undefined || hasAnyConfig(configFiles, VITE_CONFIGS)) { + return Framework.VITE; + } + return Framework.HTML; +} + +export function detect(input: DetectInput): Detection { + const reactMajor = parseMajor(depVersion(input.pkg, 'react')); + return { + framework: detectFramework(input), + reactMajor, + needsSourceMapping: reactMajor !== undefined && reactMajor >= 19, + packageManager: detectPackageManager(input.lockfiles), + }; +} + +const INSTALL_ARGS: Record = { + [PackageManager.PNPM]: ['add', '-D'], + [PackageManager.YARN]: ['add', '-D'], + [PackageManager.BUN]: ['add', '-d'], + [PackageManager.NPM]: ['i', '-D'], +}; + +export interface InstallCommand { + command: string; + args: string[]; +} + +export function installCommandParts(pm: PackageManager, pkg: string): InstallCommand { + return { command: pm, args: [...INSTALL_ARGS[pm], pkg] }; +} + +export function installCommand(pm: PackageManager, pkg: string): string { + const { command, args } = installCommandParts(pm, pkg); + return `${command} ${args.join(' ')}`; +} diff --git a/packages/server/src/init/mcp-config.test.ts b/packages/server/src/init/mcp-config.test.ts new file mode 100644 index 0000000..78f7e77 --- /dev/null +++ b/packages/server/src/init/mcp-config.test.ts @@ -0,0 +1,45 @@ +import { describe, expect, it } from 'vitest'; +import { mergeMcpConfig, McpMergeStatus } from './mcp-config.js'; + +interface McpShape { + mcpServers: Record; +} + +function parse(content: string): McpShape { + return JSON.parse(content) as McpShape; +} + +describe('mergeMcpConfig', () => { + it('creates a fresh config when none exists', () => { + const r = mergeMcpConfig(null, undefined); + expect(r.status).toBe(McpMergeStatus.APPLY); + const parsed = parse(r.content); + expect(parsed.mcpServers['iris']).toEqual({ command: 'npx', args: ['@syrin/iris', 'mcp'] }); + expect(r.content.endsWith('\n')).toBe(true); + }); + + it('bakes a port into the args when provided', () => { + const r = mergeMcpConfig(null, 4500); + expect(parse(r.content).mcpServers['iris']?.args).toEqual([ + '@syrin/iris', + 'mcp', + '--port', + '4500', + ]); + }); + + it('preserves other servers when adding iris', () => { + const existing = JSON.stringify({ mcpServers: { other: { command: 'x' } } }); + const r = mergeMcpConfig(existing, undefined); + const parsed = parse(r.content); + expect(parsed.mcpServers['other']).toEqual({ command: 'x' }); + expect(parsed.mcpServers['iris']).toBeDefined(); + }); + + it('never clobbers an existing iris entry (idempotent)', () => { + const existing = JSON.stringify({ mcpServers: { iris: { command: 'custom' } } }); + const r = mergeMcpConfig(existing, undefined); + expect(r.status).toBe(McpMergeStatus.ALREADY); + expect(r.content).toBe(existing); + }); +}); diff --git a/packages/server/src/init/mcp-config.ts b/packages/server/src/init/mcp-config.ts new file mode 100644 index 0000000..475f445 --- /dev/null +++ b/packages/server/src/init/mcp-config.ts @@ -0,0 +1,57 @@ +/** + * Pure merge of an Iris server entry into a project's `.mcp.json`. Never clobbers an existing + * `iris` entry — adoption must be idempotent and safe to re-run. + */ + +const SERVER_KEY = 'iris'; +const NPX_COMMAND = 'npx'; +const IRIS_PACKAGE = '@syrin/iris'; +const MCP_SUBCOMMAND = 'mcp'; +const PORT_FLAG = '--port'; + +export const McpMergeStatus = { + APPLY: 'apply', + ALREADY: 'already', +} as const; +export type McpMergeStatus = (typeof McpMergeStatus)[keyof typeof McpMergeStatus]; + +export interface McpMergeResult { + status: McpMergeStatus; + /** Full file content to write (2-space JSON, trailing newline). Unchanged when `already`. */ + content: string; +} + +interface McpConfigShape { + mcpServers?: Record; + [key: string]: unknown; +} + +function irisServerEntry(port: number | undefined): Record { + const args = + port === undefined + ? [IRIS_PACKAGE, MCP_SUBCOMMAND] + : [IRIS_PACKAGE, MCP_SUBCOMMAND, PORT_FLAG, String(port)]; + return { command: NPX_COMMAND, args }; +} + +function parseConfig(existing: string | null): McpConfigShape { + if (existing === null || existing.trim().length === 0) return {}; + const parsed: unknown = JSON.parse(existing); + if (typeof parsed !== 'object' || parsed === null) return {}; + return parsed as McpConfigShape; +} + +export function mergeMcpConfig(existing: string | null, port: number | undefined): McpMergeResult { + const config = parseConfig(existing); + const servers = config.mcpServers ?? {}; + + if (Object.prototype.hasOwnProperty.call(servers, SERVER_KEY)) { + return { status: McpMergeStatus.ALREADY, content: existing ?? '' }; + } + + const merged: McpConfigShape = { + ...config, + mcpServers: { ...servers, [SERVER_KEY]: irisServerEntry(port) }, + }; + return { status: McpMergeStatus.APPLY, content: `${JSON.stringify(merged, null, 2)}\n` }; +} diff --git a/packages/server/src/init/node-io.ts b/packages/server/src/init/node-io.ts new file mode 100644 index 0000000..5a36068 --- /dev/null +++ b/packages/server/src/init/node-io.ts @@ -0,0 +1,47 @@ +/** + * The real (Node filesystem) implementation of `InitIo`. Kept separate from the pure runner so + * `runInit` stays testable with an in-memory IO. Prints to stdout — `init` is a one-shot CLI + * command, not the MCP stdio transport. + */ + +import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { spawnSync } from 'node:child_process'; +import type { InitIo } from './run.js'; + +export function buildNodeIo(cwd: string): InitIo { + const abs = (rel: string): string => join(cwd, rel); + return { + readFile(rel) { + const path = abs(rel); + if (!existsSync(path)) return null; + return readFileSync(path, 'utf8'); + }, + writeFile(rel, content) { + const path = abs(rel); + mkdirSync(dirname(path), { recursive: true }); + writeFileSync(path, content, 'utf8'); + }, + exists(rel) { + return existsSync(abs(rel)); + }, + rootFiles() { + return readdirSync(cwd).filter((name) => { + try { + return statSync(join(cwd, name)).isFile(); + } catch { + return false; + } + }); + }, + exec(command, args) { + // `shell: true` lets package-manager shims (pnpm.cmd, etc.) resolve on Windows; inherit + // stdio so the install's own progress is visible to the user. + const result = spawnSync(command, [...args], { cwd, stdio: 'inherit', shell: true }); + return result.status === 0; + }, + print(line) { + process.stdout.write(`${line}\n`); + }, + }; +} diff --git a/packages/server/src/init/plan.test.ts b/packages/server/src/init/plan.test.ts new file mode 100644 index 0000000..d0492b8 --- /dev/null +++ b/packages/server/src/init/plan.test.ts @@ -0,0 +1,94 @@ +import { describe, expect, it } from 'vitest'; +import { buildPlan, StepStatus, type PlanInput } from './plan.js'; +import { Framework, PackageManager, type Detection } from './detect.js'; + +function detection(framework: Framework, reactMajor = 19): Detection { + return { + framework, + reactMajor, + needsSourceMapping: reactMajor >= 19, + packageManager: PackageManager.PNPM, + }; +} + +function input(partial: Partial): PlanInput { + return { + detection: partial.detection ?? detection(Framework.VITE), + mcpJson: partial.mcpJson ?? null, + viteConfig: partial.viteConfig ?? null, + nextConfigFile: partial.nextConfigFile ?? null, + nextIrisDevExists: partial.nextIrisDevExists ?? false, + options: partial.options ?? { port: undefined, mcp: true, install: false }, + }; +} + +const VITE_SRC = `import { defineConfig } from 'vite'; +import react from '@vitejs/plugin-react'; +export default defineConfig({ plugins: [react()] }); +`; + +function step(plan: ReturnType, title: string) { + const s = plan.steps.find((x) => x.title === title); + if (s === undefined) throw new Error(`no step ${title}`); + return s; +} + +describe('buildPlan — Vite', () => { + it('writes .mcp.json and patches the vite config (no entry edit needed)', () => { + const plan = buildPlan(input({ viteConfig: { path: 'vite.config.ts', source: VITE_SRC } })); + expect(step(plan, 'MCP config').status).toBe(StepStatus.APPLY); + expect(step(plan, 'MCP config').write).toBeDefined(); + expect(step(plan, 'Vite plugin').status).toBe(StepStatus.APPLY); + expect(step(plan, 'Vite plugin').write?.content).toContain('@syrin/iris/vite'); + // The plugin injects connect(), so there is no separate entry-file step. + expect(plan.steps.some((s) => s.title.includes('entry'))).toBe(false); + }); + + it('bails to manual when there is no vite config file', () => { + const plan = buildPlan(input({ viteConfig: null })); + expect(step(plan, 'Vite plugin').status).toBe(StepStatus.MANUAL); + }); + + it('skips MCP under --no-mcp', () => { + const plan = buildPlan(input({ options: { port: undefined, mcp: false, install: false } })); + expect(step(plan, 'MCP config').status).toBe(StepStatus.SKIP); + }); + + it('makes install an exec step when install is enabled, manual otherwise', () => { + const off = buildPlan(input({ options: { port: undefined, mcp: true, install: false } })); + expect(step(off, 'Install dependency').status).toBe(StepStatus.MANUAL); + expect(step(off, 'Install dependency').exec).toBeUndefined(); + + const on = buildPlan(input({ options: { port: undefined, mcp: true, install: true } })); + const s = step(on, 'Install dependency'); + expect(s.status).toBe(StepStatus.APPLY); + expect(s.exec?.command).toBe('pnpm'); + expect(s.exec?.args).toEqual(['add', '-D', '@syrin/iris']); + }); +}); + +describe('buildPlan — Next', () => { + it('creates iris-dev.tsx and bails config + mount to manual', () => { + const plan = buildPlan( + input({ detection: detection(Framework.NEXT), nextConfigFile: 'next.config.mjs' }), + ); + expect(step(plan, 'IrisDev component').status).toBe(StepStatus.APPLY); + expect(step(plan, 'Next config (withIris)').status).toBe(StepStatus.MANUAL); + expect(step(plan, 'Mount IrisDev').status).toBe(StepStatus.MANUAL); + }); + + it('marks iris-dev.tsx already when it exists', () => { + const plan = buildPlan( + input({ detection: detection(Framework.NEXT), nextIrisDevExists: true }), + ); + expect(step(plan, 'IrisDev component').status).toBe(StepStatus.ALREADY); + }); +}); + +describe('buildPlan — HTML', () => { + it('gives an MCP write plus a manual connect snippet', () => { + const plan = buildPlan(input({ detection: detection(Framework.HTML, 0) })); + expect(step(plan, 'MCP config').status).toBe(StepStatus.APPLY); + expect(step(plan, 'Connect snippet').status).toBe(StepStatus.MANUAL); + }); +}); diff --git a/packages/server/src/init/plan.ts b/packages/server/src/init/plan.ts new file mode 100644 index 0000000..296ed14 --- /dev/null +++ b/packages/server/src/init/plan.ts @@ -0,0 +1,194 @@ +/** + * Pure assembly of the `iris init` action plan. Given the detection result and the relevant file + * contents, produce an ordered list of steps — each marked apply / manual / already / skip. The + * runner performs the `write` side-effects; this module decides *what* should happen. + */ + +import { Framework, installCommand, installCommandParts, type Detection } from './detect.js'; +import { mergeMcpConfig, McpMergeStatus } from './mcp-config.js'; +import { patchViteConfig, VitePatchKind } from './vite-config.js'; +import { + VITE_MANUAL, + HTML_MANUAL, + NEXT_LAYOUT_MANUAL, + NEXT_IRIS_DEV_FILE, + NEXT_IRIS_DEV_PATH, + nextConfigManual, +} from './snippets.js'; + +const IRIS_PACKAGE = '@syrin/iris'; +const MCP_FILE = '.mcp.json'; + +export const StepStatus = { + APPLY: 'apply', + MANUAL: 'manual', + ALREADY: 'already', + SKIP: 'skip', +} as const; +export type StepStatus = (typeof StepStatus)[keyof typeof StepStatus]; + +export interface Step { + title: string; + target: string; + status: StepStatus; + detail: string; + /** Present only when status is APPLY and a file must be written. */ + write?: { path: string; content: string }; + /** Present only when status is APPLY and a subprocess must run (the dependency install). */ + exec?: { command: string; args: string[]; fallback: string }; +} + +export interface Plan { + framework: Framework; + steps: Step[]; +} + +export interface PlanInput { + detection: Detection; + /** Current `.mcp.json` content, or null if absent. */ + mcpJson: string | null; + /** Discovered Vite config: its path + source, or null if none found. */ + viteConfig: { path: string; source: string } | null; + /** Discovered Next config filename (e.g. 'next.config.mjs'), or null. */ + nextConfigFile: string | null; + /** Whether app/iris-dev.tsx already exists. */ + nextIrisDevExists: boolean; + options: { port: number | undefined; mcp: boolean; install: boolean }; +} + +function mcpStep(input: PlanInput): Step { + if (!input.options.mcp) { + return { title: 'MCP config', target: MCP_FILE, status: StepStatus.SKIP, detail: '--no-mcp' }; + } + const r = mergeMcpConfig(input.mcpJson, input.options.port); + if (r.status === McpMergeStatus.ALREADY) { + return { + title: 'MCP config', + target: MCP_FILE, + status: StepStatus.ALREADY, + detail: 'iris server already configured', + }; + } + return { + title: 'MCP config', + target: MCP_FILE, + status: StepStatus.APPLY, + detail: 'add iris MCP server', + write: { path: MCP_FILE, content: r.content }, + }; +} + +function installStep(input: PlanInput): Step { + const pm = input.detection.packageManager; + const command = installCommand(pm, IRIS_PACKAGE); + if (!input.options.install) { + return { + title: 'Install dependency', + target: 'package.json', + status: StepStatus.MANUAL, + detail: command, + }; + } + const parts = installCommandParts(pm, IRIS_PACKAGE); + return { + title: 'Install dependency', + target: 'package.json', + status: StepStatus.APPLY, + detail: command, + exec: { command: parts.command, args: parts.args, fallback: command }, + }; +} + +function viteSteps(input: PlanInput): Step[] { + const cfg = input.viteConfig; + if (cfg === null) { + return [ + { + title: 'Vite plugin', + target: 'vite.config', + status: StepStatus.MANUAL, + detail: VITE_MANUAL, + }, + ]; + } + const patch = patchViteConfig(cfg.source); + if (patch.kind === VitePatchKind.ALREADY) { + return [ + { + title: 'Vite plugin', + target: cfg.path, + status: StepStatus.ALREADY, + detail: 'iris() already in plugins', + }, + ]; + } + if (patch.kind === VitePatchKind.MANUAL) { + return [ + { + title: 'Vite plugin', + target: cfg.path, + status: StepStatus.MANUAL, + detail: `${patch.reason}\n\n${VITE_MANUAL}`, + }, + ]; + } + return [ + { + title: 'Vite plugin', + target: cfg.path, + status: StepStatus.APPLY, + detail: 'add iris() to plugins (also injects connect())', + write: { path: cfg.path, content: patch.code }, + }, + ]; +} + +function nextSteps(input: PlanInput): Step[] { + const configFile = input.nextConfigFile ?? 'next.config.mjs'; + const devFile: Step = input.nextIrisDevExists + ? { + title: 'IrisDev component', + target: NEXT_IRIS_DEV_PATH, + status: StepStatus.ALREADY, + detail: 'file exists', + } + : { + title: 'IrisDev component', + target: NEXT_IRIS_DEV_PATH, + status: StepStatus.APPLY, + detail: 'create dev-only connect component', + write: { path: NEXT_IRIS_DEV_PATH, content: NEXT_IRIS_DEV_FILE }, + }; + return [ + devFile, + { + title: 'Next config (withIris)', + target: configFile, + status: StepStatus.MANUAL, + detail: nextConfigManual(configFile), + }, + { + title: 'Mount IrisDev', + target: 'app/layout.tsx', + status: StepStatus.MANUAL, + detail: NEXT_LAYOUT_MANUAL, + }, + ]; +} + +export function buildPlan(input: PlanInput): Plan { + const steps: Step[] = [mcpStep(input), installStep(input)]; + if (input.detection.framework === Framework.VITE) { + steps.push(...viteSteps(input)); + } else if (input.detection.framework === Framework.NEXT) { + steps.push(...nextSteps(input)); + } else { + steps.push({ + title: 'Connect snippet', + target: 'index.html', + status: StepStatus.MANUAL, + detail: HTML_MANUAL, + }); + } + return { framework: input.detection.framework, steps }; +} diff --git a/packages/server/src/init/run.test.ts b/packages/server/src/init/run.test.ts new file mode 100644 index 0000000..6b134c1 --- /dev/null +++ b/packages/server/src/init/run.test.ts @@ -0,0 +1,104 @@ +import { describe, expect, it } from 'vitest'; +import { runInit, type InitIo, type InitOptions } from './run.js'; + +interface MemoryIo extends InitIo { + written: Record; + lines: string[]; + execCalls: { command: string; args: readonly string[] }[]; +} + +function memoryIo(files: Record, execOk = true): MemoryIo { + const written: Record = {}; + const lines: string[] = []; + const execCalls: { command: string; args: readonly string[] }[] = []; + return { + written, + lines, + execCalls, + readFile: (p) => files[p] ?? written[p] ?? null, + writeFile: (p, c) => { + written[p] = c; + }, + exists: (p) => p in files || p in written, + rootFiles: () => Object.keys(files).filter((p) => !p.includes('/')), + exec: (command, args) => { + execCalls.push({ command, args }); + return execOk; + }, + print: (l) => lines.push(l), + }; +} + +const OPTS: InitOptions = { + cwd: '/app', + port: undefined, + mcp: true, + dryRun: false, + install: false, +}; + +describe('runInit', () => { + it('errors cleanly without a package.json', () => { + const io = memoryIo({}); + const r = runInit(OPTS, io); + expect(r.ok).toBe(false); + expect(io.lines.join('\n')).toContain('No package.json'); + }); + + it('writes .mcp.json and patches the vite config for a Vite+React project', () => { + const io = memoryIo({ + 'package.json': JSON.stringify({ devDependencies: { vite: '^5', react: '^19' } }), + 'vite.config.ts': `import react from '@vitejs/plugin-react';\nexport default { plugins: [react()] };\n`, + }); + const r = runInit(OPTS, io); + expect(r.ok).toBe(true); + expect(io.written['.mcp.json']).toContain('@syrin/iris'); + expect(io.written['vite.config.ts']).toContain('@syrin/iris/vite'); + }); + + it('dry run writes nothing but still reports', () => { + const io = memoryIo({ + 'package.json': JSON.stringify({ devDependencies: { vite: '^5' } }), + 'vite.config.ts': `export default { plugins: [] };\n`, + }); + const r = runInit({ ...OPTS, dryRun: true }, io); + expect(Object.keys(io.written)).toHaveLength(0); + expect(io.lines.join('\n')).toContain('dry run'); + expect(r.applied).toBeGreaterThan(0); + }); + + it('runs the install when enabled', () => { + const io = memoryIo({ + 'package.json': JSON.stringify({ devDependencies: { vite: '^5' } }), + 'pnpm-lock.yaml': '', + 'vite.config.ts': `export default { plugins: [] };\n`, + }); + runInit({ ...OPTS, install: true }, io); + expect(io.execCalls).toEqual([{ command: 'pnpm', args: ['add', '-D', '@syrin/iris'] }]); + }); + + it('does not run the install in dry run', () => { + const io = memoryIo({ 'package.json': JSON.stringify({ devDependencies: { vite: '^5' } }) }); + runInit({ ...OPTS, install: true, dryRun: true }, io); + expect(io.execCalls).toHaveLength(0); + }); + + it('downgrades the install step to manual when it fails', () => { + const io = memoryIo( + { 'package.json': JSON.stringify({ devDependencies: { vite: '^5' } }) }, + false, + ); + const r = runInit({ ...OPTS, install: true }, io); + expect(io.lines.join('\n')).toContain('install failed — run manually'); + expect(r.manual).toBeGreaterThan(0); + }); + + it('creates app/iris-dev.tsx for a Next project', () => { + const io = memoryIo({ + 'package.json': JSON.stringify({ dependencies: { next: '15', react: '^19' } }), + 'next.config.mjs': 'export default {};\n', + }); + runInit(OPTS, io); + expect(io.written['app/iris-dev.tsx']).toContain('IrisDev'); + }); +}); diff --git a/packages/server/src/init/run.ts b/packages/server/src/init/run.ts new file mode 100644 index 0000000..54cd5e9 --- /dev/null +++ b/packages/server/src/init/run.ts @@ -0,0 +1,146 @@ +/** + * The impure shell for `iris init`: gather project files via an injected IO surface, build the + * plan (pure), optionally write the apply-steps, and print a human-readable report. All filesystem + * access goes through `InitIo` so the orchestration is unit-testable with an in-memory IO. + */ + +import { detect, Framework, type DetectInput } from './detect.js'; +import { buildPlan, StepStatus, type Plan, type PlanInput } from './plan.js'; + +const PACKAGE_JSON = 'package.json'; +const MCP_FILE = '.mcp.json'; +const NEXT_IRIS_DEV = 'app/iris-dev.tsx'; +const VITE_CONFIG_CANDIDATES = [ + 'vite.config.ts', + 'vite.config.js', + 'vite.config.mjs', + 'vite.config.mts', +]; +const NEXT_CONFIG_CANDIDATES = [ + 'next.config.mjs', + 'next.config.js', + 'next.config.ts', + 'next.config.cjs', +]; + +export interface InitOptions { + cwd: string; + port: number | undefined; + mcp: boolean; + dryRun: boolean; + install: boolean; +} + +export interface InitIo { + /** Returns file content or null if it does not exist. Path is project-relative. */ + readFile(relPath: string): string | null; + /** Writes content, creating parent directories. Path is project-relative. */ + writeFile(relPath: string, content: string): void; + exists(relPath: string): boolean; + /** Basenames present in the project root. */ + rootFiles(): readonly string[]; + /** Runs a subprocess to completion; returns true on exit code 0. */ + exec(command: string, args: readonly string[]): boolean; + print(line: string): void; +} + +export interface InitResult { + ok: boolean; + applied: number; + manual: number; +} + +const STATUS_SYMBOL: Record = { + [StepStatus.APPLY]: '✓', + [StepStatus.MANUAL]: '⚠', + [StepStatus.ALREADY]: '·', + [StepStatus.SKIP]: '–', +}; + +function firstPresent(files: ReadonlySet, candidates: readonly string[]): string | null { + for (const c of candidates) if (files.has(c)) return c; + return null; +} + +function gatherPlanInput(options: InitOptions, io: InitIo, pkgRaw: string): PlanInput { + const pkg: unknown = JSON.parse(pkgRaw); + const rootFiles = new Set(io.rootFiles()); + const detectInput: DetectInput = { + pkg: typeof pkg === 'object' && pkg !== null ? pkg : {}, + configFiles: rootFiles, + lockfiles: rootFiles, + }; + const detection = detect(detectInput); + + const vitePath = firstPresent(rootFiles, VITE_CONFIG_CANDIDATES); + const viteSource = vitePath === null ? null : io.readFile(vitePath); + const viteConfig = + vitePath !== null && viteSource !== null ? { path: vitePath, source: viteSource } : null; + + return { + detection, + mcpJson: io.readFile(MCP_FILE), + viteConfig, + nextConfigFile: firstPresent(rootFiles, NEXT_CONFIG_CANDIDATES), + nextIrisDevExists: io.exists(NEXT_IRIS_DEV), + options: { port: options.port, mcp: options.mcp, install: options.install }, + }; +} + +function restartHint(framework: Framework): string { + if (framework === Framework.NEXT) + return 'Restart `next dev`, then ask your agent: "List Iris sessions".'; + if (framework === Framework.VITE) + return 'Restart `vite`, then ask your agent: "List Iris sessions".'; + return 'Reload your app on localhost, then ask your agent: "List Iris sessions".'; +} + +function report(plan: Plan, dryRun: boolean, failed: ReadonlySet, io: InitIo): InitResult { + io.print(dryRun ? 'iris init (dry run — no files written)' : 'iris init'); + io.print(''); + let applied = 0; + let manual = 0; + for (const s of plan.steps) { + // A side effect that failed to apply is reported as a manual step with its fallback command. + const downgraded = failed.has(s.target); + const status = downgraded ? StepStatus.MANUAL : s.status; + const detail = + downgraded && s.exec !== undefined + ? `install failed — run manually: ${s.exec.fallback}` + : s.detail; + io.print(` [${STATUS_SYMBOL[status]}] ${s.title} → ${s.target}`); + if (status === StepStatus.APPLY) applied++; + if (status === StepStatus.MANUAL) { + manual++; + for (const line of detail.split('\n')) io.print(` ${line}`); + } else if (detail.length > 0) { + io.print(` ${detail}`); + } + } + io.print(''); + io.print(restartHint(plan.framework)); + return { ok: true, applied, manual }; +} + +/** Perform the apply-step side effects; return the targets whose side effect failed. */ +function applyEffects(plan: Plan, io: InitIo): Set { + const failed = new Set(); + for (const s of plan.steps) { + if (s.status !== StepStatus.APPLY) continue; + if (s.write !== undefined) io.writeFile(s.write.path, s.write.content); + if (s.exec !== undefined && !io.exec(s.exec.command, s.exec.args)) failed.add(s.target); + } + return failed; +} + +export function runInit(options: InitOptions, io: InitIo): InitResult { + const pkgRaw = io.readFile(PACKAGE_JSON); + if (pkgRaw === null) { + io.print('No package.json found. Run `iris init` from your project root.'); + return { ok: false, applied: 0, manual: 0 }; + } + + const plan = buildPlan(gatherPlanInput(options, io, pkgRaw)); + const failed = options.dryRun ? new Set() : applyEffects(plan, io); + return report(plan, options.dryRun, failed, io); +} diff --git a/packages/server/src/init/snippets.ts b/packages/server/src/init/snippets.ts new file mode 100644 index 0000000..c936e47 --- /dev/null +++ b/packages/server/src/init/snippets.ts @@ -0,0 +1,61 @@ +/** + * Generated file contents and copy-paste snippets for `iris init`. Kept as named constants so the + * runner never inlines free strings (see dev-skills/conventions.md). + */ + +/** The Vite-config snippet printed when we can't safely auto-patch the config. */ +export const VITE_MANUAL = `Add the Iris plugin to your Vite config: + + import { iris } from '@syrin/iris/vite'; + + export default defineConfig({ + plugins: [react(), iris()], + }); + +The plugin only applies during \`vite\` (dev) — it is dropped from \`vite build\`.`; + +/** Next.js config wrap — always printed (we never auto-rewrite next.config). */ +export function nextConfigManual(configFile: string): string { + return `Wrap your ${configFile} export with withIris (keeps SWC, dev-only): + + import { withIris } from '@syrin/iris/next'; + + export default withIris(nextConfig);`; +} + +/** The dev-only client component that connects Iris after hydration. */ +export const NEXT_IRIS_DEV_FILE = `'use client'; +import { useEffect } from 'react'; + +/** Dev-only: connect Iris + install the React adapter, after hydration. */ +export function IrisDev() { + useEffect(() => { + if (process.env.NODE_ENV !== 'development') return; + void import('@syrin/iris').then(({ iris, install }) => { + install(); + iris.connect(); + }); + }, []); + return null; +} +`; + +/** Mount instruction for the root layout. */ +export const NEXT_LAYOUT_MANUAL = `Mount in your root layout (app/layout.tsx), dev-only: + + import { IrisDev } from './iris-dev'; + // inside : + {process.env.NODE_ENV === 'development' ? : null}`; + +/** Plain-HTML / vanilla connect snippet. */ +export const HTML_MANUAL = `Add a dev-gated module script at app boot: + + `; + +export const NEXT_IRIS_DEV_PATH = 'app/iris-dev.tsx'; diff --git a/packages/server/src/init/vite-config.test.ts b/packages/server/src/init/vite-config.test.ts new file mode 100644 index 0000000..3e2b983 --- /dev/null +++ b/packages/server/src/init/vite-config.test.ts @@ -0,0 +1,48 @@ +import { describe, expect, it } from 'vitest'; +import { patchViteConfig, VitePatchKind, VITE_IMPORT } from './vite-config.js'; + +const BASIC = `import { defineConfig } from 'vite'; +import react from '@vitejs/plugin-react'; + +export default defineConfig({ + plugins: [react()], +}); +`; + +describe('patchViteConfig', () => { + it('adds the import and iris() into the plugins array', () => { + const r = patchViteConfig(BASIC); + expect(r.kind).toBe(VitePatchKind.APPLY); + if (r.kind !== VitePatchKind.APPLY) return; + expect(r.code).toContain(VITE_IMPORT); + expect(r.code).toMatch(/plugins:\s*\[iris\(\),\s*react\(\)\]/); + }); + + it('places the import after the last existing import', () => { + const r = patchViteConfig(BASIC); + if (r.kind !== VitePatchKind.APPLY) throw new Error('expected apply'); + const importIdx = r.code.indexOf(VITE_IMPORT); + const exportIdx = r.code.indexOf('export default'); + expect(importIdx).toBeGreaterThan(0); + expect(importIdx).toBeLessThan(exportIdx); + }); + + it('is idempotent — already-patched configs are left alone', () => { + const r = patchViteConfig(BASIC); + if (r.kind !== VitePatchKind.APPLY) throw new Error('expected apply'); + expect(patchViteConfig(r.code).kind).toBe(VitePatchKind.ALREADY); + }); + + it('bails to manual when there is no plugins array', () => { + const r = patchViteConfig(`import { defineConfig } from 'vite'; +export default defineConfig({ server: { port: 3000 } }); +`); + expect(r.kind).toBe(VitePatchKind.MANUAL); + }); + + it('prepends the import when the config has none', () => { + const r = patchViteConfig('export default { plugins: [] };\n'); + if (r.kind !== VitePatchKind.APPLY) throw new Error('expected apply'); + expect(r.code.startsWith(VITE_IMPORT)).toBe(true); + }); +}); diff --git a/packages/server/src/init/vite-config.ts b/packages/server/src/init/vite-config.ts new file mode 100644 index 0000000..5fbeb1d --- /dev/null +++ b/packages/server/src/init/vite-config.ts @@ -0,0 +1,53 @@ +/** + * Pure, conservative patcher for a Vite config: add the `@syrin/iris/vite` import and drop + * `iris()` into the `plugins` array. Only handles the obvious, common shape — anything ambiguous + * bails to a `manual` result so we never half-edit a build config (a broken config is worse than a + * documented manual step). + */ + +export const VITE_IMPORT = "import { iris } from '@syrin/iris/vite';"; +const IRIS_PLUGIN_CALL = 'iris()'; +const IRIS_MARKER = '@syrin/iris/vite'; +/** Matches the start of a `plugins: [` array literal. */ +const PLUGINS_ARRAY = /plugins\s*:\s*\[/; +/** Matches an ES import statement (used to place our import after the last one). */ +const IMPORT_LINE = /^import\s.+from\s+['"][^'"]+['"];?\s*$/gm; + +export const VitePatchKind = { + APPLY: 'apply', + ALREADY: 'already', + MANUAL: 'manual', +} as const; +export type VitePatchKind = (typeof VitePatchKind)[keyof typeof VitePatchKind]; + +export type VitePatch = + | { kind: typeof VitePatchKind.APPLY; code: string } + | { kind: typeof VitePatchKind.ALREADY } + | { kind: typeof VitePatchKind.MANUAL; reason: string }; + +const NO_PLUGINS_REASON = "couldn't find a `plugins: [...]` array to extend"; + +function insertImport(source: string): string { + const matches = [...source.matchAll(IMPORT_LINE)]; + const last = matches[matches.length - 1]; + if (last?.index === undefined) { + return `${VITE_IMPORT}\n${source}`; + } + const end = last.index + last[0].length; + return `${source.slice(0, end)}\n${VITE_IMPORT}${source.slice(end)}`; +} + +function insertPlugin(source: string): string { + // Insert right after the opening `[` of the plugins array. + return source.replace(PLUGINS_ARRAY, (match) => `${match}${IRIS_PLUGIN_CALL}, `); +} + +export function patchViteConfig(source: string): VitePatch { + if (source.includes(IRIS_MARKER)) { + return { kind: VitePatchKind.ALREADY }; + } + if (!PLUGINS_ARRAY.test(source)) { + return { kind: VitePatchKind.MANUAL, reason: NO_PLUGINS_REASON }; + } + return { kind: VitePatchKind.APPLY, code: insertImport(insertPlugin(source)) }; +} diff --git a/packages/vite-plugin/README.md b/packages/vite-plugin/README.md new file mode 100644 index 0000000..aaa21da --- /dev/null +++ b/packages/vite-plugin/README.md @@ -0,0 +1,45 @@ +# @syrin/iris-vite-plugin + +One-line Vite integration for [Iris](https://github.com/syrin-labs/iris). The plugin does the whole +dev-time wiring for you: + +- **Source mapping** — stamps `data-iris-source="file:line:col"` on JSX host elements (via + [`@syrin/iris-babel-plugin`](https://www.npmjs.com/package/@syrin/iris-babel-plugin)) so + `iris_inspect` can report the component's source file — needed on React 19. +- **Auto-connect** — injects a dev-only `install(); iris.connect()` so you don't touch your entry file. +- **Production-safe by construction** — `apply: 'serve'` means Vite drops the plugin entirely from + `vite build`. There is no env gate to forget; instrumentation cannot reach a production bundle. + +Usually installed via the umbrella package and imported from its `/vite` subpath: + +```bash +npm i -D @syrin/iris +``` + +```ts +// vite.config.ts +import { defineConfig } from 'vite'; +import react from '@vitejs/plugin-react'; +import { iris } from '@syrin/iris/vite'; + +export default defineConfig({ + plugins: [react(), iris()], +}); +``` + +That is the entire integration — no entry-file edit, no Babel-plugin wiring, no env gating. +`npx @syrin/iris init` adds this line for you automatically in a Vite project. + +## Options + +```ts +iris({ + port, // bridge WebSocket port; baked into connect() only when non-default + session, // stable session label (defaults to the SDK's auto id) + token, // auth token forwarded to connect() when the bridge requires one + sourceMapping, // default true — stamp data-iris-source (harmless on React <=18) + inject, // default true — auto-inject iris.connect() +}); +``` + +MIT. diff --git a/packages/vite-plugin/package.json b/packages/vite-plugin/package.json new file mode 100644 index 0000000..70305e4 --- /dev/null +++ b/packages/vite-plugin/package.json @@ -0,0 +1,59 @@ +{ + "name": "@syrin/iris-vite-plugin", + "version": "0.5.0", + "private": true, + "description": "Vite plugin for Iris: dev-only source-map stamping plus auto-injected iris.connect(). apply:'serve' guarantees it never ships to production.", + "type": "module", + "license": "MIT", + "author": "Iris contributors", + "repository": { + "type": "git", + "url": "git+https://github.com/syrin-labs/iris.git", + "directory": "packages/vite-plugin" + }, + "homepage": "https://github.com/syrin-labs/iris#readme", + "bugs": "https://github.com/syrin-labs/iris/issues", + "keywords": [ + "vite-plugin", + "iris", + "react", + "devtools" + ], + "sideEffects": false, + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "default": "./dist/index.js" + } + }, + "files": [ + "dist", + "README.md" + ], + "scripts": { + "build": "tsc -b", + "typecheck": "tsc -b", + "lint": "eslint src", + "test:unit": "vitest run src --passWithNoTests", + "prepack": "tsc -b && find dist -name \"*.test.*\" -delete" + }, + "dependencies": { + "@babel/core": "^7.26.0", + "@syrin/iris-babel-plugin": "workspace:*", + "@syrin/iris-protocol": "workspace:*" + }, + "devDependencies": { + "@types/babel__core": "^7.20.5", + "vite": "^5" + }, + "peerDependencies": { + "vite": ">=4" + }, + "peerDependenciesMeta": { + "vite": { + "optional": true + } + } +} diff --git a/packages/vite-plugin/src/build.integration.test.ts b/packages/vite-plugin/src/build.integration.test.ts new file mode 100644 index 0000000..60048bd --- /dev/null +++ b/packages/vite-plugin/src/build.integration.test.ts @@ -0,0 +1,49 @@ +import { describe, expect, it, beforeAll } from 'vitest'; +import { iris, IRIS_VITE_PLUGIN_NAME } from './index.js'; + +/** + * Proves the prod-safety claim against Vite's *actual* config resolution rather than a unit + * assertion on the `apply` field: `resolveConfig(..., 'build')` runs Vite's own apply-filter, so a + * plugin missing from the resolved build pipeline can never touch a production bundle. Skipped when + * `vite` is not installed (e.g. offline local runs); CI installs it as a devDependency. + */ + +type ResolveConfig = ( + inline: { plugins: unknown[]; configFile: false; logLevel: 'silent' }, + command: 'build' | 'serve', +) => Promise<{ plugins: readonly { name: string }[] }>; + +let resolveConfig: ResolveConfig | undefined; + +beforeAll(async () => { + try { + const vite = (await import('vite')) as { resolveConfig: ResolveConfig }; + resolveConfig = vite.resolveConfig; + } catch { + resolveConfig = undefined; + } +}); + +function names(plugins: readonly { name: string }[]): string[] { + return plugins.map((p) => p.name); +} + +describe('iris() in the real Vite config resolution', () => { + it('is included in the serve pipeline', async () => { + if (resolveConfig === undefined) return; + const resolved = await resolveConfig( + { plugins: [iris()], configFile: false, logLevel: 'silent' }, + 'serve', + ); + expect(names(resolved.plugins)).toContain(IRIS_VITE_PLUGIN_NAME); + }); + + it('is filtered out of the build pipeline (never ships to production)', async () => { + if (resolveConfig === undefined) return; + const resolved = await resolveConfig( + { plugins: [iris()], configFile: false, logLevel: 'silent' }, + 'build', + ); + expect(names(resolved.plugins)).not.toContain(IRIS_VITE_PLUGIN_NAME); + }); +}); diff --git a/packages/vite-plugin/src/index.test.ts b/packages/vite-plugin/src/index.test.ts new file mode 100644 index 0000000..64da816 --- /dev/null +++ b/packages/vite-plugin/src/index.test.ts @@ -0,0 +1,67 @@ +import { describe, it, expect } from 'vitest'; +import { SOURCE_ATTR } from '@syrin/iris-babel-plugin'; +import { IRIS_DEFAULT_PORT } from '@syrin/iris-protocol'; +import { iris, IRIS_VITE_PLUGIN_NAME } from './index.js'; + +describe('iris vite plugin', () => { + it('only applies during serve (never ships to production builds)', () => { + const plugin = iris(); + expect(plugin.name).toBe(IRIS_VITE_PLUGIN_NAME); + expect(plugin.apply).toBe('serve'); + expect(plugin.enforce).toBe('pre'); + }); + + it('stamps data-iris-source on host elements in .tsx files', () => { + const plugin = iris(); + const result = plugin.transform?.('const x = ;', '/app/src/Foo.tsx'); + expect(result).not.toBeNull(); + expect(result?.code).toContain(SOURCE_ATTR); + }); + + it('skips non-jsx and node_modules and virtual ids', () => { + const plugin = iris(); + expect(plugin.transform?.('const x = 1;', '/app/src/util.ts')).toBeNull(); + expect(plugin.transform?.('const x = ;', '/app/node_modules/pkg/Foo.tsx')).toBeNull(); + expect(plugin.transform?.('const x = ;', '\0virtual:foo.tsx')).toBeNull(); + }); + + it('disables stamping when sourceMapping is false', () => { + const plugin = iris({ sourceMapping: false }); + expect(plugin.transform?.('const x = ;', '/app/src/Foo.tsx')).toBeNull(); + }); + + it('injects a dev-gated connect() module importing @syrin/iris', () => { + const plugin = iris(); + const tags = plugin.transformIndexHtml?.(''); + expect(tags).toHaveLength(1); + const tag = tags?.[0]; + expect(tag?.tag).toBe('script'); + expect(tag?.attrs?.['type']).toBe('module'); + expect(tag?.children).toContain("from '@syrin/iris'"); + expect(tag?.children).toContain('iris.connect('); + expect(tag?.children).toContain('install()'); + }); + + it('does not inject connect() when inject is false', () => { + const plugin = iris({ inject: false }); + expect(plugin.transformIndexHtml?.('')).toEqual([]); + }); + + it('bakes a non-default port into the injected connect() url', () => { + const customPort = IRIS_DEFAULT_PORT + 1; + const tags = iris({ port: customPort }).transformIndexHtml?.(''); + expect(tags?.[0]?.children).toContain(String(customPort)); + expect(tags?.[0]?.children).toContain('ws://localhost:'); + }); + + it('omits the url for the default port (SDK default applies)', () => { + const tags = iris().transformIndexHtml?.(''); + expect(tags?.[0]?.children).not.toContain('ws://localhost:'); + }); + + it('forwards session and token when provided', () => { + const tags = iris({ session: 'my-app', token: 'secret' }).transformIndexHtml?.(''); + expect(tags?.[0]?.children).toContain('my-app'); + expect(tags?.[0]?.children).toContain('secret'); + }); +}); diff --git a/packages/vite-plugin/src/index.ts b/packages/vite-plugin/src/index.ts new file mode 100644 index 0000000..47b7ccb --- /dev/null +++ b/packages/vite-plugin/src/index.ts @@ -0,0 +1,108 @@ +import { transformSync } from '@babel/core'; +import irisSource from '@syrin/iris-babel-plugin'; +import { IRIS_DEFAULT_PORT, IRIS_WS_PATH } from '@syrin/iris-protocol'; + +export const IRIS_VITE_PLUGIN_NAME = 'iris'; + +/** The one-install package the host app imports the SDK from. */ +const IRIS_PACKAGE = '@syrin/iris'; +/** Files we stamp with source info — JSX/TSX only. */ +const JSX_FILE = /\.[jt]sx$/; +/** Rollup virtual-module ids start with a NUL byte; never transform those. */ +const VIRTUAL_PREFIX = '\0'; +const NODE_MODULES = 'node_modules'; + +export interface IrisVitePluginOptions { + /** Bridge WebSocket port. Defaults to the SDK default; only baked into connect() when non-default. */ + port?: number; + /** Stable session label for the bridge. Defaults to the SDK's auto-generated id. */ + session?: string; + /** Auth token forwarded to connect() when the bridge requires one. */ + token?: string; + /** Stamp data-iris-source for React 19 source mapping. Default true (harmless on React <=18). */ + sourceMapping?: boolean; + /** Auto-inject the dev-gated iris.connect() call. Default true. */ + inject?: boolean; +} + +/** Structural Vite plugin shape — avoids a hard dependency on `vite` while staying assignable to its `Plugin`. */ +export interface IrisVitePlugin { + name: string; + apply: 'serve'; + enforce: 'pre'; + transform: (code: string, id: string) => { code: string; map: string | null } | null; + transformIndexHtml: (html: string) => HtmlTag[]; +} + +interface HtmlTag { + tag: string; + attrs: Record; + children: string; + injectTo: 'body'; +} + +function shouldStamp(id: string): boolean { + if (id.startsWith(VIRTUAL_PREFIX)) return false; + if (id.includes(NODE_MODULES)) return false; + // Strip any query suffix (?worker, ?raw, ...) before matching the extension. + const clean = id.split('?')[0] ?? id; + return JSX_FILE.test(clean); +} + +function stamp(code: string, id: string): { code: string; map: string | null } | null { + const out = transformSync(code, { + filename: id, + plugins: [irisSource], + parserOpts: { plugins: ['jsx', 'typescript'] }, + sourceMaps: true, + configFile: false, + babelrc: false, + }); + if (out?.code === undefined || out.code === null) return null; + return { + code: out.code, + map: out.map === undefined || out.map === null ? null : JSON.stringify(out.map), + }; +} + +/** Build the `iris.connect()` argument literal — only includes keys the user set. */ +function connectArgs(options: IrisVitePluginOptions): string { + const args: Record = {}; + const port = options.port ?? IRIS_DEFAULT_PORT; + if (port !== IRIS_DEFAULT_PORT) args['url'] = `ws://localhost:${String(port)}${IRIS_WS_PATH}`; + if (options.session !== undefined) args['session'] = options.session; + if (options.token !== undefined) args['token'] = options.token; + return Object.keys(args).length > 0 ? JSON.stringify(args) : ''; +} + +function injectScript(options: IrisVitePluginOptions): HtmlTag[] { + if (options.inject === false) return []; + const args = connectArgs(options); + const children = `import { iris, install } from '${IRIS_PACKAGE}'; install(); iris.connect(${args});`; + return [{ tag: 'script', attrs: { type: 'module' }, children, injectTo: 'body' }]; +} + +/** + * Iris Vite plugin. Add to your `plugins` array and the entire integration is done: + * + * import { iris } from '@syrin/iris/vite'; + * export default defineConfig({ plugins: [react(), iris()] }); + * + * `apply: 'serve'` means Vite drops the plugin entirely from `vite build` — production bundles + * are never instrumented. Gating is the tool's job, not a user-managed env check. + */ +export function iris(options: IrisVitePluginOptions = {}): IrisVitePlugin { + const sourceMapping = options.sourceMapping !== false; + return { + name: IRIS_VITE_PLUGIN_NAME, + apply: 'serve', + enforce: 'pre', + transform(code, id) { + if (!sourceMapping || !shouldStamp(id)) return null; + return stamp(code, id); + }, + transformIndexHtml() { + return injectScript(options); + }, + }; +} diff --git a/packages/vite-plugin/tsconfig.json b/packages/vite-plugin/tsconfig.json new file mode 100644 index 0000000..4879713 --- /dev/null +++ b/packages/vite-plugin/tsconfig.json @@ -0,0 +1,11 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./dist", + "rootDir": "./src", + "lib": ["ES2023"], + "types": ["node"] + }, + "references": [{ "path": "../protocol" }, { "path": "../babel-plugin" }], + "include": ["src/**/*"] +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 4d45056..91ca74a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -252,6 +252,9 @@ importers: '@syrin/iris-test': specifier: workspace:* version: link:../test + '@syrin/iris-vite-plugin': + specifier: workspace:* + version: link:../vite-plugin '@types/ws': specifier: ^8.5.13 version: 8.18.1 @@ -353,6 +356,25 @@ importers: specifier: ^3.2.6 version: 3.2.6(@types/node@22.19.20)(jsdom@29.1.1)(lightningcss@1.32.0) + packages/vite-plugin: + dependencies: + '@babel/core': + specifier: ^7.26.0 + version: 7.29.7 + '@syrin/iris-babel-plugin': + specifier: workspace:* + version: link:../babel-plugin + '@syrin/iris-protocol': + specifier: workspace:* + version: link:../protocol + devDependencies: + '@types/babel__core': + specifier: ^7.20.5 + version: 7.20.5 + vite: + specifier: ^5 + version: 5.4.21(@types/node@22.19.20)(lightningcss@1.32.0) + packages: '@ampproject/remapping@2.3.0': @@ -3080,6 +3102,37 @@ packages: engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} hasBin: true + vite@5.4.21: + resolution: {integrity: sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==} + engines: {node: ^18.0.0 || >=20.0.0} + hasBin: true + peerDependencies: + '@types/node': ^18.0.0 || >=20.0.0 + less: '*' + lightningcss: ^1.21.0 + sass: '*' + sass-embedded: '*' + stylus: '*' + sugarss: '*' + terser: ^5.4.0 + peerDependenciesMeta: + '@types/node': + optional: true + less: + optional: true + lightningcss: + optional: true + sass: + optional: true + sass-embedded: + optional: true + stylus: + optional: true + sugarss: + optional: true + terser: + optional: true + vite@7.3.5: resolution: {integrity: sha512-KuOaNhcnGFN2zIPGA7wRmzF+lJA1sea7rHq17aiJ++9lzY1WWG6Jpwqwe1KNbRVPIqHmr8GLYx7jbrQcN/7/ww==} engines: {node: ^20.19.0 || >=22.12.0} @@ -5957,10 +6010,9 @@ snapshots: debug: 4.4.3 es-module-lexer: 1.7.0 pathe: 2.0.3 - vite: 7.3.5(@types/node@22.19.20)(lightningcss@1.32.0) + vite: 5.4.21(@types/node@22.19.20)(lightningcss@1.32.0) transitivePeerDependencies: - '@types/node' - - jiti - less - lightningcss - sass @@ -5969,8 +6021,16 @@ snapshots: - sugarss - supports-color - terser - - tsx - - yaml + + vite@5.4.21(@types/node@22.19.20)(lightningcss@1.32.0): + dependencies: + esbuild: 0.28.1 + postcss: 8.5.15 + rollup: 4.61.1 + optionalDependencies: + '@types/node': 22.19.20 + fsevents: 2.3.3 + lightningcss: 1.32.0 vite@7.3.5(@types/node@22.19.20)(lightningcss@1.32.0): dependencies: diff --git a/tsconfig.json b/tsconfig.json index 319a3c9..5a7f0d6 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -6,6 +6,7 @@ { "path": "packages/server" }, { "path": "packages/test" }, { "path": "packages/react" }, - { "path": "packages/babel-plugin" } + { "path": "packages/babel-plugin" }, + { "path": "packages/vite-plugin" } ] } From 1e7178215adefa6586843d7802f962d865d21de3 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 02:12:48 +0530 Subject: [PATCH 02/33] fix(vite-plugin): serve connect() as a virtual module, not an inline script An inline `; +} + +/** Printed when an existing .mcp.json can't be parsed (jsonc/comments) and we won't rewrite it. */ +export function mcpManual(entry: Record): string { + return `Couldn't parse your existing .mcp.json (comments or trailing commas?). Add this server +to its "mcpServers" object by hand: + + "iris": ${JSON.stringify(entry, null, 2)}`; +} export const NEXT_IRIS_DEV_PATH = 'app/iris-dev.tsx'; diff --git a/packages/server/src/init/vite-config.test.ts b/packages/server/src/init/vite-config.test.ts index 3e2b983..f7b6198 100644 --- a/packages/server/src/init/vite-config.test.ts +++ b/packages/server/src/init/vite-config.test.ts @@ -33,6 +33,19 @@ describe('patchViteConfig', () => { expect(patchViteConfig(r.code).kind).toBe(VitePatchKind.ALREADY); }); + it('bakes a non-default port into the iris() call', () => { + const r = patchViteConfig(BASIC, 5000); + if (r.kind !== VitePatchKind.APPLY) throw new Error('expected apply'); + expect(r.code).toContain('iris({ port: 5000 })'); + }); + + it('emits bare iris() when no port is given', () => { + const r = patchViteConfig(BASIC); + if (r.kind !== VitePatchKind.APPLY) throw new Error('expected apply'); + expect(r.code).toContain('iris(), '); + expect(r.code).not.toContain('port:'); + }); + it('bails to manual when there is no plugins array', () => { const r = patchViteConfig(`import { defineConfig } from 'vite'; export default defineConfig({ server: { port: 3000 } }); diff --git a/packages/server/src/init/vite-config.ts b/packages/server/src/init/vite-config.ts index 5fbeb1d..b115f8d 100644 --- a/packages/server/src/init/vite-config.ts +++ b/packages/server/src/init/vite-config.ts @@ -6,8 +6,12 @@ */ export const VITE_IMPORT = "import { iris } from '@syrin/iris/vite';"; -const IRIS_PLUGIN_CALL = 'iris()'; const IRIS_MARKER = '@syrin/iris/vite'; + +/** The `iris(...)` call — carries the bridge port so the injected connect() targets it. */ +export function irisPluginCall(port: number | undefined): string { + return port === undefined ? 'iris()' : `iris({ port: ${String(port)} })`; +} /** Matches the start of a `plugins: [` array literal. */ const PLUGINS_ARRAY = /plugins\s*:\s*\[/; /** Matches an ES import statement (used to place our import after the last one). */ @@ -37,17 +41,17 @@ function insertImport(source: string): string { return `${source.slice(0, end)}\n${VITE_IMPORT}${source.slice(end)}`; } -function insertPlugin(source: string): string { +function insertPlugin(source: string, port: number | undefined): string { // Insert right after the opening `[` of the plugins array. - return source.replace(PLUGINS_ARRAY, (match) => `${match}${IRIS_PLUGIN_CALL}, `); + return source.replace(PLUGINS_ARRAY, (match) => `${match}${irisPluginCall(port)}, `); } -export function patchViteConfig(source: string): VitePatch { +export function patchViteConfig(source: string, port?: number): VitePatch { if (source.includes(IRIS_MARKER)) { return { kind: VitePatchKind.ALREADY }; } if (!PLUGINS_ARRAY.test(source)) { return { kind: VitePatchKind.MANUAL, reason: NO_PLUGINS_REASON }; } - return { kind: VitePatchKind.APPLY, code: insertImport(insertPlugin(source)) }; + return { kind: VitePatchKind.APPLY, code: insertImport(insertPlugin(source, port)) }; } From 89840369e150d26010cf49fb8d4243b39587393b Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 02:42:09 +0530 Subject: [PATCH 04/33] feat(init): register the MCP server globally (user scope), not per project MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bridge + MCP server is a single process that serves every project, so it should be registered once — not re-added to a .mcp.json in each repo. `iris init` now runs `claude mcp add iris -s user` (idempotent: probes `claude mcp get iris` first, skips if present) instead of writing a project .mcp.json. Per project, only the SDK (the iris() Vite plugin / connect call) is added. - new init/mcp.ts: builds the `claude mcp add -s user` command, an existence probe, an availability probe, and a manual fallback (printed when the claude CLI is absent, with a Cursor-style global-config snippet too) - run.ts gains a quiet `probe` IO capability; gathers claudeCli + mcpExists - removed init/mcp-config.ts (the project .mcp.json JSON merge) — no longer used - --port flows into the global registration too, so bridge and SDK agree - docs (getting-started, integrate-with-claude-code): register once, globally Verified end-to-end in an isolated CLAUDE_CONFIG_DIR: init writes the iris server to the user config (no project .mcp.json), and a re-run reports ALREADY instead of erroring. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/getting-started.md | 34 +++++---- docs/integrate-with-claude-code.md | 8 ++- packages/server/src/init/mcp-config.test.ts | 52 -------------- packages/server/src/init/mcp-config.ts | 72 ------------------- packages/server/src/init/mcp.test.ts | 52 ++++++++++++++ packages/server/src/init/mcp.ts | 57 +++++++++++++++ packages/server/src/init/node-io.ts | 5 ++ packages/server/src/init/plan.test.ts | 77 +++++++++++++++------ packages/server/src/init/plan.ts | 41 ++++++----- packages/server/src/init/run.test.ts | 67 ++++++++++-------- packages/server/src/init/run.ts | 18 +++-- packages/server/src/init/snippets.ts | 8 --- 12 files changed, 269 insertions(+), 222 deletions(-) delete mode 100644 packages/server/src/init/mcp-config.test.ts delete mode 100644 packages/server/src/init/mcp-config.ts create mode 100644 packages/server/src/init/mcp.test.ts create mode 100644 packages/server/src/init/mcp.ts diff --git a/docs/getting-started.md b/docs/getting-started.md index 7c3a957..700f5f9 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -58,42 +58,46 @@ npx @syrin/iris init It detects your framework, package manager, and React version, then: -- merges an `iris` entry into `.mcp.json` (never clobbering an existing one), +- **registers the Iris MCP server once, globally** (`claude mcp add iris -s user`) — so every + project on this machine gets it; you never re-add it per project, - installs `@syrin/iris` as a dev dependency, - **Vite:** adds the `iris()` plugin to your config — which wires source mapping _and_ `iris.connect()` for you, so there is nothing else to edit, - **Next / other:** creates the dev component and prints the exact `withIris` / mount / connect snippets to paste (it never half-edits a build config). -Re-running is safe (already-done steps are skipped). Preview without writing via -`npx @syrin/iris init --dry-run`. Flags: `--port N`, `--no-mcp`, `--no-install`, `--yes`. +The bridge + MCP server is a single process that serves all your projects, so it's registered at +**user scope**, not in a per-project `.mcp.json`. Only the SDK (the `iris()` plugin / connect call) +is added per project. + +Re-running is safe (already-registered/already-patched steps are skipped). Preview without writing +via `npx @syrin/iris init --dry-run`. Flags: `--port N`, `--no-mcp`, `--no-install`, `--yes`. Then restart your dev server and skip to [Step 4](#step-4--run-it--verify-the-connection). The manual steps below explain what `init` sets up, if you prefer to wire it yourself. --- -## Step 1 — Connect your coding agent (MCP) +## Step 1 — Connect your coding agent (MCP), once -You don't start the server manually — your agent starts it via MCP. Add Iris to your agent's -MCP config. +You don't start the server manually — your agent starts it via MCP. Register Iris **once, at the +user (global) scope** so every project picks it up — there's nothing to add per project. -**Claude Code** — create/edit `.mcp.json` in your project root: +**Claude Code** — one command: -```jsonc -{ - "mcpServers": { - "iris": { "command": "npx", "args": ["@syrin/iris"] }, - }, -} +```bash +claude mcp add iris -s user -- npx @syrin/iris mcp ``` -**Cursor** — `~/.cursor/mcp.json` (or project `.cursor/mcp.json`): +(`iris init` runs exactly this for you. `-s user` is what makes it global; drop it for a +project-local registration instead.) + +**Cursor** — add to your global `~/.cursor/mcp.json` (not per-project): ```jsonc { "mcpServers": { - "iris": { "command": "npx", "args": ["@syrin/iris"] }, + "iris": { "command": "npx", "args": ["@syrin/iris", "mcp"] }, }, } ``` diff --git a/docs/integrate-with-claude-code.md b/docs/integrate-with-claude-code.md index 16a26f4..5bb0a3f 100644 --- a/docs/integrate-with-claude-code.md +++ b/docs/integrate-with-claude-code.md @@ -23,8 +23,10 @@ Read these docs first and follow them exactly: Then do all of this: -1. MCP: add an "iris" server to .mcp.json: - { "mcpServers": { "iris": { "command": "npx", "args": ["@syrin/iris"] } } } +1. MCP: register the "iris" server ONCE, globally (user scope) — not per project: + claude mcp add iris -s user -- npx @syrin/iris mcp + (`npx @syrin/iris init` runs this for you. Use a project-scoped `.mcp.json` only if a repo + needs its own pinned config.) 2. Install the dev deps from the local registry (skip the registry lines if using public npm): echo '@syrin:registry=http://localhost:4873/' >> .npmrc @@ -98,7 +100,7 @@ asserting on volatile output. Report evidence, not prose. ## What "good integration" looks like (checklist the agent should hit) -- [ ] `.mcp.json` has the `iris` server; `iris_sessions` shows the app connected. +- [ ] the `iris` MCP server is registered (globally via `claude mcp add -s user`); `iris_sessions` shows the app connected. - [ ] `iris.connect()` is dev-gated; nothing Iris ships to prod. - [ ] React adapter installed + source mapping returns `file:line` from `iris_inspect`. - [ ] Key elements have `data-testid`; components depend on an injected `createIrisEmitter()` diff --git a/packages/server/src/init/mcp-config.test.ts b/packages/server/src/init/mcp-config.test.ts deleted file mode 100644 index 6d53847..0000000 --- a/packages/server/src/init/mcp-config.test.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { describe, expect, it } from 'vitest'; -import { mergeMcpConfig, McpMergeStatus } from './mcp-config.js'; - -interface McpShape { - mcpServers: Record; -} - -function parse(content: string): McpShape { - return JSON.parse(content) as McpShape; -} - -describe('mergeMcpConfig', () => { - it('creates a fresh config when none exists', () => { - const r = mergeMcpConfig(null, undefined); - expect(r.status).toBe(McpMergeStatus.APPLY); - const parsed = parse(r.content); - expect(parsed.mcpServers['iris']).toEqual({ command: 'npx', args: ['@syrin/iris', 'mcp'] }); - expect(r.content.endsWith('\n')).toBe(true); - }); - - it('bakes a port into the args when provided', () => { - const r = mergeMcpConfig(null, 4500); - expect(parse(r.content).mcpServers['iris']?.args).toEqual([ - '@syrin/iris', - 'mcp', - '--port', - '4500', - ]); - }); - - it('preserves other servers when adding iris', () => { - const existing = JSON.stringify({ mcpServers: { other: { command: 'x' } } }); - const r = mergeMcpConfig(existing, undefined); - const parsed = parse(r.content); - expect(parsed.mcpServers['other']).toEqual({ command: 'x' }); - expect(parsed.mcpServers['iris']).toBeDefined(); - }); - - it('never clobbers an existing iris entry (idempotent)', () => { - const existing = JSON.stringify({ mcpServers: { iris: { command: 'custom' } } }); - const r = mergeMcpConfig(existing, undefined); - expect(r.status).toBe(McpMergeStatus.ALREADY); - expect(r.content).toBe(existing); - }); - - it('bails to manual on unparseable jsonc (comments) without crashing or rewriting', () => { - const jsonc = '{\n // my servers\n "mcpServers": { "other": { "command": "x" } }\n}\n'; - const r = mergeMcpConfig(jsonc, undefined); - expect(r.status).toBe(McpMergeStatus.MANUAL); - expect(r.content).toBe(jsonc); // unchanged — we never strip the user's comments - }); -}); diff --git a/packages/server/src/init/mcp-config.ts b/packages/server/src/init/mcp-config.ts deleted file mode 100644 index 566a053..0000000 --- a/packages/server/src/init/mcp-config.ts +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Pure merge of an Iris server entry into a project's `.mcp.json`. Never clobbers an existing - * `iris` entry — adoption must be idempotent and safe to re-run. - */ - -const SERVER_KEY = 'iris'; -const NPX_COMMAND = 'npx'; -const IRIS_PACKAGE = '@syrin/iris'; -const MCP_SUBCOMMAND = 'mcp'; -const PORT_FLAG = '--port'; - -export const McpMergeStatus = { - APPLY: 'apply', - ALREADY: 'already', - /** The existing file could not be parsed as JSON (e.g. jsonc with comments) — bail to manual. */ - MANUAL: 'manual', -} as const; -export type McpMergeStatus = (typeof McpMergeStatus)[keyof typeof McpMergeStatus]; - -export interface McpMergeResult { - status: McpMergeStatus; - /** Full file content to write (2-space JSON, trailing newline). Unchanged when not `apply`. */ - content: string; -} - -interface McpConfigShape { - mcpServers?: Record; - [key: string]: unknown; -} - -/** The iris server entry to add to `mcpServers`, rendered for a manual paste when we can't merge. */ -export function irisServerEntry(port: number | undefined): Record { - const args = - port === undefined - ? [IRIS_PACKAGE, MCP_SUBCOMMAND] - : [IRIS_PACKAGE, MCP_SUBCOMMAND, PORT_FLAG, String(port)]; - return { command: NPX_COMMAND, args }; -} - -type ParseResult = { ok: true; config: McpConfigShape } | { ok: false }; - -function parseConfig(existing: string | null): ParseResult { - if (existing === null || existing.trim().length === 0) return { ok: true, config: {} }; - try { - const parsed: unknown = JSON.parse(existing); - if (typeof parsed !== 'object' || parsed === null) return { ok: true, config: {} }; - return { ok: true, config: parsed as McpConfigShape }; - } catch { - // Comments / trailing commas (jsonc) or genuinely malformed — don't crash, don't rewrite - // (rewriting would strip the user's comments). The caller bails to a manual instruction. - return { ok: false }; - } -} - -export function mergeMcpConfig(existing: string | null, port: number | undefined): McpMergeResult { - const parsed = parseConfig(existing); - if (!parsed.ok) { - return { status: McpMergeStatus.MANUAL, content: existing ?? '' }; - } - const config = parsed.config; - const servers = config.mcpServers ?? {}; - - if (Object.prototype.hasOwnProperty.call(servers, SERVER_KEY)) { - return { status: McpMergeStatus.ALREADY, content: existing ?? '' }; - } - - const merged: McpConfigShape = { - ...config, - mcpServers: { ...servers, [SERVER_KEY]: irisServerEntry(port) }, - }; - return { status: McpMergeStatus.APPLY, content: `${JSON.stringify(merged, null, 2)}\n` }; -} diff --git a/packages/server/src/init/mcp.test.ts b/packages/server/src/init/mcp.test.ts new file mode 100644 index 0000000..898dd70 --- /dev/null +++ b/packages/server/src/init/mcp.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from 'vitest'; +import { claudeAddCommand, claudeExistsProbe, mcpManual, MCP_SERVER_NAME } from './mcp.js'; + +describe('claudeAddCommand', () => { + it('registers iris at user scope via npx (global, all projects)', () => { + const c = claudeAddCommand(undefined); + expect(c.command).toBe('claude'); + expect(c.args).toEqual([ + 'mcp', + 'add', + MCP_SERVER_NAME, + '-s', + 'user', + '--', + 'npx', + '@syrin/iris', + 'mcp', + ]); + }); + + it('bakes the port into the registered invocation', () => { + const c = claudeAddCommand(4500); + expect(c.args).toEqual([ + 'mcp', + 'add', + 'iris', + '-s', + 'user', + '--', + 'npx', + '@syrin/iris', + 'mcp', + '--port', + '4500', + ]); + expect(c.display).toContain('--port 4500'); + }); +}); + +describe('claudeExistsProbe', () => { + it('uses `claude mcp get iris`', () => { + expect(claudeExistsProbe()).toEqual({ command: 'claude', args: ['mcp', 'get', 'iris'] }); + }); +}); + +describe('mcpManual', () => { + it('explains the one-time global registration', () => { + const m = mcpManual(undefined); + expect(m).toContain('claude mcp add iris -s user'); + expect(m).toContain('globally'); + }); +}); diff --git a/packages/server/src/init/mcp.ts b/packages/server/src/init/mcp.ts new file mode 100644 index 0000000..567684c --- /dev/null +++ b/packages/server/src/init/mcp.ts @@ -0,0 +1,57 @@ +/** + * Global (user-scope) MCP registration for `iris init`. The bridge + MCP server is a single + * process that serves every project, so it is registered ONCE at user scope — not per-project via + * a checked-in `.mcp.json`. We shell out to the official `claude mcp add -s user` CLI rather than + * hand-editing `~/.claude.json` (a large stateful file). When the `claude` CLI is absent we print a + * manual instruction instead. + */ + +export const MCP_SERVER_NAME = 'iris'; +const NPX = 'npx'; +const IRIS_PACKAGE = '@syrin/iris'; +const MCP_SUBCOMMAND = 'mcp'; +const PORT_FLAG = '--port'; +export const CLAUDE_CLI = 'claude'; + +/** The subprocess that the SDK/agent runs to launch the bridge — the tail after `claude mcp add … --`. */ +function serverInvocation(port: number | undefined): string[] { + return port === undefined + ? [NPX, IRIS_PACKAGE, MCP_SUBCOMMAND] + : [NPX, IRIS_PACKAGE, MCP_SUBCOMMAND, PORT_FLAG, String(port)]; +} + +export interface ClaudeAddCommand { + command: string; + args: string[]; + /** Human-readable form of the same command, for reports and manual fallback. */ + display: string; +} + +/** `claude mcp add iris -s user -- npx @syrin/iris mcp [--port N]` — registers globally for all projects. */ +export function claudeAddCommand(port: number | undefined): ClaudeAddCommand { + const tail = serverInvocation(port); + const args = [MCP_SUBCOMMAND, 'add', MCP_SERVER_NAME, '-s', 'user', '--', ...tail]; + return { command: CLAUDE_CLI, args, display: `${CLAUDE_CLI} ${args.join(' ')}` }; +} + +/** Probe args that tell us whether an `iris` server already exists in any scope (exit 0 = exists). */ +export function claudeExistsProbe(): { command: string; args: string[] } { + return { command: CLAUDE_CLI, args: [MCP_SUBCOMMAND, 'get', MCP_SERVER_NAME] }; +} + +/** Probe args for whether the `claude` CLI is installed at all. */ +export function claudeAvailableProbe(): { command: string; args: string[] } { + return { command: CLAUDE_CLI, args: ['--version'] }; +} + +/** Printed when the `claude` CLI isn't available — register Iris globally once, by hand. */ +export function mcpManual(port: number | undefined): string { + const tail = serverInvocation(port).join(' '); + return `Register the Iris MCP server ONCE, globally (so every project gets it): + + ${CLAUDE_CLI} ${MCP_SUBCOMMAND} add ${MCP_SERVER_NAME} -s user -- ${tail} + +Or, for another agent, add this to its global MCP config (e.g. Cursor's ~/.cursor/mcp.json): + + "${MCP_SERVER_NAME}": { "command": "${NPX}", "args": ${JSON.stringify(serverInvocation(port).slice(1))} }`; +} diff --git a/packages/server/src/init/node-io.ts b/packages/server/src/init/node-io.ts index 5a36068..92ab46a 100644 --- a/packages/server/src/init/node-io.ts +++ b/packages/server/src/init/node-io.ts @@ -40,6 +40,11 @@ export function buildNodeIo(cwd: string): InitIo { const result = spawnSync(command, [...args], { cwd, stdio: 'inherit', shell: true }); return result.status === 0; }, + probe(command, args) { + // Quiet yes/no check (CLI availability, existing registration). Never throws. + const result = spawnSync(command, [...args], { cwd, stdio: 'ignore', shell: true }); + return result.status === 0; + }, print(line) { process.stdout.write(`${line}\n`); }, diff --git a/packages/server/src/init/plan.test.ts b/packages/server/src/init/plan.test.ts index 9204c61..70fe287 100644 --- a/packages/server/src/init/plan.test.ts +++ b/packages/server/src/init/plan.test.ts @@ -2,6 +2,8 @@ import { describe, expect, it } from 'vitest'; import { buildPlan, StepStatus, type PlanInput } from './plan.js'; import { Framework, PackageManager, type Detection } from './detect.js'; +const MCP_STEP = 'MCP server (global)'; + function detection(framework: Framework, reactMajor = 19): Detection { return { framework, @@ -14,7 +16,8 @@ function detection(framework: Framework, reactMajor = 19): Detection { function input(partial: Partial): PlanInput { return { detection: partial.detection ?? detection(Framework.VITE), - mcpJson: partial.mcpJson ?? null, + claudeCli: partial.claudeCli ?? true, + mcpExists: partial.mcpExists ?? false, viteConfig: partial.viteConfig ?? null, nextConfigFile: partial.nextConfigFile ?? null, nextIrisDevExists: partial.nextIrisDevExists ?? false, @@ -33,14 +36,58 @@ function step(plan: ReturnType, title: string) { return s; } +describe('buildPlan — MCP (global, claude user scope)', () => { + it('registers iris globally via an exec step when the claude CLI is present', () => { + const s = step(buildPlan(input({ claudeCli: true, mcpExists: false })), MCP_STEP); + expect(s.status).toBe(StepStatus.APPLY); + expect(s.exec?.command).toBe('claude'); + expect(s.exec?.args).toEqual([ + 'mcp', + 'add', + 'iris', + '-s', + 'user', + '--', + 'npx', + '@syrin/iris', + 'mcp', + ]); + }); + + it('is ALREADY (idempotent) when an iris server is already registered', () => { + const s = step(buildPlan(input({ claudeCli: true, mcpExists: true })), MCP_STEP); + expect(s.status).toBe(StepStatus.ALREADY); + }); + + it('bails to manual global instructions when the claude CLI is missing', () => { + const s = step(buildPlan(input({ claudeCli: false, mcpExists: false })), MCP_STEP); + expect(s.status).toBe(StepStatus.MANUAL); + expect(s.detail).toContain('-s user'); + }); + + it('skips under --no-mcp', () => { + const s = step( + buildPlan(input({ options: { port: undefined, mcp: false, install: false } })), + MCP_STEP, + ); + expect(s.status).toBe(StepStatus.SKIP); + }); + + it('bakes --port into the global registration', () => { + const s = step( + buildPlan(input({ options: { port: 5000, mcp: true, install: false } })), + MCP_STEP, + ); + expect(s.exec?.args).toContain('--port'); + expect(s.exec?.args).toContain('5000'); + }); +}); + describe('buildPlan — Vite', () => { - it('writes .mcp.json and patches the vite config (no entry edit needed)', () => { + it('patches the vite config; no separate entry-file step (plugin injects connect)', () => { const plan = buildPlan(input({ viteConfig: { path: 'vite.config.ts', source: VITE_SRC } })); - expect(step(plan, 'MCP config').status).toBe(StepStatus.APPLY); - expect(step(plan, 'MCP config').write).toBeDefined(); expect(step(plan, 'Vite plugin').status).toBe(StepStatus.APPLY); expect(step(plan, 'Vite plugin').write?.content).toContain('@syrin/iris/vite'); - // The plugin injects connect(), so there is no separate entry-file step. expect(plan.steps.some((s) => s.title.includes('entry'))).toBe(false); }); @@ -58,20 +105,10 @@ describe('buildPlan — Vite', () => { ); expect(step(plan, 'Vite plugin').write?.content).toContain('iris({ port: 5000 })'); }); +}); - it('bails MCP to manual when .mcp.json is unparseable jsonc (no crash)', () => { - const plan = buildPlan(input({ mcpJson: '{\n // c\n "mcpServers": {}\n}' })); - const s = step(plan, 'MCP config'); - expect(s.status).toBe(StepStatus.MANUAL); - expect(s.detail).toContain('"iris"'); - }); - - it('skips MCP under --no-mcp', () => { - const plan = buildPlan(input({ options: { port: undefined, mcp: false, install: false } })); - expect(step(plan, 'MCP config').status).toBe(StepStatus.SKIP); - }); - - it('makes install an exec step when install is enabled, manual otherwise', () => { +describe('buildPlan — install', () => { + it('makes install an exec step when enabled, manual otherwise', () => { const off = buildPlan(input({ options: { port: undefined, mcp: true, install: false } })); expect(step(off, 'Install dependency').status).toBe(StepStatus.MANUAL); expect(step(off, 'Install dependency').exec).toBeUndefined(); @@ -103,9 +140,9 @@ describe('buildPlan — Next', () => { }); describe('buildPlan — HTML', () => { - it('gives an MCP write plus a manual connect snippet', () => { + it('registers MCP globally plus a manual connect snippet', () => { const plan = buildPlan(input({ detection: detection(Framework.HTML, 0) })); - expect(step(plan, 'MCP config').status).toBe(StepStatus.APPLY); + expect(step(plan, MCP_STEP).status).toBe(StepStatus.APPLY); expect(step(plan, 'Connect snippet').status).toBe(StepStatus.MANUAL); }); }); diff --git a/packages/server/src/init/plan.ts b/packages/server/src/init/plan.ts index e7d7fe0..51b8d5a 100644 --- a/packages/server/src/init/plan.ts +++ b/packages/server/src/init/plan.ts @@ -5,7 +5,7 @@ */ import { Framework, installCommand, installCommandParts, type Detection } from './detect.js'; -import { mergeMcpConfig, McpMergeStatus, irisServerEntry } from './mcp-config.js'; +import { claudeAddCommand, mcpManual } from './mcp.js'; import { patchViteConfig, VitePatchKind } from './vite-config.js'; import { viteManual, @@ -14,11 +14,10 @@ import { nextIrisDevFile, NEXT_IRIS_DEV_PATH, nextConfigManual, - mcpManual, } from './snippets.js'; const IRIS_PACKAGE = '@syrin/iris'; -const MCP_FILE = '.mcp.json'; +const MCP_TARGET = 'global (claude user scope)'; export const StepStatus = { APPLY: 'apply', @@ -46,8 +45,10 @@ export interface Plan { export interface PlanInput { detection: Detection; - /** Current `.mcp.json` content, or null if absent. */ - mcpJson: string | null; + /** Whether the `claude` CLI is installed (so we can register the MCP server globally). */ + claudeCli: boolean; + /** Whether an `iris` MCP server is already registered (any scope) — skip to stay idempotent. */ + mcpExists: boolean; /** Discovered Vite config: its path + source, or null if none found. */ viteConfig: { path: string; source: string } | null; /** Discovered Next config filename (e.g. 'next.config.mjs'), or null. */ @@ -57,33 +58,35 @@ export interface PlanInput { options: { port: number | undefined; mcp: boolean; install: boolean }; } +const MCP_TITLE = 'MCP server (global)'; + function mcpStep(input: PlanInput): Step { if (!input.options.mcp) { - return { title: 'MCP config', target: MCP_FILE, status: StepStatus.SKIP, detail: '--no-mcp' }; + return { title: MCP_TITLE, target: MCP_TARGET, status: StepStatus.SKIP, detail: '--no-mcp' }; } - const r = mergeMcpConfig(input.mcpJson, input.options.port); - if (r.status === McpMergeStatus.ALREADY) { + if (input.mcpExists) { return { - title: 'MCP config', - target: MCP_FILE, + title: MCP_TITLE, + target: MCP_TARGET, status: StepStatus.ALREADY, - detail: 'iris server already configured', + detail: 'iris already registered (install once, used by every project)', }; } - if (r.status === McpMergeStatus.MANUAL) { + if (!input.claudeCli) { return { - title: 'MCP config', - target: MCP_FILE, + title: MCP_TITLE, + target: MCP_TARGET, status: StepStatus.MANUAL, - detail: mcpManual(irisServerEntry(input.options.port)), + detail: mcpManual(input.options.port), }; } + const cmd = claudeAddCommand(input.options.port); return { - title: 'MCP config', - target: MCP_FILE, + title: MCP_TITLE, + target: MCP_TARGET, status: StepStatus.APPLY, - detail: 'add iris MCP server', - write: { path: MCP_FILE, content: r.content }, + detail: 'register iris globally for all projects', + exec: { command: cmd.command, args: cmd.args, fallback: cmd.display }, }; } diff --git a/packages/server/src/init/run.test.ts b/packages/server/src/init/run.test.ts index 6b134c1..5b1168f 100644 --- a/packages/server/src/init/run.test.ts +++ b/packages/server/src/init/run.test.ts @@ -7,7 +7,14 @@ interface MemoryIo extends InitIo { execCalls: { command: string; args: readonly string[] }[]; } -function memoryIo(files: Record, execOk = true): MemoryIo { +interface MemoryOpts { + execOk?: boolean; + claudeAvailable?: boolean; + mcpExists?: boolean; +} + +function memoryIo(files: Record, opts: MemoryOpts = {}): MemoryIo { + const { execOk = true, claudeAvailable = true, mcpExists = false } = opts; const written: Record = {}; const lines: string[] = []; const execCalls: { command: string; args: readonly string[] }[] = []; @@ -25,6 +32,7 @@ function memoryIo(files: Record, execOk = true): MemoryIo { execCalls.push({ command, args }); return execOk; }, + probe: (_command, args) => (args.includes('get') ? mcpExists : claudeAvailable), print: (l) => lines.push(l), }; } @@ -37,6 +45,11 @@ const OPTS: InitOptions = { install: false, }; +const VITE_FILES = { + 'package.json': JSON.stringify({ devDependencies: { vite: '^5', react: '^19' } }), + 'vite.config.ts': `export default { plugins: [] };\n`, +}; + describe('runInit', () => { it('errors cleanly without a package.json', () => { const io = memoryIo({}); @@ -45,51 +58,47 @@ describe('runInit', () => { expect(io.lines.join('\n')).toContain('No package.json'); }); - it('writes .mcp.json and patches the vite config for a Vite+React project', () => { - const io = memoryIo({ - 'package.json': JSON.stringify({ devDependencies: { vite: '^5', react: '^19' } }), - 'vite.config.ts': `import react from '@vitejs/plugin-react';\nexport default { plugins: [react()] };\n`, - }); + it('registers iris globally via the claude CLI (not a project .mcp.json) and patches vite', () => { + const io = memoryIo(VITE_FILES); const r = runInit(OPTS, io); expect(r.ok).toBe(true); - expect(io.written['.mcp.json']).toContain('@syrin/iris'); + expect(io.written['.mcp.json']).toBeUndefined(); + expect(io.execCalls.some((c) => c.command === 'claude' && c.args.includes('add'))).toBe(true); expect(io.written['vite.config.ts']).toContain('@syrin/iris/vite'); }); - it('dry run writes nothing but still reports', () => { - const io = memoryIo({ - 'package.json': JSON.stringify({ devDependencies: { vite: '^5' } }), - 'vite.config.ts': `export default { plugins: [] };\n`, - }); + it('does not re-register when an iris server already exists (idempotent, install-once)', () => { + const io = memoryIo(VITE_FILES, { mcpExists: true }); + runInit(OPTS, io); + expect(io.execCalls.some((c) => c.command === 'claude')).toBe(false); + }); + + it('prints manual global instructions when the claude CLI is missing', () => { + const io = memoryIo(VITE_FILES, { claudeAvailable: false }); + runInit(OPTS, io); + expect(io.execCalls.some((c) => c.command === 'claude' && c.args.includes('add'))).toBe(false); + expect(io.lines.join('\n')).toContain('-s user'); + }); + + it('dry run writes nothing and runs no subprocess', () => { + const io = memoryIo(VITE_FILES); const r = runInit({ ...OPTS, dryRun: true }, io); expect(Object.keys(io.written)).toHaveLength(0); + expect(io.execCalls).toHaveLength(0); expect(io.lines.join('\n')).toContain('dry run'); expect(r.applied).toBeGreaterThan(0); }); it('runs the install when enabled', () => { - const io = memoryIo({ - 'package.json': JSON.stringify({ devDependencies: { vite: '^5' } }), - 'pnpm-lock.yaml': '', - 'vite.config.ts': `export default { plugins: [] };\n`, - }); + const io = memoryIo({ ...VITE_FILES, 'pnpm-lock.yaml': '' }, { mcpExists: true }); runInit({ ...OPTS, install: true }, io); expect(io.execCalls).toEqual([{ command: 'pnpm', args: ['add', '-D', '@syrin/iris'] }]); }); - it('does not run the install in dry run', () => { - const io = memoryIo({ 'package.json': JSON.stringify({ devDependencies: { vite: '^5' } }) }); - runInit({ ...OPTS, install: true, dryRun: true }, io); - expect(io.execCalls).toHaveLength(0); - }); - - it('downgrades the install step to manual when it fails', () => { - const io = memoryIo( - { 'package.json': JSON.stringify({ devDependencies: { vite: '^5' } }) }, - false, - ); + it('downgrades a failed step to manual with its fallback command', () => { + const io = memoryIo(VITE_FILES, { execOk: false, mcpExists: true }); const r = runInit({ ...OPTS, install: true }, io); - expect(io.lines.join('\n')).toContain('install failed — run manually'); + expect(io.lines.join('\n')).toContain('step failed — run manually'); expect(r.manual).toBeGreaterThan(0); }); diff --git a/packages/server/src/init/run.ts b/packages/server/src/init/run.ts index 54cd5e9..a63db5c 100644 --- a/packages/server/src/init/run.ts +++ b/packages/server/src/init/run.ts @@ -6,9 +6,9 @@ import { detect, Framework, type DetectInput } from './detect.js'; import { buildPlan, StepStatus, type Plan, type PlanInput } from './plan.js'; +import { claudeAvailableProbe, claudeExistsProbe } from './mcp.js'; const PACKAGE_JSON = 'package.json'; -const MCP_FILE = '.mcp.json'; const NEXT_IRIS_DEV = 'app/iris-dev.tsx'; const VITE_CONFIG_CANDIDATES = [ 'vite.config.ts', @@ -39,8 +39,10 @@ export interface InitIo { exists(relPath: string): boolean; /** Basenames present in the project root. */ rootFiles(): readonly string[]; - /** Runs a subprocess to completion; returns true on exit code 0. */ + /** Runs a subprocess to completion (inherits stdio); returns true on exit code 0. */ exec(command: string, args: readonly string[]): boolean; + /** Runs a subprocess quietly (no stdio) for a yes/no check; returns true on exit code 0. */ + probe(command: string, args: readonly string[]): boolean; print(line: string): void; } @@ -77,9 +79,17 @@ function gatherPlanInput(options: InitOptions, io: InitIo, pkgRaw: string): Plan const viteConfig = vitePath !== null && viteSource !== null ? { path: vitePath, source: viteSource } : null; + // Global MCP registration goes through the `claude` CLI; probe for it (and for an existing + // registration) only when the MCP step is in play. + const availableProbe = claudeAvailableProbe(); + const claudeCli = options.mcp ? io.probe(availableProbe.command, availableProbe.args) : false; + const existsProbe = claudeExistsProbe(); + const mcpExists = claudeCli ? io.probe(existsProbe.command, existsProbe.args) : false; + return { detection, - mcpJson: io.readFile(MCP_FILE), + claudeCli, + mcpExists, viteConfig, nextConfigFile: firstPresent(rootFiles, NEXT_CONFIG_CANDIDATES), nextIrisDevExists: io.exists(NEXT_IRIS_DEV), @@ -106,7 +116,7 @@ function report(plan: Plan, dryRun: boolean, failed: ReadonlySet, io: In const status = downgraded ? StepStatus.MANUAL : s.status; const detail = downgraded && s.exec !== undefined - ? `install failed — run manually: ${s.exec.fallback}` + ? `step failed — run manually: ${s.exec.fallback}` : s.detail; io.print(` [${STATUS_SYMBOL[status]}] ${s.title} → ${s.target}`); if (status === StepStatus.APPLY) applied++; diff --git a/packages/server/src/init/snippets.ts b/packages/server/src/init/snippets.ts index fdad8b1..6caae87 100644 --- a/packages/server/src/init/snippets.ts +++ b/packages/server/src/init/snippets.ts @@ -73,12 +73,4 @@ export function htmlManual(port: number | undefined): string { `; } -/** Printed when an existing .mcp.json can't be parsed (jsonc/comments) and we won't rewrite it. */ -export function mcpManual(entry: Record): string { - return `Couldn't parse your existing .mcp.json (comments or trailing commas?). Add this server -to its "mcpServers" object by hand: - - "iris": ${JSON.stringify(entry, null, 2)}`; -} - export const NEXT_IRIS_DEV_PATH = 'app/iris-dev.tsx'; From 1f85bcea2638db0a42d78fead878f9eb37ae7115 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 02:49:28 +0530 Subject: [PATCH 05/33] feat(init): also register the MCP server in Cursor's global config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `iris init` now registers Iris once, globally, for every supported agent it detects — not just Claude: - Claude Code: `claude mcp add iris -s user` (as before) - Cursor: merges an `iris` entry into ~/.cursor/mcp.json (Cursor has no CLI, but its global MCP config is a small dedicated file — safe to merge, unlike ~/.claude.json). Idempotent; never clobbers an existing entry; bails to manual on unparseable jsonc. Detection is per-agent (claude CLI present / ~/.cursor exists), so each gets its own step and only installed agents are touched. If neither is found, a single manual instruction is printed. --port flows into both registrations. The init IO gained homeDir() and absolute-path support for writing the global Cursor file. Verified end-to-end in an isolated HOME: with Cursor present, init writes ~/.cursor/mcp.json (and registers Claude too); a re-run reports ALREADY for both. Confirmed the real ~/.claude.json and ~/.cursor configs were untouched. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/getting-started.md | 7 +- packages/server/src/init/cursor.test.ts | 54 ++++++++++++++++ packages/server/src/init/cursor.ts | 68 ++++++++++++++++++++ packages/server/src/init/mcp.ts | 15 +++-- packages/server/src/init/node-io.ts | 9 ++- packages/server/src/init/plan.test.ts | 64 +++++++++++++++---- packages/server/src/init/plan.ts | 85 ++++++++++++++++++++----- packages/server/src/init/run.test.ts | 30 +++++++-- packages/server/src/init/run.ts | 19 ++++-- 9 files changed, 301 insertions(+), 50 deletions(-) create mode 100644 packages/server/src/init/cursor.test.ts create mode 100644 packages/server/src/init/cursor.ts diff --git a/docs/getting-started.md b/docs/getting-started.md index 700f5f9..fdc79ff 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -58,8 +58,9 @@ npx @syrin/iris init It detects your framework, package manager, and React version, then: -- **registers the Iris MCP server once, globally** (`claude mcp add iris -s user`) — so every - project on this machine gets it; you never re-add it per project, +- **registers the Iris MCP server once, globally, for each agent you have installed** — Claude + Code (`claude mcp add iris -s user`) and/or Cursor (`~/.cursor/mcp.json`) — so every project on + this machine gets it; you never re-add it per project, - installs `@syrin/iris` as a dev dependency, - **Vite:** adds the `iris()` plugin to your config — which wires source mapping _and_ `iris.connect()` for you, so there is nothing else to edit, @@ -92,7 +93,7 @@ claude mcp add iris -s user -- npx @syrin/iris mcp (`iris init` runs exactly this for you. `-s user` is what makes it global; drop it for a project-local registration instead.) -**Cursor** — add to your global `~/.cursor/mcp.json` (not per-project): +**Cursor** — add to your global `~/.cursor/mcp.json` (not per-project; `iris init` writes this for you): ```jsonc { diff --git a/packages/server/src/init/cursor.test.ts b/packages/server/src/init/cursor.test.ts new file mode 100644 index 0000000..cea2c3f --- /dev/null +++ b/packages/server/src/init/cursor.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, it } from 'vitest'; +import { mergeCursorConfig, CursorMergeStatus } from './cursor.js'; + +interface CursorShape { + mcpServers: Record; +} +function parse(content: string): CursorShape { + return JSON.parse(content) as CursorShape; +} + +describe('mergeCursorConfig', () => { + it('creates a fresh global config when none exists', () => { + const r = mergeCursorConfig(null, undefined); + expect(r.status).toBe(CursorMergeStatus.APPLY); + expect(parse(r.content).mcpServers['iris']).toEqual({ + command: 'npx', + args: ['@syrin/iris', 'mcp'], + }); + }); + + it('bakes the port into the args', () => { + const r = mergeCursorConfig(null, 4500); + expect(parse(r.content).mcpServers['iris']?.args).toEqual([ + '@syrin/iris', + 'mcp', + '--port', + '4500', + ]); + }); + + it('preserves other servers', () => { + const r = mergeCursorConfig( + JSON.stringify({ mcpServers: { other: { command: 'x' } } }), + undefined, + ); + const parsed = parse(r.content); + expect(parsed.mcpServers['other']).toEqual({ command: 'x' }); + expect(parsed.mcpServers['iris']).toBeDefined(); + }); + + it('never clobbers an existing iris entry (idempotent)', () => { + const existing = JSON.stringify({ mcpServers: { iris: { command: 'custom' } } }); + const r = mergeCursorConfig(existing, undefined); + expect(r.status).toBe(CursorMergeStatus.ALREADY); + expect(r.content).toBe(existing); + }); + + it('bails to manual on unparseable jsonc without rewriting', () => { + const jsonc = '{\n // servers\n "mcpServers": {}\n}\n'; + const r = mergeCursorConfig(jsonc, undefined); + expect(r.status).toBe(CursorMergeStatus.MANUAL); + expect(r.content).toBe(jsonc); + }); +}); diff --git a/packages/server/src/init/cursor.ts b/packages/server/src/init/cursor.ts new file mode 100644 index 0000000..7f77af3 --- /dev/null +++ b/packages/server/src/init/cursor.ts @@ -0,0 +1,68 @@ +/** + * Global Cursor registration for `iris init`. Cursor has no CLI, but its global MCP config is a + * small dedicated file (`~/.cursor/mcp.json`) — safe to merge directly (unlike `~/.claude.json`). + * We add the `iris` server once at this global path so every project picks it up. Unparseable + * (jsonc/comment) files bail to manual rather than being rewritten. + */ + +import { NPX, MCP_SERVER_NAME, npxServerArgs } from './mcp.js'; + +/** Path of Cursor's global MCP config, relative to the user's home directory. */ +export const CURSOR_MCP_RELPATH = '.cursor/mcp.json'; +/** The directory whose presence signals Cursor is installed for this user. */ +export const CURSOR_DIR_RELPATH = '.cursor'; + +export const CursorMergeStatus = { + APPLY: 'apply', + ALREADY: 'already', + MANUAL: 'manual', +} as const; +export type CursorMergeStatus = (typeof CursorMergeStatus)[keyof typeof CursorMergeStatus]; + +export interface CursorMergeResult { + status: CursorMergeStatus; + /** Full file content to write (2-space JSON, trailing newline). Unchanged when not `apply`. */ + content: string; +} + +interface CursorConfigShape { + mcpServers?: Record; + [key: string]: unknown; +} + +export function cursorServerEntry(port: number | undefined): Record { + return { command: NPX, args: npxServerArgs(port) }; +} + +type ParseResult = { ok: true; config: CursorConfigShape } | { ok: false }; + +function parseConfig(existing: string | null): ParseResult { + if (existing === null || existing.trim().length === 0) return { ok: true, config: {} }; + try { + const parsed: unknown = JSON.parse(existing); + if (typeof parsed !== 'object' || parsed === null) return { ok: true, config: {} }; + return { ok: true, config: parsed as CursorConfigShape }; + } catch { + return { ok: false }; + } +} + +export function mergeCursorConfig( + existing: string | null, + port: number | undefined, +): CursorMergeResult { + const parsed = parseConfig(existing); + if (!parsed.ok) { + return { status: CursorMergeStatus.MANUAL, content: existing ?? '' }; + } + const config = parsed.config; + const servers = config.mcpServers ?? {}; + if (Object.prototype.hasOwnProperty.call(servers, MCP_SERVER_NAME)) { + return { status: CursorMergeStatus.ALREADY, content: existing ?? '' }; + } + const merged: CursorConfigShape = { + ...config, + mcpServers: { ...servers, [MCP_SERVER_NAME]: cursorServerEntry(port) }, + }; + return { status: CursorMergeStatus.APPLY, content: `${JSON.stringify(merged, null, 2)}\n` }; +} diff --git a/packages/server/src/init/mcp.ts b/packages/server/src/init/mcp.ts index 567684c..7d8bc27 100644 --- a/packages/server/src/init/mcp.ts +++ b/packages/server/src/init/mcp.ts @@ -7,17 +7,22 @@ */ export const MCP_SERVER_NAME = 'iris'; -const NPX = 'npx'; +export const NPX = 'npx'; const IRIS_PACKAGE = '@syrin/iris'; const MCP_SUBCOMMAND = 'mcp'; const PORT_FLAG = '--port'; export const CLAUDE_CLI = 'claude'; -/** The subprocess that the SDK/agent runs to launch the bridge — the tail after `claude mcp add … --`. */ -function serverInvocation(port: number | undefined): string[] { +/** Args after `npx` that launch the bridge: `@syrin/iris mcp [--port N]`. Shared across agents. */ +export function npxServerArgs(port: number | undefined): string[] { return port === undefined - ? [NPX, IRIS_PACKAGE, MCP_SUBCOMMAND] - : [NPX, IRIS_PACKAGE, MCP_SUBCOMMAND, PORT_FLAG, String(port)]; + ? [IRIS_PACKAGE, MCP_SUBCOMMAND] + : [IRIS_PACKAGE, MCP_SUBCOMMAND, PORT_FLAG, String(port)]; +} + +/** The full `npx …` invocation — the tail after `claude mcp add … --`. */ +function serverInvocation(port: number | undefined): string[] { + return [NPX, ...npxServerArgs(port)]; } export interface ClaudeAddCommand { diff --git a/packages/server/src/init/node-io.ts b/packages/server/src/init/node-io.ts index 92ab46a..7b3c69b 100644 --- a/packages/server/src/init/node-io.ts +++ b/packages/server/src/init/node-io.ts @@ -5,12 +5,14 @@ */ import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from 'node:fs'; -import { join, dirname } from 'node:path'; +import { join, dirname, isAbsolute } from 'node:path'; +import { homedir } from 'node:os'; import { spawnSync } from 'node:child_process'; import type { InitIo } from './run.js'; export function buildNodeIo(cwd: string): InitIo { - const abs = (rel: string): string => join(cwd, rel); + // Project-relative by default; absolute paths (e.g. ~/.cursor/mcp.json) pass through unchanged. + const abs = (rel: string): string => (isAbsolute(rel) ? rel : join(cwd, rel)); return { readFile(rel) { const path = abs(rel); @@ -25,6 +27,9 @@ export function buildNodeIo(cwd: string): InitIo { exists(rel) { return existsSync(abs(rel)); }, + homeDir() { + return homedir(); + }, rootFiles() { return readdirSync(cwd).filter((name) => { try { diff --git a/packages/server/src/init/plan.test.ts b/packages/server/src/init/plan.test.ts index 70fe287..2cbeb81 100644 --- a/packages/server/src/init/plan.test.ts +++ b/packages/server/src/init/plan.test.ts @@ -2,6 +2,8 @@ import { describe, expect, it } from 'vitest'; import { buildPlan, StepStatus, type PlanInput } from './plan.js'; import { Framework, PackageManager, type Detection } from './detect.js'; +const CLAUDE_STEP = 'MCP server (Claude, global)'; +const CURSOR_STEP = 'MCP server (Cursor, global)'; const MCP_STEP = 'MCP server (global)'; function detection(framework: Framework, reactMajor = 19): Detection { @@ -18,6 +20,9 @@ function input(partial: Partial): PlanInput { detection: partial.detection ?? detection(Framework.VITE), claudeCli: partial.claudeCli ?? true, mcpExists: partial.mcpExists ?? false, + cursorPresent: partial.cursorPresent ?? false, + cursorConfig: partial.cursorConfig ?? null, + cursorConfigPath: partial.cursorConfigPath ?? '/home/u/.cursor/mcp.json', viteConfig: partial.viteConfig ?? null, nextConfigFile: partial.nextConfigFile ?? null, nextIrisDevExists: partial.nextIrisDevExists ?? false, @@ -25,6 +30,10 @@ function input(partial: Partial): PlanInput { }; } +function maybeStep(plan: ReturnType, title: string) { + return plan.steps.find((x) => x.title === title); +} + const VITE_SRC = `import { defineConfig } from 'vite'; import react from '@vitejs/plugin-react'; export default defineConfig({ plugins: [react()] }); @@ -36,9 +45,9 @@ function step(plan: ReturnType, title: string) { return s; } -describe('buildPlan — MCP (global, claude user scope)', () => { - it('registers iris globally via an exec step when the claude CLI is present', () => { - const s = step(buildPlan(input({ claudeCli: true, mcpExists: false })), MCP_STEP); +describe('buildPlan — MCP (global, per detected agent)', () => { + it('registers with Claude via an exec step when the claude CLI is present', () => { + const s = step(buildPlan(input({ claudeCli: true, mcpExists: false })), CLAUDE_STEP); expect(s.status).toBe(StepStatus.APPLY); expect(s.exec?.command).toBe('claude'); expect(s.exec?.args).toEqual([ @@ -54,13 +63,36 @@ describe('buildPlan — MCP (global, claude user scope)', () => { ]); }); - it('is ALREADY (idempotent) when an iris server is already registered', () => { - const s = step(buildPlan(input({ claudeCli: true, mcpExists: true })), MCP_STEP); + it('Claude step is ALREADY (idempotent) when iris is already registered', () => { + const s = step(buildPlan(input({ claudeCli: true, mcpExists: true })), CLAUDE_STEP); expect(s.status).toBe(StepStatus.ALREADY); }); - it('bails to manual global instructions when the claude CLI is missing', () => { - const s = step(buildPlan(input({ claudeCli: false, mcpExists: false })), MCP_STEP); + it('registers with Cursor by writing its global config when Cursor is present', () => { + const plan = buildPlan(input({ claudeCli: false, cursorPresent: true, cursorConfig: null })); + const s = step(plan, CURSOR_STEP); + expect(s.status).toBe(StepStatus.APPLY); + expect(s.write?.path).toBe('/home/u/.cursor/mcp.json'); + expect(s.write?.content).toContain('@syrin/iris'); + }); + + it('registers with BOTH agents when both are present', () => { + const plan = buildPlan(input({ claudeCli: true, cursorPresent: true, cursorConfig: null })); + expect(maybeStep(plan, CLAUDE_STEP)).toBeDefined(); + expect(maybeStep(plan, CURSOR_STEP)).toBeDefined(); + }); + + it('Cursor step is ALREADY when iris is already in its config', () => { + const existing = JSON.stringify({ mcpServers: { iris: { command: 'x' } } }); + const plan = buildPlan( + input({ claudeCli: false, cursorPresent: true, cursorConfig: existing }), + ); + expect(step(plan, CURSOR_STEP).status).toBe(StepStatus.ALREADY); + }); + + it('falls back to a single manual step when no agent is detected', () => { + const plan = buildPlan(input({ claudeCli: false, cursorPresent: false })); + const s = step(plan, MCP_STEP); expect(s.status).toBe(StepStatus.MANUAL); expect(s.detail).toContain('-s user'); }); @@ -73,13 +105,17 @@ describe('buildPlan — MCP (global, claude user scope)', () => { expect(s.status).toBe(StepStatus.SKIP); }); - it('bakes --port into the global registration', () => { - const s = step( - buildPlan(input({ options: { port: 5000, mcp: true, install: false } })), - MCP_STEP, + it('bakes --port into both agents’ registration', () => { + const plan = buildPlan( + input({ + claudeCli: true, + cursorPresent: true, + cursorConfig: null, + options: { port: 5000, mcp: true, install: false }, + }), ); - expect(s.exec?.args).toContain('--port'); - expect(s.exec?.args).toContain('5000'); + expect(step(plan, CLAUDE_STEP).exec?.args).toContain('5000'); + expect(step(plan, CURSOR_STEP).write?.content).toContain('5000'); }); }); @@ -142,7 +178,7 @@ describe('buildPlan — Next', () => { describe('buildPlan — HTML', () => { it('registers MCP globally plus a manual connect snippet', () => { const plan = buildPlan(input({ detection: detection(Framework.HTML, 0) })); - expect(step(plan, MCP_STEP).status).toBe(StepStatus.APPLY); + expect(step(plan, CLAUDE_STEP).status).toBe(StepStatus.APPLY); expect(step(plan, 'Connect snippet').status).toBe(StepStatus.MANUAL); }); }); diff --git a/packages/server/src/init/plan.ts b/packages/server/src/init/plan.ts index 51b8d5a..7f3f39b 100644 --- a/packages/server/src/init/plan.ts +++ b/packages/server/src/init/plan.ts @@ -6,6 +6,7 @@ import { Framework, installCommand, installCommandParts, type Detection } from './detect.js'; import { claudeAddCommand, mcpManual } from './mcp.js'; +import { mergeCursorConfig, CursorMergeStatus, cursorServerEntry } from './cursor.js'; import { patchViteConfig, VitePatchKind } from './vite-config.js'; import { viteManual, @@ -47,8 +48,14 @@ export interface PlanInput { detection: Detection; /** Whether the `claude` CLI is installed (so we can register the MCP server globally). */ claudeCli: boolean; - /** Whether an `iris` MCP server is already registered (any scope) — skip to stay idempotent. */ + /** Whether an `iris` MCP server is already registered with Claude (any scope) — idempotency. */ mcpExists: boolean; + /** Whether Cursor is installed for this user (its global config dir exists). */ + cursorPresent: boolean; + /** Current ~/.cursor/mcp.json content, or null if absent. */ + cursorConfig: string | null; + /** Absolute path of ~/.cursor/mcp.json (the write target). */ + cursorConfigPath: string; /** Discovered Vite config: its path + source, or null if none found. */ viteConfig: { path: string; source: string } | null; /** Discovered Next config filename (e.g. 'next.config.mjs'), or null. */ @@ -58,31 +65,22 @@ export interface PlanInput { options: { port: number | undefined; mcp: boolean; install: boolean }; } -const MCP_TITLE = 'MCP server (global)'; +const CLAUDE_MCP_TITLE = 'MCP server (Claude, global)'; +const CURSOR_MCP_TITLE = 'MCP server (Cursor, global)'; -function mcpStep(input: PlanInput): Step { - if (!input.options.mcp) { - return { title: MCP_TITLE, target: MCP_TARGET, status: StepStatus.SKIP, detail: '--no-mcp' }; - } +function claudeMcpStep(input: PlanInput): Step | null { + if (!input.claudeCli) return null; if (input.mcpExists) { return { - title: MCP_TITLE, + title: CLAUDE_MCP_TITLE, target: MCP_TARGET, status: StepStatus.ALREADY, detail: 'iris already registered (install once, used by every project)', }; } - if (!input.claudeCli) { - return { - title: MCP_TITLE, - target: MCP_TARGET, - status: StepStatus.MANUAL, - detail: mcpManual(input.options.port), - }; - } const cmd = claudeAddCommand(input.options.port); return { - title: MCP_TITLE, + title: CLAUDE_MCP_TITLE, target: MCP_TARGET, status: StepStatus.APPLY, detail: 'register iris globally for all projects', @@ -90,6 +88,59 @@ function mcpStep(input: PlanInput): Step { }; } +function cursorMcpStep(input: PlanInput): Step | null { + if (!input.cursorPresent) return null; + const r = mergeCursorConfig(input.cursorConfig, input.options.port); + if (r.status === CursorMergeStatus.ALREADY) { + return { + title: CURSOR_MCP_TITLE, + target: input.cursorConfigPath, + status: StepStatus.ALREADY, + detail: 'iris already in Cursor global config', + }; + } + if (r.status === CursorMergeStatus.MANUAL) { + return { + title: CURSOR_MCP_TITLE, + target: input.cursorConfigPath, + status: StepStatus.MANUAL, + detail: `couldn't parse ${input.cursorConfigPath} — add this server by hand:\n "iris": ${JSON.stringify(cursorServerEntry(input.options.port))}`, + }; + } + return { + title: CURSOR_MCP_TITLE, + target: input.cursorConfigPath, + status: StepStatus.APPLY, + detail: 'register iris in Cursor global config', + write: { path: input.cursorConfigPath, content: r.content }, + }; +} + +/** One global registration per detected agent (Claude + Cursor). Falls back to a manual note. */ +function mcpSteps(input: PlanInput): Step[] { + if (!input.options.mcp) { + return [ + { + title: 'MCP server (global)', + target: MCP_TARGET, + status: StepStatus.SKIP, + detail: '--no-mcp', + }, + ]; + } + const steps = [claudeMcpStep(input), cursorMcpStep(input)].filter((s): s is Step => s !== null); + if (steps.length > 0) return steps; + // No supported agent detected — print the one-time global instructions. + return [ + { + title: 'MCP server (global)', + target: MCP_TARGET, + status: StepStatus.MANUAL, + detail: mcpManual(input.options.port), + }, + ]; +} + function installStep(input: PlanInput): Step { const pm = input.detection.packageManager; const command = installCommand(pm, IRIS_PACKAGE); @@ -190,7 +241,7 @@ function nextSteps(input: PlanInput): Step[] { } export function buildPlan(input: PlanInput): Plan { - const steps: Step[] = [mcpStep(input), installStep(input)]; + const steps: Step[] = [...mcpSteps(input), installStep(input)]; if (input.detection.framework === Framework.VITE) { steps.push(...viteSteps(input)); } else if (input.detection.framework === Framework.NEXT) { diff --git a/packages/server/src/init/run.test.ts b/packages/server/src/init/run.test.ts index 5b1168f..20a35cb 100644 --- a/packages/server/src/init/run.test.ts +++ b/packages/server/src/init/run.test.ts @@ -7,26 +7,33 @@ interface MemoryIo extends InitIo { execCalls: { command: string; args: readonly string[] }[]; } +const HOME = '/home/u'; + interface MemoryOpts { execOk?: boolean; claudeAvailable?: boolean; mcpExists?: boolean; + cursor?: boolean; } function memoryIo(files: Record, opts: MemoryOpts = {}): MemoryIo { - const { execOk = true, claudeAvailable = true, mcpExists = false } = opts; + const { execOk = true, claudeAvailable = true, mcpExists = false, cursor = false } = opts; const written: Record = {}; const lines: string[] = []; const execCalls: { command: string; args: readonly string[] }[] = []; + // Simulate the Cursor config dir existing when requested. + const present = { ...files }; + if (cursor) present[`${HOME}/.cursor`] = ''; return { written, lines, execCalls, - readFile: (p) => files[p] ?? written[p] ?? null, + readFile: (p) => present[p] ?? written[p] ?? null, writeFile: (p, c) => { written[p] = c; }, - exists: (p) => p in files || p in written, + exists: (p) => p in present || p in written, + homeDir: () => HOME, rootFiles: () => Object.keys(files).filter((p) => !p.includes('/')), exec: (command, args) => { execCalls.push({ command, args }); @@ -73,13 +80,26 @@ describe('runInit', () => { expect(io.execCalls.some((c) => c.command === 'claude')).toBe(false); }); - it('prints manual global instructions when the claude CLI is missing', () => { - const io = memoryIo(VITE_FILES, { claudeAvailable: false }); + it('prints manual global instructions when no agent is detected', () => { + const io = memoryIo(VITE_FILES, { claudeAvailable: false, cursor: false }); runInit(OPTS, io); expect(io.execCalls.some((c) => c.command === 'claude' && c.args.includes('add'))).toBe(false); expect(io.lines.join('\n')).toContain('-s user'); }); + it('registers in Cursor global config when Cursor is present', () => { + const io = memoryIo(VITE_FILES, { claudeAvailable: false, cursor: true }); + runInit(OPTS, io); + expect(io.written['/home/u/.cursor/mcp.json']).toContain('@syrin/iris'); + }); + + it('registers with BOTH Claude and Cursor when both are present', () => { + const io = memoryIo(VITE_FILES, { claudeAvailable: true, cursor: true }); + runInit(OPTS, io); + expect(io.execCalls.some((c) => c.command === 'claude' && c.args.includes('add'))).toBe(true); + expect(io.written['/home/u/.cursor/mcp.json']).toContain('@syrin/iris'); + }); + it('dry run writes nothing and runs no subprocess', () => { const io = memoryIo(VITE_FILES); const r = runInit({ ...OPTS, dryRun: true }, io); diff --git a/packages/server/src/init/run.ts b/packages/server/src/init/run.ts index a63db5c..173699b 100644 --- a/packages/server/src/init/run.ts +++ b/packages/server/src/init/run.ts @@ -7,6 +7,7 @@ import { detect, Framework, type DetectInput } from './detect.js'; import { buildPlan, StepStatus, type Plan, type PlanInput } from './plan.js'; import { claudeAvailableProbe, claudeExistsProbe } from './mcp.js'; +import { CURSOR_DIR_RELPATH, CURSOR_MCP_RELPATH } from './cursor.js'; const PACKAGE_JSON = 'package.json'; const NEXT_IRIS_DEV = 'app/iris-dev.tsx'; @@ -32,11 +33,13 @@ export interface InitOptions { } export interface InitIo { - /** Returns file content or null if it does not exist. Path is project-relative. */ + /** Returns file content or null if it does not exist. Path is project-relative or absolute. */ readFile(relPath: string): string | null; - /** Writes content, creating parent directories. Path is project-relative. */ + /** Writes content, creating parent directories. Path is project-relative or absolute. */ writeFile(relPath: string, content: string): void; exists(relPath: string): boolean; + /** The user's home directory (for global agent config like ~/.cursor/mcp.json). */ + homeDir(): string; /** Basenames present in the project root. */ rootFiles(): readonly string[]; /** Runs a subprocess to completion (inherits stdio); returns true on exit code 0. */ @@ -79,17 +82,25 @@ function gatherPlanInput(options: InitOptions, io: InitIo, pkgRaw: string): Plan const viteConfig = vitePath !== null && viteSource !== null ? { path: vitePath, source: viteSource } : null; - // Global MCP registration goes through the `claude` CLI; probe for it (and for an existing - // registration) only when the MCP step is in play. + // Global MCP registration targets each agent that's present: Claude via its CLI, Cursor via its + // global config file. Only probe when the MCP step is in play. const availableProbe = claudeAvailableProbe(); const claudeCli = options.mcp ? io.probe(availableProbe.command, availableProbe.args) : false; const existsProbe = claudeExistsProbe(); const mcpExists = claudeCli ? io.probe(existsProbe.command, existsProbe.args) : false; + const cursorDir = `${io.homeDir()}/${CURSOR_DIR_RELPATH}`; + const cursorConfigPath = `${io.homeDir()}/${CURSOR_MCP_RELPATH}`; + const cursorPresent = options.mcp && io.exists(cursorDir); + const cursorConfig = cursorPresent ? io.readFile(cursorConfigPath) : null; + return { detection, claudeCli, mcpExists, + cursorPresent, + cursorConfig, + cursorConfigPath, viteConfig, nextConfigFile: firstPresent(rootFiles, NEXT_CONFIG_CANDIDATES), nextIrisDevExists: io.exists(NEXT_IRIS_DEV), From a538846e2b2df0f7a0250592f8a513f7220efce7 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 03:29:14 +0530 Subject: [PATCH 06/33] =?UTF-8?q?feat(flows):=20grade=20flow=20assertions?= =?UTF-8?q?=20=E2=80=94=20flag=20assertion-free=20/=20presence-only=20flow?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Iris now tells an agent, the moment it saves a flow, whether that flow actually asserts an observable consequence — or whether it will pass even when the feature is broken. iris_flow_save returns assertions.grade: - asserted — a step or the success end-condition asserts a signal/network consequence (a wrong element can't fake it) - presence-only — only element-presence checks; a locator healed to the WRONG element can still pass (the failure self-healing vendors admit) - assertion-free — acts but asserts nothing observable For non-"asserted" grades it returns a warning telling the agent to add a consequence assertion (iris_annotate assert-signal / assert-net / success-state). Grounded in: Fowler "Assertion-Free Testing" (100% coverage, 0 assertions), Kent C. Dodds "make your test fail", mabl/qate.ai (healed-wrong-locator ships bugs green), and AI agents agreeing with human pass/fail only ~68% of the time — so the flow must carry a real oracle, not rely on the agent eyeballing success. First slice of V1-ROADMAP M1 (intent-anchored assertions). Pure classifier + 7 unit tests; wired into iris_flow_save. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/flows/flow-classify.test.ts | 74 ++++++++++++ packages/server/src/flows/flow-classify.ts | 107 ++++++++++++++++++ packages/server/src/flows/flow-tools.ts | 25 +++- 3 files changed, 202 insertions(+), 4 deletions(-) create mode 100644 packages/server/src/flows/flow-classify.test.ts create mode 100644 packages/server/src/flows/flow-classify.ts diff --git a/packages/server/src/flows/flow-classify.test.ts b/packages/server/src/flows/flow-classify.test.ts new file mode 100644 index 0000000..e8bd3f1 --- /dev/null +++ b/packages/server/src/flows/flow-classify.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, it } from 'vitest'; +import { FLOW_FILE_VERSION, AnchorKind } from '@syrin/iris-protocol'; +import type { FlowFile, FlowStep, FlowExpect } from '@syrin/iris-protocol'; +import { IrisTool } from '../tools/tool-names.js'; +import { classifyFlowAssertions, FlowAssertionGrade } from './flow-classify.js'; + +function step(expect?: FlowExpect): FlowStep { + const s: FlowStep = { tool: IrisTool.ACT, anchor: { kind: AnchorKind.TESTID, value: 'x' } }; + if (expect !== undefined) s.expect = expect; + return s; +} + +function flow(steps: FlowStep[], success?: FlowExpect): FlowFile { + const f: FlowFile = { version: FLOW_FILE_VERSION, name: 'f', createdAt: 0, steps }; + if (success !== undefined) f.success = success; + return f; +} + +describe('classifyFlowAssertions', () => { + it('flags a flow that acts but asserts nothing as assertion-free', () => { + const c = classifyFlowAssertions(flow([step(), step()])); + expect(c.grade).toBe(FlowAssertionGrade.ASSERTION_FREE); + expect(c.hasConsequenceAssertion).toBe(false); + expect(c.totalSteps).toBe(2); + expect(c.warning).toContain('asserts no observable consequence'); + }); + + it('flags element-only checks as presence-only (a healed wrong locator could pass)', () => { + const c = classifyFlowAssertions(flow([step(), step({ element: { testid: 'panel' } })])); + expect(c.grade).toBe(FlowAssertionGrade.PRESENCE_ONLY); + expect(c.hasConsequenceAssertion).toBe(false); + expect(c.weakSteps).toBe(1); + expect(c.warning).toContain('element presence'); + }); + + it('treats a signal assertion as a real consequence', () => { + const c = classifyFlowAssertions(flow([step({ signal: 'order:placed' })])); + expect(c.grade).toBe(FlowAssertionGrade.ASSERTED); + expect(c.hasConsequenceAssertion).toBe(true); + expect(c.consequenceSteps).toBe(1); + expect(c.warning).toBeUndefined(); + }); + + it('treats a network assertion as a real consequence', () => { + const c = classifyFlowAssertions( + flow([step({ net: { urlContains: '/api/order', status: 200 } })]), + ); + expect(c.grade).toBe(FlowAssertionGrade.ASSERTED); + expect(c.consequenceSteps).toBe(1); + }); + + it('counts a consequence success end-condition even with no step expects', () => { + const c = classifyFlowAssertions(flow([step(), step()], { signal: 'checkout:done' })); + expect(c.grade).toBe(FlowAssertionGrade.ASSERTED); + expect(c.successIsConsequence).toBe(true); + }); + + it('an element-only success is still presence-only', () => { + const c = classifyFlowAssertions(flow([step()], { element: { testid: 'thanks' } })); + expect(c.grade).toBe(FlowAssertionGrade.PRESENCE_ONLY); + expect(c.successIsConsequence).toBe(false); + }); + + it('counts expects on act_sequence sub-steps', () => { + const seq: FlowStep = { + tool: IrisTool.ACT_SEQUENCE, + anchor: { kind: AnchorKind.TESTID, value: 'x' }, + steps: [step(), step({ signal: 'saved' })], + }; + const c = classifyFlowAssertions(flow([seq])); + expect(c.hasConsequenceAssertion).toBe(true); + expect(c.consequenceSteps).toBe(1); + }); +}); diff --git a/packages/server/src/flows/flow-classify.ts b/packages/server/src/flows/flow-classify.ts new file mode 100644 index 0000000..3b69a3f --- /dev/null +++ b/packages/server/src/flows/flow-classify.ts @@ -0,0 +1,107 @@ +/** + * Classify whether a flow asserts an observable CONSEQUENCE or is "assertion-free" / presence-only. + * + * Why this exists (grounded in real testing behavior): + * - Martin Fowler, *Assertion-Free Testing*: teams hit 100% coverage with tests that "weren't any + * assertions" — green, but verifying nothing. + * - Kent C. Dodds, *Make Your Test Fail*: a test that doesn't fail when you break the code gives + * false security. + * - Self-healing vendors (mabl, qate.ai) admit a locator healed to the WRONG element makes a test + * pass green while a real regression ships — but only if the test merely checks presence. + * - AI agents agree with human pass/fail only ~68% of the time (arXiv 2510.02418), so the flow + * itself must carry a real oracle, not rely on the agent eyeballing success. + * + * For a flow, an FlowExpect can assert a `signal` (app emitted an event), a `net` call, or just an + * `element` presence. signal/net are OBSERVABLE CONSEQUENCES — they can't be satisfied by a wrong + * element. element-only is WEAK — a healed-but-wrong locator can still satisfy it. A flow with no + * expect on any step and no success end-condition asserts nothing at all. + * + * Pure: no IO, no clock. + */ + +import type { FlowExpect, FlowFile, FlowStep } from '@syrin/iris-protocol'; + +export const FlowAssertionGrade = { + /** At least one step (or the success end-condition) asserts a signal/network consequence. */ + ASSERTED: 'asserted', + /** Only element-presence checks — a healed-but-wrong locator could still pass. */ + PRESENCE_ONLY: 'presence-only', + /** Performs actions but asserts nothing observable — passes even if the feature is broken. */ + ASSERTION_FREE: 'assertion-free', +} as const; +export type FlowAssertionGrade = (typeof FlowAssertionGrade)[keyof typeof FlowAssertionGrade]; + +export interface FlowAssertionClassification { + grade: FlowAssertionGrade; + /** True when at least one signal/net assertion exists (step-level or success). */ + hasConsequenceAssertion: boolean; + totalSteps: number; + consequenceSteps: number; + weakSteps: number; + successIsConsequence: boolean; + /** Present for presence-only / assertion-free flows: how to make the flow a real test. */ + warning?: string; +} + +const ASSERTION_FREE_WARNING = + 'This flow performs actions but asserts no observable consequence — it will pass even if the feature is broken. Add a consequence assertion with iris_annotate (assert-signal / assert-net) or a success-state.'; +const PRESENCE_ONLY_WARNING = + 'This flow only checks element presence, not an observable consequence (signal/network). A locator healed to the wrong element can still pass it. Add a consequence assertion (assert-signal / assert-net / success-state).'; + +/** signal or net present → the expect verifies a consequence a wrong element cannot fake. */ +function expectIsConsequence(e: FlowExpect | undefined): boolean { + return e !== undefined && (e.signal !== undefined || e.net !== undefined); +} + +/** element-only (no signal/net) → presence check, weak. */ +function expectIsWeak(e: FlowExpect | undefined): boolean { + return ( + e !== undefined && e.element !== undefined && e.signal === undefined && e.net === undefined + ); +} + +/** Walk steps + act_sequence sub-steps so an expect on either level is counted. */ +function flattenSteps(steps: readonly FlowStep[]): FlowStep[] { + const out: FlowStep[] = []; + for (const s of steps) { + out.push(s); + if (s.steps !== undefined) out.push(...flattenSteps(s.steps)); + } + return out; +} + +export function classifyFlowAssertions(flow: FlowFile): FlowAssertionClassification { + const all = flattenSteps(flow.steps); + let consequenceSteps = 0; + let weakSteps = 0; + for (const s of all) { + if (expectIsConsequence(s.expect)) consequenceSteps++; + else if (expectIsWeak(s.expect)) weakSteps++; + } + const successIsConsequence = expectIsConsequence(flow.success); + const successIsWeak = expectIsWeak(flow.success); + const hasConsequenceAssertion = consequenceSteps > 0 || successIsConsequence; + const hasAnyAssertion = hasConsequenceAssertion || weakSteps > 0 || successIsWeak; + + let grade: FlowAssertionGrade; + let warning: string | undefined; + if (hasConsequenceAssertion) { + grade = FlowAssertionGrade.ASSERTED; + } else if (hasAnyAssertion) { + grade = FlowAssertionGrade.PRESENCE_ONLY; + warning = PRESENCE_ONLY_WARNING; + } else { + grade = FlowAssertionGrade.ASSERTION_FREE; + warning = ASSERTION_FREE_WARNING; + } + + return { + grade, + hasConsequenceAssertion, + totalSteps: all.length, + consequenceSteps, + weakSteps, + successIsConsequence, + ...(warning !== undefined ? { warning } : {}), + }; +} diff --git a/packages/server/src/flows/flow-tools.ts b/packages/server/src/flows/flow-tools.ts index dbac2c7..818e1a2 100644 --- a/packages/server/src/flows/flow-tools.ts +++ b/packages/server/src/flows/flow-tools.ts @@ -19,6 +19,7 @@ import { import { IrisTool } from '../tools/tool-names.js'; import { asString } from '../tools/tools-helpers.js'; import { replayFlow } from './flow-replay.js'; +import { classifyFlowAssertions } from './flow-classify.js'; import { collectProposals } from './heal.js'; import { waitForPredicate } from '../events/predicate.js'; import type { FlowAnnotations } from './flows.js'; @@ -89,7 +90,7 @@ export const FLOW_TOOLS: ToolDef[] = [ { name: IrisTool.FLOW_SAVE, description: - 'Persist the last/active recording (by name) as a git-checked, anchor-resolved flow at .iris/flows/.json. Each step is bound to a SEMANTIC anchor (testid/role/signal), never a volatile ref; steps without a resolvable testid are kept with degraded:true (a "add a data-testid here" marker) rather than dropped. Returns { name, stepCount, degraded, empty } or { error, code }.', + 'Persist the last/active recording (by name) as a git-checked, anchor-resolved flow at .iris/flows/.json. Each step is bound to a SEMANTIC anchor (testid/role/signal), never a volatile ref; steps without a resolvable testid are kept with degraded:true (a "add a data-testid here" marker) rather than dropped. Returns { name, stepCount, degraded, empty, assertions } — `assertions.grade` is asserted | presence-only | assertion-free: a flow that only acts (or only checks element presence) will pass even if the feature breaks, so when grade is not "asserted" follow assertions.warning and add a consequence assertion via iris_annotate (assert-signal / assert-net / success-state).', inputSchema: { flowName: z .string() @@ -102,6 +103,16 @@ export const FLOW_TOOLS: ToolDef[] = [ path: z.string(), stepCount: z.number().optional(), degraded: z.number().optional(), + assertions: z + .object({ + grade: z.string().describe('asserted | presence-only | assertion-free'), + hasConsequenceAssertion: z.boolean(), + totalSteps: z.number(), + consequenceSteps: z.number(), + weakSteps: z.number(), + warning: z.string().optional(), + }) + .optional(), }, handler: (deps: ToolDeps, args) => { const name = asString(args['flowName']) ?? ''; @@ -119,9 +130,15 @@ export const FLOW_TOOLS: ToolDef[] = [ dynamic: deps.annotations.dynamic(name), ...(success !== undefined ? { success } : {}), }; - return deps.flows.save(program, annotations).then((res) => { - if (res.ok) deps.annotations.clear(name); - return res.ok ? res.value : { error: flowErrorMessage(res.code), code: res.code }; + return deps.flows.save(program, annotations).then(async (res) => { + if (!res.ok) return { error: flowErrorMessage(res.code), code: res.code }; + deps.annotations.clear(name); + // Grade the saved flow's assertions so the agent learns immediately if it just saved a flow + // that asserts nothing observable (passes even when the feature is broken). + const loaded = await deps.flows.load(res.value.name); + return loaded.ok + ? { ...res.value, assertions: classifyFlowAssertions(loaded.value) } + : res.value; }); }, }, From 840c65a411f43261953c7edb2f65ba5a98156b38 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 04:02:06 +0530 Subject: [PATCH 07/33] feat(flows): evaluate flow.success in the live iris_flow_replay (shared oracle) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flow.success was checked only by the @syrin/iris-test CI spec runner — the live iris_flow_replay MCP tool an agent calls never evaluated it, so a replay reported ok even when the end-condition (the actual intent) was never met. Move the success oracle (successToPredicate + assertSuccess) down into iris-server as flow-success.ts so ONE implementation is shared by both the MCP tool and the spec runner — no divergent oracle. @syrin/iris-test/success-assert now re-exports it (its 12 tests still pass against the moved code). iris_flow_replay now, after every step runs clean, asserts the success end-condition as a real consequence: a signal/net success that never fires makes the replay status:error "intent not satisfied" even though all locators resolved — catching the regression a healed-but-wrong locator ships green (mabl/qate.ai). Dynamic (LLM-output) success fields are skipped, symmetric with the step layer. A `success` result row is appended so the agent sees the verdict. V1-ROADMAP M1 slice 2 (corrected: shared oracle, not a duplicate in replayFlow). Discovered + fixed via the full suite catching a turbo-cached masking failure. 8 new server tests; iris-server 488, iris-test 97; all 16 tasks green uncached. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/flows/flow-success.test.ts | 102 ++++++++++++++++++ packages/server/src/flows/flow-success.ts | 95 ++++++++++++++++ packages/server/src/flows/flow-tools.ts | 23 ++++ packages/server/src/index.ts | 8 ++ packages/test/src/success-assert.ts | 74 +------------ 5 files changed, 232 insertions(+), 70 deletions(-) create mode 100644 packages/server/src/flows/flow-success.test.ts create mode 100644 packages/server/src/flows/flow-success.ts diff --git a/packages/server/src/flows/flow-success.test.ts b/packages/server/src/flows/flow-success.test.ts new file mode 100644 index 0000000..5915a49 --- /dev/null +++ b/packages/server/src/flows/flow-success.test.ts @@ -0,0 +1,102 @@ +import { describe, expect, it } from 'vitest'; +import { EventType, type CommandResult, type IrisEvent } from '@syrin/iris-protocol'; +import { assertSuccess, successToPredicate, dynamicTestids } from './flow-success.js'; +import { waitForPredicate } from '../events/predicate.js'; +import type { FlowReplaySession } from './flow-replay.js'; + +/** Minimal session: scripted events drive signal/net predicates; QUERY answers element presence. */ +function session(events: IrisEvent[], elementPresent = true): FlowReplaySession { + return { + command: (name): Promise => + Promise.resolve({ + kind: 'command_result', + id: 'q', + ok: true, + result: name === 'query' ? { elements: elementPresent ? [{ ref: 'e1' }] : [] } : {}, + } as CommandResult), + eventsSince: () => events, + onEvent: () => () => undefined, + }; +} + +const FAST = 40; +const NONE = new Set(); +const sig = (name: string): IrisEvent => ({ + t: 1, + type: EventType.SIGNAL, + sessionId: 's', + data: { name }, +}); + +describe('successToPredicate', () => { + it('compiles a signal success', () => { + expect(successToPredicate({ signal: 'order:placed' }, NONE)).toEqual({ + kind: 'signal', + name: 'order:placed', + }); + }); + + it('skips a dynamic-marked element testid (presence-only → vacuously met)', () => { + expect( + successToPredicate({ element: { testid: 'caption' } }, new Set(['caption'])), + ).toBeUndefined(); + }); + + it('combines multiple fields with allOf', () => { + const p = successToPredicate({ signal: 's', net: { urlContains: '/api' } }, NONE); + expect(p?.kind).toBe('allOf'); + }); +}); + +describe('assertSuccess — green only when the consequence holds', () => { + it('passes when the success signal fires', async () => { + const r = await assertSuccess( + session([sig('checkout-done')]), + { signal: 'checkout-done' }, + NONE, + waitForPredicate, + FAST, + ); + expect(r.pass).toBe(true); + }); + + it('FAILS when the success signal never fires (broken Pay-now: steps green, consequence absent)', async () => { + const r = await assertSuccess( + session([]), + { signal: 'checkout-done' }, + NONE, + waitForPredicate, + FAST, + ); + expect(r.pass).toBe(false); + }); + + it('is vacuously met when no success is declared', async () => { + const r = await assertSuccess(session([]), undefined, NONE, waitForPredicate, FAST); + expect(r.pass).toBe(true); + }); + + it('is vacuously met when the only success field is dynamic-skipped', async () => { + const r = await assertSuccess( + session([]), + { element: { testid: 'cap' } }, + new Set(['cap']), + waitForPredicate, + FAST, + ); + expect(r.pass).toBe(true); + }); +}); + +describe('dynamicTestids', () => { + it('collects testid anchors from flow.dynamic', () => { + const set = dynamicTestids({ + version: 1, + name: 'f', + createdAt: 0, + steps: [], + dynamic: [{ kind: 'testid', value: 'cap' }], + }); + expect(set.has('cap')).toBe(true); + }); +}); diff --git a/packages/server/src/flows/flow-success.ts b/packages/server/src/flows/flow-success.ts new file mode 100644 index 0000000..f3d87ce --- /dev/null +++ b/packages/server/src/flows/flow-success.ts @@ -0,0 +1,95 @@ +/** + * Evaluate a flow's `success` end-condition — "green means intent satisfied". + * + * This is the one piece replay does NOT do per-step: turn a flow's success FlowExpect into a + * Predicate and assert it with the SAME waitForPredicate engine the tools/replay use. A signal/net + * success is a real CONSEQUENCE — a locator healed to the wrong element cannot fake it, so it + * catches the regression self-healing tools ship green (mabl/qate.ai). The dynamic skip-set is + * honored exactly as in replay: a success bound to a dynamic (LLM-output) testid is presence-only, + * never asserted, so the skip is symmetric across the step layer and the success layer. + * + * Lives in iris-server (alongside the predicate engine) so BOTH the live MCP `iris_flow_replay` + * tool and the `@syrin/iris-test` spec runner share one implementation — no divergent oracle. + * Pure: no IO, no clock. + */ + +import { AnchorKind, type FlowExpect, type FlowFile } from '@syrin/iris-protocol'; +import type { EvalResult, Predicate } from '../events/predicate.js'; +import type { FlowReplaySession, WaitForSignal } from './flow-replay.js'; + +/** The dynamic (LLM-output) testids whose presence is never asserted — same rule replay uses. */ +export function dynamicTestids(flow: FlowFile): Set { + return new Set( + (flow.dynamic ?? []) + .filter((a) => a.kind === AnchorKind.TESTID) + .map((a) => (a.kind === AnchorKind.TESTID ? a.value : '')), + ); +} + +/** A short human label for the success end-condition, for result rows. */ +export function successLabel(success: FlowExpect): string { + if (success.signal !== undefined) return success.signal; + if (success.net !== undefined) return success.net.urlContains ?? success.net.method ?? 'net'; + return success.element?.testid ?? success.element?.name ?? success.element?.role ?? 'success'; +} + +/** Compile a success FlowExpect into a predicate. undefined → nothing assertable (vacuously met). */ +export function successToPredicate( + success: FlowExpect, + dynamic: ReadonlySet, +): Predicate | undefined { + const parts: Predicate[] = []; + + if (success.signal !== undefined) { + parts.push( + success.signalData !== undefined + ? { kind: 'signal', name: success.signal, dataMatches: success.signalData } + : { kind: 'signal', name: success.signal }, + ); + } + + if (success.net !== undefined) { + const net: Extract = { kind: 'net' }; + if (success.net.method !== undefined) net.method = success.net.method; + if (success.net.urlContains !== undefined) net.urlContains = success.net.urlContains; + if (success.net.status !== undefined) net.status = success.net.status; + parts.push(net); + } + + const element = success.element; + if (element !== undefined) { + const testid = element.testid; + // A dynamic-marked testid is NOT asserted as a success condition (presence-only). + if (testid === undefined || !dynamic.has(testid)) { + const query: Record = {}; + if (testid !== undefined) query['testid'] = testid; + if (element.role !== undefined) query['role'] = element.role; + if (element.name !== undefined) query['name'] = element.name; + if (Object.keys(query).length > 0) parts.push({ kind: 'element', query }); + } + } + + const [first] = parts; + if (parts.length === 0) return undefined; + if (parts.length === 1 && first !== undefined) return first; + return { kind: 'allOf', predicates: parts }; +} + +/** + * Assert a flow's success end-condition after replay. Delegates to the injected waitForSignal (the + * real waitForPredicate in production, a fake in unit tests). Passes when: no success was declared, + * OR every success field was dynamic-skipped (vacuously met), OR the compiled predicate held within + * the injected timeout. Never reads the wall clock. + */ +export async function assertSuccess( + session: FlowReplaySession, + success: FlowExpect | undefined, + dynamic: ReadonlySet, + waitForSignal: WaitForSignal, + timeoutMs: number, +): Promise { + if (success === undefined) return { pass: true }; + const predicate = successToPredicate(success, dynamic); + if (predicate === undefined) return { pass: true }; + return waitForSignal(session, predicate, timeoutMs); +} diff --git a/packages/server/src/flows/flow-tools.ts b/packages/server/src/flows/flow-tools.ts index 818e1a2..319830e 100644 --- a/packages/server/src/flows/flow-tools.ts +++ b/packages/server/src/flows/flow-tools.ts @@ -20,7 +20,9 @@ import { IrisTool } from '../tools/tool-names.js'; import { asString } from '../tools/tools-helpers.js'; import { replayFlow } from './flow-replay.js'; import { classifyFlowAssertions } from './flow-classify.js'; +import { assertSuccess, dynamicTestids, successLabel } from './flow-success.js'; import { collectProposals } from './heal.js'; +import type { FlowStepResult } from '@syrin/iris-protocol'; import { waitForPredicate } from '../events/predicate.js'; import type { FlowAnnotations } from './flows.js'; import type { ToolDef, ToolDeps } from '../tools/tools.js'; @@ -228,6 +230,27 @@ export const FLOW_TOOLS: ToolDef[] = [ FLOW_SIGNAL_TIMEOUT_MS, args['confirmDangerous'] === true, ); + // "green means intent satisfied": when every step ran clean, assert the flow's success + // end-condition as a real consequence. A signal/net success that never fires FAILS the replay + // even though all locators resolved — the regression a healed-but-wrong locator ships green. + const stepsClean = steps.length > 0 && steps.every((s) => s.ok && s.drift === undefined); + if (stepsClean && loaded.value.success !== undefined) { + const verdict = await assertSuccess( + session, + loaded.value.success, + dynamicTestids(loaded.value), + waitForPredicate, + FLOW_SIGNAL_TIMEOUT_MS, + ); + const row: FlowStepResult = { + step: steps.length, + tool: 'success', + anchor: successLabel(loaded.value.success), + ok: verdict.pass, + ...(verdict.pass ? {} : { error: verdict.failureReason ?? 'flow.success not satisfied' }), + }; + steps.push(row); + } const driftSteps = steps.filter((s) => s.drift !== undefined).length; const allOk = steps.every((s) => s.ok); const status = diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 4f4ce8f..a11d31d 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -33,6 +33,14 @@ export { RecordingStore } from './flows/recordings.js'; export type { RecordedStep, CompiledProgram } from './flows/recordings.js'; export { FlowStore, recordedStepToFlowStep } from './flows/flows.js'; export type { FlowResult, Clock } from './flows/flows.js'; +export { + assertSuccess, + successToPredicate, + dynamicTestids, + successLabel, +} from './flows/flow-success.js'; +export { classifyFlowAssertions, FlowAssertionGrade } from './flows/flow-classify.js'; +export type { FlowAssertionClassification } from './flows/flow-classify.js'; export { ProjectStore } from './project/project-store.js'; export type { ReadProjectResult } from './project/project-store.js'; export { VisualStore } from './visual/visual-store.js'; diff --git a/packages/test/src/success-assert.ts b/packages/test/src/success-assert.ts index c0d1de4..9e900bc 100644 --- a/packages/test/src/success-assert.ts +++ b/packages/test/src/success-assert.ts @@ -1,72 +1,6 @@ -import type { FlowExpect } from '@syrin/iris-protocol'; -import type { EvalResult, FlowReplaySession, Predicate, WaitForSignal } from '@syrin/iris-server'; -import { PredicateKind } from './constants.js'; - /** - * FLOW2SPEC — the one piece replay does NOT do: turn a flow's `success` FlowExpect into a server - * Predicate so it can be asserted with the SAME waitForPredicate engine the tools/replay use. The - * dynamic skip-set is honored here exactly as in replay: a success field bound to a dynamic - * (LLM-output) testid is presence-only, never asserted — so the skip is symmetric across the step - * layer and the success layer. Returns undefined when nothing assertable remains (success then - * holds vacuously). + * FLOW2SPEC — the flow success-assertion oracle now lives in @syrin/iris-server so the live MCP + * `iris_flow_replay` tool and this spec runner share ONE implementation (no divergent oracle). + * Re-exported here to keep the spec-runner's import surface stable. */ -export function successToPredicate( - success: FlowExpect, - dynamic: ReadonlySet, -): Predicate | undefined { - const parts: Predicate[] = []; - - if (success.signal !== undefined) { - parts.push( - success.signalData !== undefined - ? { kind: PredicateKind.SIGNAL, name: success.signal, dataMatches: success.signalData } - : { kind: PredicateKind.SIGNAL, name: success.signal }, - ); - } - - if (success.net !== undefined) { - const net: Extract = { kind: PredicateKind.NET }; - if (success.net.method !== undefined) net.method = success.net.method; - if (success.net.urlContains !== undefined) net.urlContains = success.net.urlContains; - if (success.net.status !== undefined) net.status = success.net.status; - parts.push(net); - } - - const element = success.element; - if (element !== undefined) { - const testid = element.testid; - // A dynamic-marked testid is NOT asserted as a success condition (presence-only). - if (testid === undefined || !dynamic.has(testid)) { - const query: Record = {}; - if (testid !== undefined) query['testid'] = testid; - if (element.role !== undefined) query['role'] = element.role; - if (element.name !== undefined) query['name'] = element.name; - if (Object.keys(query).length > 0) { - parts.push({ kind: PredicateKind.ELEMENT, query }); - } - } - } - - if (parts.length === 0) return undefined; - if (parts.length === 1) return parts[0]; - return { kind: 'allOf', predicates: parts }; -} - -/** - * FLOW2SPEC — assert a flow's success end-condition after replay. Delegates evaluation to the - * injected waitForSignal (the real waitForPredicate in CI, a fake in unit tests). Passes when: - * no success was declared, OR every success field was dynamic-skipped (vacuously met), OR the - * compiled predicate held within the injected timeout. Never reads the wall clock. - */ -export async function assertSuccess( - session: FlowReplaySession, - success: FlowExpect | undefined, - dynamic: ReadonlySet, - waitForSignal: WaitForSignal, - timeoutMs: number, -): Promise { - if (success === undefined) return { pass: true }; - const predicate = successToPredicate(success, dynamic); - if (predicate === undefined) return { pass: true }; - return waitForSignal(session, predicate, timeoutMs); -} +export { successToPredicate, assertSuccess } from '@syrin/iris-server'; From 00dcbfeb162555b509a0e8ceff20b9a7d6624085 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 04:29:10 +0530 Subject: [PATCH 08/33] feat(tools): cost preview on iris_snapshot/iris_query so agents bail before huge reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit High-volume read results now carry cost:{ bytes, tokens } (tokens estimated at ~chars/4) so an agent can re-scope BEFORE spending context on a large body — re-scope a snapshot (mode:interactive/status or a tighter `scope`) or narrow a query (`name`/`scope`). Pure helpers in output-budget.ts (estimateTokens, sizeCost, withSizeCost) reuse the existing JSON-byte idiom; the wrapper is pure and unit-tested, the handlers just .then(withSizeCost). Grounded in: by ~step 15 a screenshot/Playwright-MCP agent accrues 60–80K tokens of stale accessibility-tree data and starts hallucinating selectors that don't exist; a single screenshot is up to ~4,784 tokens. Giving the agent a cheap size signal up front is the first lever of V1-ROADMAP M2 (token efficiency). 11 output-budget tests (+8); 493 server tests; all 16 tasks green uncached. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/session/output-budget.test.ts | 51 ++++++++++++++++++- packages/server/src/session/output-budget.ts | 38 ++++++++++++++ packages/server/src/tools/tools.ts | 16 ++++-- 3 files changed, 100 insertions(+), 5 deletions(-) diff --git a/packages/server/src/session/output-budget.test.ts b/packages/server/src/session/output-budget.test.ts index deac67b..36ac239 100644 --- a/packages/server/src/session/output-budget.test.ts +++ b/packages/server/src/session/output-budget.test.ts @@ -1,6 +1,12 @@ import { describe, it, expect } from 'vitest'; import { EventType, SessionState, type IrisEvent } from '@syrin/iris-protocol'; -import { applyEventBudget, costHint } from './output-budget.js'; +import { + applyEventBudget, + costHint, + estimateTokens, + sizeCost, + withSizeCost, +} from './output-budget.js'; import { TOOLS } from '../tools/tools.js'; import { IrisTool } from '../tools/tool-names.js'; import type { Session, SessionManager } from './session.js'; @@ -39,6 +45,49 @@ describe('costHint', () => { }); }); +describe('estimateTokens', () => { + it('is ~chars/4 and grows with length', () => { + expect(estimateTokens('')).toBe(0); + expect(estimateTokens('abcd')).toBe(1); + expect(estimateTokens('a'.repeat(400))).toBe(100); + expect(estimateTokens('a'.repeat(1000))).toBeGreaterThan(estimateTokens('a'.repeat(100))); + }); +}); + +describe('sizeCost / withSizeCost', () => { + it('reports bytes + an estimated token count for a payload', () => { + const c = sizeCost({ tree: 'a'.repeat(400) }); + expect(c.bytes).toBeGreaterThan(400); + expect(c.tokens).toBeGreaterThan(0); + }); + + it('attaches cost to an object result without dropping fields', () => { + const r = withSizeCost({ tree: 'x', status: { route: '/' } }) as { + tree: string; + status: { route: string }; + cost: { bytes: number; tokens: number }; + }; + expect(r.tree).toBe('x'); + expect(r.status.route).toBe('/'); + expect(r.cost.bytes).toBeGreaterThan(0); + expect(r.cost.tokens).toBeGreaterThanOrEqual(1); + }); + + it('measures the body, not including the cost field it adds', () => { + const big = withSizeCost({ tree: 'a'.repeat(4000) }) as unknown as { + cost: { tokens: number }; + }; + // ~4000 chars of tree + JSON overhead → ~1000 tokens, not inflated by the cost object itself. + expect(big.cost.tokens).toBeGreaterThan(900); + expect(big.cost.tokens).toBeLessThan(1100); + }); + + it('passes non-object results through unchanged', () => { + expect(withSizeCost(null)).toBeNull(); + expect(withSizeCost('err')).toBe('err'); + }); +}); + // ── observe wiring ──────────────────────────────────────────────────────────── function fakeDeps(events: IrisEvent[]): ToolDeps { const stub: Partial = { diff --git a/packages/server/src/session/output-budget.ts b/packages/server/src/session/output-budget.ts index d7e5065..df5ef51 100644 --- a/packages/server/src/session/output-budget.ts +++ b/packages/server/src/session/output-budget.ts @@ -31,3 +31,41 @@ export function costHint(payload: unknown, events: number, droppedOldest = 0): C const bytes = JSON.stringify(payload)?.length ?? 0; return droppedOldest > 0 ? { events, bytes, droppedOldest } : { events, bytes }; } + +/** + * Rough token estimate for a string. The exact count is model/tokenizer-specific, but ~4 characters + * per token is the well-known heuristic for English-ish text (and JSON) across GPT/Claude + * tokenizers — accurate enough for the only decision it drives: "is this response big enough that I + * should re-scope before reading it?" Deliberately a cheap, dependency-free approximation, NOT a + * billing-grade count. + */ +const CHARS_PER_TOKEN = 4; +export function estimateTokens(text: string): number { + return Math.ceil(text.length / CHARS_PER_TOKEN); +} + +/** + * A size preview for non-event read results (snapshot, query). Same intent as CostHint but for + * payloads measured by size rather than event count: the agent can bail and re-scope (mode:status, + * a tighter scope, a more specific query) before spending context on a large body. The token + * figure is an estimate (see estimateTokens). + */ +export interface SizeCost { + bytes: number; + tokens: number; +} + +export function sizeCost(payload: unknown): SizeCost { + const json = JSON.stringify(payload) ?? ''; + return { bytes: json.length, tokens: estimateTokens(json) }; +} + +/** + * Attach a `cost` size preview to a read result. Pure: the cost is computed over the result BEFORE + * the cost field is added (so it measures the body the agent will actually read), then merged in. + * Non-object results (e.g. a thrown-error envelope) pass through unchanged. + */ +export function withSizeCost(result: T): T { + if (typeof result !== 'object' || result === null) return result; + return { ...(result as Record), cost: sizeCost(result) } as T; +} diff --git a/packages/server/src/tools/tools.ts b/packages/server/src/tools/tools.ts index bd8cfc7..8558f15 100644 --- a/packages/server/src/tools/tools.ts +++ b/packages/server/src/tools/tools.ts @@ -28,7 +28,7 @@ import { consoleEmptyHint, } from '../events/event-filters.js'; import { healthEnvelope, refuseIfThrottled } from '../session/session-health.js'; -import { applyEventBudget, costHint } from '../session/output-budget.js'; +import { applyEventBudget, costHint, withSizeCost } from '../session/output-budget.js'; import { selectPath, capDepth } from '../session/state-select.js'; import { asString, asNumber, asRecord, parseInteractive } from './tools-helpers.js'; import type { FileSystemPort } from '../project/fs-port.js'; @@ -283,7 +283,7 @@ export const TOOLS: ToolDef[] = [ { name: IrisTool.SNAPSHOT, description: - 'Semantic accessibility snapshot of the page or a subtree. mode: full|interactive|status. Use to see what is on screen right now.', + 'Semantic accessibility snapshot of the page or a subtree. mode: full|interactive|status. Use to see what is on screen right now. The result carries cost:{ bytes, tokens } (estimated) — if it is large, re-scope (pass `scope`) or use mode:interactive/status instead of reading the whole tree.', inputSchema: { scope: z .string() @@ -305,12 +305,16 @@ export const TOOLS: ToolDef[] = [ .optional() .describe('Indented ARIA tree of every element on the page (or the scoped subtree).'), status: z.object({ route: z.string(), title: z.string().optional() }).optional(), + cost: z + .object({ bytes: z.number(), tokens: z.number() }) + .optional() + .describe('Estimated size of this result — re-scope if large.'), }, handler: (deps, args) => commandOrThrow(deps, asString(args['sessionId']), IrisCommand.SNAPSHOT, { scope: args['scope'], mode: args['mode'] ?? SnapshotMode.FULL, - }), + }).then(withSizeCost), }, { name: IrisTool.QUERY, @@ -356,6 +360,10 @@ export const TOOLS: ToolDef[] = [ .describe( 'Present only on zero matches — tells you what IS on the page so you can diagnose the miss.', ), + cost: z + .object({ bytes: z.number(), tokens: z.number() }) + .optional() + .describe('Estimated size of this result — narrow with `name`/`scope` if large.'), }, handler: (deps, args) => commandOrThrow(deps, asString(args['sessionId']), IrisCommand.QUERY, { @@ -363,7 +371,7 @@ export const TOOLS: ToolDef[] = [ value: args['value'], name: args['name'], scope: args['scope'], - }), + }).then(withSizeCost), }, { name: IrisTool.INSPECT, From 2db5eb9b4d51ca0ceedd433d1141035220906e74 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 04:55:05 +0530 Subject: [PATCH 09/33] =?UTF-8?q?feat(tools):=20diffed=20snapshots=20?= =?UTF-8?q?=E2=80=94=20iris=5Fsnapshot=20diff:true=20returns=20only=20what?= =?UTF-8?q?=20changed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the first snapshot, an agent can pass diff:true to get back only the delta since its last look of the same scope/mode — mode:delta with added/removed lines, or mode:unchanged — instead of the full accessibility tree again. A route change automatically resets to a full snapshot (a cross-page delta would be meaningless), so the agent never sees a misleading diff. Why: screenshot/Playwright-MCP agents accrue 60–80K tokens of stale a11y data over a session and start hallucinating selectors that no longer exist. Returning only the change set attacks both the token cost AND that stale-context failure in one move. The agent-facing cost is the MCP result, so the delta is computed server-side, reusing the same normalize+diff the baseline layer uses. Pure decision (snapshotDelta) + a route-invalidated, bounded SnapshotCache + applySnapshotDelta shaper — all unit-tested without a browser. V1-ROADMAP M2. 10 new tests; 503 server tests; all 16 tasks green uncached. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/tools/snapshot-delta.test.ts | 102 ++++++++++++++++++ packages/server/src/tools/snapshot-delta.ts | Bin 0 -> 4751 bytes packages/server/src/tools/tools.ts | 49 ++++++++- 3 files changed, 146 insertions(+), 5 deletions(-) create mode 100644 packages/server/src/tools/snapshot-delta.test.ts create mode 100644 packages/server/src/tools/snapshot-delta.ts diff --git a/packages/server/src/tools/snapshot-delta.test.ts b/packages/server/src/tools/snapshot-delta.test.ts new file mode 100644 index 0000000..e484d5b --- /dev/null +++ b/packages/server/src/tools/snapshot-delta.test.ts @@ -0,0 +1,102 @@ +import { describe, expect, it } from 'vitest'; +import { + snapshotDelta, + SnapshotCache, + applySnapshotDelta, + snapshotCacheKey, + SnapshotDeltaMode, +} from './snapshot-delta.js'; + +const TREE_A = '- button "Save" (ref=e1)\n- button "Cancel" (ref=e2)'; +const TREE_B = '- button "Save" (ref=e9)\n- button "Cancel" (ref=e2)\n- alert "Saved!" (ref=e3)'; + +describe('snapshotDelta (pure)', () => { + it('returns full when there is no previous snapshot', () => { + expect(snapshotDelta(undefined, TREE_A).mode).toBe(SnapshotDeltaMode.FULL); + }); + + it('returns unchanged when only ref ids differ (refs are normalized out)', () => { + const refsOnlyChanged = '- button "Save" (ref=e7)\n- button "Cancel" (ref=e8)'; + expect(snapshotDelta(TREE_A, refsOnlyChanged).mode).toBe(SnapshotDeltaMode.UNCHANGED); + }); + + it('returns only the added/removed lines on a real change', () => { + const d = snapshotDelta(TREE_A, TREE_B); + if (d.mode !== SnapshotDeltaMode.DELTA) throw new Error('expected delta'); + expect(d.delta.added).toEqual(['- alert "Saved!"']); // normalize strips refs, keeps the line + expect(d.delta.removed).toEqual([]); + expect(d.delta.addedCount).toBe(1); + }); +}); + +describe('SnapshotCache (route-invalidated)', () => { + it('recalls the last tree only when the route matches', () => { + const c = new SnapshotCache(); + c.remember('k', '/a', TREE_A); + expect(c.recall('k', '/a')).toBe(TREE_A); + expect(c.recall('k', '/b')).toBeUndefined(); // route changed → invalidated + }); + + it('evicts the oldest entry past the cap', () => { + const c = new SnapshotCache(1); + c.remember('k1', '/a', 'x'); + c.remember('k2', '/a', 'y'); + expect(c.recall('k1', '/a')).toBeUndefined(); + expect(c.recall('k2', '/a')).toBe('y'); + }); +}); + +describe('applySnapshotDelta', () => { + const raw = (tree: string, route = '/'): unknown => ({ + tree, + status: { route, title: 'T' }, + nodes: 2, + }); + const opts = (diff: boolean) => ({ sessionId: 's', scope: '', mode: 'full', diff }); + + it('passes the full snapshot through when diff is off (but caches it)', () => { + const c = new SnapshotCache(); + const out = applySnapshotDelta(raw(TREE_A), opts(false), c) as { tree?: string }; + expect(out.tree).toBe(TREE_A); + expect(c.recall(snapshotCacheKey('s', '', 'full'), '/')).toBe(TREE_A); + }); + + it('first diff call returns full (no prior), second returns only the delta', () => { + const c = new SnapshotCache(); + const first = applySnapshotDelta(raw(TREE_A), opts(true), c) as { tree?: string }; + expect(first.tree).toBe(TREE_A); // first look → full + const second = applySnapshotDelta(raw(TREE_B), opts(true), c) as { + mode?: string; + delta?: { added: string[] }; + tree?: string; + }; + expect(second.mode).toBe(SnapshotDeltaMode.DELTA); + expect(second.tree).toBeUndefined(); // no full tree on a delta → tokens saved + expect(second.delta?.added).toEqual(['- alert "Saved!"']); + }); + + it('returns unchanged (no tree, no delta) when nothing changed', () => { + const c = new SnapshotCache(); + applySnapshotDelta(raw(TREE_A), opts(true), c); + const again = applySnapshotDelta(raw(TREE_A), opts(true), c) as { + mode?: string; + tree?: string; + }; + expect(again.mode).toBe(SnapshotDeltaMode.UNCHANGED); + expect(again.tree).toBeUndefined(); + }); + + it('a route change yields full again (never a cross-page delta)', () => { + const c = new SnapshotCache(); + applySnapshotDelta(raw(TREE_A, '/a'), opts(true), c); + const onB = applySnapshotDelta(raw(TREE_B, '/b'), opts(true), c) as { tree?: string }; + expect(onB.tree).toBe(TREE_B); // different route → full, not a delta + }); + + it('passes an error envelope through untouched', () => { + const c = new SnapshotCache(); + expect(applySnapshotDelta({ error: 'no session' }, opts(true), c)).toEqual({ + error: 'no session', + }); + }); +}); diff --git a/packages/server/src/tools/snapshot-delta.ts b/packages/server/src/tools/snapshot-delta.ts new file mode 100644 index 0000000000000000000000000000000000000000..faa7ddc507ae38a991a7fda73e959b08891ea7e7 GIT binary patch literal 4751 zcma)9VQ(8Z61~s<6_fT5Nv^cYmn#l(ox+Hno>AKgVh26IC@OlFlD1hb_sCt#I!8eN zL;DN!OZsL=?n-vtrXNykNe*Y;yf-r(@9*!?K3!^itL@0;nI>1N$}LPynJ#KYpMLv; ztgahdk|~Qdtrn`LSN}P>_~GQ3TwbP{>V+mX*Ja&vR47+dVa$gFcG>h#i#7G<)-+|N zv%!R1YPBx;g7MoztyeamFY3|xtGC{SBbBDM(e&+$Prv>C?-xH(Z9ZT>nHjlS6}Tx) zwR8DRUgY(9RKprylPUOW5Ckc5ylzTTWyWW0O$${NO`4ag&dWL03!T=+I(Uv?N&|<> zIgaYP+|>zP#dtRoKI+CXRUTkQBz*yFO@m!mq3gc^Rm{qR0aKXNtId` zMrDH`IRhihs*(G7n+JEHtrmpgDqrA%<|_qjl>xJ{)k_4HQIVJ0$?hz-2mr}Qo7%e+ zKw`^F1KZ>(UBa!Si(?q&^Rkcc@F>ojq7d*r2AR!y%N9=#uG(p_a6j z94*{m=B@x~gjsBU#O6IC7=`iPUcRghXnWrwe#Xfkic|6@ZD?lAl6pxpuB`bLd>F@E zxa3#YJJ{RPcVZ)V<&sa937vWT^-?r5LCXrzX)>2 z*QN~l#L9-01FK>DAYN6kEB)D<`n&=OKk{O5{Xws>@YMRxT^yn;DDuG#5``!Q#ih1D z^%{pL&_3J!DEnTa7}XOho8^tRTU#G|-?RuySOVN}B<3M*_=p6t+9pqWpD?<#`Lew& z&R=>!Qgz^oGueffHo$GXzfUZuiRcL#7bf01e}eEwn@*ZT|D~w2videR4P=4RN_`Ql zNNpemwWT^QC(PDBvl)!lTy}R9+h=oE*c$Ty)VLMQz~^izqS;6gc-dRN(__m?9@020Qe@2cmB~+~N_|Leq(E zZ`dq21xZ`&qH|S! z=WE0elo3`(ULFYF<1gC0>`P*|t=x4r184{iw&6jdzf|oSZu$&1196EUu-qkZ=nmWP zFQ*oe60uazcywY_(G0DKcM@tNoxT=XinAziIhx5*)o5()pc4YdGRRlmIKV?V?RSS- zEogB;Owl&`uoqlczH&pxYV^j{B?&_;5XjQ6+O-vqiy*uMnF0ZO^uq%2M#D z^RBUs{a@dQ?-Kc>3#r8hRMcFl|HAX(PVx}Pt;w_CIZLzi*I|00mt6#R?AWtzeW9Gh z6B8E8hAA?Q^mGTS1Yx`WAmKuQ|AD2AW}o>#zB8QIL1D0k92%^yb=Vg0<4)0$q)MDe zw=qP3XoLj7_Cn2fgr!b-7R& zI8ZUvs%<;)T1Im(qVO`Ac!5mW(v;_TtN+&^p%@zZm4J~F^XL5>x0qof@09r9E8plJ}0s^oHN9PN;%cYQwLU2m>B& z3w6sB3YCxZ#1Dp&BnPNXx4Hn`B?0mlk`y44+B7J_86Xt7hXK}J_t-@5XCIKa+WUL} zQt=Q&Jko{6nZyLfLng{W0O`8tKf^Wy(}KlA#?~4G$)L$uqVK7{z4|m*gK%I%_h|Gj z$lM^f?0F}=2iHAON^~RyiK`wi8Wfqo|IEaX_Dg>K1qn+T1~m&@JW6euKJ*Yk)cRQMIG#3mVR$mzJ&HKNBicw_C$ zwF^|FCX?irKg|BPH{k#!r#qrM$5$C&AF}65&$csU1S-_8_-c84TYCacLMS-dp_cH1 Wd(__=@}p=RL)$>5U_NPkv-c - commandOrThrow(deps, asString(args['sessionId']), IrisCommand.SNAPSHOT, { + handler: (deps, args) => { + const sessionId = asString(args['sessionId']); + const mode = asString(args['mode']) ?? SnapshotMode.FULL; + return commandOrThrow(deps, sessionId, IrisCommand.SNAPSHOT, { scope: args['scope'], - mode: args['mode'] ?? SnapshotMode.FULL, - }).then(withSizeCost), + mode, + }).then((raw) => + withSizeCost( + applySnapshotDelta( + raw, + { + sessionId: sessionId ?? 'default', + scope: asString(args['scope']) ?? '', + mode, + diff: args['diff'] === true, + }, + SNAPSHOT_CACHE, + ), + ), + ); + }, }, { name: IrisTool.QUERY, From 346f5c29f838d048d0680ddb021f5e8ebc38a836 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 05:18:06 +0530 Subject: [PATCH 10/33] =?UTF-8?q?test(bench):=20M0=20=E2=80=94=20measure?= =?UTF-8?q?=20the=20M2=20token=20wins=20on=20real=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A reproducible, regression-guarding benchmark (runs the shipped sizeCost / estimateTokens / applySnapshotDelta, not mocks) over a representative 150-row operational dashboard — the shape that makes full snapshots expensive. Measured: full re-snapshot = 4,246 tok (16,983 bytes) diff (1 row changed) = 60 tok → 99% saved unchanged re-snapshot = 17 tok Asserts the diff is <10% of a full re-read and ≥90% saved, so the M2 win can't silently regress. This is the data behind the "token efficient" claim — the absolute figure uses the ~chars/4 heuristic, the relative saving is robust. V1-ROADMAP M0 (first numbers; real-app capture is the follow-up). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/tools/snapshot-cost.test.ts | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 packages/server/src/tools/snapshot-cost.test.ts diff --git a/packages/server/src/tools/snapshot-cost.test.ts b/packages/server/src/tools/snapshot-cost.test.ts new file mode 100644 index 0000000..f92139e --- /dev/null +++ b/packages/server/src/tools/snapshot-cost.test.ts @@ -0,0 +1,92 @@ +import { describe, expect, it } from 'vitest'; +import { sizeCost, estimateTokens } from '../session/output-budget.js'; +import { applySnapshotDelta, SnapshotCache } from './snapshot-delta.js'; + +/** + * M0 — a reproducible measurement of the M2 token wins, run over the REAL shipped functions + * (sizeCost / estimateTokens / applySnapshotDelta), not a mock. It quantifies the claim "diffed + * snapshots and re-scoping save the tokens that drive agent context blow-up + selector + * hallucination". The absolute token figure uses the ~chars/4 heuristic (estimateTokens); the + * RELATIVE savings (diff vs full) is what matters and is robust to the heuristic. Numbers are + * asserted as regression guards AND printed (see the test output) so the roadmap can quote them. + * + * Representative page: a 150-row operational dashboard (orders table) — the exact shape that makes + * full snapshots expensive and where an agent typically changes one row at a time. + */ + +const ROWS = 150; + +function dashboardTree(rows: number, mutatedRow = -1): string { + const lines = ['- main "Orders" (ref=e1)', ' - table "Orders" (ref=e2)']; + for (let i = 0; i < rows; i += 1) { + const status = i === mutatedRow ? 'Shipped' : 'Pending'; + lines.push( + ` - row "Order #${String(1000 + i)} — Acme Corp — $${String(i * 7 + 19)}.00 — ${status}" (ref=e${String(i + 10)})`, + ); + lines.push(` - button "View #${String(1000 + i)}" (ref=e${String(i + 2000)})`); + } + return lines.join('\n'); +} + +function snap(tree: string, route = '/orders'): unknown { + return { tree, status: { route, title: 'Orders' }, nodes: ROWS * 2 + 2 }; +} + +function tokensOf(result: unknown): number { + return sizeCost(result).tokens; +} + +describe('M0 — snapshot token cost (M2 wins, measured on real functions)', () => { + it('a diff after a one-row change costs a tiny fraction of a full re-snapshot', () => { + const cache = new SnapshotCache(); + const full = snap(dashboardTree(ROWS)); + // First look → full (this is what the agent pays once). + applySnapshotDelta(full, { sessionId: 's', scope: '', mode: 'full', diff: true }, cache); + const fullTokens = tokensOf(full); + + // One row flips Pending → Shipped; agent asks for the diff. + const changed = snap(dashboardTree(ROWS, 42)); + const delta = applySnapshotDelta( + changed, + { sessionId: 's', scope: '', mode: 'full', diff: true }, + cache, + ); + const deltaTokens = tokensOf(delta); + + const savedPct = Math.round((1 - deltaTokens / fullTokens) * 100); + // eslint-disable-next-line no-console + console.log( + `[M0] full re-snapshot=${String(fullTokens)} tok diff=${String(deltaTokens)} tok saved=${String(savedPct)}%`, + ); + + expect(fullTokens).toBeGreaterThan(2000); // a 150-row dashboard is genuinely expensive + expect(deltaTokens).toBeLessThan(fullTokens * 0.1); // diff is <10% of a full re-read + expect(savedPct).toBeGreaterThanOrEqual(90); + }); + + it('an unchanged re-snapshot collapses to near-zero tokens', () => { + const cache = new SnapshotCache(); + const full = snap(dashboardTree(ROWS)); + applySnapshotDelta(full, { sessionId: 's', scope: '', mode: 'full', diff: true }, cache); + const unchanged = applySnapshotDelta( + snap(dashboardTree(ROWS)), + { sessionId: 's', scope: '', mode: 'full', diff: true }, + cache, + ); + const unchangedTokens = tokensOf(unchanged); + // eslint-disable-next-line no-console + console.log(`[M0] unchanged re-snapshot=${String(unchangedTokens)} tok`); + expect(unchangedTokens).toBeLessThan(50); + }); + + it('cost preview reports the full size up front so the agent can bail before reading', () => { + const full = snap(dashboardTree(ROWS)); + const preview = sizeCost(full); + // eslint-disable-next-line no-console + console.log( + `[M0] cost preview: ${String(preview.tokens)} tok / ${String(preview.bytes)} bytes`, + ); + expect(preview.tokens).toBe(estimateTokens(JSON.stringify(full))); + expect(preview.tokens).toBeGreaterThan(2000); + }); +}); From 414ca24702715ff833a71723eeabb1cbc908cc1f Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 05:44:40 +0530 Subject: [PATCH 11/33] =?UTF-8?q?feat(domain):=20iris=5Fdomain=20=E2=80=94?= =?UTF-8?q?=20learn=20the=20app's=20flows=20+=20untested=20intent=20before?= =?UTF-8?q?=20testing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New tool + pure builder that synthesizes every saved flow and the registered capabilities into a compact domain model an agent reads BEFORE driving the app: each flow with its assertion grade and the signals/testids it exercises, coverage counts, and — the differentiator — GAPS: declared signals/testids that NO flow asserts (untested intent), plus flows that assert no consequence. A one-line summary headlines it. Why: automation "checks the DOM, not the intent" — tests pass while integrated bugs ship (Bolton; Fowler). Pointing the agent straight at "you declared signal order:placed but no flow asserts it" is domain understanding + rigor in one read, instead of crawling the whole app or reading all the source. Reuses the flow assertion classifier so the two reinforce each other. Pure buildDomainModel (no IO) + a thin iris_domain tool reading .iris/flows/ + .iris/contract.json (no browser). V1-ROADMAP M4 first slice. 5 domain-model tests; 511 server tests; all 16 tasks green uncached. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/domain/domain-model.test.ts | 75 +++++++++ packages/server/src/domain/domain-model.ts | 151 ++++++++++++++++++ packages/server/src/domain/domain-tools.ts | 61 +++++++ packages/server/src/index.ts | 2 + packages/server/src/tools/tool-names.ts | 1 + packages/server/src/tools/tools.ts | 2 + 6 files changed, 292 insertions(+) create mode 100644 packages/server/src/domain/domain-model.test.ts create mode 100644 packages/server/src/domain/domain-model.ts create mode 100644 packages/server/src/domain/domain-tools.ts diff --git a/packages/server/src/domain/domain-model.test.ts b/packages/server/src/domain/domain-model.test.ts new file mode 100644 index 0000000..fd5a505 --- /dev/null +++ b/packages/server/src/domain/domain-model.test.ts @@ -0,0 +1,75 @@ +import { describe, expect, it } from 'vitest'; +import { + FLOW_FILE_VERSION, + AnchorKind, + type CapabilitiesContract, + type FlowExpect, + type FlowFile, + type FlowStep, +} from '@syrin/iris-protocol'; +import { IrisTool } from '../tools/tool-names.js'; +import { buildDomainModel } from './domain-model.js'; + +function testidStep(value: string): FlowStep { + return { tool: IrisTool.ACT, anchor: { kind: AnchorKind.TESTID, value } }; +} + +function flow(name: string, steps: FlowStep[], success?: FlowExpect): FlowFile { + const f: FlowFile = { version: FLOW_FILE_VERSION, name, createdAt: 0, steps }; + if (success !== undefined) f.success = success; + return f; +} + +const contract = (over: Partial = {}): CapabilitiesContract => ({ + testids: over.testids ?? [], + signals: over.signals ?? [], + stores: over.stores ?? [], + flows: over.flows ?? [], +}); + +describe('buildDomainModel', () => { + it('summarizes each flow with its assertion grade + anchors used', () => { + const m = buildDomainModel( + [flow('checkout', [testidStep('pay')], { signal: 'order:placed' })], + contract(), + ); + expect(m.flowCount).toBe(1); + expect(m.flows[0]?.name).toBe('checkout'); + expect(m.flows[0]?.grade).toBe('asserted'); + expect(m.flows[0]?.signals).toEqual(['order:placed']); + expect(m.flows[0]?.testids).toEqual(['pay']); + expect(m.coverage.asserted).toBe(1); + }); + + it('flags declared signals that NO flow asserts (untested intent — the differentiator)', () => { + const m = buildDomainModel( + [flow('checkout', [testidStep('pay')], { signal: 'order:placed' })], + contract({ signals: ['order:placed', 'refund:issued'], testids: ['pay', 'refund-btn'] }), + ); + expect(m.gaps.declaredUntestedSignals).toEqual(['refund:issued']); + expect(m.gaps.declaredUntestedTestids).toEqual(['refund-btn']); + expect(m.summary).toContain('refund:issued'); + }); + + it('lists unasserted flows as a gap', () => { + const m = buildDomainModel( + [flow('browse', [testidStep('nav')]), flow('buy', [testidStep('pay')], { signal: 'done' })], + contract(), + ); + expect(m.gaps.unassertedFlows).toEqual(['browse']); + expect(m.coverage.assertionFree).toBe(1); + expect(m.coverage.asserted).toBe(1); + }); + + it('handles no contract (null) without crashing', () => { + const m = buildDomainModel([flow('f', [testidStep('a')], { signal: 's' })], null); + expect(m.declared.signals).toEqual([]); + expect(m.gaps.declaredUntestedSignals).toEqual([]); + }); + + it('gives an actionable summary when there are no flows', () => { + const m = buildDomainModel([], contract({ signals: ['x'] })); + expect(m.flowCount).toBe(0); + expect(m.summary).toContain('No saved flows'); + }); +}); diff --git a/packages/server/src/domain/domain-model.ts b/packages/server/src/domain/domain-model.ts new file mode 100644 index 0000000..9fcccc3 --- /dev/null +++ b/packages/server/src/domain/domain-model.ts @@ -0,0 +1,151 @@ +/** + * Build a compact "domain model" of an app's testable surface for an agent to read BEFORE testing: + * the demonstrated flows, what each one actually asserts, and — the differentiator — the GAPS + * between declared intent (capabilities the app registered: signals/testids) and what any flow + * actually verifies. + * + * Why (grounded): automation "checks the DOM, not the intent" — tests pass while real, + * integrated-system bugs ship (Bolton; Fowler Assertion-Free Testing). An agent that can see "you + * declared the signal order:placed but no flow asserts it" knows where the real risk is, instead of + * re-deriving the app's flows by reading all the source. Pairs with the flow assertion grades + * (flow-classify) so "rigorous" and "domain-aware" reinforce each other. + * + * Pure: no IO, no clock. The tool layer loads flows + contract and calls this. + */ + +import { + AnchorKind, + type CapabilitiesContract, + type FlowFile, + type FlowStep, +} from '@syrin/iris-protocol'; +import { classifyFlowAssertions, FlowAssertionGrade } from '../flows/flow-classify.js'; + +export interface DomainFlowSummary { + name: string; + steps: number; + grade: string; + /** True when the flow asserts a real consequence (signal/net), not just presence. */ + asserts: boolean; + warning?: string; + signals: string[]; + testids: string[]; +} + +export interface DomainGaps { + /** Flows that act but don't assert a consequence (grade !== asserted). */ + unassertedFlows: string[]; + /** Signals the app declared in capabilities that NO flow asserts — untested intent. */ + declaredUntestedSignals: string[]; + /** Testids the app declared that no saved flow exercises. */ + declaredUntestedTestids: string[]; +} + +export interface DomainModel { + flowCount: number; + flows: DomainFlowSummary[]; + declared: { testids: number; signals: string[]; stores: string[] }; + coverage: { asserted: number; presenceOnly: number; assertionFree: number }; + gaps: DomainGaps; + /** One-line headline an agent (or human) can read at a glance. */ + summary: string; +} + +function flatten(steps: readonly FlowStep[]): FlowStep[] { + const out: FlowStep[] = []; + for (const s of steps) { + out.push(s); + if (s.steps !== undefined) out.push(...flatten(s.steps)); + } + return out; +} + +function flowSignals(flow: FlowFile): string[] { + const set = new Set(); + for (const step of flatten(flow.steps)) { + if (step.anchor.kind === AnchorKind.SIGNAL) set.add(step.anchor.name); + if (step.expect?.signal !== undefined) set.add(step.expect.signal); + } + if (flow.success?.signal !== undefined) set.add(flow.success.signal); + return [...set]; +} + +function flowTestids(flow: FlowFile): string[] { + const set = new Set(); + for (const step of flatten(flow.steps)) { + if (step.anchor.kind === AnchorKind.TESTID) set.add(step.anchor.value); + if (step.expect?.element?.testid !== undefined) set.add(step.expect.element.testid); + } + if (flow.success?.element?.testid !== undefined) set.add(flow.success.element.testid); + return [...set]; +} + +const EMPTY_CONTRACT: CapabilitiesContract = { testids: [], signals: [], stores: [], flows: [] }; + +export function buildDomainModel( + flows: readonly FlowFile[], + contract: CapabilitiesContract | null, +): DomainModel { + const caps = contract ?? EMPTY_CONTRACT; + + const flowSummaries: DomainFlowSummary[] = flows.map((flow) => { + const c = classifyFlowAssertions(flow); + const summary: DomainFlowSummary = { + name: flow.name, + steps: c.totalSteps, + grade: c.grade, + asserts: c.hasConsequenceAssertion, + signals: flowSignals(flow), + testids: flowTestids(flow), + }; + if (c.warning !== undefined) summary.warning = c.warning; + return summary; + }); + + const testedSignals = new Set(flowSummaries.flatMap((f) => f.signals)); + const testedTestids = new Set(flowSummaries.flatMap((f) => f.testids)); + + const coverage = { + asserted: flowSummaries.filter((f) => f.grade === FlowAssertionGrade.ASSERTED).length, + presenceOnly: flowSummaries.filter((f) => f.grade === FlowAssertionGrade.PRESENCE_ONLY).length, + assertionFree: flowSummaries.filter((f) => f.grade === FlowAssertionGrade.ASSERTION_FREE) + .length, + }; + + const gaps: DomainGaps = { + unassertedFlows: flowSummaries.filter((f) => !f.asserts).map((f) => f.name), + declaredUntestedSignals: caps.signals.filter((s) => !testedSignals.has(s)), + declaredUntestedTestids: caps.testids.filter((t) => !testedTestids.has(t)), + }; + + return { + flowCount: flows.length, + flows: flowSummaries, + declared: { testids: caps.testids.length, signals: caps.signals, stores: caps.stores }, + coverage, + gaps, + summary: buildSummary(flows.length, coverage, gaps), + }; +} + +function buildSummary( + flowCount: number, + coverage: DomainModel['coverage'], + gaps: DomainGaps, +): string { + if (flowCount === 0) { + return 'No saved flows yet — record the critical journeys (iris_record_start) so the agent learns the app.'; + } + const parts = [ + `${String(flowCount)} flow${flowCount === 1 ? '' : 's'}: ${String(coverage.asserted)} asserted, ${String(coverage.presenceOnly)} presence-only, ${String(coverage.assertionFree)} assertion-free`, + ]; + if (gaps.declaredUntestedSignals.length > 0) { + parts.push( + `${String(gaps.declaredUntestedSignals.length)} declared signal(s) no flow asserts (${gaps.declaredUntestedSignals.join(', ')})`, + ); + } + if (gaps.unassertedFlows.length > 0) { + parts.push(`${String(gaps.unassertedFlows.length)} flow(s) assert no consequence`); + } + return parts.join('. ') + '.'; +} diff --git a/packages/server/src/domain/domain-tools.ts b/packages/server/src/domain/domain-tools.ts new file mode 100644 index 0000000..18eeaaf --- /dev/null +++ b/packages/server/src/domain/domain-tools.ts @@ -0,0 +1,61 @@ +import { z } from 'zod'; +import type { FlowFile } from '@syrin/iris-protocol'; +import { IrisTool } from '../tools/tool-names.js'; +import { readContract } from '../project/iris-dir.js'; +import { buildDomainModel } from './domain-model.js'; +import type { ToolDef, ToolDeps } from '../tools/tools.js'; + +/** + * iris_domain — the "learn the app before testing it" tool. Synthesizes every saved flow + the + * registered capabilities into a compact domain model: the journeys, what each asserts, and the + * GAPS (declared signals/testids no flow verifies). An agent reads this once instead of crawling + * the whole app or reading all the source — and it points straight at untested intent. + */ +export const DOMAIN_TOOLS: ToolDef[] = [ + { + name: IrisTool.DOMAIN, + description: + 'Read the app domain model BEFORE testing: every saved flow with its assertion grade + the anchors/signals it exercises, plus GAPS — declared signals/testids that NO flow asserts (untested intent), and flows that assert no observable consequence. Use this to decide what to test and where the real risk is, instead of crawling the whole app. Reads .iris/flows/ + .iris/contract.json (no browser needed).', + inputSchema: {}, + outputSchema: { + flowCount: z.number(), + flows: z.array( + z.object({ + name: z.string(), + steps: z.number(), + grade: z.string(), + asserts: z.boolean(), + warning: z.string().optional(), + signals: z.array(z.string()), + testids: z.array(z.string()), + }), + ), + declared: z.object({ + testids: z.number(), + signals: z.array(z.string()), + stores: z.array(z.string()), + }), + coverage: z.object({ + asserted: z.number(), + presenceOnly: z.number(), + assertionFree: z.number(), + }), + gaps: z.object({ + unassertedFlows: z.array(z.string()), + declaredUntestedSignals: z.array(z.string()), + declaredUntestedTestids: z.array(z.string()), + }), + summary: z.string(), + }, + handler: async (deps: ToolDeps) => { + const names = await deps.flows.list(); + const flows: FlowFile[] = []; + for (const name of names) { + const loaded = await deps.flows.load(name); + if (loaded.ok) flows.push(loaded.value); + } + const contract = await readContract(deps.fs, deps.irisRoot); + return buildDomainModel(flows, contract.ok ? contract.capabilities : null); + }, + }, +]; diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index a11d31d..1f57c77 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -41,6 +41,8 @@ export { } from './flows/flow-success.js'; export { classifyFlowAssertions, FlowAssertionGrade } from './flows/flow-classify.js'; export type { FlowAssertionClassification } from './flows/flow-classify.js'; +export { buildDomainModel } from './domain/domain-model.js'; +export type { DomainModel, DomainFlowSummary, DomainGaps } from './domain/domain-model.js'; export { ProjectStore } from './project/project-store.js'; export type { ReadProjectResult } from './project/project-store.js'; export { VisualStore } from './visual/visual-store.js'; diff --git a/packages/server/src/tools/tool-names.ts b/packages/server/src/tools/tool-names.ts index 2ceadc7..aca58af 100644 --- a/packages/server/src/tools/tool-names.ts +++ b/packages/server/src/tools/tool-names.ts @@ -25,6 +25,7 @@ export const IrisTool = { STATE: 'iris_state', CAPABILITIES: 'iris_capabilities', CONTRACT_SAVE: 'iris_contract_save', + DOMAIN: 'iris_domain', FLOW_SAVE: 'iris_flow_save', FLOW_LIST: 'iris_flow_list', FLOW_LOAD: 'iris_flow_load', diff --git a/packages/server/src/tools/tools.ts b/packages/server/src/tools/tools.ts index 8940938..675387b 100644 --- a/packages/server/src/tools/tools.ts +++ b/packages/server/src/tools/tools.ts @@ -36,6 +36,7 @@ import type { FileSystemPort } from '../project/fs-port.js'; import type { FlowStore } from '../flows/flows.js'; import type { ProjectStore } from '../project/project-store.js'; import { CONTRACT_TOOLS } from './contract-tools.js'; +import { DOMAIN_TOOLS } from '../domain/domain-tools.js'; import { BROWSER_TOOLS } from './browser-tools.js'; import { FLOW_TOOLS } from '../flows/flow-tools.js'; import { PROJECT_TOOLS } from '../project/project-tools.js'; @@ -1216,6 +1217,7 @@ export const TOOLS: ToolDef[] = [ }, // iris_capabilities (live | fromDisk) + iris_contract_save. See contract-tools.ts. ...CONTRACT_TOOLS, + ...DOMAIN_TOOLS, // iris_flow_save / iris_flow_list / iris_flow_load. See flow-tools.ts. ...FLOW_TOOLS, // iris_project (read history + diff-vs-last) / iris_run_record. See project-tools.ts. From 1d91fea8462868ce3794a6ce5cbb34701c073d7e Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 06:09:03 +0530 Subject: [PATCH 12/33] feat(heal): never auto-heal an ambiguous drift (heals the locator, never the intent) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two present testids tie at the minimum edit distance, the prior nearest-pick was an arbitrary lexical tiebreak — exactly how a self-healing rebind lands on the WRONG element and ships a bug green (the failure mabl/qate.ai admit). Drift now carries `ambiguous:true` in that case, and the heal proposal layer refuses to auto-rebind an ambiguous drift: it still surfaces the drift + a nearest candidate for a human/agent to choose, but never applies a coin-flip. - protocol Drift gains optional `ambiguous` - flow-replay: pure `nearestIsAmbiguous` (≥2 candidates at min distance) sets it - heal.proposeRebindWith refuses when drift.ambiguous (regardless of confidence) Pairs with the M1 replay success oracle (which catches a wrong rebind at run time): this stops the wrong rebind from ever being proposed. V1-ROADMAP M5. 6 new tests; 517 server tests; all 16 tasks green uncached. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/protocol/src/types.ts | 6 ++++ packages/server/src/flows/flow-replay.test.ts | 23 ++++++++++++++- packages/server/src/flows/flow-replay.ts | 28 +++++++++++++++++-- .../server/src/flows/heal-proposal.test.ts | 24 ++++++++++++++++ packages/server/src/flows/heal.ts | 4 +++ 5 files changed, 82 insertions(+), 3 deletions(-) diff --git a/packages/protocol/src/types.ts b/packages/protocol/src/types.ts index cfc3598..0d11122 100644 --- a/packages/protocol/src/types.ts +++ b/packages/protocol/src/types.ts @@ -224,6 +224,12 @@ export interface Drift { anchor: string; /** Closest present testid via the live near-miss; null only when the page has no testids (or signal drift). */ nearest: string | null; + /** + * True when two or more present testids tie at the minimum edit distance, so `nearest` is an + * arbitrary pick. An ambiguous drift is NEVER auto-healed (a wrong rebind ships a bug green) — + * it is surfaced for a human/agent to choose. Absent ⇒ unambiguous. + */ + ambiguous?: boolean; } /** The per-step result of re-resolving + running one anchored step. */ diff --git a/packages/server/src/flows/flow-replay.test.ts b/packages/server/src/flows/flow-replay.test.ts index 105c585..e937680 100644 --- a/packages/server/src/flows/flow-replay.test.ts +++ b/packages/server/src/flows/flow-replay.test.ts @@ -14,7 +14,12 @@ import { type IrisEvent, type QueryEmptyHint, } from '@syrin/iris-protocol'; -import { nearestTestid, replayFlow, type FlowReplaySession } from './flow-replay.js'; +import { + nearestTestid, + nearestIsAmbiguous, + replayFlow, + type FlowReplaySession, +} from './flow-replay.js'; import { waitForPredicate, type Predicate } from '../events/predicate.js'; import { asRecord, asString } from '../tools/tools-helpers.js'; import { IrisTool } from '../tools/tool-names.js'; @@ -227,3 +232,19 @@ describe('nearestTestid — closest surviving anchor', () => { expect(nearestTestid('Send', ['send-x', 'send'])).toBe('send'); }); }); + +describe('nearestIsAmbiguous — refuse to auto-heal a coin-flip', () => { + it('is true when two candidates tie at the minimum distance', () => { + // both differ from "submit-bt" by one edit → tie → arbitrary pick → ambiguous + expect(nearestIsAmbiguous('submit-bt', ['submit-btn', 'submit-bts'])).toBe(true); + }); + + it('is false when one candidate is strictly closest', () => { + expect(nearestIsAmbiguous('chat-send', ['chat-send-x', 'sidebar'])).toBe(false); + }); + + it('is false with fewer than two candidates', () => { + expect(nearestIsAmbiguous('x', [])).toBe(false); + expect(nearestIsAmbiguous('x', ['y'])).toBe(false); + }); +}); diff --git a/packages/server/src/flows/flow-replay.ts b/packages/server/src/flows/flow-replay.ts index 20b2daa..8c2b21f 100644 --- a/packages/server/src/flows/flow-replay.ts +++ b/packages/server/src/flows/flow-replay.ts @@ -108,14 +108,38 @@ function readQuery(result: CommandResult): { refs: string[]; hint?: QueryEmptyHi return { refs }; } +/** + * True when ≥2 present testids tie at the minimum edit distance to the missing one — `nearest` is + * then an arbitrary lexical-tiebreak pick, so auto-healing would be a coin-flip between candidates. + * Such a drift is surfaced (with a nearest) but never auto-healed. + */ +export function nearestIsAmbiguous(missing: string, present: string[]): boolean { + if (present.length < 2) return false; + let min = Number.POSITIVE_INFINITY; + let count = 0; + for (const candidate of present) { + const distance = editDistance(missing, candidate); + if (distance < min) { + min = distance; + count = 1; + } else if (distance === min) { + count += 1; + } + } + return count >= 2; +} + /** Build the legible-drift record for a testid anchor that resolved to zero live elements. */ function testidDrift(value: string, hint: QueryEmptyHint | undefined): Drift { - return { + const present = hint?.presentTestids ?? []; + const drift: Drift = { reasonKind: DriftReason.TESTID_NOT_FOUND, reason: `testid "${value}" not found`, anchor: value, - nearest: nearestTestid(value, hint?.presentTestids ?? []), + nearest: nearestTestid(value, present), }; + if (nearestIsAmbiguous(value, present)) drift.ambiguous = true; + return drift; } /** The testid value of a step's primary anchor, for labelling the result row. */ diff --git a/packages/server/src/flows/heal-proposal.test.ts b/packages/server/src/flows/heal-proposal.test.ts index 7e7d23f..dd596a0 100644 --- a/packages/server/src/flows/heal-proposal.test.ts +++ b/packages/server/src/flows/heal-proposal.test.ts @@ -32,6 +32,30 @@ function signalDrift(name: string): Drift { }; } +function ambiguousDrift(from: string, nearest: string): Drift { + return { ...testidDrift(from, nearest), ambiguous: true }; +} + +describe('ambiguous drift is never auto-healed (heals the locator, never the intent)', () => { + it('refuses a proposal when the drift is ambiguous, even with a high-confidence nearest', () => { + // nearest is one edit away (would normally clear the floor), but the tie makes it unsafe. + expect(proposeRebind(ambiguousDrift('submit-bt', 'submit-btn'), 0)).toBeUndefined(); + }); + + it('still proposes for an unambiguous high-confidence drift', () => { + expect(proposeRebind(testidDrift('submit-bt', 'submit-btn'), 0)).toBeDefined(); + }); + + it('collectProposals skips ambiguous drifts', () => { + const steps: FlowStepResult[] = [ + { step: 0, tool: IrisTool.ACT, anchor: 'a', ok: false, drift: ambiguousDrift('a', 'ab') }, + { step: 1, tool: IrisTool.ACT, anchor: 'c', ok: false, drift: testidDrift('c', 'cd') }, + ]; + const proposals = collectProposals(steps); + expect(proposals.map((p) => p.step)).toEqual([1]); // only the unambiguous one + }); +}); + describe('confidenceFor — normalized edit-distance confidence', () => { it('confidence is higher for a smaller edit distance', () => { expect(confidenceFor('chat-send', 'chat-sent')).toBeGreaterThan( diff --git a/packages/server/src/flows/heal.ts b/packages/server/src/flows/heal.ts index d741c91..e805120 100644 --- a/packages/server/src/flows/heal.ts +++ b/packages/server/src/flows/heal.ts @@ -44,6 +44,10 @@ function proposeRebindWith( minConfidence: number, ): HealProposal | undefined { if (drift.reasonKind !== DriftReason.TESTID_NOT_FOUND) return undefined; + // Never auto-heal an ambiguous drift: when two present testids tie at the minimum distance, the + // `nearest` pick is arbitrary, so a rebind would be a coin-flip that can land on the wrong element + // and ship a bug green. Surface it (the drift still carries `nearest`) and defer to a human. + if (drift.ambiguous === true) return undefined; const to = drift.nearest; if (to === null) return undefined; const confidence = confidenceFor(drift.anchor, to); From 3c30edb9d5d4f23d64623e5e72b107f82d8ab6ee Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 06:32:25 +0530 Subject: [PATCH 13/33] docs: surface the new capabilities (iris_domain, snapshot diff/cost, grades, heal) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eight iterations shipped capabilities the human/agent docs didn't mention — features are invisible without docs. Surfaced across the three read surfaces: - token-efficiency.md: diffed snapshots section with the measured numbers (full 4,246 tok vs diff 60 tok, ~99% saved) + the cost-preview note. - agent-cheatsheet.md: "read iris_domain first" in Start-here, iris_domain in the core tool set, and a token note covering diff:true, cost, assertion grades, and ambiguous-heal refusal. - usage.md: iris_snapshot diff:true + cost documented; new iris_domain reference entry (flows + coverage + untested-intent gaps). Docs only; no code change. V1-ROADMAP M8 (discoverability slice). Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/agent-cheatsheet.md | 20 +++++++++++++++----- docs/token-efficiency.md | 24 ++++++++++++++++++++++++ docs/usage.md | 23 +++++++++++++++++++++-- 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/docs/agent-cheatsheet.md b/docs/agent-cheatsheet.md index 8fdac0b..6d78d3f 100644 --- a/docs/agent-cheatsheet.md +++ b/docs/agent-cheatsheet.md @@ -54,9 +54,9 @@ tree; a wrong `path` returns `{ found:false, availableKeys }` so it's self-corre Sessions/perception/verify — what you'll use 90% of the time: -`iris_sessions` · `iris_snapshot` · `iris_query` · `iris_act` · `iris_act_and_wait` · -`iris_observe` · `iris_wait_for` · `iris_assert` · `iris_state` · `iris_diff` · -`iris_capabilities` · `iris_narrate` (show intent on-page) · `iris_project` (run-history, see below). +`iris_sessions` · `iris_domain` (learn the app + gaps, read first) · `iris_snapshot` · `iris_query` · +`iris_act` · `iris_act_and_wait` · `iris_observe` · `iris_wait_for` · `iris_assert` · `iris_state` · +`iris_diff` · `iris_capabilities` · `iris_narrate` (show intent on-page) · `iris_project` (run-history). **Reach past core when…** you need to record/replay a journey (`iris_record_start/stop`, `iris_replay`), persist a self-healing golden flow (`iris_flow_save*` / `iris_flow_replay` / @@ -89,8 +89,10 @@ Both need a **driven browser** (`iris drive ` / `IRIS_CDP_URL`); without on ## Start here 1. `iris_sessions` — find the connected tab (omit `sessionId` if there's only one). -2. `iris_capabilities` — learn the app's testable surface (`testids`, `signals`, `stores`, `flows`) - so you assert on facts without reading source. (`iris_sessions` flags `hasCapabilities`.) +2. `iris_domain` — learn the app BEFORE testing: the saved flows, what each asserts, and the **gaps** + (declared signals/testids that no flow verifies — untested intent). Tells you what to test and + where the real risk is without crawling the whole app. Falls back to `iris_capabilities` for the + raw testable surface (`testids`, `signals`, `stores`, `flows`). 3. Run the loop: **look → act → observe → assert**, cross-checking the 4 layers on anything that matters. ## Token note @@ -98,6 +100,14 @@ Both need a **driven browser** (`iris drive ` / `IRIS_CDP_URL`); without on - **Keep the eyes cheap.** Prefer `iris_query` / scoped or `interactive` `iris_snapshot` / `iris_assert` over dumping the full tree. A full verify loop is ~100 tokens; see [token-efficiency.md](token-efficiency.md) (~73× leaner than full-tree snapshots). +- **Re-look with `iris_snapshot({ diff:true })`** after an action — it returns only what changed + (`mode:delta`/`unchanged`), ~99% fewer tokens than a full re-snapshot and no stale tree to + mis-read. Every snapshot/query result carries `cost:{ bytes, tokens }` — re-scope before reading + if it's large. +- **A saved flow tells you if it's a real test.** `iris_flow_save` returns `assertions.grade` + (`asserted` / `presence-only` / `assertion-free`); if it's not `asserted`, add a consequence + (`iris_annotate` assert-signal/assert-net or a success-state) so it can't pass while broken. On + replay, an ambiguous heal (two testids tie) is surfaced, never auto-applied. - **Predicate schema is not bloated.** The recursive predicate DSL used by `iris_assert` / `iris_wait_for` / `iris_act_and_wait` is **factored, not inlined**: when converted to the JSON Schema MCP sends, the predicate body is emitted **once** (~2.7k chars ≈ **~685 tokens** diff --git a/docs/token-efficiency.md b/docs/token-efficiency.md index 1c29e94..91d0a90 100644 --- a/docs/token-efficiency.md +++ b/docs/token-efficiency.md @@ -27,6 +27,30 @@ per-step snapshot (100 vs ~7,300 tokens). The bare a11y tree we measured directl Playwright MCP's actual payload adds a `[ref=…]` to every node, pushing it to ~7,300. On the complex pages Playwright's ecosystem cites (50k+), the gap widens to **~100–500×**. +## Diffed snapshots: pay once, then only for changes + +After the first snapshot, pass `iris_snapshot({ diff: true })` to get back **only what changed** +since your last look of the same scope/mode (`mode:delta` with added/removed lines, or +`mode:unchanged`). A route change auto-resets to a full snapshot, so you never read a misleading +cross-page diff. + +Measured on a representative 150-row dashboard (the shipped regression benchmark +`packages/server/src/tools/snapshot-cost.test.ts`, char/4 proxy): + +| Payload | Tokens | +| ---------------------------------- | --------: | +| Full re-snapshot (150-row table) | **4,246** | +| `diff:true` after a one-row change | **60** | +| `diff:true` when nothing changed | **17** | + +**~99% fewer tokens** to re-look after an action — and because a `delta` carries no stale full +tree, it also removes the 60–80K-token stale-context buildup that makes long-running agents start +hallucinating selectors that no longer exist. + +Every `iris_snapshot`/`iris_query` result also carries `cost:{ bytes, tokens }` (estimated) so you +can **re-scope before reading** a large body (`mode:interactive`/`status`, a tighter `scope`, or a +narrower `query`) instead of paying for it first. + ## The honest version - **Full-tree vs full-tree, the gap is modest (~1.8×):** Iris `full` (4,144) vs Playwright's diff --git a/docs/usage.md b/docs/usage.md index 18af75a..e5c7584 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -79,14 +79,22 @@ List connected tabs. → `{ sessions: [{ sessionId, url, title, lastSeenMs, hidd A semantic, accessibility-tree view of the page. - **args:** `mode?: 'full' | 'interactive' | 'status'` (default `full`), `scope?` (CSS - selector or ref), `sessionId?`. -- **returns:** `{ tree, status: { route, title, visibleDialogs }, nodes, truncated }`. + selector or ref), `diff?: boolean`, `sessionId?`. +- **returns:** `{ tree, status: { route, title, visibleDialogs }, nodes, truncated, cost: { bytes, tokens } }`. +- **`diff: true`** returns only what changed since your last snapshot of the same scope/mode — + `{ mode: 'delta', delta: { added, removed, addedCount, removedCount } }` or `{ mode: 'unchanged' }` + (no full tree). The first call (and any call after a route change) still returns the full tree. + ~99% fewer tokens to re-look after an action; see [token-efficiency.md](token-efficiency.md). +- **`cost`** is an estimated size of the result — re-scope (`mode`/`scope`) before reading if large. ```jsonc iris_snapshot({ mode: "interactive" }) // - tab "Overview" (ref=e2) // - button "Add item" (ref=e5) // status: { route: "/dashboard", visibleDialogs: [] } + +iris_snapshot({ diff: true }) // after an action — only the change set +// { mode: "delta", delta: { added: ['- alert "Saved!"'], removed: [], addedCount: 1, removedCount: 0 } } ``` ### `iris_query` @@ -250,6 +258,17 @@ what to assert on without reading source. `iris_sessions` also surfaces a `hasCapabilities` flag per session so you know when it's worth calling. Returns empty arrays (never errors) if the app advertised nothing. +### `iris_domain` + +Read the app's domain model **before testing**: a synthesis of every saved flow + the registered +capabilities. Tells you what to test and where the real risk is without crawling the app. Reads +`.iris/flows/` + `.iris/contract.json` — no browser needed. + +- `iris_domain({})` → `{ flowCount, flows: [{ name, steps, grade, asserts, signals, testids, warning? }], declared: { testids, signals, stores }, coverage: { asserted, presenceOnly, assertionFree }, gaps: { unassertedFlows, declaredUntestedSignals, declaredUntestedTestids }, summary }` +- **`gaps`** is the point: `declaredUntestedSignals` are intents the app emits that **no flow + asserts** (untested behavior); `unassertedFlows` act but verify no consequence. Close them with a + flow + a consequence assertion (`iris_annotate`). + ### `iris_state` Read live framework/store state directly instead of inferring it from the DOM — [§17](#17-evidence-of-effect-actawait-state-capabilities-replay-m56). From eeb8ead3474ad94938c37d8f58dab33745e0b472 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 06:58:47 +0530 Subject: [PATCH 14/33] =?UTF-8?q?feat(domain):=20risk-rank=20flows=20in=20?= =?UTF-8?q?iris=5Fdomain=20(run=20history=20=C3=97=20assertion=20quality)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit iris_domain now folds .iris/project.json run history into the model and returns `riskRanked` (flow names worst-first) plus a per-flow `risk:{ level, reason, lastStatus }`. Risk is the WORSE of two signals: - run history: last run errored/drifted = high; passed-but-with-console/network errors = medium; passed clean = low; never run = unknown. - assertion quality: an assertion-free flow is medium EVEN when it passed — a green run of a flow that asserts nothing is false confidence. Test-priority order is high > medium > unknown (unvalidated) > low, so the agent tests the genuinely riskiest journeys first instead of re-running known-good ones. Synthesizes "domain-aware" + "rigorous": a flow that looks green but can't catch a regression surfaces as risk. Pure flow-risk.ts; iris_domain loads the history. V1-ROADMAP M4 (deepening). 13 new tests (flow-risk + domain); 528 server tests; all 16 tasks green uncached. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/usage.md | 5 +- .../server/src/domain/domain-model.test.ts | 31 +++++++ packages/server/src/domain/domain-model.ts | 18 ++++ packages/server/src/domain/domain-tools.ts | 9 +- packages/server/src/domain/flow-risk.test.ts | 63 +++++++++++++ packages/server/src/domain/flow-risk.ts | 92 +++++++++++++++++++ 6 files changed, 216 insertions(+), 2 deletions(-) create mode 100644 packages/server/src/domain/flow-risk.test.ts create mode 100644 packages/server/src/domain/flow-risk.ts diff --git a/docs/usage.md b/docs/usage.md index e5c7584..919fb54 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -264,10 +264,13 @@ Read the app's domain model **before testing**: a synthesis of every saved flow capabilities. Tells you what to test and where the real risk is without crawling the app. Reads `.iris/flows/` + `.iris/contract.json` — no browser needed. -- `iris_domain({})` → `{ flowCount, flows: [{ name, steps, grade, asserts, signals, testids, warning? }], declared: { testids, signals, stores }, coverage: { asserted, presenceOnly, assertionFree }, gaps: { unassertedFlows, declaredUntestedSignals, declaredUntestedTestids }, summary }` +- `iris_domain({})` → `{ flowCount, flows: [{ name, steps, grade, asserts, signals, testids, warning?, risk? }], declared: { testids, signals, stores }, coverage: { asserted, presenceOnly, assertionFree }, gaps: { unassertedFlows, declaredUntestedSignals, declaredUntestedTestids }, riskRanked, summary }` - **`gaps`** is the point: `declaredUntestedSignals` are intents the app emits that **no flow asserts** (untested behavior); `unassertedFlows` act but verify no consequence. Close them with a flow + a consequence assertion (`iris_annotate`). +- **`riskRanked`** orders flow names worst-first by combining run history (`.iris/project.json`: + recently failed/drifted, or passed-with-errors) with assertion quality (a green assertion-free + flow is still risky). **Test these first.** Each flow's `risk` carries `{ level, reason, lastStatus? }`. ### `iris_state` diff --git a/packages/server/src/domain/domain-model.test.ts b/packages/server/src/domain/domain-model.test.ts index fd5a505..f6412cd 100644 --- a/packages/server/src/domain/domain-model.test.ts +++ b/packages/server/src/domain/domain-model.test.ts @@ -72,4 +72,35 @@ describe('buildDomainModel', () => { expect(m.flowCount).toBe(0); expect(m.summary).toContain('No saved flows'); }); + + it('risk-ranks flows worst-first when run history is supplied', () => { + const flows = [ + flow('clean', [testidStep('a')], { signal: 's' }), // asserted + (will pass clean) + flow('broken', [testidStep('b')], { signal: 't' }), // asserted but last run errored + ]; + const runs = [ + { kind: 'flow_replay', name: 'clean', status: 'pass', at: 1 }, + { kind: 'flow_replay', name: 'broken', status: 'error', at: 2 }, + ] as Parameters[2]; + const m = buildDomainModel(flows, null, runs); + expect(m.riskRanked[0]).toBe('broken'); // failed run surfaces first + expect(m.flows.find((f) => f.name === 'broken')?.risk?.level).toBe('high'); + expect(m.flows.find((f) => f.name === 'clean')?.risk?.level).toBe('low'); + }); + + it('treats a green assertion-free flow as still risky (false confidence)', () => { + const flows = [flow('noassert', [testidStep('a')])]; // assertion-free + const runs = [{ kind: 'flow_replay', name: 'noassert', status: 'pass', at: 1 }] as Parameters< + typeof buildDomainModel + >[2]; + const m = buildDomainModel(flows, null, runs); + // passed clean, but asserts nothing → medium, not low. + expect(m.flows[0]?.risk?.level).toBe('medium'); + }); + + it('omits risk entirely when no run history is supplied', () => { + const m = buildDomainModel([flow('f', [testidStep('a')], { signal: 's' })], null); + expect(m.flows[0]?.risk).toBeUndefined(); + expect(m.riskRanked).toEqual([]); + }); }); diff --git a/packages/server/src/domain/domain-model.ts b/packages/server/src/domain/domain-model.ts index 9fcccc3..4b3c2f4 100644 --- a/packages/server/src/domain/domain-model.ts +++ b/packages/server/src/domain/domain-model.ts @@ -18,8 +18,10 @@ import { type CapabilitiesContract, type FlowFile, type FlowStep, + type RunRecord, } from '@syrin/iris-protocol'; import { classifyFlowAssertions, FlowAssertionGrade } from '../flows/flow-classify.js'; +import { flowRisk, latestRun, rankByRisk, type FlowRisk } from './flow-risk.js'; export interface DomainFlowSummary { name: string; @@ -30,6 +32,8 @@ export interface DomainFlowSummary { warning?: string; signals: string[]; testids: string[]; + /** Combined run-history + assertion-quality risk (present when run history is supplied). */ + risk?: FlowRisk; } export interface DomainGaps { @@ -47,6 +51,8 @@ export interface DomainModel { declared: { testids: number; signals: string[]; stores: string[] }; coverage: { asserted: number; presenceOnly: number; assertionFree: number }; gaps: DomainGaps; + /** Flow names worst-risk first (run-history + assertion quality). Empty without run history. */ + riskRanked: string[]; /** One-line headline an agent (or human) can read at a glance. */ summary: string; } @@ -85,8 +91,10 @@ const EMPTY_CONTRACT: CapabilitiesContract = { testids: [], signals: [], stores: export function buildDomainModel( flows: readonly FlowFile[], contract: CapabilitiesContract | null, + runs: readonly RunRecord[] = [], ): DomainModel { const caps = contract ?? EMPTY_CONTRACT; + const hasHistory = runs.length > 0; const flowSummaries: DomainFlowSummary[] = flows.map((flow) => { const c = classifyFlowAssertions(flow); @@ -99,6 +107,7 @@ export function buildDomainModel( testids: flowTestids(flow), }; if (c.warning !== undefined) summary.warning = c.warning; + if (hasHistory) summary.risk = flowRisk(c.grade, latestRun(flow.name, runs)); return summary; }); @@ -118,12 +127,21 @@ export function buildDomainModel( declaredUntestedTestids: caps.testids.filter((t) => !testedTestids.has(t)), }; + const riskRanked = hasHistory + ? rankByRisk( + flowSummaries + .filter((f) => f.risk !== undefined) + .map((f) => ({ name: f.name, risk: f.risk as FlowRisk })), + ) + : []; + return { flowCount: flows.length, flows: flowSummaries, declared: { testids: caps.testids.length, signals: caps.signals, stores: caps.stores }, coverage, gaps, + riskRanked, summary: buildSummary(flows.length, coverage, gaps), }; } diff --git a/packages/server/src/domain/domain-tools.ts b/packages/server/src/domain/domain-tools.ts index 18eeaaf..aed2a46 100644 --- a/packages/server/src/domain/domain-tools.ts +++ b/packages/server/src/domain/domain-tools.ts @@ -45,6 +45,11 @@ export const DOMAIN_TOOLS: ToolDef[] = [ declaredUntestedSignals: z.array(z.string()), declaredUntestedTestids: z.array(z.string()), }), + riskRanked: z + .array(z.string()) + .describe( + 'Flow names worst-risk first (run history + assertion quality). Test these first.', + ), summary: z.string(), }, handler: async (deps: ToolDeps) => { @@ -55,7 +60,9 @@ export const DOMAIN_TOOLS: ToolDef[] = [ if (loaded.ok) flows.push(loaded.value); } const contract = await readContract(deps.fs, deps.irisRoot); - return buildDomainModel(flows, contract.ok ? contract.capabilities : null); + const project = await deps.project.read(); + const runs = project.ok ? project.file.runs : []; + return buildDomainModel(flows, contract.ok ? contract.capabilities : null, runs); }, }, ]; diff --git a/packages/server/src/domain/flow-risk.test.ts b/packages/server/src/domain/flow-risk.test.ts new file mode 100644 index 0000000..bbfa587 --- /dev/null +++ b/packages/server/src/domain/flow-risk.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, it } from 'vitest'; +import { RunKind, RunStatus, type RunRecord } from '@syrin/iris-protocol'; +import { FlowAssertionGrade } from '../flows/flow-classify.js'; +import { flowRisk, latestRun, rankByRisk, RiskLevel } from './flow-risk.js'; + +function run( + name: string, + status: RunStatus, + at: number, + extra: Partial = {}, +): RunRecord { + return { kind: RunKind.FLOW_REPLAY, name, status, at, ...extra }; +} + +describe('latestRun', () => { + it('picks the most recent run for the name', () => { + const runs = [ + run('a', RunStatus.PASS, 1), + run('a', RunStatus.ERROR, 3), + run('a', RunStatus.PASS, 2), + ]; + expect(latestRun('a', runs)?.at).toBe(3); + }); + it('is undefined when the flow was never run', () => { + expect(latestRun('x', [run('a', RunStatus.PASS, 1)])).toBeUndefined(); + }); +}); + +describe('flowRisk — worse of run-history and assertion quality', () => { + it('a failed last run is high regardless of grade', () => { + expect(flowRisk(FlowAssertionGrade.ASSERTED, run('a', RunStatus.ERROR, 1)).level).toBe( + RiskLevel.HIGH, + ); + }); + it('a clean asserted run is low', () => { + expect(flowRisk(FlowAssertionGrade.ASSERTED, run('a', RunStatus.PASS, 1)).level).toBe( + RiskLevel.LOW, + ); + }); + it('a clean run with logged errors is medium', () => { + const r = run('a', RunStatus.PASS, 1, { evidence: { consoleErrors: 2 } }); + expect(flowRisk(FlowAssertionGrade.ASSERTED, r).level).toBe(RiskLevel.MEDIUM); + }); + it('a clean run of an assertion-free flow is still medium (false confidence)', () => { + expect(flowRisk(FlowAssertionGrade.ASSERTION_FREE, run('a', RunStatus.PASS, 1)).level).toBe( + RiskLevel.MEDIUM, + ); + }); + it('a never-run flow is unknown', () => { + expect(flowRisk(FlowAssertionGrade.ASSERTED, undefined).level).toBe(RiskLevel.UNKNOWN); + }); +}); + +describe('rankByRisk', () => { + it('orders worst-first, ties broken by name', () => { + const ranked = rankByRisk([ + { name: 'b', risk: { level: RiskLevel.LOW, reason: '' } }, + { name: 'a', risk: { level: RiskLevel.HIGH, reason: '' } }, + { name: 'c', risk: { level: RiskLevel.LOW, reason: '' } }, + ]); + expect(ranked).toEqual(['a', 'b', 'c']); + }); +}); diff --git a/packages/server/src/domain/flow-risk.ts b/packages/server/src/domain/flow-risk.ts new file mode 100644 index 0000000..aafe635 --- /dev/null +++ b/packages/server/src/domain/flow-risk.ts @@ -0,0 +1,92 @@ +/** + * Risk-rank flows so an agent tests the riskiest first. Risk combines two signals: + * - run history (.iris/project.json): a flow whose last run errored/drifted, or passed but with + * console/network errors, is riskier than one that passed clean; a never-run flow is unknown. + * - assertion quality (flow-classify): a flow that asserts no consequence is risky EVEN when it + * "passes" — a green assertion-free flow is false confidence (Fowler/Dodds), so it can't be + * trusted to catch a regression. + * + * Taking the worse of the two means "passed clean last time" never hides "but it asserts nothing". + * Pure: no IO, no clock. + */ + +import { RunStatus, type RunRecord } from '@syrin/iris-protocol'; +import { FlowAssertionGrade } from '../flows/flow-classify.js'; + +export const RiskLevel = { + HIGH: 'high', + MEDIUM: 'medium', + LOW: 'low', + UNKNOWN: 'unknown', +} as const; +export type RiskLevel = (typeof RiskLevel)[keyof typeof RiskLevel]; + +// Test-priority order: a known failure first, then a known weakness, then a NEVER-RUN flow +// (unvalidated — worth running before a known-clean one), then a clean low-risk flow last. +const RANK: Record = { + [RiskLevel.HIGH]: 3, + [RiskLevel.MEDIUM]: 2, + [RiskLevel.UNKNOWN]: 1, + [RiskLevel.LOW]: 0, +}; + +export interface FlowRisk { + level: RiskLevel; + reason: string; + lastStatus?: RunStatus; +} + +/** Most recent run for a flow name (by `at`), or undefined if it has never been recorded. */ +export function latestRun(name: string, runs: readonly RunRecord[]): RunRecord | undefined { + let best: RunRecord | undefined; + for (const run of runs) { + if (run.name === name && (best === undefined || run.at > best.at)) best = run; + } + return best; +} + +function runRisk(run: RunRecord | undefined): { level: RiskLevel; reason: string } { + if (run === undefined) return { level: RiskLevel.UNKNOWN, reason: 'never run' }; + if (run.status === RunStatus.ERROR || run.status === RunStatus.FAIL) { + return { level: RiskLevel.HIGH, reason: 'last run failed' }; + } + if (run.status === RunStatus.DRIFT) return { level: RiskLevel.HIGH, reason: 'last run drifted' }; + const errors = (run.evidence?.consoleErrors ?? 0) + (run.evidence?.networkErrors ?? 0); + if (errors > 0) { + return { + level: RiskLevel.MEDIUM, + reason: `last run passed but logged ${String(errors)} error(s)`, + }; + } + return { level: RiskLevel.LOW, reason: 'last run passed clean' }; +} + +function gradeRisk(grade: string): { level: RiskLevel; reason: string } { + if (grade === FlowAssertionGrade.ASSERTION_FREE) { + return { + level: RiskLevel.MEDIUM, + reason: 'asserts no consequence — a green run proves little', + }; + } + if (grade === FlowAssertionGrade.PRESENCE_ONLY) { + return { level: RiskLevel.LOW, reason: 'presence-only assertion' }; + } + return { level: RiskLevel.LOW, reason: 'asserts a consequence' }; +} + +/** The worse of run-history risk and assertion-quality risk. */ +export function flowRisk(grade: string, run: RunRecord | undefined): FlowRisk { + const r = runRisk(run); + const g = gradeRisk(grade); + const top = RANK[r.level] >= RANK[g.level] ? r : g; + return run === undefined + ? { level: top.level, reason: top.reason } + : { level: top.level, reason: top.reason, lastStatus: run.status }; +} + +/** Order flow names worst-risk first (HIGH→UNKNOWN), ties broken by name for stable output. */ +export function rankByRisk(entries: { name: string; risk: FlowRisk }[]): string[] { + return [...entries] + .sort((a, b) => RANK[b.risk.level] - RANK[a.risk.level] || a.name.localeCompare(b.name)) + .map((e) => e.name); +} From 70b9979ef0c5cca1f7ae7baa58d1bf7410b10b6e Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 07:54:30 +0530 Subject: [PATCH 15/33] feat(domain): headline the riskiest flow in the iris_domain summary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the risk-ranking work: the one-line `summary` now names the flow to test first ("test first: ()") when run history flags a high/medium risk flow — so the single most actionable fact (where the real regression risk is) is in the headline, not buried in the arrays. Omitted when the top flow is only low risk (no false alarm). Pure; 2 new tests; 529 server tests; all 16 tasks green uncached. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/server/src/domain/domain-model.test.ts | 10 ++++++++++ packages/server/src/domain/domain-model.ts | 17 +++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/packages/server/src/domain/domain-model.test.ts b/packages/server/src/domain/domain-model.test.ts index f6412cd..a086d7c 100644 --- a/packages/server/src/domain/domain-model.test.ts +++ b/packages/server/src/domain/domain-model.test.ts @@ -86,6 +86,16 @@ describe('buildDomainModel', () => { expect(m.riskRanked[0]).toBe('broken'); // failed run surfaces first expect(m.flows.find((f) => f.name === 'broken')?.risk?.level).toBe('high'); expect(m.flows.find((f) => f.name === 'clean')?.risk?.level).toBe('low'); + // the summary headlines the riskiest flow to test first. + expect(m.summary).toContain('test first: broken'); + }); + + it('omits the "test first" headline when the top flow is only low risk', () => { + const flows = [flow('clean', [testidStep('a')], { signal: 's' })]; + const runs = [{ kind: 'flow_replay', name: 'clean', status: 'pass', at: 1 }] as Parameters< + typeof buildDomainModel + >[2]; + expect(buildDomainModel(flows, null, runs).summary).not.toContain('test first'); }); it('treats a green assertion-free flow as still risky (false confidence)', () => { diff --git a/packages/server/src/domain/domain-model.ts b/packages/server/src/domain/domain-model.ts index 4b3c2f4..4ab750d 100644 --- a/packages/server/src/domain/domain-model.ts +++ b/packages/server/src/domain/domain-model.ts @@ -21,7 +21,7 @@ import { type RunRecord, } from '@syrin/iris-protocol'; import { classifyFlowAssertions, FlowAssertionGrade } from '../flows/flow-classify.js'; -import { flowRisk, latestRun, rankByRisk, type FlowRisk } from './flow-risk.js'; +import { flowRisk, latestRun, rankByRisk, RiskLevel, type FlowRisk } from './flow-risk.js'; export interface DomainFlowSummary { name: string; @@ -135,6 +135,15 @@ export function buildDomainModel( ) : []; + // The most actionable fact — the riskiest flow to test first — when run history flagged one. + const top = riskRanked[0]; + const topFlow = top === undefined ? undefined : flowSummaries.find((f) => f.name === top); + const topRisk = + topFlow?.risk !== undefined && + (topFlow.risk.level === RiskLevel.HIGH || topFlow.risk.level === RiskLevel.MEDIUM) + ? { name: topFlow.name, reason: topFlow.risk.reason } + : undefined; + return { flowCount: flows.length, flows: flowSummaries, @@ -142,7 +151,7 @@ export function buildDomainModel( coverage, gaps, riskRanked, - summary: buildSummary(flows.length, coverage, gaps), + summary: buildSummary(flows.length, coverage, gaps, topRisk), }; } @@ -150,6 +159,7 @@ function buildSummary( flowCount: number, coverage: DomainModel['coverage'], gaps: DomainGaps, + topRisk: { name: string; reason: string } | undefined, ): string { if (flowCount === 0) { return 'No saved flows yet — record the critical journeys (iris_record_start) so the agent learns the app.'; @@ -157,6 +167,9 @@ function buildSummary( const parts = [ `${String(flowCount)} flow${flowCount === 1 ? '' : 's'}: ${String(coverage.asserted)} asserted, ${String(coverage.presenceOnly)} presence-only, ${String(coverage.assertionFree)} assertion-free`, ]; + if (topRisk !== undefined) { + parts.push(`test first: ${topRisk.name} (${topRisk.reason})`); + } if (gaps.declaredUntestedSignals.length > 0) { parts.push( `${String(gaps.declaredUntestedSignals.length)} declared signal(s) no flow asserts (${gaps.declaredUntestedSignals.join(', ')})`, From 25ed01fe0b1b44ccb26de92b28e5e72710e8b683 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 12:35:52 +0530 Subject: [PATCH 16/33] fix(flows): iris_flow_list returns {name,path} objects to match its outputSchema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The handler returned bare flow-name strings while its declared outputSchema promises {name, path, createdAt?} objects — so a schema-validating MCP client rejected the result ("Expected object, received string"). Caught for real while driving the live demo via MCP. The unit test had codified the wrong (string[]) shape, so it passed while the contract was violated — corrected too. Now maps each name → {name, path: flowPath(irisRoot, name)}. 529 server tests; all gates green. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/server/src/flows/flow-tools.ts | 9 ++++++++- packages/server/src/flows/tools.flows.test.ts | 9 +++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/packages/server/src/flows/flow-tools.ts b/packages/server/src/flows/flow-tools.ts index 319830e..34c7eed 100644 --- a/packages/server/src/flows/flow-tools.ts +++ b/packages/server/src/flows/flow-tools.ts @@ -21,6 +21,7 @@ import { asString } from '../tools/tools-helpers.js'; import { replayFlow } from './flow-replay.js'; import { classifyFlowAssertions } from './flow-classify.js'; import { assertSuccess, dynamicTestids, successLabel } from './flow-success.js'; +import { flowPath } from '../project/iris-dir.js'; import { collectProposals } from './heal.js'; import type { FlowStepResult } from '@syrin/iris-protocol'; import { waitForPredicate } from '../events/predicate.js'; @@ -154,7 +155,13 @@ export const FLOW_TOOLS: ToolDef[] = [ z.object({ name: z.string(), path: z.string(), createdAt: z.number().optional() }), ), }, - handler: (deps: ToolDeps) => deps.flows.list().then((flows) => ({ flows })), + // Return {name, path} objects to MATCH the declared outputSchema. Returning bare name strings + // (the prior bug) made schema-validating MCP clients reject the result ("expected object, + // received string") — caught driving the live demo. + handler: (deps: ToolDeps) => + deps.flows.list().then((names) => ({ + flows: names.map((name) => ({ name, path: flowPath(deps.irisRoot, name) })), + })), }, { name: IrisTool.FLOW_LOAD, diff --git a/packages/server/src/flows/tools.flows.test.ts b/packages/server/src/flows/tools.flows.test.ts index f1b8abc..8917a92 100644 --- a/packages/server/src/flows/tools.flows.test.ts +++ b/packages/server/src/flows/tools.flows.test.ts @@ -130,8 +130,13 @@ describe('iris_flow_save / iris_flow_load handlers', () => { expect(loaded.flowName).toBe('checkout'); expect(loaded.steps[0]?.anchor).toEqual({ kind: 'testid', value: 'pay' }); - const list = (await tool(IrisTool.FLOW_LIST).handler(deps, {})) as { flows: string[] }; - expect(list.flows).toEqual(['checkout']); + // FLOW_LIST returns {name, path} objects (matches its outputSchema — schema-validating MCP + // clients reject bare strings). + const list = (await tool(IrisTool.FLOW_LIST).handler(deps, {})) as { + flows: { name: string; path: string }[]; + }; + expect(list.flows.map((f) => f.name)).toEqual(['checkout']); + expect(list.flows[0]?.path).toContain('checkout'); }); it('3: a recorded expect.signal survives the round-trip', async () => { From 52cf56fdd2e6cba83972f04dad461811ee861960 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 13:10:25 +0530 Subject: [PATCH 17/33] feat(observe): nudge the agent to scope a large timeline (cost.recommendation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Observed live driving the demo: a single login flooded iris_observe with 319 events / ~37KB because the dashboard's count-up animations emit a dom.text per frame — the agent only avoided the token hit because it knew to pass filters. Now costHint adds a `recommendation` when a timeline is large (>=80 events or >=8KB) telling the agent to pass filters:[...] or max_events next time. Surfaced in iris_observe's cost output schema. Backed by real session data; V1-ROADMAP M2. 3 new tests; 532 server tests; all 16 tasks green uncached. (No daemon restart — the live demo session is untouched.) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/session/output-budget.test.ts | 17 ++++++++++++++++ packages/server/src/session/output-budget.ts | 20 +++++++++++++++++-- packages/server/src/tools/tools.ts | 6 ++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/packages/server/src/session/output-budget.test.ts b/packages/server/src/session/output-budget.test.ts index 36ac239..51d2ee9 100644 --- a/packages/server/src/session/output-budget.test.ts +++ b/packages/server/src/session/output-budget.test.ts @@ -43,6 +43,23 @@ describe('costHint', () => { it('includes droppedOldest only when something was dropped', () => { expect(costHint({}, 1, 4).droppedOldest).toBe(4); }); + + it('adds no recommendation for a small timeline', () => { + expect(costHint({ a: 1 }, 5).recommendation).toBeUndefined(); + }); + + it('recommends scoping when the event count is large (observed: login flooded ~319)', () => { + const c = costHint({ a: 1 }, 319); + expect(c.recommendation).toBeDefined(); + expect(c.recommendation).toContain('filters'); + expect(c.recommendation).toContain('319'); + }); + + it('recommends scoping when the byte size is large even with few events', () => { + const big = { blob: 'x'.repeat(9000) }; + const c = costHint(big, 3); + expect(c.recommendation).toBeDefined(); + }); }); describe('estimateTokens', () => { diff --git a/packages/server/src/session/output-budget.ts b/packages/server/src/session/output-budget.ts index df5ef51..c5db49c 100644 --- a/packages/server/src/session/output-budget.ts +++ b/packages/server/src/session/output-budget.ts @@ -10,8 +10,19 @@ export interface CostHint { events: number; bytes: number; droppedOldest?: number; + /** Present when the timeline is large — tells the agent to scope its NEXT call (cut tokens). */ + recommendation?: string; } +/** + * Above this, a timeline is big enough that re-reading it unscoped is a real token tax. Observed + * live: a single login flooded 319 events / ~37KB because the dashboard's count-up animations emit + * a dom.text per frame — the agent only dodged the cost by knowing to pass filters. The hint now + * tells it. + */ +const LARGE_TIMELINE_EVENTS = 80; +const LARGE_TIMELINE_BYTES = 8000; + /** Keep only the most recent `maxEvents` events; report how many older ones were dropped. */ export function applyEventBudget( events: IrisEvent[], @@ -28,8 +39,13 @@ export function applyEventBudget( /** Build a cost hint from a payload + the event count it carries. */ export function costHint(payload: unknown, events: number, droppedOldest = 0): CostHint { - const bytes = JSON.stringify(payload)?.length ?? 0; - return droppedOldest > 0 ? { events, bytes, droppedOldest } : { events, bytes }; + const json = JSON.stringify(payload) ?? ''; + const bytes = json.length; + const base: CostHint = droppedOldest > 0 ? { events, bytes, droppedOldest } : { events, bytes }; + if (events >= LARGE_TIMELINE_EVENTS || bytes >= LARGE_TIMELINE_BYTES) { + base.recommendation = `large timeline (${String(events)} events, ~${String(estimateTokens(json))} tokens) — pass filters:[...] (e.g. ["signal","net"]) or max_events to scope your next call and cut tokens`; + } + return base; } /** diff --git a/packages/server/src/tools/tools.ts b/packages/server/src/tools/tools.ts index 675387b..3fe4daa 100644 --- a/packages/server/src/tools/tools.ts +++ b/packages/server/src/tools/tools.ts @@ -699,6 +699,12 @@ export const TOOLS: ToolDef[] = [ events: z.number(), bytes: z.number(), droppedOldest: z.number().optional(), + recommendation: z + .string() + .optional() + .describe( + 'Present when the timeline is large — scope your next call (filters/max_events).', + ), }), session: z .object({ lastSeenMs: z.number(), throttled: z.boolean(), focused: z.boolean() }) From 814bbf0873dab31c2923e4163546c754d90ec59c Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 14:55:14 +0530 Subject: [PATCH 18/33] fix(browser): restore original fetch identity on network observer teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The network observer stored window.fetch.bind(window) and assigned that bound copy back on teardown, so window.fetch was never restored to its original identity — violating the "fully reversible" contract in types.ts. Keep the true original for restore; use a separate window-bound copy only for invocation (fetch throws "Illegal invocation" on the wrong `this`). Adds the first test coverage for installNetwork, pinning GET->500 emission (the case a live MCP session appeared to miss; the observer itself is sound, so that miss was a long-session/window artifact, not a code bug), POST method capture, reject-path status:0, and teardown identity. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../browser/src/observers/network.test.ts | 86 +++++++++++++++++++ packages/browser/src/observers/network.ts | 7 +- 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 packages/browser/src/observers/network.test.ts diff --git a/packages/browser/src/observers/network.test.ts b/packages/browser/src/observers/network.test.ts new file mode 100644 index 0000000..68cf7d4 --- /dev/null +++ b/packages/browser/src/observers/network.test.ts @@ -0,0 +1,86 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { EventType } from '@syrin/iris-protocol'; +import { installNetwork } from './network.js'; +import type { Emit, Teardown } from './types.js'; + +interface Emitted { + type: EventType; + data: Record; +} + +function collect(): { emit: Emit; events: Emitted[] } { + const events: Emitted[] = []; + const emit: Emit = (type, data) => { + events.push({ type, data }); + }; + return { emit, events }; +} + +/** A minimal Response stand-in — jsdom does not always expose a usable global Response. */ +function fakeResponse(status: number): Response { + return { status, ok: status >= 200 && status < 300 } as Response; +} + +describe('installNetwork (fetch)', () => { + let teardown: Teardown | undefined; + const origFetch = window.fetch; + + beforeEach(() => { + // Ensure there is a fetch for the observer to wrap; each test overrides the behavior. + window.fetch = vi.fn(() => Promise.resolve(fakeResponse(200))); + }); + + afterEach(() => { + teardown?.(); + teardown = undefined; + window.fetch = origFetch; + }); + + it('emits NET_REQUEST for a GET that resolves with a 500 (the regression that prompted this test)', async () => { + window.fetch = vi.fn(() => Promise.resolve(fakeResponse(500))); + const { emit, events } = collect(); + teardown = installNetwork(emit); + + const res = await window.fetch('http://localhost:8787/api/broken/500'); + + expect(res.status).toBe(500); + expect(events).toHaveLength(1); + expect(events[0]?.type).toBe(EventType.NET_REQUEST); + expect(events[0]?.data).toMatchObject({ + method: 'GET', + url: 'http://localhost:8787/api/broken/500', + status: 500, + ok: false, + initiator: 'fetch', + }); + }); + + it('captures the method from init for a POST', async () => { + window.fetch = vi.fn(() => Promise.resolve(fakeResponse(200))); + const { emit, events } = collect(); + teardown = installNetwork(emit); + + await window.fetch('http://localhost:8787/api/login', { method: 'POST' }); + + expect(events[0]?.data).toMatchObject({ method: 'POST', status: 200, ok: true }); + }); + + it('emits a NET_REQUEST with status 0 and rethrows when the fetch rejects', async () => { + const boom = new Error('network down'); + window.fetch = vi.fn(() => Promise.reject(boom)); + const { emit, events } = collect(); + teardown = installNetwork(emit); + + await expect(window.fetch('http://localhost:8787/api/x')).rejects.toBe(boom); + expect(events).toHaveLength(1); + expect(events[0]?.data).toMatchObject({ status: 0, ok: false, error: 'network down' }); + }); + + it('restores the original fetch on teardown', () => { + const before = window.fetch; + const t = installNetwork(collect().emit); + expect(window.fetch).not.toBe(before); + t(); + expect(window.fetch).toBe(before); + }); +}); diff --git a/packages/browser/src/observers/network.ts b/packages/browser/src/observers/network.ts index 5bca6ae..a2b4599 100644 --- a/packages/browser/src/observers/network.ts +++ b/packages/browser/src/observers/network.ts @@ -21,14 +21,17 @@ function methodOf(input: RequestInfo | URL, init: RequestInit | undefined): stri /** Patch fetch + XMLHttpRequest to emit net.request events. Fully reversible. */ export function installNetwork(emit: Emit): Teardown { - const origFetch = window.fetch.bind(window); + // Keep the true original for teardown identity, plus a window-bound copy to invoke + // (fetch throws "Illegal invocation" if called with the wrong `this`). + const origFetch = window.fetch; + const callFetch = origFetch.bind(window); window.fetch = async (input: RequestInfo | URL, init?: RequestInit): Promise => { const start = performance.now(); const method = methodOf(input, init); const url = urlOf(input); try { - const res = await origFetch(input, init); + const res = await callFetch(input, init); emit(EventType.NET_REQUEST, { method, url, From e9a667082b4a4eda245888167ded56b1624fa030 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 14:58:41 +0530 Subject: [PATCH 19/33] fix(browser): restore original identity on route + console observer teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same bind-on-restore bug audited from the network observer fix: route.ts and console.ts stored bound copies (history.pushState.bind / console[m].bind) and assigned those back on teardown, so the globals were never restored to their original identity — breaking the "reversible" contract. Keep the true originals for restore; use bound copies only for invocation. Adds first-ever observer-level tests for both (emit + teardown identity). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../browser/src/observers/console.test.ts | 51 +++++++++++++++++++ packages/browser/src/observers/console.ts | 6 ++- packages/browser/src/observers/route.test.ts | 49 ++++++++++++++++++ packages/browser/src/observers/route.ts | 14 +++-- 4 files changed, 114 insertions(+), 6 deletions(-) create mode 100644 packages/browser/src/observers/console.test.ts create mode 100644 packages/browser/src/observers/route.test.ts diff --git a/packages/browser/src/observers/console.test.ts b/packages/browser/src/observers/console.test.ts new file mode 100644 index 0000000..ec3682a --- /dev/null +++ b/packages/browser/src/observers/console.test.ts @@ -0,0 +1,51 @@ +import { describe, it, expect, afterEach } from 'vitest'; +import { EventType } from '@syrin/iris-protocol'; +import { installConsole } from './console.js'; +import type { Emit, Teardown } from './types.js'; + +interface Emitted { + type: EventType; + data: Record; +} + +function collect(): { emit: Emit; events: Emitted[] } { + const events: Emitted[] = []; + const emit: Emit = (type, data) => { + events.push({ type, data }); + }; + return { emit, events }; +} + +describe('installConsole', () => { + let teardown: Teardown | undefined; + + afterEach(() => { + teardown?.(); + teardown = undefined; + }); + + it('emits CONSOLE_ERROR and still forwards to the original console', () => { + const { emit, events } = collect(); + teardown = installConsole(emit); + + console.error('boom', 42); + + expect(events).toHaveLength(1); + expect(events[0]?.type).toBe(EventType.CONSOLE_ERROR); + expect(events[0]?.data.message).toBe('boom 42'); + }); + + it('restores the original console methods (identity) on teardown', () => { + /* eslint-disable no-console -- asserting console.log identity, not logging */ + const beforeLog = console.log; + const beforeWarn = console.warn; + const beforeError = console.error; + const t = installConsole(collect().emit); + expect(console.error).not.toBe(beforeError); + t(); + expect(console.log).toBe(beforeLog); + expect(console.warn).toBe(beforeWarn); + expect(console.error).toBe(beforeError); + /* eslint-enable no-console */ + }); +}); diff --git a/packages/browser/src/observers/console.ts b/packages/browser/src/observers/console.ts index 2fa8203..79b951f 100644 --- a/packages/browser/src/observers/console.ts +++ b/packages/browser/src/observers/console.ts @@ -27,11 +27,13 @@ export function installConsole(emit: Emit): Teardown { const originals = new Map void>(); for (const method of methods) { - const original = console[method].bind(console) as (...args: unknown[]) => void; + // Store the true original for teardown identity; call through a bound copy. + const original = console[method] as (...args: unknown[]) => void; originals.set(method, original); + const callOriginal = original.bind(console); console[method] = (...args: unknown[]): void => { emit(METHOD_EVENT[method], { message: stringifyArgs(args) }); - original(...args); + callOriginal(...args); }; } diff --git a/packages/browser/src/observers/route.test.ts b/packages/browser/src/observers/route.test.ts new file mode 100644 index 0000000..594a196 --- /dev/null +++ b/packages/browser/src/observers/route.test.ts @@ -0,0 +1,49 @@ +import { describe, it, expect, afterEach } from 'vitest'; +import { EventType } from '@syrin/iris-protocol'; +import { installRoute } from './route.js'; +import type { Emit, Teardown } from './types.js'; + +interface Emitted { + type: EventType; + data: Record; +} + +function collect(): { emit: Emit; events: Emitted[] } { + const events: Emitted[] = []; + const emit: Emit = (type, data) => { + events.push({ type, data }); + }; + return { emit, events }; +} + +describe('installRoute', () => { + let teardown: Teardown | undefined; + + afterEach(() => { + teardown?.(); + teardown = undefined; + }); + + it('emits ROUTE_CHANGE on pushState to a new url', () => { + const { emit, events } = collect(); + teardown = installRoute(emit); + + history.pushState({}, '', '/next'); + + expect(events).toHaveLength(1); + expect(events[0]?.type).toBe(EventType.ROUTE_CHANGE); + expect(String(events[0]?.data.pathname)).toBe('/next'); + }); + + it('restores the original history methods (identity) on teardown', () => { + /* eslint-disable @typescript-eslint/unbound-method -- comparing method identity, not calling */ + const beforePush = history.pushState; + const beforeReplace = history.replaceState; + const t = installRoute(collect().emit); + expect(history.pushState).not.toBe(beforePush); + t(); + expect(history.pushState).toBe(beforePush); + expect(history.replaceState).toBe(beforeReplace); + /* eslint-enable @typescript-eslint/unbound-method */ + }); +}); diff --git a/packages/browser/src/observers/route.ts b/packages/browser/src/observers/route.ts index 5b36257..34d8ccd 100644 --- a/packages/browser/src/observers/route.ts +++ b/packages/browser/src/observers/route.ts @@ -12,8 +12,14 @@ function snapshotLocation(): { pathname: string; search: string; hash: string; h /** Patch the History API + listen to popstate/hashchange to emit route.change. */ export function installRoute(emit: Emit): Teardown { - const origPush = history.pushState.bind(history); - const origReplace = history.replaceState.bind(history); + // Keep the true originals for teardown identity; bound copies are used for invocation + // (History methods throw "Illegal invocation" if called with the wrong `this`). + /* eslint-disable @typescript-eslint/unbound-method -- captured to restore exact identity on teardown */ + const origPush = history.pushState; + const origReplace = history.replaceState; + /* eslint-enable @typescript-eslint/unbound-method */ + const callPush = origPush.bind(history); + const callReplace = origReplace.bind(history); const fire = (from: string): void => { const to = snapshotLocation(); @@ -29,12 +35,12 @@ export function installRoute(emit: Emit): Teardown { history.pushState = (data: unknown, unused: string, url?: string | URL | null): void => { const from = location.href; - origPush(data, unused, url ?? null); + callPush(data, unused, url ?? null); fire(from); }; history.replaceState = (data: unknown, unused: string, url?: string | URL | null): void => { const from = location.href; - origReplace(data, unused, url ?? null); + callReplace(data, unused, url ?? null); fire(from); }; From a7775a67569ad446b61f5f6e57d7d91b01be6576 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:10:10 +0530 Subject: [PATCH 20/33] =?UTF-8?q?feat(server):=20add=20deterministic=20'se?= =?UTF-8?q?ttled'=20predicate=20(M3=20=E2=80=94=20kills=20sleep-based=20fl?= =?UTF-8?q?akiness)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #1 flake cause in agent-driven UI testing is async-wait + concurrency handled with fixed sleeps. Adds a { kind: 'settled', quietMs } predicate: the page is settled when no network/DOM/animation activity has occurred for quietMs (default 500). "No activity in the last N ms" is relative to now, so the buffer clock is injected via PredicateSession.elapsed() (CLAUDE.md rule 7) and the wait loop's poll interval flips it to pass once activity stops. Usable today via iris_wait_for and iris_act_and_wait's `until` (PredicateSchema already gates both), composes inside allOf with the consequence you expect, and both tool descriptions now steer agents to it over raw sleeps. FlowReplaySession gains elapsed(); test fakes across server + the public @syrin/iris-test package updated to match. All 16 packages green, 537 server tests (+5 settled cases). Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/server/src/events/predicate.test.ts | 75 +++++++++++++++++++ packages/server/src/events/predicate.ts | 60 +++++++++++++++ .../src/flows/flow-replay.dynamic.test.ts | 4 + packages/server/src/flows/flow-replay.test.ts | 4 + packages/server/src/flows/flow-replay.ts | 2 + .../server/src/flows/flow-success.test.ts | 1 + packages/server/src/flows/flows.heal.test.ts | 3 + packages/server/src/tools/tools.ts | 4 +- packages/test/src/flow-spec.test.ts | 2 + packages/test/src/register.test.ts | 2 + packages/test/src/success-assert.test.ts | 1 + 11 files changed, 156 insertions(+), 2 deletions(-) diff --git a/packages/server/src/events/predicate.test.ts b/packages/server/src/events/predicate.test.ts index 0d45770..67438a6 100644 --- a/packages/server/src/events/predicate.test.ts +++ b/packages/server/src/events/predicate.test.ts @@ -18,8 +18,13 @@ class FakeSession implements PredicateSession { count: 0, elements: [], }), + private readonly nowMs = 0, ) {} + elapsed(): number { + return this.nowMs; + } + command(name: string, args: Record = {}): Promise { if (name === IrisCommand.MATCH) { const result = this.matcher(args['query'] ?? {}); @@ -209,6 +214,7 @@ describe('predicate engine', () => { command: () => Promise.reject(new Error('session disconnected')), eventsSince: () => [], onEvent: () => () => undefined, + elapsed: () => 0, }; const result = await waitForPredicate( session, @@ -218,3 +224,72 @@ describe('predicate engine', () => { expect(result).toEqual({ pass: false, failureReason: 'session disconnected' }); }); }); + +describe('settled predicate (deterministic waiting)', () => { + it('passes when there has been no network/DOM/animation activity since the floor', async () => { + // Only a non-activity event (signal) in the buffer → nothing to settle → quiet. + const session = new FakeSession([ev(EventType.SIGNAL, { name: 'x' }, 100)], undefined, 1000); + const r = await evaluatePredicate(session, { kind: 'settled' }, 0); + expect(r.pass).toBe(true); + }); + + it('fails while the last activity is more recent than quietMs', async () => { + // Last network call at t=900, now=1000 → 100ms quiet < 200ms required. + const session = new FakeSession( + [ev(EventType.NET_REQUEST, { url: '/api/x', status: 200 }, 900)], + undefined, + 1000, + ); + const r = await evaluatePredicate(session, { kind: 'settled', quietMs: 200 }, 0); + expect(r.pass).toBe(false); + expect(r.failureReason).toContain('not settled'); + expect((r.evidence as { quietForMs: number }).quietForMs).toBe(100); + }); + + it('passes once the quiet gap reaches quietMs (DOM mutation long enough ago)', async () => { + // Last DOM text mutation at t=500, now=1000 → 500ms quiet ≥ 200ms required. + const session = new FakeSession([ev(EventType.DOM_TEXT, { text: 'hi' }, 500)], undefined, 1000); + const r = await evaluatePredicate(session, { kind: 'settled', quietMs: 200 }, 0); + expect(r.pass).toBe(true); + expect((r.evidence as { quietForMs: number }).quietForMs).toBe(500); + }); + + it('respects the since floor: activity before the floor does not count', async () => { + // A burst at t=100, then quiet. Asserting from floor=900 ignores the old burst → settled. + const session = new FakeSession( + [ev(EventType.DOM_ADDED, {}, 100), ev(EventType.ANIM_START, { name: 'spin' }, 100)], + undefined, + 1000, + ); + expect((await evaluatePredicate(session, { kind: 'settled', quietMs: 200 }, 900)).pass).toBe( + true, + ); + // From the start (floor 0) the burst is in scope but it is 900ms old → still settled. + expect((await evaluatePredicate(session, { kind: 'settled', quietMs: 200 }, 0)).pass).toBe( + true, + ); + }); + + it('composes inside allOf with a consequence predicate', async () => { + const session = new FakeSession( + [ + ev(EventType.SIGNAL, { name: 'deploy:shipped', data: {} }, 600), + ev(EventType.NET_REQUEST, { url: '/api/deploy', status: 200 }, 600), + ], + undefined, + 1000, + ); + const r = await evaluatePredicate( + session, + { + kind: 'allOf', + predicates: [ + { kind: 'signal', name: 'deploy:shipped' }, + { kind: 'settled', quietMs: 300 }, + ], + }, + 0, + ); + expect(r.pass).toBe(true); + }); +}); diff --git a/packages/server/src/events/predicate.ts b/packages/server/src/events/predicate.ts index 6990d7b..11c08d0 100644 --- a/packages/server/src/events/predicate.ts +++ b/packages/server/src/events/predicate.ts @@ -15,6 +15,8 @@ export interface PredicateSession { command(name: string, args?: Record): Promise; eventsSince(cursor: number): IrisEvent[]; onEvent(listener: (event: IrisEvent) => void): () => void; + /** Milliseconds since connect — the same clock that stamps event `t` (injected, testable). */ + elapsed(): number; } /** The predicate DSL (plan/06). A declarative description of what should be true. */ @@ -26,6 +28,7 @@ export type Predicate = | { kind: 'console'; level?: string; absent?: boolean; since?: number } | { kind: 'animation'; name?: string; target?: string; completed?: boolean } | { kind: 'signal'; name?: string; dataMatches?: Record } + | { kind: 'settled'; quietMs?: number } | { kind: 'allOf'; predicates: Predicate[] } | { kind: 'anyOf'; predicates: Predicate[] } | { kind: 'not'; predicate: Predicate }; @@ -73,6 +76,7 @@ export const PredicateSchema = z.lazy(() => name: z.string().optional(), dataMatches: z.record(z.unknown()).optional(), }), + z.object({ kind: z.literal('settled'), quietMs: z.number().positive().optional() }), z.object({ kind: z.literal('allOf'), predicates: z.array(PredicateSchema) }), z.object({ kind: z.literal('anyOf'), predicates: z.array(PredicateSchema) }), z.object({ kind: z.literal('not'), predicate: PredicateSchema }), @@ -313,6 +317,60 @@ function evalSignal(events: IrisEvent[], p: Extract = new Set([ + EventType.NET_REQUEST, + EventType.DOM_ADDED, + EventType.DOM_REMOVED, + EventType.DOM_ATTR, + EventType.DOM_TEXT, + EventType.ANIM_START, + EventType.ANIM_END, +]); + +/** Default quiet window — enough to absorb a render+xhr settle without waiting on slow polls. */ +const DEFAULT_QUIET_MS = 500; + +/** + * "The page has gone quiet": no network/DOM/animation activity for at least `quietMs`. Needs the + * wall-clock `now` (in the buffer's time base) because "no activity in the last N ms" is relative to + * now, not to any buffered event — so `now` is injected (CLAUDE.md rule 7), and the wait loop's + * poll interval is what eventually flips this to pass once activity stops. + */ +function evalSettled( + events: IrisEvent[], + p: Extract, + now: number, +): EvalResult { + const quietMs = p.quietMs ?? DEFAULT_QUIET_MS; + let lastT = -1; + let lastType: EventType | undefined; + for (const e of events) { + if (SETTLE_ACTIVITY.has(e.type) && e.t > lastT) { + lastT = e.t; + lastType = e.type; + } + } + if (lastT < 0) { + return { + pass: true, + evidence: { settled: true, quietForMs: null, note: 'no activity to settle' }, + }; + } + const quietForMs = now - lastT; + if (quietForMs >= quietMs) { + return { pass: true, evidence: { settled: true, quietForMs, lastActivity: lastType } }; + } + return { + pass: false, + failureReason: `not settled: last activity (${String(lastType)}) ${String(quietForMs)}ms ago, need ${String(quietMs)}ms quiet`, + evidence: { quietForMs, lastActivity: lastType }, + }; +} + /** * Evaluate a predicate once against the session's current state + event buffer. * @@ -346,6 +404,8 @@ export async function evaluatePredicate( return evalAnimation(events, predicate); case 'signal': return evalSignal(events, predicate); + case 'settled': + return evalSettled(events, predicate, session.elapsed()); case 'allOf': { const results = await Promise.all( predicate.predicates.map((p) => evaluatePredicate(session, p, since)), diff --git a/packages/server/src/flows/flow-replay.dynamic.test.ts b/packages/server/src/flows/flow-replay.dynamic.test.ts index 0cbf1e0..9b42dd3 100644 --- a/packages/server/src/flows/flow-replay.dynamic.test.ts +++ b/packages/server/src/flows/flow-replay.dynamic.test.ts @@ -57,6 +57,10 @@ class FakeSession implements FlowReplaySession { onEvent(): () => void { return () => undefined; } + + elapsed(): number { + return 0; + } } function step(value: string, expectTestid?: string): FlowStep { diff --git a/packages/server/src/flows/flow-replay.test.ts b/packages/server/src/flows/flow-replay.test.ts index e937680..077484a 100644 --- a/packages/server/src/flows/flow-replay.test.ts +++ b/packages/server/src/flows/flow-replay.test.ts @@ -81,6 +81,10 @@ class FakeSession implements FlowReplaySession { onEvent(): () => void { return () => undefined; } + + elapsed(): number { + return 0; + } } function el(ref: string, testid: string): ElementDescriptor { diff --git a/packages/server/src/flows/flow-replay.ts b/packages/server/src/flows/flow-replay.ts index 8c2b21f..d27cf42 100644 --- a/packages/server/src/flows/flow-replay.ts +++ b/packages/server/src/flows/flow-replay.ts @@ -25,6 +25,8 @@ export interface FlowReplaySession { command(name: string, args?: Record): Promise; eventsSince(cursor: number): IrisEvent[]; onEvent(listener: (event: IrisEvent) => void): () => void; + /** Buffer clock (ms since connect) — required by the predicate engine's `settled` check. */ + elapsed(): number; } /** The injected predicate-waiter (the real waitForPredicate) — reused, never reimplemented. */ diff --git a/packages/server/src/flows/flow-success.test.ts b/packages/server/src/flows/flow-success.test.ts index 5915a49..8680f20 100644 --- a/packages/server/src/flows/flow-success.test.ts +++ b/packages/server/src/flows/flow-success.test.ts @@ -16,6 +16,7 @@ function session(events: IrisEvent[], elementPresent = true): FlowReplaySession } as CommandResult), eventsSince: () => events, onEvent: () => () => undefined, + elapsed: () => 0, }; } diff --git a/packages/server/src/flows/flows.heal.test.ts b/packages/server/src/flows/flows.heal.test.ts index d1346a4..7582d2e 100644 --- a/packages/server/src/flows/flows.heal.test.ts +++ b/packages/server/src/flows/flows.heal.test.ts @@ -86,6 +86,9 @@ class FakeSession { onEvent(): () => void { return () => undefined; } + elapsed(): number { + return 0; + } } /** A session where `old` resolves to 0 elements with `present`, and any other testid resolves to 1. */ diff --git a/packages/server/src/tools/tools.ts b/packages/server/src/tools/tools.ts index 3fe4daa..0ab848b 100644 --- a/packages/server/src/tools/tools.ts +++ b/packages/server/src/tools/tools.ts @@ -594,7 +594,7 @@ export const TOOLS: ToolDef[] = [ 'Action-specific arguments: { value } for fill/select, { text } for type/press, { confirmDangerous: true } for a potentially destructive control.', ), until: PredicateSchema.describe( - 'Predicate to wait for after the action completes. Same shape accepted by iris_assert.', + 'Predicate to wait for after the action completes. Same shape accepted by iris_assert. For deterministic waiting instead of a sleep, use { kind: "settled", quietMs } to block until network/DOM/animation activity has been quiet for quietMs (default 500) — or allOf it with the consequence you expect.', ), timeout_ms: z .number() @@ -741,7 +741,7 @@ export const TOOLS: ToolDef[] = [ 'Block until a predicate is satisfied (or already true in the recent buffer), else time out. Returns matching evidence or a near-miss diagnosis. By default it only counts events since your last act, so a signal buffered BEFORE the action can never fake a pass; pass `since` (an observe/act cursor) to widen or narrow that window explicitly.', inputSchema: { predicate: PredicateSchema.describe( - 'Predicate to wait for: { signal }, { net }, { element } or a combination.', + 'Predicate to wait for: { signal }, { net }, { element }, { kind: "settled", quietMs } (deterministic network/DOM/animation idle — prefer this over a fixed sleep), or a combination via allOf/anyOf.', ), timeout_ms: z.number().optional().describe('Maximum wait in milliseconds. Default: 4000.'), since: z diff --git a/packages/test/src/flow-spec.test.ts b/packages/test/src/flow-spec.test.ts index 502931f..5b643f3 100644 --- a/packages/test/src/flow-spec.test.ts +++ b/packages/test/src/flow-spec.test.ts @@ -108,6 +108,7 @@ function fakeSession(config: FakeSessionConfig): FlowReplaySessionLike { command, eventsSince: () => events, onEvent: () => () => {}, + elapsed: () => 0, }; } @@ -116,6 +117,7 @@ interface FlowReplaySessionLike { command(name: string, args?: Record): Promise; eventsSince(cursor: number): IrisEvent[]; onEvent(listener: (event: IrisEvent) => void): () => void; + elapsed(): number; } /** A waiter that synchronously evaluates a signal predicate against the fake event buffer. */ diff --git a/packages/test/src/register.test.ts b/packages/test/src/register.test.ts index 1d4c253..8e4f8f8 100644 --- a/packages/test/src/register.test.ts +++ b/packages/test/src/register.test.ts @@ -57,6 +57,7 @@ interface SessionLike { command(name: string, args?: Record): Promise; eventsSince(cursor: number): IrisEvent[]; onEvent(listener: (event: IrisEvent) => void): () => void; + elapsed(): number; } function ok(result: unknown): CommandResult { @@ -82,6 +83,7 @@ function fakeSession(testids: string[]): SessionLike { }, eventsSince: () => [], onEvent: () => () => {}, + elapsed: () => 0, }; } diff --git a/packages/test/src/success-assert.test.ts b/packages/test/src/success-assert.test.ts index cef093a..3397745 100644 --- a/packages/test/src/success-assert.test.ts +++ b/packages/test/src/success-assert.test.ts @@ -8,6 +8,7 @@ const fakeSession: FlowReplaySession = { command: () => Promise.resolve({ kind: 'command_result', id: 'x', ok: true, result: {} }), eventsSince: () => [], onEvent: () => () => {}, + elapsed: () => 0, }; /** Records the predicate + timeout it was handed, and answers with a scripted verdict. */ From 20b4007797f3289c023c79596062020422dc2155 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:15:42 +0530 Subject: [PATCH 21/33] feat(server): act_and_wait defaults to settle when `until` is omitted (M3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes "act then wait for the page to go quiet" a single zero-config call: omit `until` on iris_act_and_wait and it now waits for the { kind: 'settled' } predicate (network/DOM/animation idle) instead of requiring a predicate — the documented alternative to a fixed sleep. Explicit predicates are unchanged; to assert a consequence AND settle, allOf them. Tool + field descriptions updated to steer agents here. Adds onEvent to the live-control test fake so the wait loop is exercised. 538 server tests green (+1). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/session/tools.live-control.test.ts | 14 ++++++++++++++ packages/server/src/tools/tools.ts | 11 ++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/packages/server/src/session/tools.live-control.test.ts b/packages/server/src/session/tools.live-control.test.ts index 1a0458f..7877dd8 100644 --- a/packages/server/src/session/tools.live-control.test.ts +++ b/packages/server/src/session/tools.live-control.test.ts @@ -47,6 +47,7 @@ function fakeSession(opts: { state?: SessionState; inbox?: string[] }): FakeSess markActCursor: () => undefined, lastActCursor: () => undefined, eventsSince: () => [], + onEvent: () => () => undefined, health: () => ({ lastSeenMs: 0, throttled: false, focused: true }), throttled: () => false, command: (name: string, args?: Record): Promise => { @@ -206,6 +207,19 @@ describe('live-control: piggyback', () => { expect(res.control?.guidance).toHaveLength(1); }); + it('iris_act_and_wait with no `until` defaults to waiting for the page to settle', async () => { + const session = fakeSession({ state: SessionState.ACTIVE, inbox: [] }); + const res = (await tool(IrisTool.ACT_AND_WAIT).handler(fakeDeps(session), { + ...ACT_ARGS, + timeout_ms: 50, + })) as ControlShape; + // No buffered activity (eventsSince → []) → the implicit `settled` predicate passes at once. + const verdict = res.verdict as { pass: boolean; evidence?: { settled?: boolean } }; + expect(verdict.pass).toBe(true); + expect(verdict.evidence?.settled).toBe(true); + expect(session.__sent.filter((c) => c.name === 'act')).toHaveLength(1); + }); + it('iris_assert piggybacks control while paused (observe-only)', async () => { const session = fakeSession({ state: SessionState.PAUSED, inbox: ['note'] }); const res = (await tool(IrisTool.ASSERT).handler(fakeDeps(session), { diff --git a/packages/server/src/tools/tools.ts b/packages/server/src/tools/tools.ts index 0ab848b..7067943 100644 --- a/packages/server/src/tools/tools.ts +++ b/packages/server/src/tools/tools.ts @@ -577,6 +577,7 @@ export const TOOLS: ToolDef[] = [ name: IrisTool.ACT_AND_WAIT, description: 'Act on a ref, then wait for a predicate to hold — one hop for the act->observe->assert loop. ' + + 'Omit `until` to wait for the page to settle (network/DOM/animation idle) — use this instead of a fixed sleep. ' + 'Returns { effect } (the action result), { verdict } (predicate pass/evidence/near-miss), ' + 'and { trace } (the reaction report of everything the app did after the action). ' + 'timeout_ms 0 evaluates the predicate once without waiting.', @@ -593,8 +594,8 @@ export const TOOLS: ToolDef[] = [ .describe( 'Action-specific arguments: { value } for fill/select, { text } for type/press, { confirmDangerous: true } for a potentially destructive control.', ), - until: PredicateSchema.describe( - 'Predicate to wait for after the action completes. Same shape accepted by iris_assert. For deterministic waiting instead of a sleep, use { kind: "settled", quietMs } to block until network/DOM/animation activity has been quiet for quietMs (default 500) — or allOf it with the consequence you expect.', + until: PredicateSchema.optional().describe( + 'Predicate to wait for after the action completes (same shape as iris_assert). OMIT to wait for the page to SETTLE — network/DOM/animation idle — the deterministic default instead of a sleep. To assert a consequence AND settle, allOf them: { kind: "allOf", predicates: [, { kind: "settled" }] }.', ), timeout_ms: z .number() @@ -626,7 +627,11 @@ export const TOOLS: ToolDef[] = [ const paused = pausedShortCircuit(session); if (paused !== undefined) return paused; refuseIfThrottled(session, args['refuseWhenThrottled']); - const until = PredicateSchema.parse(args['until']); + // Omitting `until` waits for the page to settle (idle) — the deterministic default vs a sleep. + const until = + args['until'] !== undefined + ? PredicateSchema.parse(args['until']) + : ({ kind: 'settled' } as const); const timeout = asNumber(args['timeout_ms']) ?? 4000; const since = session.elapsed(); From f86c676143cfaaa4f33f3b702e4ca4840e95c759 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:27:00 +0530 Subject: [PATCH 22/33] =?UTF-8?q?feat(server):=20flow=5Fheal=20verifies=20?= =?UTF-8?q?the=20consequence=20before=20persisting=20(M5=20=E2=80=94=20hea?= =?UTF-8?q?l=20the=20locator,=20never=20the=20intent)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit iris_flow_heal apply:true rewrote a drifted testid to its nearest match and wrote it to disk WITHOUT checking that the healed flow still does what it was testing. A rebind can resolve to a real but wrong element (a look-alike control) that no longer triggers the flow's success signal/net — persisting that ships a green test that asserts nothing. The maintenance-tax research is explicit: locator-only healing covers <1/3 of failures and can ship bugs. Now, before writing, apply re-replays the healed flow in memory and re-asserts its success consequence. If the consequence no longer fires, the write is REFUSED (status:consequence_broken, file untouched) and the proposal is still surfaced for a human. Flows with no declared success heal but say so loudly (the rebind couldn't be verified — add a success-state assertion). Extracts the pure applyHealChanges (shared by FlowStore.heal's on-disk writer and the in-memory verification so both rewrite identically). 541 server tests green (+3: verified-heal, refused-broken-consequence, unverified-no-success). Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/protocol/src/constants.ts | 1 + packages/server/src/flows/flow-tools.ts | 57 +++++++++++++-- packages/server/src/flows/flows.heal.test.ts | 73 +++++++++++++++++++- packages/server/src/flows/flows.ts | 20 +----- packages/server/src/flows/heal.ts | 32 +++++++++ 5 files changed, 159 insertions(+), 24 deletions(-) diff --git a/packages/protocol/src/constants.ts b/packages/protocol/src/constants.ts index e2c17aa..e2ffd24 100644 --- a/packages/protocol/src/constants.ts +++ b/packages/protocol/src/constants.ts @@ -269,6 +269,7 @@ export const HealStatus = { DRIFT: 'drift', // apply:false: confident proposal(s) returned, file untouched UNHEALABLE: 'unhealable', // drift exists but no proposal cleared the confidence floor NOTHING_TO_HEAL: 'nothing_to_heal', // replay was green + CONSEQUENCE_BROKEN: 'consequence_broken', // rebind resolves a locator but the flow's success consequence no longer fires — REFUSED (file untouched) ERROR: 'error', // flow missing/malformed/invalid-name, or a resolved action failed } as const; export type HealStatus = (typeof HealStatus)[keyof typeof HealStatus]; diff --git a/packages/server/src/flows/flow-tools.ts b/packages/server/src/flows/flow-tools.ts index 34c7eed..338a1b0 100644 --- a/packages/server/src/flows/flow-tools.ts +++ b/packages/server/src/flows/flow-tools.ts @@ -22,7 +22,7 @@ import { replayFlow } from './flow-replay.js'; import { classifyFlowAssertions } from './flow-classify.js'; import { assertSuccess, dynamicTestids, successLabel } from './flow-success.js'; import { flowPath } from '../project/iris-dir.js'; -import { collectProposals } from './heal.js'; +import { applyHealChanges, collectProposals } from './heal.js'; import type { FlowStepResult } from '@syrin/iris-protocol'; import { waitForPredicate } from '../events/predicate.js'; import type { FlowAnnotations } from './flows.js'; @@ -331,8 +331,11 @@ export const FLOW_TOOLS: ToolDef[] = [ 'Self-healing replay. Re-runs iris_flow_replay; on testid DRIFT computes confidence-scored ' + 'nearest-match rebind PROPOSALS. With apply:false (default) returns the proposed diff WITHOUT ' + 'writing. With apply:true, writes the confident rebind(s) back into .iris/flows/.json and ' + - 'returns what changed — never silently. A drift with no proposal above the confidence floor is ' + - 'status:unhealable (file untouched). Returns { name, status: healed|drift|unhealable|' + + 'returns what changed — never silently. Before writing, apply re-replays the healed flow and ' + + 're-asserts its success consequence: if the rebound locator resolves but the consequence no ' + + 'longer fires, the write is REFUSED (status:consequence_broken) — it heals the locator, never ' + + 'the intent. A drift with no proposal above the confidence floor is status:unhealable (file ' + + 'untouched). Returns { name, status: healed|drift|unhealable|consequence_broken|' + 'nothing_to_heal|error, applied, proposals[], changed[], message }.', inputSchema: { flowName: z.string().describe('Flow file name to heal (from iris_flow_list).'), @@ -364,9 +367,14 @@ export const FLOW_TOOLS: ToolDef[] = [ const HEAL_MESSAGES = { NOTHING: 'nothing to heal — every anchor resolved on replay', - HEALED: 'rewrote drifted testid anchors to their nearest surviving match', + HEALED: + "rewrote drifted testid anchors to their nearest surviving match and re-verified the flow's success consequence still fires", DRIFT_DRY: 'confident rebind(s) proposed — re-run with apply:true to write them to disk', UNHEALABLE: `drift found, but no nearest match cleared the confidence floor (HEAL_CONFIDENCE_MIN=${HEAL_CONFIDENCE_MIN}); file left untouched — add a data-testid or fix the flow by hand`, + HEALED_UNVERIFIED: + 'rewrote drifted testid anchors — but this flow declares no success consequence, so the rebind resolves a locator without proving the intent still holds. Add a success-state assertion (iris_annotate) so future heals can be verified.', + CONSEQUENCE_BROKEN: + 'rebind resolves the drifted locator to a surviving element, but the healed flow no longer satisfies its success consequence — refusing to write (a heal that loses the intent would ship a green-but-dead test). Fix by hand and verify', } as const; function toChange(proposal: HealProposal): HealChange { @@ -450,6 +458,44 @@ async function healFlow(deps: ToolDeps, args: Record): Promise< }; } + // M5 invariant — "heal the locator, never the intent." Before persisting, verify the rebind on a + // healed in-memory copy: a rebound testid can resolve to a real but WRONG element that no longer + // triggers the flow's success consequence (e.g. a look-alike control). Persisting that would ship + // a green flow that tests nothing. Re-replay the healed flow and assert its success; refuse the + // write if the consequence no longer fires. Flows with no declared success can't be verified — we + // still heal them but say so loudly so the gap is visible. + const { flow: healed } = applyHealChanges(loaded.value, proposals.map(toChange)); + if (healed.success !== undefined) { + const verifySteps = await replayFlow( + session, + healed, + waitForPredicate, + FLOW_SIGNAL_TIMEOUT_MS, + args['confirmDangerous'] === true, + ); + const verifyClean = + verifySteps.length > 0 && verifySteps.every((s) => s.ok && s.drift === undefined); + const verdict = verifyClean + ? await assertSuccess( + session, + healed.success, + dynamicTestids(healed), + waitForPredicate, + FLOW_SIGNAL_TIMEOUT_MS, + ) + : { pass: false, failureReason: 'healed flow did not replay cleanly' }; + if (!verdict.pass) { + return { + name, + status: HealStatus.CONSEQUENCE_BROKEN, + applied: false, + proposals, + changed: [], + message: `${HEAL_MESSAGES.CONSEQUENCE_BROKEN} (${successLabel(healed.success)}: ${verdict.failureReason ?? 'not satisfied'})`, + }; + } + } + const written = await deps.flows.heal(name, proposals.map(toChange)); if (!written.ok) { return { @@ -468,6 +514,7 @@ async function healFlow(deps: ToolDeps, args: Record): Promise< applied: written.value.changed.length > 0, proposals, changed: written.value.changed, - message: HEAL_MESSAGES.HEALED, + message: + loaded.value.success !== undefined ? HEAL_MESSAGES.HEALED : HEAL_MESSAGES.HEALED_UNVERIFIED, }; } diff --git a/packages/server/src/flows/flows.heal.test.ts b/packages/server/src/flows/flows.heal.test.ts index 7582d2e..25744c4 100644 --- a/packages/server/src/flows/flows.heal.test.ts +++ b/packages/server/src/flows/flows.heal.test.ts @@ -6,6 +6,7 @@ import { ActionType, AnchorKind, DANGEROUS_ACTION_CONFIRM_ARG, + EventType, FLOW_FILE_VERSION, FLOW_SIGNAL_TIMEOUT_MS, FlowErrorCode, @@ -58,6 +59,7 @@ class FakeSession { constructor( private readonly script: (testid: string) => QueryScript, private readonly actOk = true, + private readonly events: IrisEvent[] = [], ) {} command(name: string, args: Record = {}): Promise { if (name === IrisCommand.QUERY) { @@ -81,7 +83,7 @@ class FakeSession { return Promise.resolve({ kind: 'command_result', id: 'x', ok: true, result: {} }); } eventsSince(): IrisEvent[] { - return []; + return this.events; } onEvent(): () => void { return () => undefined; @@ -91,6 +93,27 @@ class FakeSession { } } +/** A SIGNAL event the success oracle can match (data.name === the flow's success.signal). */ +function signalEvent(name: string): IrisEvent { + return { t: 0, type: EventType.SIGNAL, sessionId: 's', data: { name, data: {} } }; +} + +/** Like renamedSession, but the page also emits `signal` — so a healed flow's consequence holds. */ +function renamedSessionWithSignal( + old: string, + presentTestids: string[], + signal: string, +): FakeSession { + return new FakeSession( + (testid) => + testid === old + ? { elements: [], hint: present(presentTestids) } + : { elements: [el(`e-${testid}`, testid)] }, + true, + [signalEvent(signal)], + ); +} + /** A session where `old` resolves to 0 elements with `present`, and any other testid resolves to 1. */ function renamedSession(old: string, presentTestids: string[]): FakeSession { return new FakeSession((testid) => @@ -175,6 +198,54 @@ describe('FlowStore.heal + iris_flow_heal', () => { expect(steps.some((s) => s.drift !== undefined)).toBe(false); }); + it('heal apply re-verifies the success consequence and writes when it still fires', async () => { + await store.saveFlow({ + ...flowFile('chat', [clickStep('old-id')]), + success: { signal: 'done' }, + }); + const session = renamedSessionWithSignal('old-id', ['new-id'], 'done'); + + const res = await heal(store, session, { flowName: 'chat', apply: true }); + expect(res.status).toBe(HealStatus.HEALED); + expect(res.applied).toBe(true); + expect(res.changed).toEqual([{ step: 0, from: 'old-id', to: 'new-id' }]); + + const loaded = await store.load('chat'); + if (!loaded.ok) throw new Error('expected ok'); + expect(loaded.value.steps[0]?.anchor).toEqual({ kind: AnchorKind.TESTID, value: 'new-id' }); + }); + + it('REFUSES to persist a heal when the rebind breaks the success consequence', async () => { + await store.saveFlow({ + ...flowFile('chat', [clickStep('old-id')]), + success: { signal: 'done' }, + }); + const before = await readFile(flowPath(root, 'chat'), 'utf8'); + // The locator heals (old-id → new-id resolves), but the page never emits the 'done' signal, + // so the healed flow no longer satisfies its intent. The write must be refused. + const session = renamedSession('old-id', ['new-id']); + + const res = await heal(store, session, { flowName: 'chat', apply: true }); + expect(res.status).toBe(HealStatus.CONSEQUENCE_BROKEN); + expect(res.applied).toBe(false); + expect(res.changed).toEqual([]); + expect(res.proposals).toHaveLength(1); // the proposal is still surfaced for a human + expect(res.message).toContain('done'); + + const after = await readFile(flowPath(root, 'chat'), 'utf8'); + expect(after).toEqual(before); // file untouched — never ship a green-but-dead flow + }, 10_000); + + it('heals a flow with no declared success but says the rebind is unverified', async () => { + await store.saveFlow(flowFile('chat', [clickStep('old-id')])); + const session = renamedSession('old-id', ['new-id']); + + const res = await heal(store, session, { flowName: 'chat', apply: true }); + expect(res.status).toBe(HealStatus.HEALED); + expect(res.applied).toBe(true); + expect(res.message.toLowerCase()).toContain('no success consequence'); + }); + it('heal apply:false returns the proposal but does NOT modify the file', async () => { await store.saveFlow(flowFile('chat', [clickStep('old-id')])); const before = await readFile(flowPath(root, 'chat'), 'utf8'); diff --git a/packages/server/src/flows/flows.ts b/packages/server/src/flows/flows.ts index e3268c8..c5468b7 100644 --- a/packages/server/src/flows/flows.ts +++ b/packages/server/src/flows/flows.ts @@ -15,6 +15,7 @@ import type { HealChange, } from '@syrin/iris-protocol'; import { IrisTool } from '../tools/tool-names.js'; +import { applyHealChanges } from './heal.js'; import type { CompiledProgram, RecordedStep } from './recordings.js'; import type { FileSystemPort } from '../project/fs-port.js'; import { flowPath, irisDirPaths, isValidFlowName } from '../project/iris-dir.js'; @@ -235,24 +236,7 @@ export class FlowStore { if (!loaded.ok) return { ok: false, code: loaded.code }; const flow = loaded.value; - const byStep = new Map(); - for (const change of changes) byStep.set(change.step, change); - - const applied: HealChange[] = []; - const steps = flow.steps.map((step, index): FlowStep => { - const change = byStep.get(index); - if ( - change === undefined || - step.anchor.kind !== AnchorKind.TESTID || - step.anchor.value !== change.from - ) { - return step; - } - applied.push(change); - return { ...step, anchor: { kind: AnchorKind.TESTID, value: change.to } }; - }); - - const next: FlowFile = { ...flow, steps }; + const { flow: next, applied } = applyHealChanges(flow, changes); await this.#fs.writeFile(flowPath(this.#root, name), this.#serialize(next)); return { ok: true, value: { name, changed: applied } }; } diff --git a/packages/server/src/flows/heal.ts b/packages/server/src/flows/heal.ts index e805120..367f2ac 100644 --- a/packages/server/src/flows/heal.ts +++ b/packages/server/src/flows/heal.ts @@ -1,8 +1,12 @@ import { + AnchorKind, DriftReason, HEAL_CONFIDENCE_MIN, type Drift, + type FlowFile, + type FlowStep, type FlowStepResult, + type HealChange, type HealProposal, } from '@syrin/iris-protocol'; import { editDistance } from './flow-replay.js'; @@ -37,6 +41,34 @@ export function confidenceFor(from: string, to: string): number { return raw; } +/** + * Pure: rewrite the named steps' testid anchors (from→to) and return the new flow plus the changes + * that actually applied. A change whose `from` no longer matches that step's testid anchor is + * skipped (idempotent / defensive), never throwing. Shared by the on-disk writer (FlowStore.heal) + * and the in-memory pre-write verification in iris_flow_heal, so both rewrite identically. + */ +export function applyHealChanges( + flow: FlowFile, + changes: HealChange[], +): { flow: FlowFile; applied: HealChange[] } { + const byStep = new Map(); + for (const change of changes) byStep.set(change.step, change); + const applied: HealChange[] = []; + const steps = flow.steps.map((step, index): FlowStep => { + const change = byStep.get(index); + if ( + change === undefined || + step.anchor.kind !== AnchorKind.TESTID || + step.anchor.value !== change.from + ) { + return step; + } + applied.push(change); + return { ...step, anchor: { kind: AnchorKind.TESTID, value: change.to } }; + }); + return { flow: { ...flow, steps }, applied }; +} + /** Internal: propose with a caller-supplied floor (enables the tunable-confidence API). */ function proposeRebindWith( drift: Drift, From 112c3661ea2555ecbd9cc2a369df0bc4d3dd121f Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:32:45 +0530 Subject: [PATCH 23/33] feat(server): iris_query limit + count_only for token efficiency (M2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A broad query (by=role value=button) on a busy page returns every descriptor, spending agent context on data it may not need. Adds two opt-in trims to the result the AGENT sees: - count_only: return just { count }, drop the element array (when you only need "how many match?"). - limit: keep the first N descriptors; if more matched, carry total + truncated:true so the trim is never silent — the agent narrows with name/scope rather than assuming it saw everything. Pure, result-shape-tolerant paginateQueryResult (non-{elements} results pass through). 548 server tests green (+7). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/tools/query-paginate.test.ts | 71 +++++++++++++++++++ packages/server/src/tools/query-paginate.ts | 35 +++++++++ packages/server/src/tools/tools.ts | 51 +++++++++---- 3 files changed, 144 insertions(+), 13 deletions(-) create mode 100644 packages/server/src/tools/query-paginate.test.ts create mode 100644 packages/server/src/tools/query-paginate.ts diff --git a/packages/server/src/tools/query-paginate.test.ts b/packages/server/src/tools/query-paginate.test.ts new file mode 100644 index 0000000..48e7329 --- /dev/null +++ b/packages/server/src/tools/query-paginate.test.ts @@ -0,0 +1,71 @@ +import { describe, it, expect } from 'vitest'; +import { paginateQueryResult } from './query-paginate.js'; + +function elements(n: number): { ref: string }[] { + return Array.from({ length: n }, (_v, i) => ({ ref: `e${String(i)}` })); +} + +describe('paginateQueryResult', () => { + it('returns the result unchanged when no limit and not count_only', () => { + const r = { elements: elements(3), hint: undefined }; + expect(paginateQueryResult(r, undefined, false)).toBe(r); + }); + + it('count_only drops the elements array and reports the count', () => { + const r = paginateQueryResult({ elements: elements(12) }, undefined, true) as { + count: number; + elements?: unknown; + }; + expect(r.count).toBe(12); + expect('elements' in r).toBe(false); + }); + + it('count_only preserves other fields (e.g. hint)', () => { + const r = paginateQueryResult( + { elements: elements(0), hint: { route: '/' } }, + undefined, + true, + ) as { + count: number; + hint: { route: string }; + }; + expect(r.count).toBe(0); + expect(r.hint.route).toBe('/'); + }); + + it('limit truncates and flags total + truncated when over the limit', () => { + const r = paginateQueryResult({ elements: elements(10) }, 3, false) as { + elements: unknown[]; + total: number; + truncated: boolean; + }; + expect(r.elements).toHaveLength(3); + expect(r.total).toBe(10); + expect(r.truncated).toBe(true); + }); + + it('limit is a no-op (no truncated flag) when the count is within the limit', () => { + const r = paginateQueryResult({ elements: elements(2) }, 5, false) as { + elements: unknown[]; + truncated?: boolean; + }; + expect(r.elements).toHaveLength(2); + expect(r.truncated).toBeUndefined(); + }); + + it('count_only takes precedence over limit', () => { + const r = paginateQueryResult({ elements: elements(10) }, 3, true) as { + count: number; + elements?: unknown; + }; + expect(r.count).toBe(10); + expect('elements' in r).toBe(false); + }); + + it('passes non-object / element-less results through untouched', () => { + expect(paginateQueryResult(null, 5, false)).toBeNull(); + expect(paginateQueryResult('err', 5, true)).toBe('err'); + const hintOnly = { hint: { route: '/x' } }; + expect(paginateQueryResult(hintOnly, 5, true)).toBe(hintOnly); + }); +}); diff --git a/packages/server/src/tools/query-paginate.ts b/packages/server/src/tools/query-paginate.ts new file mode 100644 index 0000000..948da16 --- /dev/null +++ b/packages/server/src/tools/query-paginate.ts @@ -0,0 +1,35 @@ +/** + * Token-efficiency for iris_query: a `by=role value=button` on a busy page can return dozens of + * element descriptors the agent must read in full. This trims the result the AGENT sees — the most + * expensive part of a tool call is the bytes that land in its context. + * + * - count_only: drop the elements array entirely, return just `count` (the agent often only needs + * "how many?" — e.g. "are there 3 rows now?"). + * - limit: keep the first N descriptors, report `total` + `truncated:true` so the trim is never + * silent (the agent knows to narrow with name/scope rather than assume it saw everything). + * + * Pure and result-shape-tolerant: anything that is not a `{ elements: [...] }` object passes + * through untouched (a thrown-error envelope, a zero-match hint result, etc.). + */ +export function paginateQueryResult( + result: unknown, + limit: number | undefined, + countOnly: boolean, +): unknown { + if (typeof result !== 'object' || result === null) return result; + const record = result as Record; + const elements = record['elements']; + if (!Array.isArray(elements)) return result; + const total = elements.length; + + if (countOnly) { + const { elements: _dropped, ...rest } = record; + return { ...rest, count: total }; + } + + if (limit !== undefined && limit >= 0 && total > limit) { + return { ...record, elements: elements.slice(0, limit), total, truncated: true }; + } + + return result; +} diff --git a/packages/server/src/tools/tools.ts b/packages/server/src/tools/tools.ts index 7067943..223cd50 100644 --- a/packages/server/src/tools/tools.ts +++ b/packages/server/src/tools/tools.ts @@ -32,6 +32,7 @@ import { applyEventBudget, costHint, withSizeCost } from '../session/output-budg import { applySnapshotDelta, SnapshotCache } from './snapshot-delta.js'; import { selectPath, capDepth } from '../session/state-select.js'; import { asString, asNumber, asRecord, parseInteractive } from './tools-helpers.js'; +import { paginateQueryResult } from './query-paginate.js'; import type { FileSystemPort } from '../project/fs-port.js'; import type { FlowStore } from '../flows/flows.js'; import type { ProjectStore } from '../project/project-store.js'; @@ -359,7 +360,7 @@ export const TOOLS: ToolDef[] = [ { name: IrisTool.QUERY, description: - 'Find elements by Testing-Library semantics. Pass `by` (role|text|label|placeholder|testid|alt) and `value` (the query string). Returns matching refs + descriptors + visibility. On zero matches, also returns hint:{ route, presentTestids[], knownEmptyState } so you can distinguish an empty state from a missing element WITHOUT taking a snapshot.', + 'Find elements by Testing-Library semantics. Pass `by` (role|text|label|placeholder|testid|alt) and `value` (the query string). Returns matching refs + descriptors + visibility. Pass `limit` to cap descriptors (broad role queries can be large) or `count_only:true` for just the match count — both cut tokens. On zero matches, also returns hint:{ route, presentTestids[], knownEmptyState } so you can distinguish an empty state from a missing element WITHOUT taking a snapshot.', inputSchema: { by: z.string().describe('Query strategy: role | text | label | placeholder | testid | alt'), value: z @@ -377,19 +378,39 @@ export const TOOLS: ToolDef[] = [ .string() .optional() .describe('CSS selector or element ref to restrict the search to a subtree.'), + limit: z + .number() + .optional() + .describe( + 'Cap the returned descriptors to the first N (cuts tokens on broad queries). If more matched, the result carries total + truncated:true so the trim is never silent — narrow with name/scope.', + ), + count_only: z + .boolean() + .optional() + .describe( + 'Return just { count } (no element descriptors) — use when you only need "how many match?" and not their refs.', + ), ...sessionIdShape, }, outputSchema: { - elements: z.array( - z.object({ - ref: z.string(), - role: z.string(), - name: z.string(), - value: z.string().optional(), - states: z.array(z.string()), - visible: z.boolean(), - }), - ), + elements: z + .array( + z.object({ + ref: z.string(), + role: z.string(), + name: z.string(), + value: z.string().optional(), + states: z.array(z.string()), + visible: z.boolean(), + }), + ) + .optional(), + count: z.number().optional().describe('Match count — present when count_only is set.'), + total: z + .number() + .optional() + .describe('Total matches before `limit` truncation — present only when truncated.'), + truncated: z.boolean().optional().describe('True when `limit` dropped some matches.'), hint: z .object({ route: z.string(), @@ -403,7 +424,7 @@ export const TOOLS: ToolDef[] = [ cost: z .object({ bytes: z.number(), tokens: z.number() }) .optional() - .describe('Estimated size of this result — narrow with `name`/`scope` if large.'), + .describe('Estimated size of this result — narrow with `name`/`scope`/`limit` if large.'), }, handler: (deps, args) => commandOrThrow(deps, asString(args['sessionId']), IrisCommand.QUERY, { @@ -411,7 +432,11 @@ export const TOOLS: ToolDef[] = [ value: args['value'], name: args['name'], scope: args['scope'], - }).then(withSizeCost), + }).then((result) => + withSizeCost( + paginateQueryResult(result, asNumber(args['limit']), args['count_only'] === true), + ), + ), }, { name: IrisTool.INSPECT, From c8278dbdb65027a3a745f968de88d33bc0115746 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:37:53 +0530 Subject: [PATCH 24/33] feat(server): limit + cost hint on iris_network and iris_console (M2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both tools returned every matching call/log uncapped and carried no size hint — unlike iris_observe/iris_query/iris_snapshot. A buggy page spams the console and a wide `since` window returns many calls, both spending agent context. Adds a `limit` (most-recent-N via the already-tested applyEventBudget, reporting total + droppedOldest so a trim is never silent) and a `cost:{bytes,tokens}` hint so the agent can self-budget — consistent with the other read tools. 551 server tests green (+3). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/tools/tools.near-miss.test.ts | 50 +++++++++++++++++++ packages/server/src/tools/tools.ts | 50 ++++++++++++++++--- 2 files changed, 92 insertions(+), 8 deletions(-) diff --git a/packages/server/src/tools/tools.near-miss.test.ts b/packages/server/src/tools/tools.near-miss.test.ts index dfa4de9..110856e 100644 --- a/packages/server/src/tools/tools.near-miss.test.ts +++ b/packages/server/src/tools/tools.near-miss.test.ts @@ -75,3 +75,53 @@ describe('near-miss on iris_network / iris_console', () => { expect(r.hint?.byLevel).toEqual({ log: 1, warn: 1, error: 0 }); }); }); + +describe('token budget on iris_network / iris_console', () => { + it('iris_network: limit keeps the most recent N, reporting total + droppedOldest + cost', async () => { + const deps = depsWith([ + ev(EventType.NET_REQUEST, { url: '/1', status: 200 }), + ev(EventType.NET_REQUEST, { url: '/2', status: 200 }), + ev(EventType.NET_REQUEST, { url: '/3', status: 200 }), + ]); + const r = (await tool(IrisTool.NETWORK).handler(deps, { limit: 2 })) as { + calls: { data: { url: string } }[]; + total?: number; + droppedOldest?: number; + cost?: { bytes: number }; + }; + expect(r.calls.map((c) => c.data.url)).toEqual(['/2', '/3']); + expect(r.total).toBe(3); + expect(r.droppedOldest).toBe(1); + expect(r.cost?.bytes).toBeGreaterThan(0); + }); + + it('iris_network: no limit returns all matches + a cost hint, no total/droppedOldest', async () => { + const deps = depsWith([ev(EventType.NET_REQUEST, { url: '/1', status: 200 })]); + const r = (await tool(IrisTool.NETWORK).handler(deps, {})) as { + calls: unknown[]; + total?: number; + droppedOldest?: number; + cost?: { bytes: number }; + }; + expect(r.calls).toHaveLength(1); + expect(r.total).toBeUndefined(); + expect(r.droppedOldest).toBeUndefined(); + expect(r.cost?.bytes).toBeGreaterThan(0); + }); + + it('iris_console: limit keeps the most recent N entries', async () => { + const deps = depsWith([ + ev(EventType.CONSOLE_ERROR, { message: 'a' }), + ev(EventType.CONSOLE_ERROR, { message: 'b' }), + ev(EventType.CONSOLE_ERROR, { message: 'c' }), + ]); + const r = (await tool(IrisTool.CONSOLE).handler(deps, { level: 'error', limit: 1 })) as { + logs: { data: { message: string } }[]; + total?: number; + droppedOldest?: number; + }; + expect(r.logs.map((l) => l.data.message)).toEqual(['c']); + expect(r.total).toBe(3); + expect(r.droppedOldest).toBe(2); + }); +}); diff --git a/packages/server/src/tools/tools.ts b/packages/server/src/tools/tools.ts index 223cd50..928fa69 100644 --- a/packages/server/src/tools/tools.ts +++ b/packages/server/src/tools/tools.ts @@ -861,11 +861,23 @@ export const TOOLS: ToolDef[] = [ .describe('HTTP method filter: GET | POST | PUT | DELETE | PATCH etc.'), urlContains: z.string().optional().describe('Substring that the request URL must contain.'), status: z.number().optional().describe('HTTP status code filter (e.g. 200, 404, 500).'), + limit: z + .number() + .optional() + .describe( + 'Keep only the most recent N matching calls (older are dropped and counted in droppedOldest) — cuts tokens on a wide window.', + ), ...sessionIdShape, }, outputSchema: { calls: z.array(z.unknown()), + total: z + .number() + .optional() + .describe('Total matches before `limit` — present only when capped.'), + droppedOldest: z.number().optional().describe('How many older matches `limit` dropped.'), hint: z.object({ totalInWindow: z.number(), present: z.array(z.string()) }).optional(), + cost: z.object({ bytes: z.number(), tokens: z.number() }).optional(), }, handler: (deps, args) => { const session = deps.sessions.resolve(asString(args['sessionId'])); @@ -873,13 +885,19 @@ export const TOOLS: ToolDef[] = [ const method = asString(args['method']); const urlContains = asString(args['urlContains']); const status = asNumber(args['status']); + const limit = asNumber(args['limit']); const allNet = session.eventsSince(since).filter((e) => e.type === EventType.NET_REQUEST); - const calls = allNet.filter((e) => matchNet(e, method, urlContains, status)); + const matched = allNet.filter((e) => matchNet(e, method, urlContains, status)); // zero-match filter returns what DID fire, not a bare []. - if (calls.length === 0 && allNet.length > 0) { - return Promise.resolve({ calls, hint: netEmptyHint(allNet) }); + if (matched.length === 0 && allNet.length > 0) { + return Promise.resolve(withSizeCost({ calls: matched, hint: netEmptyHint(allNet) })); } - return Promise.resolve({ calls }); + const { events: calls, droppedOldest } = applyEventBudget(matched, limit); + return Promise.resolve( + withSizeCost( + droppedOldest > 0 ? { calls, total: matched.length, droppedOldest } : { calls }, + ), + ); }, }, { @@ -895,23 +913,39 @@ export const TOOLS: ToolDef[] = [ .number() .optional() .describe('Cursor from a prior iris_act — scopes the query to log entries after that act.'), + limit: z + .number() + .optional() + .describe( + 'Keep only the most recent N matching entries (older are dropped and counted in droppedOldest) — cuts tokens when a page spams the console.', + ), ...sessionIdShape, }, outputSchema: { logs: z.array(z.unknown()), + total: z + .number() + .optional() + .describe('Total matches before `limit` — present only when capped.'), + droppedOldest: z.number().optional().describe('How many older matches `limit` dropped.'), hint: z.object({ totalInWindow: z.number(), byLevel: z.record(z.number()) }).optional(), + cost: z.object({ bytes: z.number(), tokens: z.number() }).optional(), }, handler: (deps, args) => { const session = deps.sessions.resolve(asString(args['sessionId'])); const since = asNumber(args['since']) ?? 0; const level = asString(args['level']); + const limit = asNumber(args['limit']); const allConsole = session.eventsSince(since).filter(isConsoleEvent); - const logs = allConsole.filter((e) => matchConsole(e, level)); + const matched = allConsole.filter((e) => matchConsole(e, level)); // zero matches at this level → report what levels ARE present (not a bare []). - if (logs.length === 0 && allConsole.length > 0) { - return Promise.resolve({ logs, hint: consoleEmptyHint(allConsole) }); + if (matched.length === 0 && allConsole.length > 0) { + return Promise.resolve(withSizeCost({ logs: matched, hint: consoleEmptyHint(allConsole) })); } - return Promise.resolve({ logs }); + const { events: logs, droppedOldest } = applyEventBudget(matched, limit); + return Promise.resolve( + withSizeCost(droppedOldest > 0 ? { logs, total: matched.length, droppedOldest } : { logs }), + ); }, }, { From 63e6edea0c1213de657dce69404279e6a0fce0b8 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:42:12 +0530 Subject: [PATCH 25/33] =?UTF-8?q?feat(server):=20domain=20model=20surfaces?= =?UTF-8?q?=20mustHold=20=E2=80=94=20what=20must=20hold=20per=20flow=20(M4?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit iris_domain told an agent that a flow asserts SOMETHING (boolean) but not WHAT its success depends on. The M4 bar is that the agent can answer "what are the critical flows and what must hold for each?" from Iris alone — the "what must hold" was missing. Each DomainFlowSummary now carries mustHold: the human label of the flow's success consequence (signal name / net URL), or undefined when the flow declares no success (then asserts:false — it tests nothing observable). 552 server tests green (+1). Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/server/src/domain/domain-model.test.ts | 15 +++++++++++++++ packages/server/src/domain/domain-model.ts | 9 +++++++++ packages/server/src/domain/domain-tools.ts | 8 +++++++- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/packages/server/src/domain/domain-model.test.ts b/packages/server/src/domain/domain-model.test.ts index a086d7c..ce6e858 100644 --- a/packages/server/src/domain/domain-model.test.ts +++ b/packages/server/src/domain/domain-model.test.ts @@ -41,6 +41,21 @@ describe('buildDomainModel', () => { expect(m.coverage.asserted).toBe(1); }); + it('surfaces mustHold — what must hold for each flow — from its success consequence', () => { + const m = buildDomainModel( + [ + flow('checkout', [testidStep('pay')], { signal: 'order:placed' }), + flow('browse', [testidStep('nav')]), // no success declared + ], + contract(), + ); + const checkout = m.flows.find((f) => f.name === 'checkout'); + const browse = m.flows.find((f) => f.name === 'browse'); + expect(checkout?.mustHold).toBe('order:placed'); // the consequence that must hold + expect(browse?.mustHold).toBeUndefined(); // tests nothing observable + expect(browse?.asserts).toBe(false); + }); + it('flags declared signals that NO flow asserts (untested intent — the differentiator)', () => { const m = buildDomainModel( [flow('checkout', [testidStep('pay')], { signal: 'order:placed' })], diff --git a/packages/server/src/domain/domain-model.ts b/packages/server/src/domain/domain-model.ts index 4ab750d..81c3035 100644 --- a/packages/server/src/domain/domain-model.ts +++ b/packages/server/src/domain/domain-model.ts @@ -21,6 +21,7 @@ import { type RunRecord, } from '@syrin/iris-protocol'; import { classifyFlowAssertions, FlowAssertionGrade } from '../flows/flow-classify.js'; +import { successLabel } from '../flows/flow-success.js'; import { flowRisk, latestRun, rankByRisk, RiskLevel, type FlowRisk } from './flow-risk.js'; export interface DomainFlowSummary { @@ -29,6 +30,13 @@ export interface DomainFlowSummary { grade: string; /** True when the flow asserts a real consequence (signal/net), not just presence. */ asserts: boolean; + /** + * The success consequence that MUST hold for this flow to count as passing — the human-readable + * label of flow.success (e.g. a signal name or net URL). Undefined when the flow declares no + * success condition (then `asserts` is false: it tests nothing observable). This is the + * "what must hold for each flow" an agent needs before testing. + */ + mustHold?: string; warning?: string; signals: string[]; testids: string[]; @@ -106,6 +114,7 @@ export function buildDomainModel( signals: flowSignals(flow), testids: flowTestids(flow), }; + if (flow.success !== undefined) summary.mustHold = successLabel(flow.success); if (c.warning !== undefined) summary.warning = c.warning; if (hasHistory) summary.risk = flowRisk(c.grade, latestRun(flow.name, runs)); return summary; diff --git a/packages/server/src/domain/domain-tools.ts b/packages/server/src/domain/domain-tools.ts index aed2a46..dce332a 100644 --- a/packages/server/src/domain/domain-tools.ts +++ b/packages/server/src/domain/domain-tools.ts @@ -15,7 +15,7 @@ export const DOMAIN_TOOLS: ToolDef[] = [ { name: IrisTool.DOMAIN, description: - 'Read the app domain model BEFORE testing: every saved flow with its assertion grade + the anchors/signals it exercises, plus GAPS — declared signals/testids that NO flow asserts (untested intent), and flows that assert no observable consequence. Use this to decide what to test and where the real risk is, instead of crawling the whole app. Reads .iris/flows/ + .iris/contract.json (no browser needed).', + 'Read the app domain model BEFORE testing: every saved flow with its assertion grade, the consequence that MUST hold for it (mustHold = what it actually tests), the anchors/signals it exercises, plus GAPS — declared signals/testids that NO flow asserts (untested intent), and flows that assert no observable consequence. Use this to decide what to test and where the real risk is, instead of crawling the whole app. Reads .iris/flows/ + .iris/contract.json (no browser needed).', inputSchema: {}, outputSchema: { flowCount: z.number(), @@ -25,6 +25,12 @@ export const DOMAIN_TOOLS: ToolDef[] = [ steps: z.number(), grade: z.string(), asserts: z.boolean(), + mustHold: z + .string() + .optional() + .describe( + 'The success consequence that must hold for this flow (what it actually tests).', + ), warning: z.string().optional(), signals: z.array(z.string()), testids: z.array(z.string()), From c2bd0c8f5ea4b7392b04edc48272ebec421ee2b5 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:45:47 +0530 Subject: [PATCH 26/33] docs(changelog): capture the Unreleased deterministic-waiting, token-control, safe-heal, and domain features Records this session's user-facing changes under [Unreleased]: the settled predicate + act_and_wait auto-settle (M3), iris_query/network/console token controls + cost hints (M2), iris_domain mustHold (M4), consequence-verified flow_heal (M5), and the observer teardown-identity fix. (Also folds in a pre-existing cosmetic prettier reflow of the 0.5.0 section.) Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 52 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c157d7d..4ffc522 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,40 @@ All notable changes to **`@syrin/iris`** are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added + +- **Deterministic waiting — the `settled` predicate** (`packages/server`). A new predicate + `{ kind: "settled", quietMs }` passes once network/DOM/animation activity has been quiet for + `quietMs` (default 500ms). Usable in `iris_wait_for` and `iris_assert`, and composable inside + `allOf` with the consequence you expect. Replaces fixed sleeps — the #1 cause of flaky agent tests. +- **`iris_act_and_wait` auto-settle** (`packages/server`). Omit `until` and the tool waits for the page + to settle instead of requiring a predicate — "act, then wait for quiet" is now a single zero-config + call, the documented alternative to a sleep. +- **`iris_query` token controls** (`packages/server`) — `limit` (cap returned descriptors; reports + `total` + `truncated` so a trim is never silent) and `count_only` (return just the match count). +- **`iris_network` / `iris_console` token controls** (`packages/server`) — `limit` (keep the most + recent N matches, reporting `total` + `droppedOldest`) and a `cost:{bytes,tokens}` hint, matching the + other read tools so the agent can self-budget everywhere. +- **`iris_domain` `mustHold` per flow** (`packages/server`) — each flow now reports the success + consequence that must hold for it (signal name / net URL), so an agent can answer "what are the + critical flows and what must hold for each?" from the domain model alone. + +### Changed + +- **Self-healing now verifies the consequence before persisting** (`packages/server`). `iris_flow_heal` + with `apply:true` re-replays the healed flow and re-asserts its success consequence; if a rebound + locator resolves but the flow no longer satisfies its intent, the write is **refused** + (`status:consequence_broken`, file untouched). It heals the locator, never the intent. + +### Fixed + +- **Browser observers fully restore patched globals on teardown** (`packages/browser`). The network, + route, and console observers stored a bound copy and assigned it back on teardown, so `window.fetch` + / `history.pushState` / `console.*` were never restored to their original identity. They now keep the + true original for restore and a bound copy only for invocation. + ## [0.5.0] — 2026-06-15 ### Added @@ -25,24 +59,18 @@ All notable changes to **`@syrin/iris`** are documented here. The format follows dev-only HUD overlay that the agent can control: `iris_narrate` shows a caption, `iris_highlight` draws a ring around any element. The HUD is excluded from snapshots and tree-shaken in production. - **Unified `SKILL.md` at repo root** — a single skill file auto-detects mode: setup wizard on first - run (no `.iris.json`), live-app testing on every run after. Covers Claude Code, OpenCode, Codex CLI, - Cursor, Windsurf, VS Code, and Zed MCP config formats. + run (no `.iris.json`), live-app testing on every run after. Covers Claude Code, OpenCode, Codex CLI, Cursor, Windsurf, VS Code, and Zed MCP config formats. - **`.iris.json` project config** — written after first-run setup; persists `port`, `headed`, `framework`, and `harnesses` so subsequent runs need zero questions. -- **`dev:iris` script** in `apps/demo` — second Vite dev server on port 4310, isolated from the user's - normal dev port. +- **`dev:iris` script** in `apps/demo` — second Vite dev server on port 4310, isolated from the user's normal dev port. ### Fixed - **All-throttled session auto-selection** (`packages/server`). When every connected tab is hidden - (e.g. user is in VS Code with Chrome on another desktop), `SessionManager.resolve()` now picks the - session with the freshest heartbeat instead of throwing `"multiple sessions connected"`. -- **Presenter HUD shows on bridge connect** — the overlay now mounts as soon as the SDK connects to the - bridge, not only after the first `iris_narrate` call. -- **`iris_narrate` MCP schema validation** — relaxed the output schema so the tool no longer rejects - responses from narration calls. -- **`iris_inspect` / `iris_clock` output schemas** — relaxed to pass through extra fields instead of - stripping them, fixing spurious validation errors. + (e.g. user is in VS Code with Chrome on another desktop), `SessionManager.resolve()` now picks the session with the freshest heartbeat instead of throwing `"multiple sessions connected"`. +- **Presenter HUD shows on bridge connect** — the overlay now mounts as soon as the SDK connects to the bridge, not only after the first `iris_narrate` call. +- **`iris_narrate` MCP schema validation** — relaxed the output schema so the tool no longer rejects responses from narration calls. +- **`iris_inspect` / `iris_clock` output schemas** — relaxed to pass through extra fields instead of stripping them, fixing spurious validation errors. --- From 131ddeb11733e6f2ca612d33e006576d4c6dde2f Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:51:06 +0530 Subject: [PATCH 27/33] feat(server): iris_assert nudges presence-only assertions toward a consequence (rigor) The success-oracle thesis ("green means the feature worked") was enforced for saved flows (classifyFlowAssertions) but not for ad-hoc iris_assert. An agent that asserts only { element }/{ text } presence gets a green verdict even though a healed-but-wrong element or a stale render can satisfy it (Fowler, Assertion- Free Testing; Dodds, Make Your Test Fail). iris_assert now attaches `advice` to a PASSING presence-only assertion, steering toward a { signal }/{ net } consequence (or an allOf with one). Never on a failing verdict or when a consequence is already asserted. Pure isPresenceOnlyAssertion walks allOf/anyOf/not. 563 server tests green (+11). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../server/src/tools/assert-grade.test.ts | 60 ++++++++++++++ packages/server/src/tools/assert-grade.ts | 61 ++++++++++++++ .../src/tools/tools.assert-advice.test.ts | 79 +++++++++++++++++++ packages/server/src/tools/tools.ts | 13 ++- 4 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 packages/server/src/tools/assert-grade.test.ts create mode 100644 packages/server/src/tools/assert-grade.ts create mode 100644 packages/server/src/tools/tools.assert-advice.test.ts diff --git a/packages/server/src/tools/assert-grade.test.ts b/packages/server/src/tools/assert-grade.test.ts new file mode 100644 index 0000000..f9ff66e --- /dev/null +++ b/packages/server/src/tools/assert-grade.test.ts @@ -0,0 +1,60 @@ +import { describe, it, expect } from 'vitest'; +import { isPresenceOnlyAssertion } from './assert-grade.js'; +import type { Predicate } from '../events/predicate.js'; + +describe('isPresenceOnlyAssertion', () => { + it('flags a bare element predicate', () => { + expect(isPresenceOnlyAssertion({ kind: 'element', query: { role: 'button' } })).toBe(true); + }); + + it('flags a bare text predicate', () => { + expect(isPresenceOnlyAssertion({ kind: 'text', contains: 'Saved' })).toBe(true); + }); + + it('does NOT flag a signal consequence', () => { + expect(isPresenceOnlyAssertion({ kind: 'signal', name: 'order:placed' })).toBe(false); + }); + + it('does NOT flag a net consequence', () => { + expect(isPresenceOnlyAssertion({ kind: 'net', urlContains: '/api/order', status: 200 })).toBe( + false, + ); + }); + + it('does NOT flag presence when a consequence is allOf-ed in', () => { + const p: Predicate = { + kind: 'allOf', + predicates: [ + { kind: 'element', query: { text: 'Done' } }, + { kind: 'signal', name: 'order:placed' }, + ], + }; + expect(isPresenceOnlyAssertion(p)).toBe(false); + }); + + it('flags an allOf of only presence checks', () => { + const p: Predicate = { + kind: 'allOf', + predicates: [ + { kind: 'element', query: { role: 'dialog' } }, + { kind: 'text', contains: 'Welcome' }, + ], + }; + expect(isPresenceOnlyAssertion(p)).toBe(true); + }); + + it('does NOT flag non-presence predicates (route / settled / console)', () => { + expect(isPresenceOnlyAssertion({ kind: 'route', pathname: '/success' })).toBe(false); + expect(isPresenceOnlyAssertion({ kind: 'settled' })).toBe(false); + expect(isPresenceOnlyAssertion({ kind: 'console', level: 'error', absent: true })).toBe(false); + }); + + it('flags a negated presence check (still presence-shaped)', () => { + expect( + isPresenceOnlyAssertion({ + kind: 'not', + predicate: { kind: 'element', query: { text: 'x' } }, + }), + ).toBe(true); + }); +}); diff --git a/packages/server/src/tools/assert-grade.ts b/packages/server/src/tools/assert-grade.ts new file mode 100644 index 0000000..ca110b8 --- /dev/null +++ b/packages/server/src/tools/assert-grade.ts @@ -0,0 +1,61 @@ +/** + * Grade a single assertion predicate as a CONSEQUENCE or a mere PRESENCE check — the same distinction + * classifyFlowAssertions applies to saved flows, now applied to ad-hoc iris_assert calls. + * + * Why (grounded, same sources as flow-classify): a test that only checks an element is present is + * weak — a locator healed to the wrong element, or a stale render, can satisfy it while the feature + * is broken (Fowler, *Assertion-Free Testing*; Dodds, *Make Your Test Fail*). A signal/net assertion + * verifies an observable consequence a wrong element cannot fake. When an agent asserts only + * presence, we pass the verdict but nudge it toward a consequence — the success-oracle thesis. + * + * Pure: no IO, no clock. + */ + +import type { Predicate } from '../events/predicate.js'; + +export const PRESENCE_ONLY_ADVICE = + 'This predicate only checks element/text presence, not an observable consequence. A locator healed to the wrong element (or a stale render) can satisfy it while the feature is broken. Prefer a { signal } or { net } assertion — or allOf it with one — so green means the feature actually worked.'; + +interface PredicateKinds { + /** A signal/net leaf is present — proves the app did something a wrong element cannot fake. */ + consequence: boolean; + /** An element/text leaf is present — a weak presence check. */ + presence: boolean; +} + +function walk(predicate: Predicate): PredicateKinds { + switch (predicate.kind) { + case 'signal': + case 'net': + return { consequence: true, presence: false }; + case 'element': + case 'text': + return { consequence: false, presence: true }; + case 'route': + case 'console': + case 'animation': + case 'settled': + // Observable but not the weak presence pattern we nudge — neither flags the advice. + return { consequence: false, presence: false }; + case 'allOf': + case 'anyOf': { + const subs = predicate.predicates.map(walk); + return { + consequence: subs.some((s) => s.consequence), + presence: subs.some((s) => s.presence), + }; + } + case 'not': + return walk(predicate.predicate); + } +} + +/** + * True when the predicate asserts ONLY element/text presence with no signal/net consequence anywhere + * — the weak pattern worth nudging. A predicate that mixes in a consequence, or that checks something + * other than presence (route/console/settled), is not flagged. + */ +export function isPresenceOnlyAssertion(predicate: Predicate): boolean { + const kinds = walk(predicate); + return kinds.presence && !kinds.consequence; +} diff --git a/packages/server/src/tools/tools.assert-advice.test.ts b/packages/server/src/tools/tools.assert-advice.test.ts new file mode 100644 index 0000000..fea80ef --- /dev/null +++ b/packages/server/src/tools/tools.assert-advice.test.ts @@ -0,0 +1,79 @@ +import { describe, it, expect } from 'vitest'; +import { + EventType, + IrisCommand, + SessionState, + type CommandResult, + type IrisEvent, +} from '@syrin/iris-protocol'; +import { TOOLS, type ToolDeps } from './tools.js'; +import { IrisTool } from './tool-names.js'; +import type { Session, SessionManager } from '../session/session.js'; + +/** A session whose MATCH answers `matched`, and whose buffer is a fixed event list. */ +function depsWith(opts: { matched?: boolean; events?: IrisEvent[] }): ToolDeps { + const matchResult = { + matched: opts.matched ?? false, + count: opts.matched === true ? 1 : 0, + elements: + opts.matched === true + ? [{ ref: 'e1', role: 'button', name: 'X', states: [], visible: true }] + : [], + }; + const stub: Partial = { + id: 'demo', + command: (name: string): Promise => + Promise.resolve({ + kind: 'command_result', + id: 'c', + ok: true, + result: name === IrisCommand.MATCH ? matchResult : {}, + }), + eventsSince: () => opts.events ?? [], + lastActCursor: () => 0, + health: () => ({ lastSeenMs: 0, throttled: false, focused: true }), + getState: () => SessionState.ACTIVE, + drainInbox: () => [], + }; + const sessions: Partial = { resolve: () => stub as Session }; + return { sessions: sessions as SessionManager } as ToolDeps; +} + +function assertTool() { + const t = TOOLS.find((x) => x.name === IrisTool.ASSERT); + if (t === undefined) throw new Error('no iris_assert tool'); + return t; +} + +const signal = (name: string): IrisEvent => ({ + t: 1, + type: EventType.SIGNAL, + sessionId: 's', + data: { name, data: {} }, +}); + +describe('iris_assert presence-only advice', () => { + it('attaches advice to a PASSING presence-only (element) assertion', async () => { + const r = (await assertTool().handler(depsWith({ matched: true }), { + predicate: { kind: 'element', query: { role: 'button' } }, + })) as { pass: boolean; advice?: string }; + expect(r.pass).toBe(true); + expect(r.advice).toContain('consequence'); + }); + + it('does NOT attach advice to a signal consequence assertion', async () => { + const r = (await assertTool().handler(depsWith({ events: [signal('order:placed')] }), { + predicate: { kind: 'signal', name: 'order:placed' }, + })) as { pass: boolean; advice?: string }; + expect(r.pass).toBe(true); + expect(r.advice).toBeUndefined(); + }); + + it('does NOT attach advice to a FAILING presence assertion (moot)', async () => { + const r = (await assertTool().handler(depsWith({ matched: false }), { + predicate: { kind: 'element', query: { role: 'button' } }, + })) as { pass: boolean; advice?: string }; + expect(r.pass).toBe(false); + expect(r.advice).toBeUndefined(); + }); +}); diff --git a/packages/server/src/tools/tools.ts b/packages/server/src/tools/tools.ts index 928fa69..6a4f0b1 100644 --- a/packages/server/src/tools/tools.ts +++ b/packages/server/src/tools/tools.ts @@ -33,6 +33,7 @@ import { applySnapshotDelta, SnapshotCache } from './snapshot-delta.js'; import { selectPath, capDepth } from '../session/state-select.js'; import { asString, asNumber, asRecord, parseInteractive } from './tools-helpers.js'; import { paginateQueryResult } from './query-paginate.js'; +import { isPresenceOnlyAssertion, PRESENCE_ONLY_ADVICE } from './assert-grade.js'; import type { FileSystemPort } from '../project/fs-port.js'; import type { FlowStore } from '../flows/flows.js'; import type { ProjectStore } from '../project/project-store.js'; @@ -806,7 +807,7 @@ export const TOOLS: ToolDef[] = [ { name: IrisTool.ASSERT, description: - 'Evaluate a predicate (optionally waiting up to timeout_ms). Returns { pass, evidence, failureReason? }. The end of every verify loop. By default it only counts events since your last act, so a stale buffered signal can never fake a pass; pass `since` (an observe/act cursor) to set the window explicitly.', + 'Evaluate a predicate (optionally waiting up to timeout_ms). Returns { pass, evidence, failureReason? }. The end of every verify loop. Prefer a { signal } or { net } consequence over { element }/{ text } presence — a passing presence-only assertion returns `advice` because a wrong/healed element can fake it. By default it only counts events since your last act, so a stale buffered signal can never fake a pass; pass `since` (an observe/act cursor) to set the window explicitly.', inputSchema: { predicate: PredicateSchema.describe( 'Predicate to evaluate: { signal }, { net }, { element } or a combination.', @@ -827,6 +828,10 @@ export const TOOLS: ToolDef[] = [ pass: z.boolean(), evidence: z.unknown().optional(), failureReason: z.string().optional(), + advice: z + .string() + .optional() + .describe('Present on a PASSING presence-only assertion — nudges toward a consequence.'), session: z .object({ lastSeenMs: z.number(), throttled: z.boolean(), focused: z.boolean() }) .optional(), @@ -841,7 +846,11 @@ export const TOOLS: ToolDef[] = [ timeout > 0 ? await waitForPredicate(session, predicate, timeout, since) : await evaluatePredicate(session, predicate, since); - return withControl(session, { ...verdict, ...healthEnvelope(session) }); + // A GREEN presence-only assertion is the dangerous case (a wrong element can fake it) — nudge + // toward a consequence. Never on a failing verdict (moot) or when a signal/net is asserted. + const advice = + verdict.pass && isPresenceOnlyAssertion(predicate) ? { advice: PRESENCE_ONLY_ADVICE } : {}; + return withControl(session, { ...verdict, ...advice, ...healthEnvelope(session) }); }, }, { From 3284f0aed2828b0ba428c7d1ac06891242ec8ce0 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:54:12 +0530 Subject: [PATCH 28/33] docs(cheatsheet): teach deterministic waiting, consequence-first asserts, and read caps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaces tonight's habits in the canonical quick-reference so they're discoverable to agents and humans, not just buried in tool descriptions: - Never sleep — iris_act_and_wait with no `until` settles; { kind:"settled" } waits for network/DOM/animation idle; allOf a consequence with settled. - Assert a consequence ({signal}/{net}) over presence ({element}/{text}); a passing presence-only assert returns `advice`. - Cap broad reads: iris_query limit/count_only, iris_network/iris_console limit + cost. - flow_heal refuses to write a rebind that breaks the success consequence. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/agent-cheatsheet.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/docs/agent-cheatsheet.md b/docs/agent-cheatsheet.md index 6d78d3f..29fb8b5 100644 --- a/docs/agent-cheatsheet.md +++ b/docs/agent-cheatsheet.md @@ -23,6 +23,19 @@ pointer sequence on the element (no coordinate gesture for the HUD to intercept) `occluded:true` when something covers the target, and stays synthetic even with CDP configured (use `args:{ native:true }` for a trusted native click). +**Never sleep — wait deterministically.** Fixed sleeps are the #1 cause of flaky agent tests. Instead: + +- `iris_act_and_wait({ ref, action })` with **no `until`** waits for the page to _settle_ (network + + DOM + animation idle) before returning — the one-call replacement for "click then sleep 500ms". +- Need to wait without acting? `iris_wait_for({ predicate: { kind: "settled", quietMs } })`. +- Waiting for a specific outcome? Pass that consequence as the predicate (`{ signal }` / `{ net }`), + or `allOf` it with `{ kind: "settled" }` to wait for both the event _and_ the page going quiet. + +**Assert a consequence, not just presence.** `{ signal }` / `{ net }` prove the feature actually did +something; `{ element }` / `{ text }` only prove something is on screen — which a stale render or a +locator healed to the wrong element can fake. A _passing_ presence-only `iris_assert` returns +`advice` nudging you to a consequence; heed it on anything that matters. + ## The 4-layer cross-check — never trust a green the state contradicts A claim is real only when the layers agree. Check more than the UI: @@ -104,10 +117,16 @@ Both need a **driven browser** (`iris drive ` / `IRIS_CDP_URL`); without on (`mode:delta`/`unchanged`), ~99% fewer tokens than a full re-snapshot and no stale tree to mis-read. Every snapshot/query result carries `cost:{ bytes, tokens }` — re-scope before reading if it's large. +- **Cap broad reads.** `iris_query` takes `limit` (caps descriptors; reports `total`/`truncated`) and + `count_only` (just the match count). `iris_network` / `iris_console` take `limit` (most-recent-N, + reports `droppedOldest`) and carry the same `cost` hint — so a busy page or wide window never floods + your context unnoticed. - **A saved flow tells you if it's a real test.** `iris_flow_save` returns `assertions.grade` (`asserted` / `presence-only` / `assertion-free`); if it's not `asserted`, add a consequence (`iris_annotate` assert-signal/assert-net or a success-state) so it can't pass while broken. On - replay, an ambiguous heal (two testids tie) is surfaced, never auto-applied. + replay, an ambiguous heal (two testids tie) is surfaced, never auto-applied — and an `apply` heal + re-replays the rebound flow and **refuses to write** if the success consequence no longer fires + (`status:consequence_broken`): it heals the locator, never the intent. - **Predicate schema is not bloated.** The recursive predicate DSL used by `iris_assert` / `iris_wait_for` / `iris_act_and_wait` is **factored, not inlined**: when converted to the JSON Schema MCP sends, the predicate body is emitted **once** (~2.7k chars ≈ **~685 tokens** From 80f2d8071c4c84fe3869ee859062ca5a1641336f Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 15:58:45 +0530 Subject: [PATCH 29/33] fix(server): settled predicate ignores ambient dom.text/animation churn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review caught a real bug in tonight's settled predicate: it counted dom.text and animation events as activity, so any page with an ambient effect — a count-up counter, spinner, or pulsing dot — emits an event every frame and NEVER goes quiet, making act->settle time out at 4s on a healthy page. Observed live in iter 18: one login flooded 319 dom.text events from the dashboard's count-up animations. Same trap that deprecated Playwright's networkidle. SETTLE_ACTIVITY is now network + structural DOM (added/removed/attr) only — the real "app is doing work" signals. Ambient text/anim churn is ignored; for an animation-gated outcome, assert the specific consequence instead. Tool descriptions, cheatsheet, and CHANGELOG corrected to "network + DOM idle". 564 server tests green (+1 ambient-animation regression test). Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 5 +++-- docs/agent-cheatsheet.md | 3 ++- packages/server/src/events/predicate.test.ts | 22 +++++++++++++++++--- packages/server/src/events/predicate.ts | 13 +++++++----- packages/server/src/tools/tools.ts | 6 +++--- 5 files changed, 35 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ffc522..d2b8bc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,9 @@ All notable changes to **`@syrin/iris`** are documented here. The format follows ### Added - **Deterministic waiting — the `settled` predicate** (`packages/server`). A new predicate - `{ kind: "settled", quietMs }` passes once network/DOM/animation activity has been quiet for - `quietMs` (default 500ms). Usable in `iris_wait_for` and `iris_assert`, and composable inside + `{ kind: "settled", quietMs }` passes once network + structural-DOM activity has been quiet for + `quietMs` (default 500ms); ambient `dom.text`/animation churn (count-ups, spinners) is ignored so + an animated page can still settle. Usable in `iris_wait_for` and `iris_assert`, and composable inside `allOf` with the consequence you expect. Replaces fixed sleeps — the #1 cause of flaky agent tests. - **`iris_act_and_wait` auto-settle** (`packages/server`). Omit `until` and the tool waits for the page to settle instead of requiring a predicate — "act, then wait for quiet" is now a single zero-config diff --git a/docs/agent-cheatsheet.md b/docs/agent-cheatsheet.md index 29fb8b5..bd5db48 100644 --- a/docs/agent-cheatsheet.md +++ b/docs/agent-cheatsheet.md @@ -26,7 +26,8 @@ pointer sequence on the element (no coordinate gesture for the HUD to intercept) **Never sleep — wait deterministically.** Fixed sleeps are the #1 cause of flaky agent tests. Instead: - `iris_act_and_wait({ ref, action })` with **no `until`** waits for the page to _settle_ (network + - DOM + animation idle) before returning — the one-call replacement for "click then sleep 500ms". + structural DOM idle; ambient count-up/spinner churn is ignored so an animated page still settles) + before returning — the one-call replacement for "click then sleep 500ms". - Need to wait without acting? `iris_wait_for({ predicate: { kind: "settled", quietMs } })`. - Waiting for a specific outcome? Pass that consequence as the predicate (`{ signal }` / `{ net }`), or `allOf` it with `{ kind: "settled" }` to wait for both the event _and_ the page going quiet. diff --git a/packages/server/src/events/predicate.test.ts b/packages/server/src/events/predicate.test.ts index 67438a6..c7fd278 100644 --- a/packages/server/src/events/predicate.test.ts +++ b/packages/server/src/events/predicate.test.ts @@ -246,14 +246,30 @@ describe('settled predicate (deterministic waiting)', () => { expect((r.evidence as { quietForMs: number }).quietForMs).toBe(100); }); - it('passes once the quiet gap reaches quietMs (DOM mutation long enough ago)', async () => { - // Last DOM text mutation at t=500, now=1000 → 500ms quiet ≥ 200ms required. - const session = new FakeSession([ev(EventType.DOM_TEXT, { text: 'hi' }, 500)], undefined, 1000); + it('passes once the quiet gap reaches quietMs (structural DOM mutation long enough ago)', async () => { + // Last DOM node added at t=500, now=1000 → 500ms quiet ≥ 200ms required. + const session = new FakeSession([ev(EventType.DOM_ADDED, {}, 500)], undefined, 1000); const r = await evaluatePredicate(session, { kind: 'settled', quietMs: 200 }, 0); expect(r.pass).toBe(true); expect((r.evidence as { quietForMs: number }).quietForMs).toBe(500); }); + it('ignores ambient dom.text / animation frames so an animated page can still settle', async () => { + // A count-up counter + spinner emit a text/anim event EVERY frame — here at t=995/998, only + // 2-5ms ago. If these counted as activity the page would never go quiet; they must not. + const session = new FakeSession( + [ + ev(EventType.DOM_TEXT, { text: '42' }, 995), + ev(EventType.ANIM_START, { name: 'spin' }, 996), + ev(EventType.ANIM_END, { name: 'pulse' }, 998), + ], + undefined, + 1000, + ); + const r = await evaluatePredicate(session, { kind: 'settled', quietMs: 200 }, 0); + expect(r.pass).toBe(true); // settled despite very recent text/anim churn + }); + it('respects the since floor: activity before the floor does not count', async () => { // A burst at t=100, then quiet. Asserting from floor=900 ignores the old burst → settled. const session = new FakeSession( diff --git a/packages/server/src/events/predicate.ts b/packages/server/src/events/predicate.ts index 11c08d0..4f96ee5 100644 --- a/packages/server/src/events/predicate.ts +++ b/packages/server/src/events/predicate.ts @@ -318,17 +318,20 @@ function evalSignal(events: IrisEvent[], p: Extract = new Set([ EventType.NET_REQUEST, EventType.DOM_ADDED, EventType.DOM_REMOVED, EventType.DOM_ATTR, - EventType.DOM_TEXT, - EventType.ANIM_START, - EventType.ANIM_END, ]); /** Default quiet window — enough to absorb a render+xhr settle without waiting on slow polls. */ diff --git a/packages/server/src/tools/tools.ts b/packages/server/src/tools/tools.ts index 6a4f0b1..a34bc99 100644 --- a/packages/server/src/tools/tools.ts +++ b/packages/server/src/tools/tools.ts @@ -603,7 +603,7 @@ export const TOOLS: ToolDef[] = [ name: IrisTool.ACT_AND_WAIT, description: 'Act on a ref, then wait for a predicate to hold — one hop for the act->observe->assert loop. ' + - 'Omit `until` to wait for the page to settle (network/DOM/animation idle) — use this instead of a fixed sleep. ' + + 'Omit `until` to wait for the page to settle (network + DOM idle) — use this instead of a fixed sleep. ' + 'Returns { effect } (the action result), { verdict } (predicate pass/evidence/near-miss), ' + 'and { trace } (the reaction report of everything the app did after the action). ' + 'timeout_ms 0 evaluates the predicate once without waiting.', @@ -621,7 +621,7 @@ export const TOOLS: ToolDef[] = [ 'Action-specific arguments: { value } for fill/select, { text } for type/press, { confirmDangerous: true } for a potentially destructive control.', ), until: PredicateSchema.optional().describe( - 'Predicate to wait for after the action completes (same shape as iris_assert). OMIT to wait for the page to SETTLE — network/DOM/animation idle — the deterministic default instead of a sleep. To assert a consequence AND settle, allOf them: { kind: "allOf", predicates: [, { kind: "settled" }] }.', + 'Predicate to wait for after the action completes (same shape as iris_assert). OMIT to wait for the page to SETTLE — network + DOM idle — the deterministic default instead of a sleep. To assert a consequence AND settle, allOf them: { kind: "allOf", predicates: [, { kind: "settled" }] }.', ), timeout_ms: z .number() @@ -772,7 +772,7 @@ export const TOOLS: ToolDef[] = [ 'Block until a predicate is satisfied (or already true in the recent buffer), else time out. Returns matching evidence or a near-miss diagnosis. By default it only counts events since your last act, so a signal buffered BEFORE the action can never fake a pass; pass `since` (an observe/act cursor) to widen or narrow that window explicitly.', inputSchema: { predicate: PredicateSchema.describe( - 'Predicate to wait for: { signal }, { net }, { element }, { kind: "settled", quietMs } (deterministic network/DOM/animation idle — prefer this over a fixed sleep), or a combination via allOf/anyOf.', + 'Predicate to wait for: { signal }, { net }, { element }, { kind: "settled", quietMs } (deterministic network + DOM idle — prefer this over a fixed sleep), or a combination via allOf/anyOf.', ), timeout_ms: z.number().optional().describe('Maximum wait in milliseconds. Default: 4000.'), since: z From 0b43710259931ccfd94c969ea87952f6e3264c33 Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 16:24:48 +0530 Subject: [PATCH 30/33] fix(server): floor the flow success oracle at each replay (no stale-signal false pass) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review of the M5 heal verification found a real honesty gap: assertSuccess evaluated the success predicate against the WHOLE session buffer (no since floor). In flow_heal this is acute — one heal call does two replays (the pre-heal drift replay's prefix, then the verify replay), so a success signal emitted by the first could satisfy the verify even when the rebound step never fired it: a false-positive heal, the exact thing the feature exists to prevent. flow_replay had the same staleness across runs in one session. assertSuccess now takes a `since` floor (WaitForSignal gains the param, already honored by waitForPredicate); flow_replay and the heal verify each capture the cursor before their replay so only that replay's events count — the same since-floor honesty principle used by iris_assert/wait_for. 565 server tests green (+1 floor regression test); flow-replay test fake gains elapsed(). Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/server/src/flows/flow-replay.ts | 7 ++++++- .../server/src/flows/flow-success.test.ts | 21 +++++++++++++++++++ packages/server/src/flows/flow-success.ts | 5 ++++- packages/server/src/flows/flow-tools.ts | 8 +++++++ .../src/flows/tools.flow-replay.test.ts | 8 ++++++- 5 files changed, 46 insertions(+), 3 deletions(-) diff --git a/packages/server/src/flows/flow-replay.ts b/packages/server/src/flows/flow-replay.ts index d27cf42..377a8c8 100644 --- a/packages/server/src/flows/flow-replay.ts +++ b/packages/server/src/flows/flow-replay.ts @@ -29,11 +29,16 @@ export interface FlowReplaySession { elapsed(): number; } -/** The injected predicate-waiter (the real waitForPredicate) — reused, never reimplemented. */ +/** + * The injected predicate-waiter (the real waitForPredicate) — reused, never reimplemented. + * `since` is the event-time floor (default 0 = whole buffer): pass the cursor captured before a + * replay so the success oracle can't be satisfied by a stale signal from a prior replay/run. + */ export type WaitForSignal = ( session: FlowReplaySession, predicate: Predicate, timeoutMs: number, + since?: number, ) => Promise; /** diff --git a/packages/server/src/flows/flow-success.test.ts b/packages/server/src/flows/flow-success.test.ts index 8680f20..6e1fa54 100644 --- a/packages/server/src/flows/flow-success.test.ts +++ b/packages/server/src/flows/flow-success.test.ts @@ -87,6 +87,27 @@ describe('assertSuccess — green only when the consequence holds', () => { ); expect(r.pass).toBe(true); }); + + it('honors the since floor: a success signal from a PRIOR replay does not fake a pass', async () => { + // A success signal fired at t=10 (a previous replay / the pre-heal drift replay's prefix). + const filtering: FlowReplaySession = { + command: () => Promise.resolve({ kind: 'command_result', id: 'q', ok: true, result: {} }), + eventsSince: (cursor: number) => + [{ t: 10, type: EventType.SIGNAL, sessionId: 's', data: { name: 'done' } }].filter( + (e) => e.t >= cursor, + ), + onEvent: () => () => undefined, + elapsed: () => 1000, + }; + // floor 0 (whole buffer) → the stale signal matches (legacy behavior). + expect( + (await assertSuccess(filtering, { signal: 'done' }, NONE, waitForPredicate, 0, 0)).pass, + ).toBe(true); + // floor 20 (this replay started after the stale signal) → excluded, so it FAILS. + expect( + (await assertSuccess(filtering, { signal: 'done' }, NONE, waitForPredicate, 0, 20)).pass, + ).toBe(false); + }); }); describe('dynamicTestids', () => { diff --git a/packages/server/src/flows/flow-success.ts b/packages/server/src/flows/flow-success.ts index f3d87ce..10c599f 100644 --- a/packages/server/src/flows/flow-success.ts +++ b/packages/server/src/flows/flow-success.ts @@ -87,9 +87,12 @@ export async function assertSuccess( dynamic: ReadonlySet, waitForSignal: WaitForSignal, timeoutMs: number, + since = 0, ): Promise { if (success === undefined) return { pass: true }; const predicate = successToPredicate(success, dynamic); if (predicate === undefined) return { pass: true }; - return waitForSignal(session, predicate, timeoutMs); + // `since` floors the window at the start of THIS replay so a success signal left in the buffer by + // a prior replay/run (or, in heal, by the pre-heal drift replay) cannot fake a pass. + return waitForSignal(session, predicate, timeoutMs, since); } diff --git a/packages/server/src/flows/flow-tools.ts b/packages/server/src/flows/flow-tools.ts index 338a1b0..b966233 100644 --- a/packages/server/src/flows/flow-tools.ts +++ b/packages/server/src/flows/flow-tools.ts @@ -230,6 +230,9 @@ export const FLOW_TOOLS: ToolDef[] = [ }; } const session = deps.sessions.resolve(asString(args['sessionId'])); + // Floor the success oracle at the start of THIS replay so a stale signal from a prior run + // in the same session can't fake a pass. + const replayFloor = session.elapsed(); const steps = await replayFlow( session, loaded.value, @@ -248,6 +251,7 @@ export const FLOW_TOOLS: ToolDef[] = [ dynamicTestids(loaded.value), waitForPredicate, FLOW_SIGNAL_TIMEOUT_MS, + replayFloor, ); const row: FlowStepResult = { step: steps.length, @@ -466,6 +470,9 @@ async function healFlow(deps: ToolDeps, args: Record): Promise< // still heal them but say so loudly so the gap is visible. const { flow: healed } = applyHealChanges(loaded.value, proposals.map(toChange)); if (healed.success !== undefined) { + // Floor the success oracle at the start of the VERIFY replay so the success signal emitted by the + // earlier drift replay's prefix (this same heal call) cannot fake the verification. + const verifyFloor = session.elapsed(); const verifySteps = await replayFlow( session, healed, @@ -482,6 +489,7 @@ async function healFlow(deps: ToolDeps, args: Record): Promise< dynamicTestids(healed), waitForPredicate, FLOW_SIGNAL_TIMEOUT_MS, + verifyFloor, ) : { pass: false, failureReason: 'healed flow did not replay cleanly' }; if (!verdict.pass) { diff --git a/packages/server/src/flows/tools.flow-replay.test.ts b/packages/server/src/flows/tools.flow-replay.test.ts index 2079b52..d0a5030 100644 --- a/packages/server/src/flows/tools.flow-replay.test.ts +++ b/packages/server/src/flows/tools.flow-replay.test.ts @@ -61,7 +61,13 @@ function scriptedSession( } return Promise.resolve({ kind: 'command_result', id: 'x', ok: true, result: {} }); }; - return { id: 'demo', command, eventsSince: () => [], onEvent: () => () => undefined }; + return { + id: 'demo', + command, + eventsSince: () => [], + onEvent: () => () => undefined, + elapsed: () => 0, + }; } function fakeDeps( From 84c296c8996dee7b6c9f604b85854f76e565c3ea Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 16:52:55 +0530 Subject: [PATCH 31/33] test(e2e): live verification of the new features against the real demo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Positively exercises tonight's additions end-to-end in a real browser (the existing battery only proves no-regression): iris_query count_only/limit, the settled predicate + act_and_wait auto-settle, and the iris_assert presence-only advice. The settled check is run against the demo's count-up animations on purpose — the exact ambient dom.text churn that the iter-29 fix must tolerate; it now settles (quietForMs ~307) instead of timing out. 7/7 live. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/e2e/specs/new-features-test.mjs | 91 ++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 apps/e2e/specs/new-features-test.mjs diff --git a/apps/e2e/specs/new-features-test.mjs b/apps/e2e/specs/new-features-test.mjs new file mode 100644 index 0000000..232d1ca --- /dev/null +++ b/apps/e2e/specs/new-features-test.mjs @@ -0,0 +1,91 @@ +// Live verification of the features added in the [Unreleased] CHANGELOG section, against the real +// showcase dashboard (apps/demo :4310 + apps/api :8787). The existing battery proves no regression; +// this spec positively exercises the NEW surfaces end-to-end in a real browser: +// - settled predicate + iris_act_and_wait auto-settle (incl. the ambient-animation fix: the demo's +// count-up counters emit dom.text every frame, which must NOT prevent settling) +// - iris_query limit / count_only token controls +// - iris_assert presence-only `advice` nudge +import { chromium } from 'playwright'; +import { + start, + TOOLS, + BaselineStore, + RecordingStore, + FlowStore, + ProjectStore, + AnnotationStore, + createNodeFileSystem, +} from '@syrin/iris-server'; +import os from 'node:os'; +import path from 'node:path'; +const sleep = (ms) => new Promise((r) => setTimeout(r, ms)); +let pass = 0, + fail = 0; +const chk = (l, o, d = '') => { + console.log(` ${o ? '✅' : '❌'} ${l}${d ? ' — ' + d : ''}`); + o ? pass++ : fail++; +}; + +const irisRoot = path.join(os.tmpdir(), `iris-nf-${process.pid}`, '.iris'); +const fsp = createNodeFileSystem(); +const now = () => Date.now(); +const server = await start({ port: 4400, mcp: false }); +const deps = { + sessions: server.bridge.sessions, + baselines: new BaselineStore(), + recordings: new RecordingStore(), + flows: new FlowStore(fsp, irisRoot, { now }), + project: new ProjectStore(fsp, irisRoot, { now }), + annotations: new AnnotationStore(), + fs: fsp, + irisRoot, + now, +}; +const T = (n, a = {}) => TOOLS.find((t) => t.name === n).handler(deps, { sessionId: 'demo', ...a }); +const refOf = async (by, value) => { + for (let i = 0; i < 40; i++) { + const r = (await T('iris_query', { by, value })).elements?.[0]?.ref; + if (r) return r; + await sleep(100); + } + return null; +}; + +const b = await chromium.launch({ headless: true }); +const p = await b.newPage(); +await p.goto('http://localhost:4310/?session=demo', { waitUntil: 'networkidle' }); +for (let i = 0; i < 200 && server.bridge.sessions.count() === 0; i++) await sleep(50); + +console.log('\n=== Iris × new features (:4310) ==='); +chk('dashboard SDK connected', server.bridge.sessions.count() > 0); + +// count_only — just the match count, no descriptors. +const co = await T('iris_query', { by: 'role', value: 'button', count_only: true }); +chk('iris_query count_only returns a count, drops elements', typeof co.count === 'number' && co.count >= 1 && co.elements === undefined, `count=${co.count}`); + +// limit — cap descriptors; when more matched, total + truncated flag it. +const lim = await T('iris_query', { by: 'role', value: 'button', limit: 1 }); +const moreThanOne = (co.count ?? 0) > 1; +chk('iris_query limit caps descriptors (truncated when more)', (lim.elements?.length ?? 0) <= 1 && (!moreThanOne || (lim.truncated === true && lim.total === co.count)), `returned=${lim.elements?.length}, total=${lim.total ?? 'n/a'}`); + +// Auth (pre-filled) → dashboard with its count-up animations. +await T('iris_act_and_wait', { ref: await refOf('testid', 'login-submit'), action: 'click', until: { kind: 'signal', name: 'auth:granted' }, timeout_ms: 5000 }); +chk('login → dashboard', (await refOf('testid', 'nav-deployments')) !== null); + +// settled wait — the dashboard's count-up counters emit dom.text every frame; settle must STILL +// resolve (the ambient-animation fix). Pre-fix this would time out at 4s with pass:false. +const settled = await T('iris_wait_for', { predicate: { kind: 'settled', quietMs: 300 }, timeout_ms: 4000 }); +chk('settled resolves despite count-up animation churn', settled.pass === true, JSON.stringify(settled.evidence ?? {})); + +// act_and_wait with NO `until` → auto-settle after a nav click; verdict carries settled evidence. +const aw = await T('iris_act_and_wait', { ref: await refOf('testid', 'nav-deployments'), action: 'click' }); +chk('act_and_wait (no until) auto-settles', aw.verdict?.pass === true && aw.verdict?.evidence?.settled === true, JSON.stringify(aw.verdict?.evidence ?? {})); + +// presence-only advice — a PASSING element assertion is nudged toward a consequence. +const adv = await T('iris_assert', { predicate: { kind: 'element', query: { testid: 'deploy-list' } } }); +chk('iris_assert presence-only attaches advice', adv.pass === true && typeof adv.advice === 'string' && adv.advice.includes('consequence'), adv.advice ? 'advice present' : 'no advice'); + +console.log(`\n${fail === 0 ? '✅ NEW FEATURES VERIFIED' : '❌ FAILED'} (${pass} passed, ${fail} failed)`); +await b.close(); +await server.close(); +process.exit(fail === 0 ? 0 : 1); From 3b0e8e08ba9bdfda1acbd71553fd548a38ad775c Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 16:59:48 +0530 Subject: [PATCH 32/33] =?UTF-8?q?chore(release):=200.6.10=20=E2=80=94=20de?= =?UTF-8?q?terministic=20waiting,=20safe=20healing,=20token=20controls,=20?= =?UTF-8?q?domain=20mustHold?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps all 10 publishable @syrin/iris-* packages 0.5.0 -> 0.6.10 and promotes the CHANGELOG [Unreleased] section to [0.6.10]. This is the live-verified feature release (NOT 1.0.0 — see plan/V1-ROADMAP.md for the 1.0 gate): full e2e battery 14/14 + a new-features spec 7/7 green on a real browser across the Vite/React demo, Next.js 15, and the api. Cleanup: removed a throwaway headless-launcher script (untracked) and its leaked browser process. No dead code to remove otherwise — noUnusedLocals + no-unused-vars are enforced and green; knip's "unused files" are all intentional dynamically-spawned e2e specs and standalone asset scripts. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 2 +- packages/babel-plugin/package.json | 2 +- packages/browser/package.json | 2 +- packages/eslint-plugin/package.json | 2 +- packages/iris/package.json | 2 +- packages/next/package.json | 2 +- packages/protocol/package.json | 2 +- packages/react/package.json | 2 +- packages/server/package.json | 2 +- packages/test/package.json | 2 +- packages/vite-plugin/package.json | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2b8bc1..fa02d50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to **`@syrin/iris`** are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.6.10] — 2026-06-18 ### Added diff --git a/packages/babel-plugin/package.json b/packages/babel-plugin/package.json index 2c40e83..98b62d2 100644 --- a/packages/babel-plugin/package.json +++ b/packages/babel-plugin/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris-babel-plugin", - "version": "0.5.0", + "version": "0.6.10", "private": true, "description": "Babel plugin that stamps data-iris-source=\"file:line:col\" on JSX host elements so @syrin/iris-react can map a DOM node to its source (works on React 19, which dropped _debugSource).", "type": "module", diff --git a/packages/browser/package.json b/packages/browser/package.json index cf768c4..d7a5727 100644 --- a/packages/browser/package.json +++ b/packages/browser/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris-browser", - "version": "0.5.0", + "version": "0.6.10", "description": "Iris browser SDK — installs observers, builds semantic snapshots, executes actions, talks to the bridge.", "type": "module", "main": "./dist/index.js", diff --git a/packages/eslint-plugin/package.json b/packages/eslint-plugin/package.json index 5968bfb..27340ea 100644 --- a/packages/eslint-plugin/package.json +++ b/packages/eslint-plugin/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris-eslint-plugin", - "version": "0.5.0", + "version": "0.6.10", "description": "Iris ESLint plugin — keeps the signal layer self-enforcing (mutation-without-signal).", "type": "module", "main": "./dist/index.js", diff --git a/packages/iris/package.json b/packages/iris/package.json index 2336abb..1cb1fd1 100644 --- a/packages/iris/package.json +++ b/packages/iris/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris", - "version": "0.5.0", + "version": "0.6.10", "description": "One-install Iris: the dev-only SDK (browser + React adapter) and the spec runner under one package, with subpaths for the source-mapping plugins and the MCP server.", "type": "module", "license": "MIT", diff --git a/packages/next/package.json b/packages/next/package.json index 095f602..c74e453 100644 --- a/packages/next/package.json +++ b/packages/next/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris-next", - "version": "0.5.0", + "version": "0.6.10", "private": true, "description": "Next.js helper for Iris: keeps SWC, adds source-file mapping via a dev-only webpack pre-loader (data-iris-source).", "license": "MIT", diff --git a/packages/protocol/package.json b/packages/protocol/package.json index 4b2b19a..4fbf9fc 100644 --- a/packages/protocol/package.json +++ b/packages/protocol/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris-protocol", - "version": "0.5.0", + "version": "0.6.10", "description": "Shared wire contract, constants, and zod schemas for the Iris browser SDK <-> bridge channel.", "type": "module", "main": "./dist/index.js", diff --git a/packages/react/package.json b/packages/react/package.json index bd7c9dc..9474a29 100644 --- a/packages/react/package.json +++ b/packages/react/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris-react", - "version": "0.5.0", + "version": "0.6.10", "description": "Iris React adapter — maps a DOM ref to its component stack and source file via the fiber tree.", "type": "module", "main": "./dist/index.js", diff --git a/packages/server/package.json b/packages/server/package.json index 787f2fb..954c9f7 100644 --- a/packages/server/package.json +++ b/packages/server/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris-server", - "version": "0.5.0", + "version": "0.6.10", "description": "Iris bridge + MCP server. Hosts the browser WS endpoint and exposes MCP tools to the coding agent.", "type": "module", "main": "./dist/index.js", diff --git a/packages/test/package.json b/packages/test/package.json index f7fc650..ee988a9 100644 --- a/packages/test/package.json +++ b/packages/test/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris-test", - "version": "0.5.0", + "version": "0.6.10", "description": "Iris test runner: irisTest spec registry + a runner that drives Iris tools without MCP/stdio.", "type": "module", "main": "./dist/index.js", diff --git a/packages/vite-plugin/package.json b/packages/vite-plugin/package.json index 70305e4..60ae3b7 100644 --- a/packages/vite-plugin/package.json +++ b/packages/vite-plugin/package.json @@ -1,6 +1,6 @@ { "name": "@syrin/iris-vite-plugin", - "version": "0.5.0", + "version": "0.6.10", "private": true, "description": "Vite plugin for Iris: dev-only source-map stamping plus auto-injected iris.connect(). apply:'serve' guarantees it never ships to production.", "type": "module", From 777a60b8fea4f154ec5705390b1b6604adcefdcb Mon Sep 17 00:00:00 2001 From: Divyanshu Shekhar Date: Thu, 18 Jun 2026 17:12:39 +0530 Subject: [PATCH 33/33] fix(security): bump vite-plugin's dev vite to ^7 (clears GHSA-fx2h-pf6j-xcff + 2 moderates) and add build+audit to pre-commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit audit: vite-plugin's devDependency vite "^5" resolved to 5.4.21, flagged by 1 high (server.fs.deny bypass) + 2 moderate (optimized-deps path traversal, launch-editor NTLM disclosure) advisories (all vite <=6.4.2). Bumped to "^7" so it dedupes to the already-in-tree patched vite 7.3.5; peer stays >=4. `pnpm audit --audit-level high` now reports no known vulnerabilities; vite-plugin builds + 12 tests pass under vite 7. pre-commit: the hook was missing two CI steps. It now mirrors .github/workflows/ci.yml one-for-one — added `pnpm build` (step 2) and `pnpm audit --audit-level high` (step 7), so a green commit is a green CI run and a high+ vuln is caught before push, not in CI. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/vite-plugin/package.json | 2 +- pnpm-lock.yaml | 50 ++++--------------------------- pre-commit.sh | 23 ++++++++++---- 3 files changed, 25 insertions(+), 50 deletions(-) diff --git a/packages/vite-plugin/package.json b/packages/vite-plugin/package.json index 60ae3b7..9b3da5e 100644 --- a/packages/vite-plugin/package.json +++ b/packages/vite-plugin/package.json @@ -46,7 +46,7 @@ }, "devDependencies": { "@types/babel__core": "^7.20.5", - "vite": "^5" + "vite": "^7" }, "peerDependencies": { "vite": ">=4" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 91ca74a..bfb2268 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -372,8 +372,8 @@ importers: specifier: ^7.20.5 version: 7.20.5 vite: - specifier: ^5 - version: 5.4.21(@types/node@22.19.20)(lightningcss@1.32.0) + specifier: ^7 + version: 7.3.5(@types/node@22.19.20)(lightningcss@1.32.0) packages: @@ -3102,37 +3102,6 @@ packages: engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} hasBin: true - vite@5.4.21: - resolution: {integrity: sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==} - engines: {node: ^18.0.0 || >=20.0.0} - hasBin: true - peerDependencies: - '@types/node': ^18.0.0 || >=20.0.0 - less: '*' - lightningcss: ^1.21.0 - sass: '*' - sass-embedded: '*' - stylus: '*' - sugarss: '*' - terser: ^5.4.0 - peerDependenciesMeta: - '@types/node': - optional: true - less: - optional: true - lightningcss: - optional: true - sass: - optional: true - sass-embedded: - optional: true - stylus: - optional: true - sugarss: - optional: true - terser: - optional: true - vite@7.3.5: resolution: {integrity: sha512-KuOaNhcnGFN2zIPGA7wRmzF+lJA1sea7rHq17aiJ++9lzY1WWG6Jpwqwe1KNbRVPIqHmr8GLYx7jbrQcN/7/ww==} engines: {node: ^20.19.0 || >=22.12.0} @@ -6010,9 +5979,10 @@ snapshots: debug: 4.4.3 es-module-lexer: 1.7.0 pathe: 2.0.3 - vite: 5.4.21(@types/node@22.19.20)(lightningcss@1.32.0) + vite: 7.3.5(@types/node@22.19.20)(lightningcss@1.32.0) transitivePeerDependencies: - '@types/node' + - jiti - less - lightningcss - sass @@ -6021,16 +5991,8 @@ snapshots: - sugarss - supports-color - terser - - vite@5.4.21(@types/node@22.19.20)(lightningcss@1.32.0): - dependencies: - esbuild: 0.28.1 - postcss: 8.5.15 - rollup: 4.61.1 - optionalDependencies: - '@types/node': 22.19.20 - fsevents: 2.3.3 - lightningcss: 1.32.0 + - tsx + - yaml vite@7.3.5(@types/node@22.19.20)(lightningcss@1.32.0): dependencies: diff --git a/pre-commit.sh b/pre-commit.sh index e7f7b7b..4ae4662 100755 --- a/pre-commit.sh +++ b/pre-commit.sh @@ -80,23 +80,36 @@ done < <(ts_staged) [ "$fail" -eq 0 ] && note "${GREEN}✓ safety${NC}" -# ----- 2. FORMAT ----------------------------------------------------------- +# Steps 2-7 mirror CI (.github/workflows/ci.yml) one-for-one so a green commit is a green CI run: +# build -> format -> lint -> types -> tests -> audit. Keep this list in sync with ci.yml. + +# ----- 2. BUILD ------------------------------------------------------------ +step "Build (turbo)" +if ! pnpm -s build; then note "${RED}✗ build${NC}"; fail=1; else note "${GREEN}✓ build${NC}"; fi + +# ----- 3. FORMAT ----------------------------------------------------------- step "Prettier (format check)" if ! pnpm -s format:check; then note "${RED}✗ prettier${NC}"; fail=1; else note "${GREEN}✓ format${NC}"; fi -# ----- 3. LINT ------------------------------------------------------------- +# ----- 4. LINT ------------------------------------------------------------- step "ESLint" if ! pnpm -s lint; then note "${RED}✗ eslint${NC}"; fail=1; else note "${GREEN}✓ lint${NC}"; fi -# ----- 4. TYPES ------------------------------------------------------------ +# ----- 5. TYPES ------------------------------------------------------------ step "TypeScript (tsc --build)" if ! pnpm -s typecheck; then note "${RED}✗ types${NC}"; fail=1; else note "${GREEN}✓ types${NC}"; fi -# ----- 5. TESTS ------------------------------------------------------------ +# ----- 6. TESTS ------------------------------------------------------------ step "Unit tests (vitest)" if ! pnpm -s test:unit; then note "${RED}✗ tests${NC}"; fail=1; else note "${GREEN}✓ tests${NC}"; fi -# ----- 6. SUMMARY ---------------------------------------------------------- +# ----- 7. AUDIT ------------------------------------------------------------ +# Same gate as CI. Needs network (queries the advisory DB); an offline commit will fail here — that +# is the intended CI parity, so a high+ vuln is caught before push, not after. +step "Security audit (--audit-level high)" +if ! pnpm audit --audit-level high; then note "${RED}✗ audit (high+ vulnerability)${NC}"; fail=1; else note "${GREEN}✓ audit${NC}"; fi + +# ----- 8. SUMMARY ---------------------------------------------------------- if [ "$fail" -ne 0 ]; then note "\n${RED}✗ pre-commit FAILED — commit blocked${NC}" exit 1