diff --git a/skills/browser-trace/package.json b/skills/browser-trace/package.json index 566b63c..1327023 100644 --- a/skills/browser-trace/package.json +++ b/skills/browser-trace/package.json @@ -2,5 +2,8 @@ "name": "browser-trace", "version": "0.1.0", "private": true, - "type": "module" + "type": "module", + "scripts": { + "test": "node --test scripts/*.test.mjs" + } } diff --git a/skills/browser-trace/scripts/snapshot-loop.mjs b/skills/browser-trace/scripts/snapshot-loop.mjs index 00b297b..e4bd4d2 100755 --- a/skills/browser-trace/scripts/snapshot-loop.mjs +++ b/skills/browser-trace/scripts/snapshot-loop.mjs @@ -4,62 +4,160 @@ // // Each tick opens a one-shot CDP connection via `browse --ws ...` // (bypasses the `browse` daemon so it doesn't fight the main automation). +// +// Lifecycle: stop-capture sends SIGTERM and then waits up to ~3 seconds +// before falling back to SIGKILL. The loop must therefore wake from its +// inter-iteration sleep promptly when SIGTERM arrives, otherwise long +// `interval-seconds` settings cause SIGKILL to fire mid-iteration and the +// run loses its last DOM/screenshot pair. import fs from 'node:fs'; import path from 'node:path'; +import { fileURLToPath } from 'node:url'; import { spawnSync } from 'node:child_process'; -import { isoStampForFilename, sleepMs } from './lib.mjs'; +import { isoStampForFilename } from './lib.mjs'; -const [target, RD, intervalArg] = process.argv.slice(2); -if (!target || !RD) { - console.error('usage: snapshot-loop.mjs [interval-seconds]'); - process.exit(2); -} +// Per-call timeout for `browse --ws ...` invocations. A hung browse CLI +// would otherwise block this loop indefinitely until the parent SIGKILL +// arrives, leaving the run truncated. Tunable via env so tests / heavy +// pages can extend it. +const SNAPSHOT_TIMEOUT_MS = Number(process.env.O11Y_SNAPSHOT_TIMEOUT_MS) || 30_000; -const intervalMs = (Number(intervalArg) || 2) * 1000; -const indexPath = path.join(RD, 'index.jsonl'); +// Run as a CLI only when invoked directly. Letting the module be imported +// (e.g. from snapshot-loop.test.mjs) keeps the stop-signal helper unit- +// testable without booting the full sampler loop. +const isEntry = (() => { + if (!process.argv[1]) return false; + try { + return fileURLToPath(import.meta.url) === path.resolve(process.argv[1]); + } catch { + return false; + } +})(); +if (isEntry) await runSampler(); -let stopping = false; -process.on('SIGTERM', () => { stopping = true; }); -process.on('SIGINT', () => { stopping = true; }); +async function runSampler() { + const [target, RD, intervalArg] = process.argv.slice(2); + if (!target || !RD) { + console.error('usage: snapshot-loop.mjs [interval-seconds]'); + process.exit(2); + } -while (!stopping) { - const ts = isoStampForFilename(); - const png = path.join(RD, 'screenshots', `${ts}.png`); - const html = path.join(RD, 'dom', `${ts}.html`); - const tmp = `${html}.partial`; + const intervalMs = (Number(intervalArg) || 2) * 1000; + const indexPath = path.join(RD, 'index.jsonl'); - // Best-effort screenshot. If browse fails we just don't get one this tick. - spawnSync('browse', ['--ws', target, 'screenshot', png], { stdio: 'ignore' }); - if (fs.existsSync(png) && fs.statSync(png).size === 0) { - fs.unlinkSync(png); - } + // Single shared stop signal. SIGTERM/SIGINT both flip `stopping` and + // resolve the promise so any in-flight wait can short-circuit. + const stop = createStopSignal(); - // DOM dump via temp file → rename, so we never leave a 0-byte HTML behind. - try { - const r = spawnSync('browse', ['--ws', target, 'get', 'html', 'body'], { encoding: 'utf8' }); - if (r.stdout && r.stdout.length) { - fs.writeFileSync(tmp, r.stdout); - fs.renameSync(tmp, html); + // Synchronous calls inside the iteration body cannot be interrupted by + // SIGTERM mid-tick: Node only drains the signal callback queue between + // event-loop turns, and the only yield point we hit is the awaited + // sleep at the bottom. So the iteration body always runs to completion; + // SIGTERM responsiveness comes from the per-spawnSync timeout (each + // browse call returns within SNAPSHOT_TIMEOUT_MS) plus the abortable + // sleep that resolves immediately when the signal arrives. + while (!stop.stopping) { + const ts = isoStampForFilename(); + const png = path.join(RD, 'screenshots', `${ts}.png`); + const html = path.join(RD, 'dom', `${ts}.html`); + const tmp = `${html}.partial`; + + // Best-effort screenshot. If browse fails we just don't get one this tick. + spawnSync('browse', ['--ws', target, 'screenshot', png], { + stdio: 'ignore', + timeout: SNAPSHOT_TIMEOUT_MS, + }); + if (fs.existsSync(png) && fs.statSync(png).size === 0) { + fs.unlinkSync(png); + } + + // DOM dump via temp file → rename, so we never leave a 0-byte HTML behind. + try { + const r = spawnSync('browse', ['--ws', target, 'get', 'html', 'body'], { + encoding: 'utf8', + timeout: SNAPSHOT_TIMEOUT_MS, + }); + if (r.stdout && r.stdout.length) { + fs.writeFileSync(tmp, r.stdout); + fs.renameSync(tmp, html); + } + } catch { /* best-effort */ } + // Cleanup any leftover .partial from a previous interrupted iteration. + if (fs.existsSync(tmp)) { + try { fs.unlinkSync(tmp); } catch {} } - } catch { /* best-effort */ } - // Cleanup any leftover .partial from a previous interrupted iteration. - if (fs.existsSync(tmp)) { - try { fs.unlinkSync(tmp); } catch {} - } - // URL via the daemon-bypassing one-shot. Returns {"url": "..."}. - let urlValue = ''; - const u = spawnSync('browse', ['--ws', target, '--json', 'get', 'url'], { encoding: 'utf8' }); - if (u.stdout) { - try { urlValue = JSON.parse(u.stdout).url || ''; } catch {} + // URL via the daemon-bypassing one-shot. Returns {"url": "..."}. + let urlValue = ''; + const u = spawnSync('browse', ['--ws', target, '--json', 'get', 'url'], { + encoding: 'utf8', + timeout: SNAPSHOT_TIMEOUT_MS, + }); + if (u.stdout) { + try { urlValue = JSON.parse(u.stdout).url || ''; } catch {} + } + + const screenshotRel = fs.existsSync(png) ? `screenshots/${ts}.png` : ''; + const domRel = fs.existsSync(html) ? `dom/${ts}.html` : ''; + fs.appendFileSync(indexPath, + JSON.stringify({ ts, screenshot: screenshotRel, dom: domRel, url: urlValue }) + '\n'); + + await stop.sleep(intervalMs); } +} + +// --------------------------------------------------------------------------- - const screenshotRel = fs.existsSync(png) ? `screenshots/${ts}.png` : ''; - const domRel = fs.existsSync(html) ? `dom/${ts}.html` : ''; - fs.appendFileSync(indexPath, - JSON.stringify({ ts, screenshot: screenshotRel, dom: domRel, url: urlValue }) + '\n'); +// Build a stop signal that listens once for SIGTERM/SIGINT, exposes a +// boolean view, and provides an abortable sleep so the inter-iteration +// pause wakes immediately when the user calls stop-capture. +// +// Exported as a factory rather than a module-level mutable so a test can +// drive it deterministically without messing with the live process's +// signal handlers. +export function createStopSignal() { + let stopping = false; + // Pending sleeps register here so `trigger` can resolve them in one + // pass. Using an explicit set rather than `stopPromise.then(...)` per + // sleep keeps memory bounded: a 24-hour capture at the default 2 s + // interval is ~43,000 sleeps, and a chained .then handler closure per + // sleep would pin all of them on the long-lived stop promise's + // reaction list until SIGTERM finally fired. + const pending = new Set(); + + const trigger = () => { + if (stopping) return; + stopping = true; + for (const entry of pending) { + clearTimeout(entry.timer); + entry.resolve(); + } + pending.clear(); + }; + process.on('SIGTERM', trigger); + process.on('SIGINT', trigger); + + function sleep(ms) { + if (stopping) return Promise.resolve(); + return new Promise((resolve) => { + const entry = { resolve, timer: null }; + entry.timer = setTimeout(() => { + // Self-clean on natural wake-up so the closure becomes + // unreachable as soon as the timer fires. + pending.delete(entry); + resolve(); + }, ms); + pending.add(entry); + }); + } - await sleepMs(intervalMs); + return { + get stopping() { return stopping; }, + sleep, + // Test-only entry points. Production code relies on the signal handlers. + _trigger: trigger, + _pendingCount: () => pending.size, + }; } diff --git a/skills/browser-trace/scripts/snapshot-loop.test.mjs b/skills/browser-trace/scripts/snapshot-loop.test.mjs new file mode 100644 index 0000000..2e499a7 --- /dev/null +++ b/skills/browser-trace/scripts/snapshot-loop.test.mjs @@ -0,0 +1,88 @@ +// Unit tests for the stop-signal helper used by snapshot-loop.mjs. +// +// Run with: node --test scripts/snapshot-loop.test.mjs +// +// The factory exposes `_trigger` so each test can stand in for the live +// SIGTERM/SIGINT handlers without touching the surrounding process. + +import { test } from 'node:test'; +import assert from 'node:assert/strict'; + +import { createStopSignal } from './snapshot-loop.mjs'; + +test('stopping is false until trigger fires', () => { + const stop = createStopSignal(); + assert.equal(stop.stopping, false); + stop._trigger(); + assert.equal(stop.stopping, true); +}); + +test('sleep wakes immediately when trigger fires mid-wait', async () => { + const stop = createStopSignal(); + const startedAt = Date.now(); + const sleeping = stop.sleep(60_000); + // Fire the stop on the next tick so the sleep is actually pending. + setTimeout(() => stop._trigger(), 5); + await sleeping; + const waited = Date.now() - startedAt; + assert.ok( + waited < 1_000, + `sleep should have aborted in well under a second, but waited ${waited} ms`, + ); + assert.equal(stop.stopping, true); +}); + +test('sleep resolves immediately when trigger has already fired', async () => { + const stop = createStopSignal(); + stop._trigger(); + const startedAt = Date.now(); + await stop.sleep(60_000); + const waited = Date.now() - startedAt; + assert.ok( + waited < 100, + `sleep should have returned synchronously after stopping, but waited ${waited} ms`, + ); +}); + +test('sleep without a trigger waits for the requested interval', async () => { + const stop = createStopSignal(); + const startedAt = Date.now(); + await stop.sleep(80); + const waited = Date.now() - startedAt; + assert.ok( + waited >= 70, + `sleep should honor its interval when no trigger fires, but waited ${waited} ms`, + ); + assert.equal(stop.stopping, false); +}); + +test('repeated triggers are idempotent', () => { + const stop = createStopSignal(); + stop._trigger(); + stop._trigger(); + stop._trigger(); + assert.equal(stop.stopping, true); +}); + +test('completed sleeps deregister from the pending set so closures can be GC\'d', async () => { + const stop = createStopSignal(); + assert.equal(stop._pendingCount(), 0); + // Run many sleeps back-to-back so a per-sleep .then handler would + // accumulate visibly on the stop promise's reaction list. + for (let i = 0; i < 200; i++) { + const p = stop.sleep(1); + assert.equal(stop._pendingCount(), 1, `pending should be 1 mid-sleep on iteration ${i}`); + await p; + assert.equal(stop._pendingCount(), 0, `pending should drain to 0 after iteration ${i}`); + } +}); + +test('trigger drains every still-pending sleep without leaking', async () => { + const stop = createStopSignal(); + const sleeps = [stop.sleep(60_000), stop.sleep(60_000), stop.sleep(60_000)]; + assert.equal(stop._pendingCount(), 3); + stop._trigger(); + assert.equal(stop._pendingCount(), 0); + // All three resolve with no further input. + await Promise.all(sleeps); +}); diff --git a/skills/browser-trace/scripts/start-capture.mjs b/skills/browser-trace/scripts/start-capture.mjs index ab10185..33a533f 100755 --- a/skills/browser-trace/scripts/start-capture.mjs +++ b/skills/browser-trace/scripts/start-capture.mjs @@ -65,8 +65,11 @@ const loop = spawn(process.execPath, [loopScript, target, RD, String(interval)], loop.unref(); fs.writeFileSync(path.join(RD, '.loop.pid'), String(loop.pid)); -// Give browse cdp a beat to fail loudly on bad targets so the user sees the -// real error instead of a silent zero-event capture. +// Give both children a beat to fail loudly on bad targets so the user sees +// the real error instead of a silent zero-event capture. The snapshot loop +// is checked too: a syntax error or missing dep there would otherwise leave +// the run with CDP events but no DOM/screenshots, and the user would only +// notice after stop-capture. await sleepMs(1000); if (!isAlive(cdp.pid)) { console.error(`browse cdp exited immediately — check ${RD}/cdp/stderr.log`); @@ -74,6 +77,12 @@ if (!isAlive(cdp.pid)) { try { process.kill(loop.pid); } catch {} process.exit(1); } +if (!isAlive(loop.pid)) { + console.error(`snapshot-loop exited immediately — check ${RD}/snapshot-loop.log`); + try { console.error(fs.readFileSync(path.join(RD, 'snapshot-loop.log'), 'utf8')); } catch {} + try { process.kill(cdp.pid); } catch {} + process.exit(1); +} console.log(`run_id=${runId}`); console.log(`run_dir=${RD}`);