From c4dc0665862a203f308e3afb8475bb8a21529858 Mon Sep 17 00:00:00 2001 From: Michael Sober Date: Tue, 23 Jun 2026 10:20:03 +0000 Subject: [PATCH 1/6] fix(dev-server): auto-respawn frontend and kill process group on restart (fixes :3100 502 race) The dev server spawns the frontend (Vite) with `shell: true`, so the real Vite process is a grandchild (shell -> npx -> node vite). On a `tsx watch` restart, cleanup sent SIGTERM to only the shell parent, orphaning the Vite grandchild, which survived still bound to :3100. The new Vite then hit `--strictPort`, failed to bind, and exited; the exit handler only logged, so `/` returned a permanent 502 "Frontend server unavailable" with no recovery. Fix (packages/core/src/scripts/dev-server.ts): - Spawn the frontend `detached` on POSIX (own process group) and kill the whole group via process.kill(-pid, ...) on cleanup/restart so the Vite grandchild is reaped and :3100 is freed. Windows falls back to a direct child kill. - Bounded auto-respawn on unexpected exit (exponential backoff, max 5 / 10s), suppressed during shutdown via an isShuttingDown guard. - Idempotent, leak-free cleanup wired to SIGINT/SIGTERM/SIGHUP that waits (bounded, SIGTERM->SIGKILL) for the group to die before exit, plus a synchronous process 'exit' safety net. - Retain --strictPort: the proxy target is hardcoded to :3100, so the port is reliably freed rather than letting Vite drift to a port the proxy won't follow. Adds dev-server-supervisor.test.ts: unit tests for the respawn policy and process-group signal routing, plus a real-process integration test proving a direct child kill leaks the port while a group kill frees it. --- .changeset/dev-server-restart-race-502.md | 30 +++ .../src/scripts/dev-server-supervisor.test.ts | 203 ++++++++++++++ packages/core/src/scripts/dev-server.ts | 249 ++++++++++++++++-- 3 files changed, 456 insertions(+), 26 deletions(-) create mode 100644 .changeset/dev-server-restart-race-502.md create mode 100644 packages/core/src/scripts/dev-server-supervisor.test.ts diff --git a/.changeset/dev-server-restart-race-502.md b/.changeset/dev-server-restart-race-502.md new file mode 100644 index 00000000..5759d5c1 --- /dev/null +++ b/.changeset/dev-server-restart-race-502.md @@ -0,0 +1,30 @@ +--- +"@aws-blocks/core": patch +--- + +fix(dev-server): auto-respawn frontend and kill the whole process group on restart + +The dev server spawns the frontend (Vite) with `shell: true`, making the real +Vite process a **grandchild** (shell → npx → node vite). On a `tsx watch` +restart, cleanup sent `SIGTERM` to only the shell parent, orphaning the Vite +grandchild — it survived still bound to `:3100`. The freshly launched Vite then +hit `--strictPort`, failed to bind, and exited; the `exit` handler only logged, +so `/` served a permanent `502 Frontend server unavailable` with no recovery. + +Fixes: + +- **Process-group kill** — the frontend is spawned `detached` on POSIX (its own + process group) and cleanup/restart now signal the entire group via + `process.kill(-pid, …)`, reaping the Vite grandchild and freeing `:3100`. + Windows (no POSIX groups) falls back to a direct child kill. +- **Bounded auto-respawn** — an unexpected frontend exit now respawns Vite with + exponential backoff, capped at 5 restarts / 10s to avoid hot loops, and is + suppressed during intentional shutdown via an `isShuttingDown` guard. +- **Robust shutdown** — cleanup is idempotent, wired to `SIGINT`/`SIGTERM`/ + `SIGHUP`, removes its own listeners, waits (bounded) for the group to die + before exiting (SIGTERM→SIGKILL escalation), and keeps a synchronous + `process.on('exit')` safety net so a detached Vite is never left orphaned. + +`--strictPort` is intentionally retained: the proxy target is hardcoded to +`:3100`, so the port is reliably freed rather than letting Vite drift to another +port the proxy wouldn't follow. diff --git a/packages/core/src/scripts/dev-server-supervisor.test.ts b/packages/core/src/scripts/dev-server-supervisor.test.ts new file mode 100644 index 00000000..5a955c7c --- /dev/null +++ b/packages/core/src/scripts/dev-server-supervisor.test.ts @@ -0,0 +1,203 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +import { describe, it } from 'node:test'; +import assert from 'node:assert'; +import { spawn } from 'node:child_process'; +import { createConnection, createServer } from 'node:net'; +import { + evaluateFrontendRespawn, + DEFAULT_FRONTEND_RESPAWN_POLICY, + killFrontendTree, + type KillableProcess, +} from './dev-server.js'; + +const isWindows = process.platform === 'win32'; + +function getFreePort(): Promise { + return new Promise((resolve, reject) => { + const srv = createServer(); + srv.on('error', reject); + srv.listen(0, '127.0.0.1', () => { + const addr = srv.address(); + const port = typeof addr === 'object' && addr ? addr.port : 0; + srv.close(() => resolve(port)); + }); + }); +} + +function isPortOpen(port: number): Promise { + return new Promise((resolve) => { + const sock = createConnection({ port, host: '127.0.0.1' }, () => { + sock.destroy(); + resolve(true); + }); + sock.on('error', () => { sock.destroy(); resolve(false); }); + sock.setTimeout(300, () => { sock.destroy(); resolve(false); }); + }); +} + +async function waitFor(predicate: () => Promise, timeoutMs: number): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (await predicate()) return true; + await new Promise((r) => setTimeout(r, 50)); + } + return predicate(); +} + +const delay = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +// ── evaluateFrontendRespawn ──────────────────────────────────────────────── +describe('evaluateFrontendRespawn — bounded auto-respawn policy', () => { + it('allows the first restart with the base backoff', () => { + const now = 1_000_000; + const d = evaluateFrontendRespawn([], now); + assert.strictEqual(d.restart, true); + assert.strictEqual(d.delayMs, DEFAULT_FRONTEND_RESPAWN_POLICY.backoffMs); + assert.deepStrictEqual(d.recent, [now]); + }); + + it('backs off exponentially with the number of recent restarts', () => { + const now = 1_000_000; + // one recent restart already → 500 * 2^1 + assert.strictEqual(evaluateFrontendRespawn([now - 100], now).delayMs, 1000); + // three recent → 500 * 2^3 + assert.strictEqual(evaluateFrontendRespawn([now - 30, now - 20, now - 10], now).delayMs, 4000); + }); + + it('caps the backoff at maxBackoffMs', () => { + const now = 1_000_000; + // four recent → 500 * 2^4 = 8000, capped to 5000 + const d = evaluateFrontendRespawn([now - 4, now - 3, now - 2, now - 1], now); + assert.strictEqual(d.restart, true); + assert.strictEqual(d.delayMs, DEFAULT_FRONTEND_RESPAWN_POLICY.maxBackoffMs); + }); + + it('stops restarting once the budget within the window is exhausted', () => { + const now = 1_000_000; + const recent = [now - 5, now - 4, now - 3, now - 2, now - 1]; // 5 == maxRestarts + const d = evaluateFrontendRespawn(recent, now); + assert.strictEqual(d.restart, false); + assert.strictEqual(d.delayMs, 0); + assert.strictEqual(d.recent.length, DEFAULT_FRONTEND_RESPAWN_POLICY.maxRestarts); + }); + + it('forgets restarts that fall outside the sliding window', () => { + const now = 1_000_000; + const { windowMs } = DEFAULT_FRONTEND_RESPAWN_POLICY; + const recent = [ + now - windowMs - 1, // stale + now - windowMs - 2, // stale + now - windowMs - 3, // stale + now - windowMs - 4, // stale + now - 100, // in-window + ]; + const d = evaluateFrontendRespawn(recent, now); + assert.strictEqual(d.restart, true); + // only the one in-window timestamp survives → backoff 500 * 2^1 + assert.strictEqual(d.delayMs, 1000); + assert.deepStrictEqual(d.recent, [now - 100, now]); + }); + + it('honors a custom policy', () => { + const now = 0; + const policy = { maxRestarts: 1, windowMs: 1000, backoffMs: 100, maxBackoffMs: 200 }; + assert.strictEqual(evaluateFrontendRespawn([], now, policy).delayMs, 100); + assert.strictEqual(evaluateFrontendRespawn([now], now, policy).restart, false); + }); +}); + +// ── killFrontendTree (unit, injected spies) ───────────────────────────────── +describe('killFrontendTree — signal routing', () => { + function makeChild(pid: number | undefined) { + const calls: Array = []; + const child: KillableProcess = { + pid, + kill(signal) { calls.push(signal); return true; }, + }; + return { child, calls }; + } + + it('signals the whole process group on POSIX (negative pid)', () => { + const { child, calls } = makeChild(4242); + const groupCalls: Array<[number, NodeJS.Signals]> = []; + killFrontendTree(child, 'SIGTERM', 'linux', (pid, sig) => { groupCalls.push([pid, sig]); }); + assert.deepStrictEqual(groupCalls, [[-4242, 'SIGTERM']]); + assert.deepStrictEqual(calls, []); // direct child.kill not used on POSIX + }); + + it('falls back to a direct child kill on Windows (no process groups)', () => { + const { child, calls } = makeChild(4242); + let groupCalled = false; + killFrontendTree(child, 'SIGTERM', 'win32', () => { groupCalled = true; }); + assert.strictEqual(groupCalled, false); + assert.deepStrictEqual(calls, ['SIGTERM']); + }); + + it('falls back to a direct child kill when the group signal throws', () => { + const { child, calls } = makeChild(4242); + killFrontendTree(child, 'SIGKILL', 'linux', () => { + throw Object.assign(new Error('ESRCH'), { code: 'ESRCH' }); + }); + assert.deepStrictEqual(calls, ['SIGKILL']); + }); + + it('uses a direct child kill when there is no pid', () => { + const { child, calls } = makeChild(undefined); + let groupCalled = false; + killFrontendTree(child, 'SIGTERM', 'linux', () => { groupCalled = true; }); + assert.strictEqual(groupCalled, false); + assert.deepStrictEqual(calls, ['SIGTERM']); + }); + + it('never group-signals pid <= 1 (defensive)', () => { + const { child, calls } = makeChild(1); + let groupCalled = false; + killFrontendTree(child, 'SIGTERM', 'linux', () => { groupCalled = true; }); + assert.strictEqual(groupCalled, false); + assert.deepStrictEqual(calls, ['SIGTERM']); + }); +}); + +// ── killFrontendTree (integration, real shell + grandchild) ───────────────── +describe('killFrontendTree — reaps a real detached shell tree', () => { + it('frees a port held by a grandchild that survives a direct child kill', + { skip: isWindows, timeout: 30000 }, + async () => { + const port = await getFreePort(); + const inner = + `const net=require('net');` + + `const s=net.createServer(c=>c.destroy());` + + `s.on('error',()=>process.exit(1));` + + `s.listen(${port},'127.0.0.1');` + + `setInterval(()=>{},1e9);`; + // Run node as a backgrounded child of the shell (then `wait`): this gives + // the shell→node parent/grandchild topology of `shell: true` without any + // exec-optimization. SIGTERM to the shell does NOT propagate to the + // backgrounded node, so the node is orphaned and keeps the port — exactly + // the leak the fix must reap via a process-group kill. + const cmd = `${JSON.stringify(process.execPath)} -e ${JSON.stringify(inner)} & wait`; + const child = spawn(cmd, { shell: true, detached: true, stdio: 'ignore' }); + + try { + assert.ok(await waitFor(() => isPortOpen(port), 10000), + 'frontend grandchild should bind the port'); + + // Direct kill of only the shell parent (what the old cleanup did): the + // backgrounded node grandchild is orphaned, survives, and keeps the + // port bound — this is exactly the :3100 502 leak. + child.kill('SIGTERM'); + await delay(600); + assert.strictEqual(await isPortOpen(port), true, + 'direct child kill must NOT free the port (demonstrates the orphan bug)'); + + // Group kill reaps the entire tree and frees the port — the fix. + killFrontendTree(child, 'SIGKILL'); + assert.ok(await waitFor(async () => !(await isPortOpen(port)), 10000), + 'group kill must free the port (the fix)'); + } finally { + // Belt-and-suspenders: never leak the node grandchild if an assert throws. + if (child.pid) { try { process.kill(-child.pid, 'SIGKILL'); } catch { /* gone */ } } + } + }); +}); diff --git a/packages/core/src/scripts/dev-server.ts b/packages/core/src/scripts/dev-server.ts index a7397e02..27daf396 100644 --- a/packages/core/src/scripts/dev-server.ts +++ b/packages/core/src/scripts/dev-server.ts @@ -119,6 +119,103 @@ async function waitForPort(port: number, maxAttempts = 60): Promise { throw new Error(`Frontend server on port ${port} did not start within ${maxAttempts * 500}ms`); } +/** Bounded auto-respawn policy for the frontend dev server. */ +export interface FrontendRespawnPolicy { + /** Max restarts allowed within `windowMs` before giving up (prevents hot loops). */ + maxRestarts: number; + /** Sliding window (ms) over which restarts are counted. */ + windowMs: number; + /** Base backoff (ms); doubles for each restart already in the window. */ + backoffMs: number; + /** Upper bound (ms) on any single backoff delay. */ + maxBackoffMs: number; +} + +/** Default frontend respawn budget: 5 restarts / 10s, 500ms→5s exponential backoff. */ +export const DEFAULT_FRONTEND_RESPAWN_POLICY: FrontendRespawnPolicy = { + maxRestarts: 5, + windowMs: 10_000, + backoffMs: 500, + maxBackoffMs: 5_000, +}; + +/** Outcome of {@link evaluateFrontendRespawn}. */ +export interface RespawnDecision { + /** Whether the frontend should be respawned now. */ + restart: boolean; + /** Delay (ms) to wait before respawning when `restart` is true. */ + delayMs: number; + /** + * Restart timestamps still inside the window — plus the new attempt when + * restarting. The caller persists this for the next decision. + */ + recent: number[]; +} + +/** + * Decide whether to auto-respawn the frontend dev server after an unexpected + * exit. Restarts outside the sliding window are forgotten; once the budget is + * exhausted the frontend is left down (no hot restart loop). Otherwise the + * delay grows exponentially with the number of recent restarts, capped at + * `maxBackoffMs`. + */ +export function evaluateFrontendRespawn( + recentRestarts: number[], + now: number, + policy: FrontendRespawnPolicy = DEFAULT_FRONTEND_RESPAWN_POLICY, +): RespawnDecision { + const recent = recentRestarts.filter((t) => now - t < policy.windowMs); + if (recent.length >= policy.maxRestarts) { + return { restart: false, delayMs: 0, recent }; + } + const delayMs = Math.min(policy.backoffMs * 2 ** recent.length, policy.maxBackoffMs); + return { restart: true, delayMs, recent: [...recent, now] }; +} + +/** Minimal child-process surface needed to terminate a frontend dev server. */ +export interface KillableProcess { + pid?: number; + kill(signal?: NodeJS.Signals | number): boolean; +} + +/** + * Terminate a frontend dev server spawned with `shell: true`, including its + * descendants. + * + * Under a shell the real dev server (e.g. Vite) is a **grandchild**: the direct + * child is the shell, so signalling only the shell (`child.kill`) orphans the + * grandchild, which keeps holding its port (`:3100`) and wedges the next + * restart. On POSIX the frontend is spawned `detached` (its own process group, + * pgid === child.pid), so we signal the whole group with + * `process.kill(-pid, signal)` and every descendant dies, freeing the port. + * Windows has no POSIX process groups, so we fall back to `child.kill`. + * + * Best-effort and never throws: a missing/invalid pid, an already-dead group + * (ESRCH), or a failed group signal all degrade to a direct `child.kill`. + */ +export function killFrontendTree( + child: KillableProcess, + signal: NodeJS.Signals = 'SIGTERM', + platform: NodeJS.Platform = process.platform, + killFn: (pid: number, signal: NodeJS.Signals) => void = process.kill, +): void { + const { pid } = child; + // pid > 1 guards against signalling the whole current group (-0) or init (-1). + if (pid && pid > 1 && platform !== 'win32') { + try { + killFn(-pid, signal); + return; + } catch { + // Group already gone or signal failed — fall through to a direct kill. + } + } + try { + child.kill(signal); + } catch { + // Process already exited; nothing to do. + } +} + export async function startDevServer(options: DevServerOptions) { const { port = 3000, @@ -200,6 +297,103 @@ export async function startDevServer(options: DevServerOptions) { (res as ServerResponse).end('Frontend server unavailable'); }); + // ── Frontend supervisor ───────────────────────────────────────────────── + // The frontend runs under `shell: true`, so the real dev server (Vite) is a + // grandchild of this process. We spawn it `detached` (its own process group) + // on POSIX so cleanup/restart can signal the *whole* tree and free the port; + // otherwise the orphaned grandchild keeps `:3100` and every `/` request 502s + // forever (the proxy target is hardcoded to `frontendPort`). We also bound- + // respawn it on unexpected death and suppress all of this during shutdown. + const usePosixProcessGroups = process.platform !== 'win32'; + let isShuttingDown = false; + let frontendRestarts: number[] = []; + let respawnTimer: ReturnType | null = null; + + const announceFrontendReady = async (suffix = ''): Promise => { + try { + await waitForPort(frontendPort); + console.log(`\n ➜ http://localhost:${port}/${suffix}\n`); + } catch (e) { + console.error(`⚠️ Frontend did not start: ${(e as Error).message}`); + console.log(`\n ➜ http://localhost:${port}/ (API only — frontend unavailable)\n`); + } + }; + + const spawnFrontend = (command: string): ChildProcess => { + const child = spawn(command, { + shell: true, + // Own process group on POSIX so we can reap the Vite grandchild too. + detached: usePosixProcessGroups, + stdio: ['ignore', 'pipe', 'pipe'], + env: { ...process.env, NODE_OPTIONS: '' }, + }); + frontendProcess = child; + + // Suppress frontend output — only show errors. + child.stderr?.on('data', (d: Buffer) => { + const msg = d.toString(); + if (!msg.includes('DeprecationWarning')) process.stderr.write(msg); + }); + + child.on('exit', (code, signal) => { + // Ignore exits from a process we've already replaced or torn down. + if (child !== frontendProcess) return; + frontendProcess = null; + if (isShuttingDown) return; + // Reap any orphaned grandchild left in this child's group so `:3100` is + // free before we respawn — otherwise `--strictPort` makes the new Vite + // exit on bind and we'd spin until the restart budget is gone. + killFrontendTree(child, 'SIGKILL'); + + const decision = evaluateFrontendRespawn(frontendRestarts, Date.now()); + frontendRestarts = decision.recent; + const why = `code=${code ?? 'null'}, signal=${signal ?? 'null'}`; + if (!decision.restart) { + console.error( + `⚠️ Frontend dev server exited (${why}) and exceeded ` + + `${DEFAULT_FRONTEND_RESPAWN_POLICY.maxRestarts} restarts within ` + + `${DEFAULT_FRONTEND_RESPAWN_POLICY.windowMs / 1000}s — leaving it down. ` + + `Fix the error above, then restart \`npm run dev\`.`, + ); + return; + } + console.error(`⚠️ Frontend dev server exited (${why}); restarting in ${decision.delayMs}ms…`); + respawnTimer = setTimeout(() => { + respawnTimer = null; + if (isShuttingDown) return; + spawnFrontend(command); + void announceFrontendReady(' (frontend restarted)'); + }, decision.delayMs); + respawnTimer.unref?.(); + }); + + return child; + }; + + /** + * Gracefully terminate the frontend tree and wait (bounded) for it to die so + * the port is actually freed before this process exits. SIGTERM the group, + * then escalate to SIGKILL if it lingers. tsx-watch gives us ~5s before it + * force-kills us, so the ~2s budget here is safe. + */ + const terminateFrontend = async (child: ChildProcess | null): Promise => { + if (!child) return; + if (child.exitCode !== null || child.signalCode !== null) return; + const exited = new Promise((res) => child.once('exit', () => res())); + killFrontendTree(child, 'SIGTERM'); + const lingered = await Promise.race([ + exited.then(() => false), + new Promise((res) => { setTimeout(() => res(true), 1500).unref?.(); }), + ]); + if (lingered) { + killFrontendTree(child, 'SIGKILL'); + await Promise.race([ + exited, + new Promise((res) => { setTimeout(res, 500).unref?.(); }), + ]); + } + }; + // ── API Gateway proxy (sandbox mode) ─────────────────────────────────── // `changeOrigin: true` rewrites the outgoing `Host` to the execute-api target // (required for API Gateway's TLS SNI / host-based routing). That would make @@ -325,29 +519,8 @@ export async function startDevServer(options: DevServerOptions) { // Spawn frontend dev server after Blocks server is ready if (frontendCommand) { - frontendProcess = spawn(frontendCommand, { - shell: true, - stdio: ['ignore', 'pipe', 'pipe'], - env: { ...process.env, NODE_OPTIONS: '' }, - }); - // Suppress frontend output — only show errors - frontendProcess.stderr?.on('data', (d: Buffer) => { - const msg = d.toString(); - if (!msg.includes('DeprecationWarning')) process.stderr.write(msg); - }); - frontendProcess.on('exit', (code) => { - if (code !== 0 && code !== null) { - console.error(`⚠️ Frontend process exited with code ${code}`); - } - }); - - try { - await waitForPort(frontendPort); - console.log(`\n ➜ http://localhost:${port}/\n`); - } catch (e) { - console.error(`⚠️ Frontend did not start: ${(e as Error).message}`); - console.log(`\n ➜ http://localhost:${port}/ (API only — frontend unavailable)\n`); - } + spawnFrontend(frontendCommand); + await announceFrontendReady(); } else { console.log(`\n ➜ http://localhost:${port}/\n`); } @@ -359,9 +532,24 @@ export async function startDevServer(options: DevServerOptions) { }); // ── Cleanup ──────────────────────────────────────────────────────────── + const signals: NodeJS.Signals[] = ['SIGINT', 'SIGTERM', 'SIGHUP']; + let cleaningUp = false; const cleanup = async () => { + if (cleaningUp) return; // idempotent — a second signal must not re-enter + cleaningUp = true; + isShuttingDown = true; // stop the supervisor from respawning the frontend console.log('\nShutting down...'); - if (frontendProcess) frontendProcess.kill('SIGTERM'); + + if (respawnTimer) { clearTimeout(respawnTimer); respawnTimer = null; } + // Detach our own listeners so repeated signals can't pile up handlers. + for (const sig of signals) process.removeListener(sig, cleanup); + + // Kill the frontend process *group* and wait for the port to free before + // we exit, so a tsx-watch restart can rebind `:3100` cleanly. + const child = frontendProcess; + frontendProcess = null; + await terminateFrontend(child); + if (typeof backend.__cleanup === 'function') { try { await backend.__cleanup(); } catch {} } @@ -371,8 +559,17 @@ export async function startDevServer(options: DevServerOptions) { setTimeout(() => process.exit(0), 2000).unref(); }; - process.on('SIGINT', cleanup); - process.on('SIGTERM', cleanup); + for (const sig of signals) process.on(sig, cleanup); + + // Last-resort safety net for paths that bypass `cleanup` (e.g. an uncaught + // exception terminating the process): synchronously reap the detached + // frontend group so a `detached` Vite is never left orphaned on `:3100`. + process.once('exit', () => { + const pid = frontendProcess?.pid; + if (pid && pid > 1 && usePosixProcessGroups) { + try { process.kill(-pid, 'SIGKILL'); } catch { /* already gone */ } + } + }); } // ── Local API handler ──────────────────────────────────────────────────────── From 60ccf41d525f5c339e511576ce8336233f343389 Mon Sep 17 00:00:00 2001 From: Michael Sober Date: Tue, 23 Jun 2026 11:44:04 +0000 Subject: [PATCH 2/6] =?UTF-8?q?fix(dev-server):=20address=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20Windows=20taskkill=20reaping,=20failing-only=20rest?= =?UTF-8?q?art=20budget,=20live-child=20exit=20reap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - killFrontendTree reaps the Vite grandchild on Windows via `taskkill /T /F /PID` (POSIX keeps the process-group kill); degrades to child.kill only if taskkill can't spawn. - Reset the auto-respawn budget once a respawn rebinds the port, so only consecutive *failing* restarts count toward the give-up threshold; documented the semantics on evaluateFrontendRespawn. - Gate the process.on('exit') group reap on a still-live child (exitCode/signalCode null) to close the PID-reuse window. - Bind killFn default explicitly: (p, s) => process.kill(p, s). - Tests: add windowsTreeKill + Windows-routing unit tests; poll the "still bound" check; gate the real-process integration test behind RUN_SLOW_TESTS. --- .changeset/dev-server-restart-race-502.md | 9 +- .../src/scripts/dev-server-supervisor.test.ts | 78 ++++++++++-- packages/core/src/scripts/dev-server.ts | 111 ++++++++++++++---- 3 files changed, 168 insertions(+), 30 deletions(-) diff --git a/.changeset/dev-server-restart-race-502.md b/.changeset/dev-server-restart-race-502.md index 5759d5c1..645888d6 100644 --- a/.changeset/dev-server-restart-race-502.md +++ b/.changeset/dev-server-restart-race-502.md @@ -16,10 +16,15 @@ Fixes: - **Process-group kill** — the frontend is spawned `detached` on POSIX (its own process group) and cleanup/restart now signal the entire group via `process.kill(-pid, …)`, reaping the Vite grandchild and freeing `:3100`. - Windows (no POSIX groups) falls back to a direct child kill. + Windows (no POSIX groups) reaps the tree with `taskkill /T /F /PID `, + which walks the child tree by PID so the Vite grandchild is killed too; it + degrades to a direct child kill only if `taskkill` cannot be spawned. - **Bounded auto-respawn** — an unexpected frontend exit now respawns Vite with exponential backoff, capped at 5 restarts / 10s to avoid hot loops, and is - suppressed during intentional shutdown via an `isShuttingDown` guard. + suppressed during intentional shutdown via an `isShuttingDown` guard. The + budget counts only *consecutive failing* restarts: it resets once a respawn + rebinds the port, so a frontend that legitimately restarts many times (e.g. + editor-triggered full reloads) is never permanently left down. - **Robust shutdown** — cleanup is idempotent, wired to `SIGINT`/`SIGTERM`/ `SIGHUP`, removes its own listeners, waits (bounded) for the group to die before exiting (SIGTERM→SIGKILL escalation), and keeps a synchronous diff --git a/packages/core/src/scripts/dev-server-supervisor.test.ts b/packages/core/src/scripts/dev-server-supervisor.test.ts index 5a955c7c..1ca30661 100644 --- a/packages/core/src/scripts/dev-server-supervisor.test.ts +++ b/packages/core/src/scripts/dev-server-supervisor.test.ts @@ -8,10 +8,15 @@ import { evaluateFrontendRespawn, DEFAULT_FRONTEND_RESPAWN_POLICY, killFrontendTree, + windowsTreeKill, type KillableProcess, } from './dev-server.js'; const isWindows = process.platform === 'win32'; +// The real-process integration test spawns OS processes and depends on +// wall-clock timing; gate it behind RUN_SLOW_TESTS so the default test run +// stays deterministic and fast. The pure unit tests below always run. +const runSlowTests = !!process.env.RUN_SLOW_TESTS; function getFreePort(): Promise { return new Promise((resolve, reject) => { @@ -47,6 +52,18 @@ async function waitFor(predicate: () => Promise, timeoutMs: number): Pr const delay = (ms: number) => new Promise((r) => setTimeout(r, ms)); +// Poll across a bounded window, returning false the instant the port closes. +// More robust than a single post-`delay` snapshot on a busy/slow host: it +// asserts the port stays bound for the whole window rather than at one moment. +async function staysOpen(port: number, windowMs: number): Promise { + const deadline = Date.now() + windowMs; + while (Date.now() < deadline) { + if (!(await isPortOpen(port))) return false; + await delay(50); + } + return true; +} + // ── evaluateFrontendRespawn ──────────────────────────────────────────────── describe('evaluateFrontendRespawn — bounded auto-respawn policy', () => { it('allows the first restart with the base backoff', () => { @@ -126,11 +143,23 @@ describe('killFrontendTree — signal routing', () => { assert.deepStrictEqual(calls, []); // direct child.kill not used on POSIX }); - it('falls back to a direct child kill on Windows (no process groups)', () => { + it('reaps the tree via taskkill on Windows (no POSIX group, no direct kill)', () => { const { child, calls } = makeChild(4242); let groupCalled = false; - killFrontendTree(child, 'SIGTERM', 'win32', () => { groupCalled = true; }); - assert.strictEqual(groupCalled, false); + const winPids: number[] = []; + killFrontendTree(child, 'SIGTERM', 'win32', + () => { groupCalled = true; }, + (pid) => { winPids.push(pid); return true; }); + assert.strictEqual(groupCalled, false); // POSIX group kill not used on Windows + assert.deepStrictEqual(winPids, [4242]); // taskkill tree-kill invoked with the pid + assert.deepStrictEqual(calls, []); // no direct child.kill once taskkill succeeded + }); + + it('falls back to a direct child kill on Windows when taskkill cannot run', () => { + const { child, calls } = makeChild(4242); + killFrontendTree(child, 'SIGTERM', 'win32', + () => {}, + () => false); // taskkill unavailable → must degrade to child.kill assert.deepStrictEqual(calls, ['SIGTERM']); }); @@ -159,10 +188,44 @@ describe('killFrontendTree — signal routing', () => { }); }); +// ── windowsTreeKill (unit, injected runner) ───────────────────────────────── +describe('windowsTreeKill — taskkill tree-kill command', () => { + it('invokes `taskkill /T /F /PID ` and reports success', () => { + const runs: Array<[string, readonly string[]]> = []; + const ok = windowsTreeKill(4242, (cmd, args) => { runs.push([cmd, args]); return { status: 0 }; }); + assert.strictEqual(ok, true); + assert.deepStrictEqual(runs, [['taskkill', ['/T', '/F', '/PID', '4242']]]); + }); + + it('treats an already-gone tree (non-zero exit, no spawn error) as handled', () => { + const ok = windowsTreeKill(4242, () => ({ status: 128 })); // 128 = "process not found" + assert.strictEqual(ok, true); + }); + + it('reports failure when taskkill cannot be spawned (caller then falls back)', () => { + const ok = windowsTreeKill(4242, () => ({ status: null, error: new Error('ENOENT') })); + assert.strictEqual(ok, false); + }); + + it('never throws even if the runner throws', () => { + const ok = windowsTreeKill(4242, () => { throw new Error('boom'); }); + assert.strictEqual(ok, false); + }); +}); + // ── killFrontendTree (integration, real shell + grandchild) ───────────────── -describe('killFrontendTree — reaps a real detached shell tree', () => { +// Opt-in: spawns real OS processes and relies on wall-clock timing, so it is +// gated behind RUN_SLOW_TESTS to keep the default `npm test` deterministic. +// POSIX-only because it asserts process-group reaping. +const integrationSkip = !runSlowTests + ? 'set RUN_SLOW_TESTS=1 to run (spawns real processes)' + : isWindows + ? 'POSIX-only (relies on process groups)' + : false; + +describe('killFrontendTree — reaps a real detached shell tree', { skip: integrationSkip }, () => { it('frees a port held by a grandchild that survives a direct child kill', - { skip: isWindows, timeout: 30000 }, + { timeout: 30000 }, async () => { const port = await getFreePort(); const inner = @@ -187,8 +250,9 @@ describe('killFrontendTree — reaps a real detached shell tree', () => { // backgrounded node grandchild is orphaned, survives, and keeps the // port bound — this is exactly the :3100 502 leak. child.kill('SIGTERM'); - await delay(600); - assert.strictEqual(await isPortOpen(port), true, + // Poll across a bounded window instead of a single post-`delay` + // snapshot so a slow/busy host can't race the assertion. + assert.ok(await staysOpen(port, 600), 'direct child kill must NOT free the port (demonstrates the orphan bug)'); // Group kill reaps the entire tree and frees the port — the fix. diff --git a/packages/core/src/scripts/dev-server.ts b/packages/core/src/scripts/dev-server.ts index 27daf396..9fd46186 100644 --- a/packages/core/src/scripts/dev-server.ts +++ b/packages/core/src/scripts/dev-server.ts @@ -5,7 +5,7 @@ import { createServer, type IncomingMessage, type ServerResponse } from 'node:ht import { pathToFileURL, URL } from 'node:url'; import { resolve, dirname, join } from 'node:path'; import { writeFileSync, mkdirSync } from 'node:fs'; -import { spawn, type ChildProcess } from 'node:child_process'; +import { spawn, spawnSync, type ChildProcess } from 'node:child_process'; import { createConnection } from 'node:net'; import httpProxy from 'http-proxy'; import { writeClientCode } from './generate-client.js'; @@ -154,10 +154,24 @@ export interface RespawnDecision { /** * Decide whether to auto-respawn the frontend dev server after an unexpected - * exit. Restarts outside the sliding window are forgotten; once the budget is - * exhausted the frontend is left down (no hot restart loop). Otherwise the - * delay grows exponentially with the number of recent restarts, capped at - * `maxBackoffMs`. + * exit, given the timestamps of restarts not yet "forgiven". + * + * Semantics — the budget counts only *failing* restarts: + * - Timestamps older than `windowMs` are dropped from the sliding window. + * - If `maxRestarts` are still within the window, the budget is exhausted and + * the frontend is left down (no hot restart loop) — `restart: false`. + * - Otherwise `restart: true` with an exponential backoff (`backoffMs` doubled + * per in-window restart, capped at `maxBackoffMs`) and the new attempt + * appended to `recent`. + * + * This function is pure; the *meaning* of the budget is enforced by the caller, + * which **resets `recentRestarts` to `[]` once a respawn demonstrably succeeds** + * (the frontend port becomes bound — see `announceFrontendReady`). As a result + * only *consecutive failing* restarts accumulate toward `maxRestarts`: a + * frontend that legitimately restarts many times in a burst (e.g. + * editor-triggered Vite full reloads) refreshes its budget on each healthy bind + * and is never permanently left down — only a genuine crash loop that never + * rebinds the port trips the limit. */ export function evaluateFrontendRespawn( recentRestarts: number[], @@ -178,35 +192,82 @@ export interface KillableProcess { kill(signal?: NodeJS.Signals | number): boolean; } +/** Subset of {@link import('node:child_process').SpawnSyncReturns} that {@link windowsTreeKill} inspects. */ +interface TreeKillResult { + status: number | null; + error?: Error; +} + +/** + * Force-kill an entire process tree on Windows via `taskkill /T /F /PID `. + * + * Windows has no POSIX process groups, so a bare `child.kill()` only signals the + * spawned shell and orphans the real dev server (the Vite grandchild), which + * keeps holding `:3100` — the very wedge the POSIX process-group kill fixes. + * `taskkill /T` walks the live child tree by PID and terminates every + * descendant; `/F` is required because Windows cannot deliver a graceful + * shutdown to a non-console subtree anyway (Node maps SIGTERM/SIGKILL to + * `TerminateProcess`). + * + * Returns `true` when `taskkill` actually ran — whether it reaped the tree or + * found it already gone (exit 128) — and `false` only when the command could + * not be spawned at all (e.g. not on `PATH`), signalling the caller to fall + * back to a direct `child.kill`. Never throws. + */ +export function windowsTreeKill( + pid: number, + runner: (command: string, args: readonly string[]) => TreeKillResult = (command, args) => + spawnSync(command, args as string[], { stdio: 'ignore', windowsHide: true }), +): boolean { + try { + const { error } = runner('taskkill', ['/T', '/F', '/PID', String(pid)]); + return !error; + } catch { + return false; + } +} + /** * Terminate a frontend dev server spawned with `shell: true`, including its - * descendants. + * descendants, on every platform. * * Under a shell the real dev server (e.g. Vite) is a **grandchild**: the direct * child is the shell, so signalling only the shell (`child.kill`) orphans the * grandchild, which keeps holding its port (`:3100`) and wedges the next - * restart. On POSIX the frontend is spawned `detached` (its own process group, - * pgid === child.pid), so we signal the whole group with - * `process.kill(-pid, signal)` and every descendant dies, freeing the port. - * Windows has no POSIX process groups, so we fall back to `child.kill`. + * restart. + * + * - **POSIX**: the frontend is spawned `detached` (its own process group, + * pgid === child.pid), so we signal the whole group with + * `process.kill(-pid, signal)` and every descendant dies, freeing the port. + * - **Windows**: there are no process groups, so we reap the tree with + * `taskkill /T /F /PID ` (see {@link windowsTreeKill}), which walks the + * child tree by PID. A bare `child.kill` would leave the Vite grandchild + * bound to `:3100`, reproducing the POSIX wedge. * * Best-effort and never throws: a missing/invalid pid, an already-dead group - * (ESRCH), or a failed group signal all degrade to a direct `child.kill`. + * (ESRCH), a failed group signal, or an unavailable `taskkill` all degrade to a + * direct `child.kill`. */ export function killFrontendTree( child: KillableProcess, signal: NodeJS.Signals = 'SIGTERM', platform: NodeJS.Platform = process.platform, - killFn: (pid: number, signal: NodeJS.Signals) => void = process.kill, + killFn: (pid: number, signal: NodeJS.Signals) => void = (p, s) => process.kill(p, s), + winTreeKill: (pid: number) => boolean = windowsTreeKill, ): void { const { pid } = child; // pid > 1 guards against signalling the whole current group (-0) or init (-1). - if (pid && pid > 1 && platform !== 'win32') { - try { - killFn(-pid, signal); + if (pid && pid > 1) { + if (platform !== 'win32') { + try { + killFn(-pid, signal); + return; + } catch { + // Group already gone or signal failed — fall through to a direct kill. + } + } else if (winTreeKill(pid)) { + // taskkill walked the PID tree and reaped the Vite grandchild. return; - } catch { - // Group already gone or signal failed — fall through to a direct kill. } } try { @@ -312,6 +373,11 @@ export async function startDevServer(options: DevServerOptions) { const announceFrontendReady = async (suffix = ''): Promise => { try { await waitForPort(frontendPort); + // The respawn demonstrably succeeded — the port is bound — so reset the + // restart budget. Only *consecutive failing* restarts should count toward + // the give-up threshold; a frontend that legitimately restarts many times + // (e.g. editor-triggered Vite full reloads) thus never gets left down. + frontendRestarts = []; console.log(`\n ➜ http://localhost:${port}/${suffix}\n`); } catch (e) { console.error(`⚠️ Frontend did not start: ${(e as Error).message}`); @@ -565,10 +631,13 @@ export async function startDevServer(options: DevServerOptions) { // exception terminating the process): synchronously reap the detached // frontend group so a `detached` Vite is never left orphaned on `:3100`. process.once('exit', () => { - const pid = frontendProcess?.pid; - if (pid && pid > 1 && usePosixProcessGroups) { - try { process.kill(-pid, 'SIGKILL'); } catch { /* already gone */ } - } + const child = frontendProcess; + if (!child || !child.pid || child.pid <= 1 || !usePosixProcessGroups) return; + // Only reap a still-live child. Once it has exited (exitCode/signalCode set) + // the OS may recycle its PID, so kill(-pid) could signal an unrelated group + // — the classic PID-reuse window. Accepted here as a best-effort last resort. + if (child.exitCode !== null || child.signalCode !== null) return; + try { process.kill(-child.pid, 'SIGKILL'); } catch { /* already gone */ } }); } From b2080d7777d9ebab66b231caecf057d373b0f169 Mon Sep 17 00:00:00 2001 From: Michael Sober Date: Tue, 23 Jun 2026 13:05:50 +0000 Subject: [PATCH 3/6] fix(dev-server): reconcile post-exit group-kill policy across all three sites MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer noted the respawn-path kill (inside child.on('exit')) group-kills after the shell has exited with no guard/comment, while the exit-net guards the PID-reuse window by skipping the kill once the child exited — yet that very skip would leave an orphaned grandchild holding :3100 (the bug this PR fixes). Reconcile to ONE coherent policy: best-effort SYNCHRONOUS post-exit group-kill at all three sites (respawn path, terminateFrontend, exit-net), documented once as POST-EXIT GROUP-KILL POLICY. A surviving grandchild keeps the process group non-empty, so POSIX keeps pid reserved as the pgid and kill(-pid) still targets our own group; kills are issued synchronously on observing the exit to keep the PID-reuse window minimal. terminateFrontend now reaps instead of returning early when the shell already exited; the exit-net drops its skip-if-exited guard so it agrees with the other two. Also harden the opt-in integration test: the spawned grandchild now retries on EADDRINUSE (fresh server per attempt) to close the getFreePort() rebind race, and the test asserts the shell parent has exited before the group kill so it exercises the post-exit reap the reconciled policy relies on. --- .changeset/dev-server-restart-race-502.md | 9 ++++ .../src/scripts/dev-server-supervisor.test.ts | 30 ++++++++++-- packages/core/src/scripts/dev-server.ts | 48 ++++++++++++++++--- 3 files changed, 77 insertions(+), 10 deletions(-) diff --git a/.changeset/dev-server-restart-race-502.md b/.changeset/dev-server-restart-race-502.md index 645888d6..d7bbe237 100644 --- a/.changeset/dev-server-restart-race-502.md +++ b/.changeset/dev-server-restart-race-502.md @@ -29,6 +29,15 @@ Fixes: `SIGHUP`, removes its own listeners, waits (bounded) for the group to die before exiting (SIGTERM→SIGKILL escalation), and keeps a synchronous `process.on('exit')` safety net so a detached Vite is never left orphaned. +- **Consistent post-exit reaping** — the failure being fixed is the *shell + exiting while the detached grandchild survives*, so every post-exit path + (the respawn handler, graceful shutdown, and the `exit` safety net) now + issues one best-effort process-group kill even after the shell has gone, + rather than skipping it. A surviving grandchild keeps the group's id reserved + on POSIX, so `process.kill(-pid)` still targets our own group; the kills are + issued synchronously on observing the exit to keep the PID-reuse window + minimal. The single rationale lives next to the supervisor as the + "POST-EXIT GROUP-KILL POLICY" so all three sites stay in agreement. `--strictPort` is intentionally retained: the proxy target is hardcoded to `:3100`, so the port is reliably freed rather than letting Vite drift to another diff --git a/packages/core/src/scripts/dev-server-supervisor.test.ts b/packages/core/src/scripts/dev-server-supervisor.test.ts index 1ca30661..2d0ee48a 100644 --- a/packages/core/src/scripts/dev-server-supervisor.test.ts +++ b/packages/core/src/scripts/dev-server-supervisor.test.ts @@ -228,11 +228,24 @@ describe('killFrontendTree — reaps a real detached shell tree', { skip: integr { timeout: 30000 }, async () => { const port = await getFreePort(); + // getFreePort() closes its probe listener before this grandchild binds, + // so on a busy host another process can grab the port in that gap. Retry + // briefly on EADDRINUSE (creating a fresh server each attempt) instead of + // exiting on the first error, so the shell→node→port topology under test + // is reliably established; bail only on a non-transient error or once the + // retry budget (~5s) is exhausted. const inner = `const net=require('net');` + - `const s=net.createServer(c=>c.destroy());` + - `s.on('error',()=>process.exit(1));` + - `s.listen(${port},'127.0.0.1');` + + `const port=${port};` + + `let tries=0;` + + `(function bind(){` + + `const s=net.createServer(c=>c.destroy());` + + `s.on('error',e=>{` + + `if(e.code==='EADDRINUSE'&&tries++<50){setTimeout(bind,100);return;}` + + `process.exit(1);` + + `});` + + `s.listen(port,'127.0.0.1');` + + `})();` + `setInterval(()=>{},1e9);`; // Run node as a backgrounded child of the shell (then `wait`): this gives // the shell→node parent/grandchild topology of `shell: true` without any @@ -255,7 +268,16 @@ describe('killFrontendTree — reaps a real detached shell tree', { skip: integr assert.ok(await staysOpen(port, 600), 'direct child kill must NOT free the port (demonstrates the orphan bug)'); - // Group kill reaps the entire tree and frees the port — the fix. + // The shell parent is now gone, so the group kill below is exercised as + // a POST-EXIT reap — the exact case the reconciled kill policy relies on: + // the orphaned grandchild keeps the process group alive (pgid === the + // exited shell's pid), so `process.kill(-pid)` still targets our group. + assert.ok( + await waitFor(async () => child.exitCode !== null || child.signalCode !== null, 5000), + 'shell parent should have exited after SIGTERM (sets up the post-exit reap)'); + + // Group kill reaps the entire tree and frees the port — the fix. It + // works even though the shell parent has already exited (asserted above). killFrontendTree(child, 'SIGKILL'); assert.ok(await waitFor(async () => !(await isPortOpen(port)), 10000), 'group kill must free the port (the fix)'); diff --git a/packages/core/src/scripts/dev-server.ts b/packages/core/src/scripts/dev-server.ts index 9fd46186..c07181b7 100644 --- a/packages/core/src/scripts/dev-server.ts +++ b/packages/core/src/scripts/dev-server.ts @@ -365,6 +365,29 @@ export async function startDevServer(options: DevServerOptions) { // otherwise the orphaned grandchild keeps `:3100` and every `/` request 502s // forever (the proxy target is hardcoded to `frontendPort`). We also bound- // respawn it on unexpected death and suppress all of this during shutdown. + // + // ── POST-EXIT GROUP-KILL POLICY ───────────────────────────────────────── + // The exact bug this supervisor fixes is the shell *exiting* while the + // detached grandchild survives, orphaned, still holding `:3100`. Reaping that + // orphan REQUIRES a group kill (`process.kill(-pid, …)`) issued *after* the + // shell has already exited — so all three post-exit kill sites below agree: + // the respawn path, `terminateFrontend`, and the `process.on('exit')` net all + // group-kill rather than skip when the shell is already gone. + // + // Why this is safe against the classic `-pid` PID-reuse hazard: + // 1. A surviving grandchild keeps the process group non-empty, so POSIX + // keeps `pid` reserved as the group id — it cannot be recycled as a new + // process id while it is still a live group's id. Hence `-pid` is + // guaranteed to target *our* group precisely when it matters (an orphan + // is still alive in it). + // 2. We only ever issue the kill synchronously, the instant we observe the + // shell's exit — there is no intervening `await` that could let the group + // drain and the pid be recycled — so the residual window is minimal. + // Residual accepted risk: if the ENTIRE group is already gone *and* `pid` has + // since been recycled into a brand-new group leader, `-pid` could signal an + // unrelated group. This is an accepted best-effort trade-off — there is then + // nothing of ours left to reap, whereas skipping the kill would otherwise + // leave `:3100` wedged, which is the failure this PR exists to prevent. const usePosixProcessGroups = process.platform !== 'win32'; let isShuttingDown = false; let frontendRestarts: number[] = []; @@ -408,7 +431,10 @@ export async function startDevServer(options: DevServerOptions) { if (isShuttingDown) return; // Reap any orphaned grandchild left in this child's group so `:3100` is // free before we respawn — otherwise `--strictPort` makes the new Vite - // exit on bind and we'd spin until the restart budget is gone. + // exit on bind and we'd spin until the restart budget is gone. The shell + // has already exited here (we are inside its `exit` handler), so this is a + // post-exit group kill; it is issued synchronously in this handler and is + // safe against PID reuse — see POST-EXIT GROUP-KILL POLICY above. killFrontendTree(child, 'SIGKILL'); const decision = evaluateFrontendRespawn(frontendRestarts, Date.now()); @@ -444,7 +470,15 @@ export async function startDevServer(options: DevServerOptions) { */ const terminateFrontend = async (child: ChildProcess | null): Promise => { if (!child) return; - if (child.exitCode !== null || child.signalCode !== null) return; + if (child.exitCode !== null || child.signalCode !== null) { + // The shell already exited, but a detached grandchild may still be + // orphaned on `:3100` — and the bounded SIGTERM→SIGKILL dance below can't + // run (there is no shell left to wait on). Issue one best-effort group + // kill so the orphan can't keep the port bound, instead of returning and + // leaving it — see POST-EXIT GROUP-KILL POLICY above. + killFrontendTree(child, 'SIGKILL'); + return; + } const exited = new Promise((res) => child.once('exit', () => res())); killFrontendTree(child, 'SIGTERM'); const lingered = await Promise.race([ @@ -633,10 +667,12 @@ export async function startDevServer(options: DevServerOptions) { process.once('exit', () => { const child = frontendProcess; if (!child || !child.pid || child.pid <= 1 || !usePosixProcessGroups) return; - // Only reap a still-live child. Once it has exited (exitCode/signalCode set) - // the OS may recycle its PID, so kill(-pid) could signal an unrelated group - // — the classic PID-reuse window. Accepted here as a best-effort last resort. - if (child.exitCode !== null || child.signalCode !== null) return; + // Reap the group even if the shell itself has already exited — a surviving + // grandchild could still hold `:3100`, and this net is the only thing that + // runs when `cleanup` was bypassed. This agrees with the respawn path and + // `terminateFrontend`: all three group-kill post-exit. The kill is + // synchronous (this is an `exit` handler), keeping the PID-reuse window + // minimal — see POST-EXIT GROUP-KILL POLICY above for the accepted trade-off. try { process.kill(-child.pid, 'SIGKILL'); } catch { /* already gone */ } }); } From 00f02c4103eb0e0dc6ce62b6285d39e735a5b237 Mon Sep 17 00:00:00 2001 From: Michael Sober Date: Thu, 25 Jun 2026 14:50:41 +0000 Subject: [PATCH 4/6] =?UTF-8?q?fix(dev-server):=20address=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20shared=20tree-kill,=20sandbox=20drain,=20port-free?= =?UTF-8?q?=20wait,=20budget-reset=20guard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract POSIX group-kill / Windows taskkill / bounded SIGTERM->SIGKILL teardown into a shared process-tree.ts so the dev server, respawn handler, exit safety net, and sandbox all reap identically (no divergent copies). - exit safety net now calls killFrontendTree(child,'SIGKILL') so it reaps the Vite tree on Windows (taskkill) instead of early-returning and leaking it. - sandbox.ts cleanup is async and awaits terminateProcessTree(devServer) before exiting (dev server spawned detached), so its own shutdown drain reaps the detached Vite great-grandchild instead of orphaning it on :3100. - terminateFrontend waits for :3100 to free on BOTH the live and already-exited paths (was skipped post-exit), avoiding a --strictPort EADDRINUSE relaunch race. - announceFrontendReady credits the restart-budget reset only when our own spawned child is the live listener (shouldCreditFrontendReady), so a foreign listener can't neutralize the maxRestarts cap and hot-loop forever. - Tests: add shouldCreditFrontendReady / waitForPortFree / terminateProcessTree unit coverage; real-process integration test stays gated behind RUN_SLOW_TESTS. --- .changeset/dev-server-restart-race-502.md | 33 ++- .../src/scripts/dev-server-supervisor.test.ts | 134 ++++++++++- packages/core/src/scripts/dev-server.ts | 213 +++++++----------- packages/core/src/scripts/process-tree.ts | 154 +++++++++++++ packages/core/src/scripts/sandbox.ts | 20 +- 5 files changed, 417 insertions(+), 137 deletions(-) create mode 100644 packages/core/src/scripts/process-tree.ts diff --git a/.changeset/dev-server-restart-race-502.md b/.changeset/dev-server-restart-race-502.md index d7bbe237..4cfaeb2b 100644 --- a/.changeset/dev-server-restart-race-502.md +++ b/.changeset/dev-server-restart-race-502.md @@ -22,13 +22,22 @@ Fixes: - **Bounded auto-respawn** — an unexpected frontend exit now respawns Vite with exponential backoff, capped at 5 restarts / 10s to avoid hot loops, and is suppressed during intentional shutdown via an `isShuttingDown` guard. The - budget counts only *consecutive failing* restarts: it resets once a respawn - rebinds the port, so a frontend that legitimately restarts many times (e.g. - editor-triggered full reloads) is never permanently left down. + budget counts only *consecutive failing* restarts: it resets only when **our + own** freshly spawned child is the process now bound to the port. A liveness + probe alone cannot tell our Vite from a foreign listener (a leftover Vite or a + second dev server), and crediting a foreign one would make every + `--strictPort`-failing respawn look successful, neutralizing the cap and + hot-looping forever. A frontend that legitimately restarts many times (e.g. + editor-triggered full reloads) is still never permanently left down. - **Robust shutdown** — cleanup is idempotent, wired to `SIGINT`/`SIGTERM`/ - `SIGHUP`, removes its own listeners, waits (bounded) for the group to die - before exiting (SIGTERM→SIGKILL escalation), and keeps a synchronous - `process.on('exit')` safety net so a detached Vite is never left orphaned. + `SIGHUP`, removes its own listeners, and waits (bounded) for the group to die + **and for `:3100` to actually be released** before exiting: SIGTERM→SIGKILL + escalation, then a port-free poll that runs on *both* the live and the + already-exited paths (the post-exit path previously skipped it, so a relaunch + could race the kernel's socket teardown into `--strictPort` `EADDRINUSE`). A + synchronous `process.on('exit')` safety net remains for paths that bypass + cleanup — now routed through the shared tree-kill so it reaps on Windows + (`taskkill`) too instead of early-returning and leaking the Vite tree. - **Consistent post-exit reaping** — the failure being fixed is the *shell exiting while the detached grandchild survives*, so every post-exit path (the respawn handler, graceful shutdown, and the `exit` safety net) now @@ -38,6 +47,18 @@ Fixes: issued synchronously on observing the exit to keep the PID-reuse window minimal. The single rationale lives next to the supervisor as the "POST-EXIT GROUP-KILL POLICY" so all three sites stay in agreement. +- **Sandbox entrypoint parity** — `sandbox.ts` (the sibling dev entrypoint) now + spawns the dev server in its own process group and `await`s a bounded group + teardown on `SIGINT`/`SIGTERM`, replacing the synchronous `kill()` + + `process.exit(0)` that signalled only the shell and exited immediately. The + drain gives the dev server time to run its own shutdown and reap the + *detached* Vite great-grandchild, so the next `npm run sandbox` no longer + races a survivor on `:3100`. +- **Single tree-kill primitive** — the POSIX group-kill, the Windows `taskkill` + tree-kill, and the bounded SIGTERM→SIGKILL teardown now live in one shared + `process-tree.ts` module used by every entrypoint (dev server, respawn + handler, `exit` net, and sandbox), so the reaping behavior can no longer drift + between hand-rolled copies. `--strictPort` is intentionally retained: the proxy target is hardcoded to `:3100`, so the port is reliably freed rather than letting Vite drift to another diff --git a/packages/core/src/scripts/dev-server-supervisor.test.ts b/packages/core/src/scripts/dev-server-supervisor.test.ts index 2d0ee48a..a606a805 100644 --- a/packages/core/src/scripts/dev-server-supervisor.test.ts +++ b/packages/core/src/scripts/dev-server-supervisor.test.ts @@ -7,10 +7,16 @@ import { createConnection, createServer } from 'node:net'; import { evaluateFrontendRespawn, DEFAULT_FRONTEND_RESPAWN_POLICY, + waitForPortFree, + shouldCreditFrontendReady, +} from './dev-server.js'; +import { killFrontendTree, windowsTreeKill, + terminateProcessTree, type KillableProcess, -} from './dev-server.js'; + type AwaitableChild, +} from './process-tree.js'; const isWindows = process.platform === 'win32'; // The real-process integration test spawns OS processes and depends on @@ -287,3 +293,129 @@ describe('killFrontendTree — reaps a real detached shell tree', { skip: integr } }); }); + +// ── shouldCreditFrontendReady (unit) ──────────────────────────────────────── +// Guards the restart-budget reset: a liveness-only port probe must not credit a +// foreign listener, or the maxRestarts cap is neutralized and Vite hot-loops. +describe('shouldCreditFrontendReady — credit only our own live child', () => { + const liveChild = () => ({ exitCode: null as number | null, signalCode: null as NodeJS.Signals | null }); + + it('credits the probe when our spawned child is still the live frontend', () => { + const child = liveChild(); + assert.strictEqual(shouldCreditFrontendReady(child, child), true); + }); + + it('does NOT credit a foreign listener (a different or absent current child)', () => { + const child = liveChild(); + assert.strictEqual(shouldCreditFrontendReady(child, liveChild()), false); + assert.strictEqual(shouldCreditFrontendReady(child, null), false); + }); + + it('does NOT credit a child that already exited (lost the --strictPort bind race)', () => { + const exited = { exitCode: 1 as number | null, signalCode: null as NodeJS.Signals | null }; + assert.strictEqual(shouldCreditFrontendReady(exited, exited), false); + const killed = { exitCode: null as number | null, signalCode: 'SIGKILL' as NodeJS.Signals | null }; + assert.strictEqual(shouldCreditFrontendReady(killed, killed), false); + }); + + it('does NOT credit when there is no child', () => { + assert.strictEqual(shouldCreditFrontendReady(null, null), false); + }); +}); + +// ── terminateProcessTree (unit, injected killTree + sleep) ────────────────── +// The shared SIGTERM->SIGKILL escalation used by both the dev server and the +// sandbox entrypoint. Dependencies are injected so the policy is exercised +// without spawning real processes or real timers. +describe('terminateProcessTree — escalation + post-exit reap', () => { + const immediate = (_ms: number): Promise => Promise.resolve(); + const never = (_ms: number): Promise => new Promise(() => {}); + + function makeAwaitableChild( + initial: { exitCode?: number | null; signalCode?: NodeJS.Signals | null } = {}, + ): { child: AwaitableChild; fireExit: (code?: number | null, signal?: NodeJS.Signals | null) => void } { + let exitListener: (() => void) | null = null; + const child: AwaitableChild = { + pid: 4242, + exitCode: initial.exitCode ?? null, + signalCode: initial.signalCode ?? null, + kill() { return true; }, + once(_event: 'exit', listener: () => void) { exitListener = listener; return child; }, + }; + const fireExit = (code: number | null = 0, signal: NodeJS.Signals | null = null) => { + child.exitCode = code; + child.signalCode = signal; + exitListener?.(); + }; + return { child, fireExit }; + } + + it('issues a single best-effort group SIGKILL when the child already exited', async () => { + const { child } = makeAwaitableChild({ exitCode: 0 }); + const sent: NodeJS.Signals[] = []; + const ok = await terminateProcessTree(child, 1500, (_c, s) => { sent.push(s); }, never); + assert.strictEqual(ok, true); + assert.deepStrictEqual(sent, ['SIGKILL']); // post-exit reap: no SIGTERM, no await + }); + + it('SIGTERMs the tree and resolves without escalating when it exits in time', async () => { + const { child, fireExit } = makeAwaitableChild(); + const sent: NodeJS.Signals[] = []; + const ok = await terminateProcessTree(child, 1500, (_c, s) => { + sent.push(s); + if (s === 'SIGTERM') fireExit(0, null); // clean exit wins the grace race + }, never); + assert.strictEqual(ok, true); + assert.deepStrictEqual(sent, ['SIGTERM']); + }); + + it('escalates to a tree SIGKILL when the child lingers past the grace window', async () => { + const { child, fireExit } = makeAwaitableChild(); + const sent: NodeJS.Signals[] = []; + const ok = await terminateProcessTree(child, 1500, (_c, s) => { + sent.push(s); + if (s === 'SIGKILL') fireExit(null, 'SIGKILL'); // dies only on SIGKILL + }, immediate); + assert.deepStrictEqual(sent, ['SIGTERM', 'SIGKILL']); + assert.strictEqual(ok, true); + }); + + it('reports false when the child is still alive after the SIGKILL grace', async () => { + const { child } = makeAwaitableChild(); + const sent: NodeJS.Signals[] = []; + const ok = await terminateProcessTree(child, 1500, (_c, s) => { sent.push(s); }, immediate); + assert.deepStrictEqual(sent, ['SIGTERM', 'SIGKILL']); + assert.strictEqual(ok, false); + }); +}); + +// ── waitForPortFree (real sockets, fast + deterministic) ──────────────────── +// Asserts the "wait for the port to free before exiting" invariant that the +// post-exit terminate path used to drop. +describe('waitForPortFree — waits for the listener to release the port', () => { + it('returns promptly when nothing holds the port', async () => { + const port = await getFreePort(); + const t0 = Date.now(); + await waitForPortFree(port, 2000); + assert.ok(Date.now() - t0 < 1000, 'should return quickly when the port is already free'); + }); + + it('keeps polling while the port is held, then resolves once it closes', async () => { + const port = await getFreePort(); + const srv = createServer((c) => c.destroy()); + await new Promise((res) => srv.listen(port, '127.0.0.1', () => res())); + + // A short bounded wait must consume ~its whole budget while the port is held; + // it can only return early if it (wrongly) sees the held port as free. + const t0 = Date.now(); + await waitForPortFree(port, 300); + assert.ok(Date.now() - t0 >= 250, 'must keep polling while the port is held'); + + await new Promise((res) => srv.close(() => res())); + + // Once free, a generous-timeout call returns promptly. + const t1 = Date.now(); + await waitForPortFree(port, 3000); + assert.ok(Date.now() - t1 < 1500, 'should resolve soon after the port frees'); + }); +}); diff --git a/packages/core/src/scripts/dev-server.ts b/packages/core/src/scripts/dev-server.ts index c07181b7..7cf3b6e8 100644 --- a/packages/core/src/scripts/dev-server.ts +++ b/packages/core/src/scripts/dev-server.ts @@ -5,7 +5,7 @@ import { createServer, type IncomingMessage, type ServerResponse } from 'node:ht import { pathToFileURL, URL } from 'node:url'; import { resolve, dirname, join } from 'node:path'; import { writeFileSync, mkdirSync } from 'node:fs'; -import { spawn, spawnSync, type ChildProcess } from 'node:child_process'; +import { spawn, type ChildProcess } from 'node:child_process'; import { createConnection } from 'node:net'; import httpProxy from 'http-proxy'; import { writeClientCode } from './generate-client.js'; @@ -22,6 +22,7 @@ import { import { redactToJson } from '../redact.js'; import { buildAndSendEvent } from '../telemetry/client.js'; import { applyDevMigrations } from './external-migrations-step.js'; +import { killFrontendTree, terminateProcessTree } from './process-tree.js'; function toBodyStream(text: string): ReadableStream | null { if (!text) return null; @@ -186,95 +187,54 @@ export function evaluateFrontendRespawn( return { restart: true, delayMs, recent: [...recent, now] }; } -/** Minimal child-process surface needed to terminate a frontend dev server. */ -export interface KillableProcess { - pid?: number; - kill(signal?: NodeJS.Signals | number): boolean; -} - -/** Subset of {@link import('node:child_process').SpawnSyncReturns} that {@link windowsTreeKill} inspects. */ -interface TreeKillResult { - status: number | null; - error?: Error; -} - /** - * Force-kill an entire process tree on Windows via `taskkill /T /F /PID `. - * - * Windows has no POSIX process groups, so a bare `child.kill()` only signals the - * spawned shell and orphans the real dev server (the Vite grandchild), which - * keeps holding `:3100` — the very wedge the POSIX process-group kill fixes. - * `taskkill /T` walks the live child tree by PID and terminates every - * descendant; `/F` is required because Windows cannot deliver a graceful - * shutdown to a non-console subtree anyway (Node maps SIGTERM/SIGKILL to - * `TerminateProcess`). - * - * Returns `true` when `taskkill` actually ran — whether it reaped the tree or - * found it already gone (exit 128) — and `false` only when the command could - * not be spawned at all (e.g. not on `PATH`), signalling the caller to fall - * back to a direct `child.kill`. Never throws. + * Wait (bounded) for a TCP port to STOP accepting connections, i.e. for the + * listener to actually release the socket. Used after killing the frontend so a + * `tsx watch` relaunch can rebind `:3100` cleanly instead of racing the kernel's + * socket teardown and hitting `--strictPort` `EADDRINUSE`. Resolves as soon as + * the port is free, or once `timeoutMs` elapses (never rejects). */ -export function windowsTreeKill( - pid: number, - runner: (command: string, args: readonly string[]) => TreeKillResult = (command, args) => - spawnSync(command, args as string[], { stdio: 'ignore', windowsHide: true }), -): boolean { - try { - const { error } = runner('taskkill', ['/T', '/F', '/PID', String(pid)]); - return !error; - } catch { - return false; +export async function waitForPortFree(port: number, timeoutMs = 2000): Promise { + const { setTimeout: sleep } = await import('node:timers/promises'); + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + const open = await new Promise((resolve) => { + const socket = createConnection({ port, host: 'localhost' }, () => { + socket.destroy(); + resolve(true); + }); + socket.on('error', () => { socket.destroy(); resolve(false); }); + socket.setTimeout(200, () => { socket.destroy(); resolve(false); }); + }); + if (!open) return; + await sleep(100); } } /** - * Terminate a frontend dev server spawned with `shell: true`, including its - * descendants, on every platform. - * - * Under a shell the real dev server (e.g. Vite) is a **grandchild**: the direct - * child is the shell, so signalling only the shell (`child.kill`) orphans the - * grandchild, which keeps holding its port (`:3100`) and wedges the next - * restart. - * - * - **POSIX**: the frontend is spawned `detached` (its own process group, - * pgid === child.pid), so we signal the whole group with - * `process.kill(-pid, signal)` and every descendant dies, freeing the port. - * - **Windows**: there are no process groups, so we reap the tree with - * `taskkill /T /F /PID ` (see {@link windowsTreeKill}), which walks the - * child tree by PID. A bare `child.kill` would leave the Vite grandchild - * bound to `:3100`, reproducing the POSIX wedge. + * Decide whether a "frontend is listening" probe should be *credited* as a + * successful (re)spawn — and thus reset the restart budget. * - * Best-effort and never throws: a missing/invalid pid, an already-dead group - * (ESRCH), a failed group signal, or an unavailable `taskkill` all degrade to a - * direct `child.kill`. + * `waitForPort` only proves *something* is listening on `:3100`; it cannot tell + * our Vite apart from a foreign listener (a leftover Vite, or a second dev + * server). Crediting any listener would let a foreign process on `:3100` make + * every `--strictPort`-failing respawn look successful, neutralizing the + * `maxRestarts` cap and hot-looping forever. So we credit the probe only when + * **our** spawned child is still the live frontend process — same identity and + * not yet exited. A child that already exited (e.g. it lost the `--strictPort` + * bind race to the foreign listener) is no longer `current`, so it is not + * credited and its failed attempt still counts toward the budget. */ -export function killFrontendTree( - child: KillableProcess, - signal: NodeJS.Signals = 'SIGTERM', - platform: NodeJS.Platform = process.platform, - killFn: (pid: number, signal: NodeJS.Signals) => void = (p, s) => process.kill(p, s), - winTreeKill: (pid: number) => boolean = windowsTreeKill, -): void { - const { pid } = child; - // pid > 1 guards against signalling the whole current group (-0) or init (-1). - if (pid && pid > 1) { - if (platform !== 'win32') { - try { - killFn(-pid, signal); - return; - } catch { - // Group already gone or signal failed — fall through to a direct kill. - } - } else if (winTreeKill(pid)) { - // taskkill walked the PID tree and reaped the Vite grandchild. - return; - } - } - try { - child.kill(signal); - } catch { - // Process already exited; nothing to do. - } +export function shouldCreditFrontendReady( + child: { exitCode: number | null; signalCode: NodeJS.Signals | null } | null, + current: unknown, +): boolean { + return ( + !!child && + child === current && + child.exitCode === null && + child.signalCode === null + ); } export async function startDevServer(options: DevServerOptions) { @@ -393,14 +353,21 @@ export async function startDevServer(options: DevServerOptions) { let frontendRestarts: number[] = []; let respawnTimer: ReturnType | null = null; - const announceFrontendReady = async (suffix = ''): Promise => { + const announceFrontendReady = async (child: ChildProcess | null, suffix = ''): Promise => { try { await waitForPort(frontendPort); - // The respawn demonstrably succeeded — the port is bound — so reset the - // restart budget. Only *consecutive failing* restarts should count toward - // the give-up threshold; a frontend that legitimately restarts many times - // (e.g. editor-triggered Vite full reloads) thus never gets left down. - frontendRestarts = []; + // Reset the restart budget only when OUR child is the one now bound to + // `:3100`. `waitForPort` is a liveness-only probe — it cannot tell our + // Vite from a foreign listener (a leftover Vite or a second dev server), + // and crediting a foreign listener would make every `--strictPort`-failing + // respawn look successful, neutralizing the `maxRestarts` cap and + // hot-looping forever (see {@link shouldCreditFrontendReady}). Only + // *consecutive failing* restarts should count toward the give-up + // threshold, so a frontend that legitimately restarts many times (e.g. + // editor-triggered Vite full reloads) still never gets left down. + if (shouldCreditFrontendReady(child, frontendProcess)) { + frontendRestarts = []; + } console.log(`\n ➜ http://localhost:${port}/${suffix}\n`); } catch (e) { console.error(`⚠️ Frontend did not start: ${(e as Error).message}`); @@ -453,8 +420,8 @@ export async function startDevServer(options: DevServerOptions) { respawnTimer = setTimeout(() => { respawnTimer = null; if (isShuttingDown) return; - spawnFrontend(command); - void announceFrontendReady(' (frontend restarted)'); + const child = spawnFrontend(command); + void announceFrontendReady(child, ' (frontend restarted)'); }, decision.delayMs); respawnTimer.unref?.(); }); @@ -463,35 +430,24 @@ export async function startDevServer(options: DevServerOptions) { }; /** - * Gracefully terminate the frontend tree and wait (bounded) for it to die so - * the port is actually freed before this process exits. SIGTERM the group, - * then escalate to SIGKILL if it lingers. tsx-watch gives us ~5s before it - * force-kills us, so the ~2s budget here is safe. + * Gracefully terminate the frontend tree and wait (bounded) for the port to + * actually free before this process exits, so a `tsx watch` relaunch can + * rebind `:3100` cleanly. SIGTERM the group, escalate to SIGKILL if it lingers + * (via the shared {@link terminateProcessTree}), then poll until `:3100` is + * released. tsx-watch gives us ~5s before it force-kills us, so this budget is + * safe. Crucially the port-free wait runs on *both* paths — including when the + * shell has already exited — so the post-exit branch no longer drops the + * "wait for the port to free" guarantee. */ const terminateFrontend = async (child: ChildProcess | null): Promise => { if (!child) return; - if (child.exitCode !== null || child.signalCode !== null) { - // The shell already exited, but a detached grandchild may still be - // orphaned on `:3100` — and the bounded SIGTERM→SIGKILL dance below can't - // run (there is no shell left to wait on). Issue one best-effort group - // kill so the orphan can't keep the port bound, instead of returning and - // leaving it — see POST-EXIT GROUP-KILL POLICY above. - killFrontendTree(child, 'SIGKILL'); - return; - } - const exited = new Promise((res) => child.once('exit', () => res())); - killFrontendTree(child, 'SIGTERM'); - const lingered = await Promise.race([ - exited.then(() => false), - new Promise((res) => { setTimeout(() => res(true), 1500).unref?.(); }), - ]); - if (lingered) { - killFrontendTree(child, 'SIGKILL'); - await Promise.race([ - exited, - new Promise((res) => { setTimeout(res, 500).unref?.(); }), - ]); - } + // SIGTERM→SIGKILL the whole tree, reaping the detached Vite grandchild even + // when the shell has already exited (post-exit group kill — see policy). + await terminateProcessTree(child, 1500); + // Then wait (bounded) for `:3100` to be released. The old post-exit branch + // returned right after SIGKILL with no port poll, so a relaunch could race + // the kernel's socket teardown and hit `--strictPort` `EADDRINUSE`. + await waitForPortFree(frontendPort); }; // ── API Gateway proxy (sandbox mode) ─────────────────────────────────── @@ -619,8 +575,8 @@ export async function startDevServer(options: DevServerOptions) { // Spawn frontend dev server after Blocks server is ready if (frontendCommand) { - spawnFrontend(frontendCommand); - await announceFrontendReady(); + const child = spawnFrontend(frontendCommand); + await announceFrontendReady(child); } else { console.log(`\n ➜ http://localhost:${port}/\n`); } @@ -662,18 +618,19 @@ export async function startDevServer(options: DevServerOptions) { for (const sig of signals) process.on(sig, cleanup); // Last-resort safety net for paths that bypass `cleanup` (e.g. an uncaught - // exception terminating the process): synchronously reap the detached - // frontend group so a `detached` Vite is never left orphaned on `:3100`. + // exception terminating the process): synchronously reap the frontend tree so + // a `detached` Vite is never left orphaned on `:3100`. Reuses + // `killFrontendTree`, so unlike the old hand-rolled `process.kill(-pid)` it + // also reaps on Windows (via `taskkill`) instead of early-returning and + // leaking the Vite tree, and stays in lockstep with the other kill sites. Both + // the POSIX group kill and the Windows `taskkill` are synchronous, so this is + // legal in an `exit` handler; it reaps even when the shell has already exited + // (a surviving grandchild keeps the group alive) — see POST-EXIT GROUP-KILL + // POLICY above. process.once('exit', () => { const child = frontendProcess; - if (!child || !child.pid || child.pid <= 1 || !usePosixProcessGroups) return; - // Reap the group even if the shell itself has already exited — a surviving - // grandchild could still hold `:3100`, and this net is the only thing that - // runs when `cleanup` was bypassed. This agrees with the respawn path and - // `terminateFrontend`: all three group-kill post-exit. The kill is - // synchronous (this is an `exit` handler), keeping the PID-reuse window - // minimal — see POST-EXIT GROUP-KILL POLICY above for the accepted trade-off. - try { process.kill(-child.pid, 'SIGKILL'); } catch { /* already gone */ } + if (!child) return; + killFrontendTree(child, 'SIGKILL'); }); } diff --git a/packages/core/src/scripts/process-tree.ts b/packages/core/src/scripts/process-tree.ts new file mode 100644 index 00000000..fe1326d4 --- /dev/null +++ b/packages/core/src/scripts/process-tree.ts @@ -0,0 +1,154 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { spawnSync } from 'node:child_process'; + +// Shared process-tree teardown primitives used by every dev-tooling entrypoint +// (the dev server and the sandbox). Both spawn a long-running command with +// `shell: true`, so the real process (Vite, or the `tsx watch` dev server) is a +// grandchild of the shell. Reaping it requires killing the whole tree, not just +// the shell parent — see the per-function docs. Keeping this in one module means +// the dev server, the sandbox, and the `process.on('exit')` safety net all reap +// identically instead of hand-rolling divergent copies. + +/** Minimal child-process surface needed to terminate a frontend dev server. */ +export interface KillableProcess { + pid?: number; + kill(signal?: NodeJS.Signals | number): boolean; +} + +/** Subset of {@link import('node:child_process').SpawnSyncReturns} that {@link windowsTreeKill} inspects. */ +interface TreeKillResult { + status: number | null; + error?: Error; +} + +/** + * Force-kill an entire process tree on Windows via `taskkill /T /F /PID `. + * + * Windows has no POSIX process groups, so a bare `child.kill()` only signals the + * spawned shell and orphans the real dev server (the Vite grandchild), which + * keeps holding `:3100` — the very wedge the POSIX process-group kill fixes. + * `taskkill /T` walks the live child tree by PID and terminates every + * descendant; `/F` is required because Windows cannot deliver a graceful + * shutdown to a non-console subtree anyway (Node maps SIGTERM/SIGKILL to + * `TerminateProcess`). + * + * Returns `true` when `taskkill` actually ran — whether it reaped the tree or + * found it already gone (exit 128) — and `false` only when the command could + * not be spawned at all (e.g. not on `PATH`), signalling the caller to fall + * back to a direct `child.kill`. Never throws. + */ +export function windowsTreeKill( + pid: number, + runner: (command: string, args: readonly string[]) => TreeKillResult = (command, args) => + spawnSync(command, args as string[], { stdio: 'ignore', windowsHide: true }), +): boolean { + try { + const { error } = runner('taskkill', ['/T', '/F', '/PID', String(pid)]); + return !error; + } catch { + return false; + } +} + +/** + * Terminate a process spawned with `shell: true`, including its descendants, on + * every platform. + * + * Under a shell the real dev server (e.g. Vite) is a **grandchild**: the direct + * child is the shell, so signalling only the shell (`child.kill`) orphans the + * grandchild, which keeps holding its port (`:3100`) and wedges the next + * restart. + * + * - **POSIX**: the process is spawned `detached` (its own process group, + * pgid === child.pid), so we signal the whole group with + * `process.kill(-pid, signal)` and every descendant dies, freeing the port. + * - **Windows**: there are no process groups, so we reap the tree with + * `taskkill /T /F /PID ` (see {@link windowsTreeKill}), which walks the + * child tree by PID. A bare `child.kill` would leave the Vite grandchild + * bound to `:3100`, reproducing the POSIX wedge. + * + * Best-effort and never throws: a missing/invalid pid, an already-dead group + * (ESRCH), a failed group signal, or an unavailable `taskkill` all degrade to a + * direct `child.kill`. + */ +export function killFrontendTree( + child: KillableProcess, + signal: NodeJS.Signals = 'SIGTERM', + platform: NodeJS.Platform = process.platform, + killFn: (pid: number, signal: NodeJS.Signals) => void = (p, s) => process.kill(p, s), + winTreeKill: (pid: number) => boolean = windowsTreeKill, +): void { + const { pid } = child; + // pid > 1 guards against signalling the whole current group (-0) or init (-1). + if (pid && pid > 1) { + if (platform !== 'win32') { + try { + killFn(-pid, signal); + return; + } catch { + // Group already gone or signal failed — fall through to a direct kill. + } + } else if (winTreeKill(pid)) { + // taskkill walked the PID tree and reaped the Vite grandchild. + return; + } + } + try { + child.kill(signal); + } catch { + // Process already exited; nothing to do. + } +} + +/** Child surface {@link terminateProcessTree} needs: a tree to kill plus exit state to await. */ +export interface AwaitableChild extends KillableProcess { + exitCode: number | null; + signalCode: NodeJS.Signals | null; + once(event: 'exit', listener: () => void): unknown; +} + +const defaultSleep = (ms: number): Promise => + new Promise((res) => { + setTimeout(res, ms).unref?.(); + }); + +/** + * Terminate a child process *tree* and wait — bounded — for the child to exit, + * escalating SIGTERM → SIGKILL. Reuses {@link killFrontendTree} so every + * entrypoint reaps the same way (POSIX process-group kill / Windows `taskkill`) + * instead of hand-rolling its own group kill. + * + * Post-exit policy: if the child has *already* exited, a detached grandchild may + * still be orphaned (still holding a port), so we issue one best-effort group + * SIGKILL to reap it rather than returning blind — see the dev server's + * "POST-EXIT GROUP-KILL POLICY". Otherwise we SIGTERM the tree, wait up to + * `graceMs` for a clean exit, then SIGKILL the tree and wait a short grace. + * + * Resolves `true` once the child has exited (or was already gone), `false` if it + * was still alive when the budget elapsed. Dependencies are injected for tests. + */ +export async function terminateProcessTree( + child: AwaitableChild, + graceMs = 2000, + killTree: (c: KillableProcess, signal: NodeJS.Signals) => void = killFrontendTree, + sleep: (ms: number) => Promise = defaultSleep, +): Promise { + if (child.exitCode !== null || child.signalCode !== null) { + // Shell already exited — a detached grandchild may still hold its port, so + // reap the group best-effort instead of returning blind. + killTree(child, 'SIGKILL'); + return true; + } + const exited = new Promise((res) => child.once('exit', () => res())); + killTree(child, 'SIGTERM'); + const exitedCleanly = await Promise.race([ + exited.then(() => true), + sleep(graceMs).then(() => false), + ]); + if (exitedCleanly) return true; + killTree(child, 'SIGKILL'); + await Promise.race([exited, sleep(500)]); + return child.exitCode !== null || child.signalCode !== null; +} diff --git a/packages/core/src/scripts/sandbox.ts b/packages/core/src/scripts/sandbox.ts index cd8fceb0..e17f573c 100644 --- a/packages/core/src/scripts/sandbox.ts +++ b/packages/core/src/scripts/sandbox.ts @@ -11,6 +11,7 @@ import { trackCommand } from '../telemetry/trackCommand.js'; import { buildAndSendEvent } from '../telemetry/client.js'; import { getCdkTelemetryEnv } from './cdk-telemetry-env.js'; import { runSync, spawnCommand } from './run-command.js'; +import { terminateProcessTree } from './process-tree.js'; /** * Import the backend definition to populate the Scope BB registry. @@ -166,6 +167,12 @@ export async function startSandbox(options: SandboxOptions) { const devServer = spawnCommand(cmd, args, { stdio: "inherit", shell: true, + // Own process group on POSIX so cleanup can signal the whole dev-server + // tree (shell → tsx → node). The node dev server then runs its own SIGTERM + // handler — the ~2s terminateFrontend drain that reaps the *detached* Vite + // great-grandchild — which a bare `devServer.kill()` (the shell only) never + // triggers. Windows has no groups; terminateProcessTree reaps via taskkill. + detached: process.platform !== 'win32', env: { ...process.env, NODE_OPTIONS: '', @@ -173,12 +180,21 @@ export async function startSandbox(options: SandboxOptions) { }, }); - const cleanup = () => { + let cleaningUp = false; + const cleanup = async () => { + if (cleaningUp) return; // idempotent — a second signal must not re-enter + cleaningUp = true; console.log("\n\n🛑 Stopping local processes..."); console.log(" (AWS resources are still running)"); console.log("\n To destroy AWS resources, run: npm run sandbox:destroy\n"); cdkWatch.kill(); - devServer.kill(); + // The dev server owns a *detached* Vite tree that only its own SIGTERM + // handler reaps (a ~2s terminateFrontend drain). Killing just the shell and + // exiting immediately would orphan Vite on :3100 and wedge the next run with + // --strictPort/502, so signal the dev-server process group and AWAIT that + // drain. Bounded at 6s (> the ~2s drain): a hung dev server escalates to a + // tree SIGKILL and we exit regardless, so shutdown can never wedge. + await terminateProcessTree(devServer, 6000); process.exit(0); }; From 71a4cc5079c788c72c6212e3ea3d368615960ba5 Mon Sep 17 00:00:00 2001 From: Michael Sober Date: Fri, 26 Jun 2026 08:55:29 +0000 Subject: [PATCH 5/6] =?UTF-8?q?fix(dev-server):=20address=20PR=20review=20?= =?UTF-8?q?round=203=20=E2=80=94=20cdk-watch=20group-reap,=20respawn=20por?= =?UTF-8?q?t-free=20wait,=20KILL=5FGRACE=5FMS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - sandbox.ts: spawn `cdk watch` in its own process group on POSIX (detached) and reap it via the shared `terminateProcessTree` instead of a bare `cdkWatch.kill()` that signalled only the npx shell and could orphan the real cdk-watch node (npx → cdk → node). Both child trees are now reaped concurrently; only the dev-server child owns the :3100 port-free drain via its own SIGTERM handler. - process-tree.ts: tighten `terminateProcessTree` JSDoc — the boolean reflects only the DIRECT CHILD's exit, not whole-group teardown or port release; callers needing a freed port must follow with a port-free wait. No change to the boolean's semantics. - process-tree.ts: extract the hardcoded 500ms post-SIGKILL grace into a named `KILL_GRACE_MS` constant with a comment explaining why it is shorter than and decoupled from the injectable SIGTERM `graceMs` (SIGKILL is uncatchable). Still injectable for tests via the `sleep` seam. - dev-server.ts: the auto-respawn timer now `await waitForPortFree( frontendPort)` before relaunching (parity with the graceful `terminateFrontend` path), re-checking the `isShuttingDown` guard after the bounded wait so a relaunch never races the kernel's socket teardown into a `--strictPort` EADDRINUSE. Budget is debited once at exit time, so no double-count; the wait is bounded, so no deadlock. - tests: add a KILL_GRACE_MS unit test (recording-sleep spy asserts the SIGTERM grace uses graceMs and the post-SIGKILL grace uses KILL_GRACE_MS) and a RUN_SLOW_TESTS integration test that reaps a detached non-shell npx→node tree (the cdk-watch shape) via terminateProcessTree post-exit. - changeset: document round-3 behavior. --- .changeset/dev-server-restart-race-502.md | 32 ++++++-- .../src/scripts/dev-server-supervisor.test.ts | 82 +++++++++++++++++++ packages/core/src/scripts/dev-server.ts | 17 +++- packages/core/src/scripts/process-tree.ts | 27 +++++- packages/core/src/scripts/sandbox.ts | 31 +++++-- 5 files changed, 168 insertions(+), 21 deletions(-) diff --git a/.changeset/dev-server-restart-race-502.md b/.changeset/dev-server-restart-race-502.md index 4cfaeb2b..93950a51 100644 --- a/.changeset/dev-server-restart-race-502.md +++ b/.changeset/dev-server-restart-race-502.md @@ -28,7 +28,13 @@ Fixes: second dev server), and crediting a foreign one would make every `--strictPort`-failing respawn look successful, neutralizing the cap and hot-looping forever. A frontend that legitimately restarts many times (e.g. - editor-triggered full reloads) is still never permanently left down. + editor-triggered full reloads) is still never permanently left down. Before + each relaunch the supervisor now also waits (bounded) for `:3100` to be + released — the same port-free drain the graceful shutdown path uses — so a slow + socket teardown can't hand the relaunched `--strictPort` Vite an `EADDRINUSE` + and burn a restart-budget slot; that wait re-checks the `isShuttingDown` guard, + so a shutdown arriving mid-wait still cancels the relaunch (and the budget is + debited once, at exit time, so the wait never double-counts a restart). - **Robust shutdown** — cleanup is idempotent, wired to `SIGINT`/`SIGTERM`/ `SIGHUP`, removes its own listeners, and waits (bounded) for the group to die **and for `:3100` to actually be released** before exiting: SIGTERM→SIGKILL @@ -48,17 +54,27 @@ Fixes: minimal. The single rationale lives next to the supervisor as the "POST-EXIT GROUP-KILL POLICY" so all three sites stay in agreement. - **Sandbox entrypoint parity** — `sandbox.ts` (the sibling dev entrypoint) now - spawns the dev server in its own process group and `await`s a bounded group - teardown on `SIGINT`/`SIGTERM`, replacing the synchronous `kill()` + - `process.exit(0)` that signalled only the shell and exited immediately. The - drain gives the dev server time to run its own shutdown and reap the - *detached* Vite great-grandchild, so the next `npm run sandbox` no longer - races a survivor on `:3100`. + spawns **both** long-running children — the dev server *and* `cdk watch` — in + their own process groups and `await`s a bounded group teardown for each + (run concurrently) on `SIGINT`/`SIGTERM`, replacing the synchronous + `cdkWatch.kill()` + single dev-server `kill()` + `process.exit(0)` that + signalled only the npx/shell parents and exited immediately. A bare + `cdkWatch.kill()` could orphan the real `cdk watch` node process + (npx → cdk → node) — the same shell-only-kill leak this PR fixes for the dev + server — so it now routes through the shared `terminateProcessTree` too. Only + the dev-server drain (the longer 6s budget) owns the `:3100` port-free wait, via + its own SIGTERM handler, so the next `npm run sandbox` no longer races a + survivor on `:3100`. - **Single tree-kill primitive** — the POSIX group-kill, the Windows `taskkill` tree-kill, and the bounded SIGTERM→SIGKILL teardown now live in one shared `process-tree.ts` module used by every entrypoint (dev server, respawn handler, `exit` net, and sandbox), so the reaping behavior can no longer drift - between hand-rolled copies. + between hand-rolled copies. Its bounded teardown documents that its boolean + reflects only the **direct child's** exit (not whole-group teardown or port + release — callers needing a freed port must follow with `waitForPortFree`), and + its post-SIGKILL grace is a named `KILL_GRACE_MS` constant kept deliberately + shorter than the injectable SIGTERM grace (SIGKILL is uncatchable, so only a + brief beat is needed to observe the exit). `--strictPort` is intentionally retained: the proxy target is hardcoded to `:3100`, so the port is reliably freed rather than letting Vite drift to another diff --git a/packages/core/src/scripts/dev-server-supervisor.test.ts b/packages/core/src/scripts/dev-server-supervisor.test.ts index a606a805..5aa2dfe4 100644 --- a/packages/core/src/scripts/dev-server-supervisor.test.ts +++ b/packages/core/src/scripts/dev-server-supervisor.test.ts @@ -14,6 +14,7 @@ import { killFrontendTree, windowsTreeKill, terminateProcessTree, + KILL_GRACE_MS, type KillableProcess, type AwaitableChild, } from './process-tree.js'; @@ -294,6 +295,71 @@ describe('killFrontendTree — reaps a real detached shell tree', { skip: integr }); }); +// ── terminateProcessTree (integration, detached NON-shell tree) ───────────── +// Mirrors the sandbox `cdk watch` topology (npx → cdk → node, no shell) that +// finding #1 switched from a bare `cdkWatch.kill()` to `terminateProcessTree`. +// Opt-in + POSIX-only for the same reasons as the shell-tree test above. +describe('terminateProcessTree — reaps a detached non-shell tree (cdk-watch shape)', { skip: integrationSkip }, () => { + it('group-reaps an npx-style parent whose grandchild holds a port, even post-exit', + { timeout: 30000 }, + async () => { + const port = await getFreePort(); + // Grandchild (stands in for cdk-watch's node): binds the port and idles. + // Retry on EADDRINUSE like the shell-tree test, since getFreePort() + // releases its probe listener before this grandchild can rebind. + const inner = + `const net=require('net');` + + `const port=${port};` + + `let tries=0;` + + `(function bind(){` + + `const s=net.createServer(c=>c.destroy());` + + `s.on('error',e=>{` + + `if(e.code==='EADDRINUSE'&&tries++<50){setTimeout(bind,100);return;}` + + `process.exit(1);` + + `});` + + `s.listen(port,'127.0.0.1');` + + `})();` + + `setInterval(()=>{},1e9);`; + // Parent (stands in for `npx`): spawns the port-binding grandchild as a + // normal (NON-detached) child, then idles. No shell anywhere — the + // cdk-watch shape. Spawning the PARENT `detached` makes it a process-group + // leader (pgid === parent.pid) that the grandchild inherits, so a single + // `process.kill(-pid)` reaches both. + const parent = + `const cp=require('child_process');` + + `cp.spawn(process.execPath,['-e',${JSON.stringify(inner)}],{stdio:'ignore'});` + + `setInterval(()=>{},1e9);`; + const child = spawn(process.execPath, ['-e', parent], { detached: true, stdio: 'ignore' }); + + try { + assert.ok(await waitFor(() => isPortOpen(port), 10000), + 'grandchild should bind the port'); + + // A direct kill of ONLY the parent (what the old bare `cdkWatch.kill()` + // did) orphans the grandchild, which survives still holding the port — + // the exact shell/parent-only-kill leak finding #1 eliminates. + child.kill('SIGTERM'); + assert.ok(await staysOpen(port, 600), + 'direct parent kill must NOT free the port (demonstrates the orphan)'); + assert.ok( + await waitFor(async () => child.exitCode !== null || child.signalCode !== null, 5000), + 'parent should have exited after SIGTERM (sets up the post-exit reap)'); + + // terminateProcessTree now takes the post-exit branch (parent already + // gone) and issues a group SIGKILL via the shared killFrontendTree, + // reaping the orphaned grandchild and freeing the port — what the new + // `terminateProcessTree(cdkWatch, …)` call guarantees in sandbox.ts. + const reaped = await terminateProcessTree(child, 2000); + assert.strictEqual(reaped, true, 'post-exit terminateProcessTree resolves true'); + assert.ok(await waitFor(async () => !(await isPortOpen(port)), 10000), + 'group kill must free the port held by the grandchild (the fix)'); + } finally { + // Belt-and-suspenders: never leak the grandchild if an assert throws. + if (child.pid) { try { process.kill(-child.pid, 'SIGKILL'); } catch { /* gone */ } } + } + }); +}); + // ── shouldCreditFrontendReady (unit) ──────────────────────────────────────── // Guards the restart-budget reset: a liveness-only port probe must not credit a // foreign listener, or the maxRestarts cap is neutralized and Vite hot-loops. @@ -387,6 +453,22 @@ describe('terminateProcessTree — escalation + post-exit reap', () => { assert.deepStrictEqual(sent, ['SIGTERM', 'SIGKILL']); assert.strictEqual(ok, false); }); + + it('waits graceMs for the SIGTERM grace and the shorter KILL_GRACE_MS after SIGKILL', async () => { + const { child } = makeAwaitableChild(); + const sent: NodeJS.Signals[] = []; + const slept: number[] = []; + // Resolve every sleep immediately (so both grace races resolve) but record + // the requested durations to prove which grace each escalation step uses. + const recordingSleep = (ms: number): Promise => { slept.push(ms); return Promise.resolve(); }; + const ok = await terminateProcessTree(child, 1500, (_c, s) => { sent.push(s); }, recordingSleep); + assert.deepStrictEqual(sent, ['SIGTERM', 'SIGKILL']); + // First grace == the injectable SIGTERM graceMs; second == the fixed, + // deliberately shorter post-SIGKILL grace constant. + assert.deepStrictEqual(slept, [1500, KILL_GRACE_MS]); + assert.ok(KILL_GRACE_MS < 1500, 'the post-SIGKILL grace must be shorter than the SIGTERM grace'); + assert.strictEqual(ok, false); // child never fired exit + }); }); // ── waitForPortFree (real sockets, fast + deterministic) ──────────────────── diff --git a/packages/core/src/scripts/dev-server.ts b/packages/core/src/scripts/dev-server.ts index 7cf3b6e8..2b9b4fdc 100644 --- a/packages/core/src/scripts/dev-server.ts +++ b/packages/core/src/scripts/dev-server.ts @@ -420,8 +420,21 @@ export async function startDevServer(options: DevServerOptions) { respawnTimer = setTimeout(() => { respawnTimer = null; if (isShuttingDown) return; - const child = spawnFrontend(command); - void announceFrontendReady(child, ' (frontend restarted)'); + // Before relaunching, wait (bounded) for `:3100` to actually free — + // mirroring the graceful `terminateFrontend` path. The synchronous + // post-exit SIGKILL above only *initiates* teardown of the orphaned + // group; the kernel can still hold the listening socket for a beat, and a + // relaunched `--strictPort` Vite would then hit `EADDRINUSE` and burn a + // restart-budget slot on a race that isn't a real crash. The budget was + // already debited above, so this never double-counts a restart; re-check + // `isShuttingDown` after the await, since a shutdown signal can land while + // we wait (`waitForPortFree` is bounded, so it can't deadlock shutdown). + void (async () => { + await waitForPortFree(frontendPort); + if (isShuttingDown) return; + const next = spawnFrontend(command); + await announceFrontendReady(next, ' (frontend restarted)'); + })(); }, decision.delayMs); respawnTimer.unref?.(); }); diff --git a/packages/core/src/scripts/process-tree.ts b/packages/core/src/scripts/process-tree.ts index fe1326d4..dfda78f7 100644 --- a/packages/core/src/scripts/process-tree.ts +++ b/packages/core/src/scripts/process-tree.ts @@ -114,6 +114,17 @@ const defaultSleep = (ms: number): Promise => setTimeout(res, ms).unref?.(); }); +/** + * Grace (ms) we wait for the child's `exit` event *after* SIGKILL before giving + * up and reporting its last-known exit state. Deliberately shorter than — and + * intentionally decoupled from — the injectable SIGTERM `graceMs`: SIGKILL + * cannot be caught, blocked, or handled, so the child is already being + * force-terminated; we only need a brief beat to observe the `exit` event, not a + * full, tunable shutdown window. Fixed (not a parameter) because no caller needs + * to tune it — the injected `sleep` is the test seam. + */ +export const KILL_GRACE_MS = 500; + /** * Terminate a child process *tree* and wait — bounded — for the child to exit, * escalating SIGTERM → SIGKILL. Reuses {@link killFrontendTree} so every @@ -126,8 +137,16 @@ const defaultSleep = (ms: number): Promise => * "POST-EXIT GROUP-KILL POLICY". Otherwise we SIGTERM the tree, wait up to * `graceMs` for a clean exit, then SIGKILL the tree and wait a short grace. * - * Resolves `true` once the child has exited (or was already gone), `false` if it - * was still alive when the budget elapsed. Dependencies are injected for tests. + * Return value — IMPORTANT: the boolean reflects only the **direct child's** + * exit state (its `exitCode`/`signalCode`), NOT whole-group teardown or port + * release. On POSIX the SIGKILL is delivered to the whole group (`-pid`), but a + * surviving *detached grandchild* can outlive the awaited child and keep holding + * a port even after this resolves `true`. So `true` means only "the child we + * awaited has exited (or was already gone)" and `false` means "it was still + * alive when the budget elapsed" — neither guarantees the port is free. Callers + * that need a freed port MUST follow this with a bounded port-free wait (see + * `waitForPortFree` in dev-server.ts, which the dev-server child's own SIGTERM + * handler runs). Dependencies are injected for tests. */ export async function terminateProcessTree( child: AwaitableChild, @@ -149,6 +168,8 @@ export async function terminateProcessTree( ]); if (exitedCleanly) return true; killTree(child, 'SIGKILL'); - await Promise.race([exited, sleep(500)]); + // Shorter, fixed grace after SIGKILL (vs. the injectable SIGTERM graceMs): + // SIGKILL is uncatchable, so we only need a brief beat to observe `exit`. + await Promise.race([exited, sleep(KILL_GRACE_MS)]); return child.exitCode !== null || child.signalCode !== null; } diff --git a/packages/core/src/scripts/sandbox.ts b/packages/core/src/scripts/sandbox.ts index e17f573c..225ed74b 100644 --- a/packages/core/src/scripts/sandbox.ts +++ b/packages/core/src/scripts/sandbox.ts @@ -148,6 +148,12 @@ export async function startSandbox(options: SandboxOptions) { `--app`, `npm exec tsx -- -C cdk ${backendPath}` ], { stdio: ["ignore", "pipe", "pipe"], + // Own process group on POSIX so cleanup can reap the whole `cdk watch` tree + // (npx → cdk → node) via terminateProcessTree, not just the npx shell — a + // bare kill() would orphan the real cdk-watch node process, the same + // shell-only-kill leak this PR eliminates for the dev server. Windows has no + // groups; terminateProcessTree reaps the tree via taskkill. + detached: process.platform !== 'win32', env: { ...process.env, NODE_OPTIONS: "--conditions=cdk", ...getCdkTelemetryEnv('sandbox') }, }); @@ -187,14 +193,23 @@ export async function startSandbox(options: SandboxOptions) { console.log("\n\n🛑 Stopping local processes..."); console.log(" (AWS resources are still running)"); console.log("\n To destroy AWS resources, run: npm run sandbox:destroy\n"); - cdkWatch.kill(); - // The dev server owns a *detached* Vite tree that only its own SIGTERM - // handler reaps (a ~2s terminateFrontend drain). Killing just the shell and - // exiting immediately would orphan Vite on :3100 and wedge the next run with - // --strictPort/502, so signal the dev-server process group and AWAIT that - // drain. Bounded at 6s (> the ~2s drain): a hung dev server escalates to a - // tree SIGKILL and we exit regardless, so shutdown can never wedge. - await terminateProcessTree(devServer, 6000); + // Reap BOTH child trees the way the dev server reaps Vite — a process-group + // SIGTERM→SIGKILL via the shared terminateProcessTree — instead of a bare + // kill() that signals only the npx/shell parent and orphans the real + // grandchild (cdk-watch's node, or the dev server's detached Vite). Run them + // concurrently so the cdk-watch teardown doesn't serialize on top of the dev + // server's longer drain. + // + // Only the dev-server child owns the `:3100` port-free wait: its own SIGTERM + // handler runs terminateFrontend (a ~2s drain that reaps the detached Vite + // great-grandchild AND polls until the port frees), so we give it the longer + // 6s budget (> that ~2s drain) — a hung dev server still escalates to a tree + // SIGKILL and we exit regardless, so shutdown can never wedge. cdk watch + // holds no local port, so a bounded tree-kill is all it needs. + await Promise.all([ + terminateProcessTree(cdkWatch, 2000), + terminateProcessTree(devServer, 6000), + ]); process.exit(0); }; From ca45cbb32a3cd5bc8f41c4c8fa22b069728bed80 Mon Sep 17 00:00:00 2001 From: Michael Sober Date: Fri, 26 Jun 2026 09:54:03 +0000 Subject: [PATCH 6/6] =?UTF-8?q?fix(dev-server):=20address=20PR=20review=20?= =?UTF-8?q?round=204=20=E2=80=94=20scoped=20post-exit=20group=20kill,=20ne?= =?UTF-8?q?sted-SIGTERM=20test,=20taskkill=20status=20check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - process-tree: scope terminateProcessTree's post-exit group SIGKILL behind a group-liveness probe (new isProcessGroupAlive, POSIX signal 0) so a fully drained group can't be hit by a recycled `-pid` signal landing on an unrelated group; when the group has drained there is also nothing of ours to reap, so we skip. Cross-reference the dev-server POST-EXIT GROUP-KILL POLICY from the shared primitive (and a note back the other way). - process-tree: windowsTreeKill now treats only status 0 (reaped) or 128 (already gone) as handled and returns false for any other taskkill exit (e.g. access-denied 1) so the caller falls back to child.kill instead of silently leaking the tree; JSDoc updated. - dev-server: document that the respawn timer's `unref` is intentional (the listening server owns process lifetime; the timer must not keep us alive on shutdown). - sandbox: cross-reference the new integration test that verifies a group SIGTERM reaches the nested node dev server and runs its own SIGTERM handler. - tests: add isProcessGroupAlive unit tests; a post-exit "group drained → skip" case; a windowsTreeKill non-zero/non-128 status case; and a RUN_SLOW_TESTS, POSIX-only integration test asserting the nested node SIGTERM handler runs on a group SIGTERM. Build clean; @aws-blocks/core tests pass 516/516 (default) and 519/519 (RUN_SLOW_TESTS=1, incl. the new integration test). --- .../src/scripts/dev-server-supervisor.test.ts | 122 +++++++++++++++++- packages/core/src/scripts/dev-server.ts | 17 +++ packages/core/src/scripts/process-tree.ts | 94 ++++++++++++-- packages/core/src/scripts/sandbox.ts | 6 + 4 files changed, 225 insertions(+), 14 deletions(-) diff --git a/packages/core/src/scripts/dev-server-supervisor.test.ts b/packages/core/src/scripts/dev-server-supervisor.test.ts index 5aa2dfe4..97af2a97 100644 --- a/packages/core/src/scripts/dev-server-supervisor.test.ts +++ b/packages/core/src/scripts/dev-server-supervisor.test.ts @@ -4,6 +4,9 @@ import { describe, it } from 'node:test'; import assert from 'node:assert'; import { spawn } from 'node:child_process'; import { createConnection, createServer } from 'node:net'; +import { readFileSync, unlinkSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; import { evaluateFrontendRespawn, DEFAULT_FRONTEND_RESPAWN_POLICY, @@ -14,6 +17,7 @@ import { killFrontendTree, windowsTreeKill, terminateProcessTree, + isProcessGroupAlive, KILL_GRACE_MS, type KillableProcess, type AwaitableChild, @@ -214,12 +218,53 @@ describe('windowsTreeKill — taskkill tree-kill command', () => { assert.strictEqual(ok, false); }); + it('reports failure when taskkill runs but returns a non-zero, non-128 status', () => { + // taskkill spawned fine (no error) but FAILED to reap the tree — e.g. exit 1 + // = access denied. It must NOT be treated as handled, or the caller skips + // its child.kill fallback and silently leaks the tree. + assert.strictEqual(windowsTreeKill(4242, () => ({ status: 1 })), false); + // A taskkill killed by a signal (status null, no spawn error) is likewise + // not a successful reap. + assert.strictEqual(windowsTreeKill(4242, () => ({ status: null })), false); + }); + it('never throws even if the runner throws', () => { const ok = windowsTreeKill(4242, () => { throw new Error('boom'); }); assert.strictEqual(ok, false); }); }); +// ── isProcessGroupAlive (unit, injected kill + platform) ──────────────────── +// Scopes the post-exit group SIGKILL: a `-pid` signal is only PID-reuse-safe +// while a group member is still alive, so terminateProcessTree probes here and +// skips the reap once the group has drained. +describe('isProcessGroupAlive — group-liveness probe (signal 0)', () => { + const esrch = () => { throw Object.assign(new Error('ESRCH'), { code: 'ESRCH' }); }; + const eperm = () => { throw Object.assign(new Error('EPERM'), { code: 'EPERM' }); }; + + it('probes the GROUP with signal 0 (negative pid, no real signal) and reports alive on success', () => { + const calls: Array<[number, number]> = []; + const alive = isProcessGroupAlive(4242, 'linux', (pid, sig) => { calls.push([pid, sig]); }); + assert.strictEqual(alive, true); + assert.deepStrictEqual(calls, [[-4242, 0]]); // group probe, signal 0 = existence check only + }); + + it('reports drained (false) when the group is gone (ESRCH)', () => { + assert.strictEqual(isProcessGroupAlive(4242, 'linux', esrch), false); + }); + + it('reports alive (true) when the group exists but is not ours to signal (EPERM)', () => { + assert.strictEqual(isProcessGroupAlive(4242, 'linux', eperm), true); + }); + + it('always reports alive on Windows (no process groups; taskkill is PID-tree-scoped)', () => { + let probed = false; + const alive = isProcessGroupAlive(4242, 'win32', () => { probed = true; }); + assert.strictEqual(alive, true); + assert.strictEqual(probed, false); // never probes on Windows + }); +}); + // ── killFrontendTree (integration, real shell + grandchild) ───────────────── // Opt-in: spawns real OS processes and relies on wall-clock timing, so it is // gated behind RUN_SLOW_TESTS to keep the default `npm test` deterministic. @@ -360,6 +405,68 @@ describe('terminateProcessTree — reaps a detached non-shell tree (cdk-watch sh }); }); +// ── group SIGTERM → nested node handler (sandbox dev-server teardown) ──────── +// Verifies the load-bearing claim in sandbox.ts: a *group* SIGTERM (what +// killFrontendTree issues on POSIX, via terminateProcessTree) actually reaches +// the nested node dev server so its OWN SIGTERM handler (the :3100 drain) runs — +// not merely that the tree gets reaped. Opt-in + POSIX-only like the reaping +// tests above. +describe('group SIGTERM reaches a nested node child and runs its SIGTERM handler', { skip: integrationSkip }, () => { + it('delivers a group SIGTERM to a detached shell→node child whose node handler observes it', + { timeout: 30000 }, + async () => { + const port = await getFreePort(); + const marker = join(tmpdir(), `blocks-sigterm-${process.pid}-${Date.now()}.flag`); + // node grandchild: install a SIGTERM handler that RECORDS it observed the + // signal (writes the marker) before exiting, THEN bind the port to + // announce readiness. Installing the handler before binding guarantees the + // test never sends SIGTERM before the handler exists (which would + // default-terminate node and flake the assertion). Retry on EADDRINUSE + // like the reaping tests, since getFreePort() releases its probe listener. + const inner = + `const net=require('net'),fs=require('fs');` + + `process.on('SIGTERM',()=>{try{fs.writeFileSync(${JSON.stringify(marker)},'sigterm');}catch{}process.exit(0);});` + + `let tries=0;` + + `(function bind(){` + + `const s=net.createServer(c=>c.destroy());` + + `s.on('error',e=>{` + + `if(e.code==='EADDRINUSE'&&tries++<50){setTimeout(bind,100);return;}` + + `process.exit(1);` + + `});` + + `s.listen(${port},'127.0.0.1');` + + `})();` + + `setInterval(()=>{},1e9);`; + // shell → node grandchild, backgrounded with `& wait` so the shell stays + // the live group leader — the exact `shell: true` + detached topology the + // dev server uses, where a group signal must fan out to the nested node. + const cmd = `${JSON.stringify(process.execPath)} -e ${JSON.stringify(inner)} & wait`; + const child = spawn(cmd, { shell: true, detached: true, stdio: 'ignore' }); + + try { + assert.ok(await waitFor(() => isPortOpen(port), 10000), + 'nested node should install its SIGTERM handler then bind the port (ready)'); + assert.ok(child.pid && child.pid > 1, 'child must have a real pid to group-signal'); + + // The group SIGTERM under test: exactly what killFrontendTree does on + // POSIX (`process.kill(-pid, 'SIGTERM')`) and what the sandbox dev-server + // teardown relies on to trigger the node's own terminateFrontend handler. + // The `if (child.pid)` guard (asserted above) narrows the type without a + // non-null assertion — matching the finally-block pattern used in this file. + if (child.pid) process.kill(-child.pid, 'SIGTERM'); + + // Assert the node handler OBSERVED the signal (wrote the marker) — i.e. + // the group SIGTERM reached the *nested* node, not just the shell. + const observed = await waitFor(async () => { + try { return readFileSync(marker, 'utf8') === 'sigterm'; } catch { return false; } + }, 10000); + assert.ok(observed, 'the nested node SIGTERM handler must run on a group SIGTERM'); + } finally { + if (child.pid) { try { process.kill(-child.pid, 'SIGKILL'); } catch { /* gone */ } } + try { unlinkSync(marker); } catch { /* never created */ } + } + }); +}); + // ── shouldCreditFrontendReady (unit) ──────────────────────────────────────── // Guards the restart-budget reset: a liveness-only port probe must not credit a // foreign listener, or the maxRestarts cap is neutralized and Vite hot-loops. @@ -416,14 +523,25 @@ describe('terminateProcessTree — escalation + post-exit reap', () => { return { child, fireExit }; } - it('issues a single best-effort group SIGKILL when the child already exited', async () => { + it('issues a single best-effort group SIGKILL when the child already exited and its group is still alive', async () => { const { child } = makeAwaitableChild({ exitCode: 0 }); const sent: NodeJS.Signals[] = []; - const ok = await terminateProcessTree(child, 1500, (_c, s) => { sent.push(s); }, never); + // Group still has a live member (an orphaned grandchild) → reap it. + const ok = await terminateProcessTree(child, 1500, (_c, s) => { sent.push(s); }, never, () => true); assert.strictEqual(ok, true); assert.deepStrictEqual(sent, ['SIGKILL']); // post-exit reap: no SIGTERM, no await }); + it('skips the post-exit group SIGKILL when the whole group has already drained', async () => { + const { child } = makeAwaitableChild({ exitCode: 0 }); + const sent: NodeJS.Signals[] = []; + // The group has fully drained: a `-pid` SIGKILL risks signalling a recycled, + // unrelated group, and there is nothing of ours left to reap — so skip it. + const ok = await terminateProcessTree(child, 1500, (_c, s) => { sent.push(s); }, never, () => false); + assert.strictEqual(ok, true); // still resolves true (the child has exited) + assert.deepStrictEqual(sent, []); // no signal sent into the drained group + }); + it('SIGTERMs the tree and resolves without escalating when it exits in time', async () => { const { child, fireExit } = makeAwaitableChild(); const sent: NodeJS.Signals[] = []; diff --git a/packages/core/src/scripts/dev-server.ts b/packages/core/src/scripts/dev-server.ts index 2b9b4fdc..37744a83 100644 --- a/packages/core/src/scripts/dev-server.ts +++ b/packages/core/src/scripts/dev-server.ts @@ -348,6 +348,15 @@ export async function startDevServer(options: DevServerOptions) { // unrelated group. This is an accepted best-effort trade-off — there is then // nothing of ours left to reap, whereas skipping the kill would otherwise // leave `:3100` wedged, which is the failure this PR exists to prevent. + // + // Where each post-exit kill site lands on this trade-off: the two sites *in + // this file* — the respawn reap (in the child's `exit` handler) and the + // `process.on('exit')` net — fire synchronously the instant we observe the + // exit, so they lean on point (2) above and stay unconditional. The third + // path, `terminateFrontend` → `terminateProcessTree` (process-tree.ts), can + // run outside that minimal synchronous window, so it additionally PROBES group + // liveness (POSIX signal 0) and skips the reap once the group has fully + // drained — see its "POST-EXIT GROUP-KILL (scoped)" comment. const usePosixProcessGroups = process.platform !== 'win32'; let isShuttingDown = false; let frontendRestarts: number[] = []; @@ -436,6 +445,14 @@ export async function startDevServer(options: DevServerOptions) { await announceFrontendReady(next, ' (frontend restarted)'); })(); }, decision.delayMs); + // INTENTIONAL unref: the listening HTTP `server` (created below) owns this + // process's lifetime — the backoff timer must NOT, by itself, keep the + // event loop alive. Without unref a pending respawn timer would hold the + // process up during shutdown (or after the server has closed), delaying or + // blocking a clean exit. This never drops a legitimately-needed respawn: + // `cleanup` explicitly clears this timer, and both the timer body and the + // awaited relaunch re-check `isShuttingDown`. Do NOT remove the unref to + // "fix" a perceived missed restart — it would reintroduce that shutdown hang. respawnTimer.unref?.(); }); diff --git a/packages/core/src/scripts/process-tree.ts b/packages/core/src/scripts/process-tree.ts index dfda78f7..8d4f1878 100644 --- a/packages/core/src/scripts/process-tree.ts +++ b/packages/core/src/scripts/process-tree.ts @@ -34,10 +34,15 @@ interface TreeKillResult { * shutdown to a non-console subtree anyway (Node maps SIGTERM/SIGKILL to * `TerminateProcess`). * - * Returns `true` when `taskkill` actually ran — whether it reaped the tree or - * found it already gone (exit 128) — and `false` only when the command could - * not be spawned at all (e.g. not on `PATH`), signalling the caller to fall - * back to a direct `child.kill`. Never throws. + * Returns `true` only when `taskkill` ran AND reported the tree handled — exit + * `0` (reaped the tree) or `128` (`"process not found"`, i.e. already gone). + * Returns `false` when the command could not be spawned at all (e.g. not on + * `PATH`) OR when it ran but returned any other status (e.g. `1` = access + * denied): such a run did NOT reap the tree, so the caller must fall back to a + * direct `child.kill` rather than treat the leak as handled. (`child.kill` + * cannot reap the orphaned grandchild either, but the fallback is cheap and + * strictly correct — we never silently swallow a failed tree-kill.) Never + * throws. */ export function windowsTreeKill( pid: number, @@ -45,8 +50,14 @@ export function windowsTreeKill( spawnSync(command, args as string[], { stdio: 'ignore', windowsHide: true }), ): boolean { try { - const { error } = runner('taskkill', ['/T', '/F', '/PID', String(pid)]); - return !error; + const { status, error } = runner('taskkill', ['/T', '/F', '/PID', String(pid)]); + // Couldn't even spawn taskkill (e.g. not on PATH) → not handled; fall back. + if (error) return false; + // taskkill ran: only exit 0 (reaped the tree) or 128 ("process not found", + // already gone) mean the tree is handled. Any other non-null status (e.g. + // 1 = access denied) means taskkill ran but did NOT reap the tree, so report + // not-handled and let the caller fall back to a direct child.kill. + return status === 0 || status === 128; } catch { return false; } @@ -125,6 +136,43 @@ const defaultSleep = (ms: number): Promise => */ export const KILL_GRACE_MS = 500; +/** + * Probe whether a detached process *group* still has at least one live member, + * **without signalling it**. Used to scope the post-exit group SIGKILL in + * {@link terminateProcessTree} to the only window where the `-pid` group signal + * is PID-reuse-safe. + * + * The hazard: {@link killFrontendTree}'s POSIX reap is `process.kill(-pid, …)`, + * which targets the process group whose gid is `pid`. That is safe only while a + * group member is still alive — a survivor keeps the kernel from recycling + * `pid` as a brand-new (unrelated) group leader. Once the whole group has + * drained, `pid` is eligible for reuse and a blind `-pid` kill could land on an + * unrelated group. So before a *post-exit* reap we probe here and skip when the + * group has already drained (there is then nothing of ours left to reap). + * + * - **POSIX**: `kill(-pid, 0)` sends no signal — it only checks the group + * exists and is signallable. Success or `EPERM` (exists but owned by another + * user) ⇒ alive. `ESRCH` (or anything else) ⇒ treat as drained. + * - **Windows**: there are no process groups and the reap path + * (`taskkill /T /F /PID`) walks the live PID tree, so there is no `-pid` + * recycle hazard — always allow the reap (`true`). + * + * Never throws. `platform`/`kill` are injected for tests. + */ +export function isProcessGroupAlive( + pid: number, + platform: NodeJS.Platform = process.platform, + kill: (pid: number, signal: number) => void = (p, s) => process.kill(p, s), +): boolean { + if (platform === 'win32') return true; + try { + kill(-pid, 0); + return true; + } catch (e) { + return (e as NodeJS.ErrnoException).code === 'EPERM'; + } +} + /** * Terminate a child process *tree* and wait — bounded — for the child to exit, * escalating SIGTERM → SIGKILL. Reuses {@link killFrontendTree} so every @@ -133,9 +181,15 @@ export const KILL_GRACE_MS = 500; * * Post-exit policy: if the child has *already* exited, a detached grandchild may * still be orphaned (still holding a port), so we issue one best-effort group - * SIGKILL to reap it rather than returning blind — see the dev server's - * "POST-EXIT GROUP-KILL POLICY". Otherwise we SIGTERM the tree, wait up to - * `graceMs` for a clean exit, then SIGKILL the tree and wait a short grace. + * SIGKILL to reap it — but ONLY when the group still has a live member + * ({@link isProcessGroupAlive}). When the whole group has already drained (the + * common healthy shutdown — Vite was already gone), `pid` is eligible for + * recycling and a blind `-pid` signal could hit an unrelated, newly created + * group; since there is also nothing of ours left to reap, we skip the kill. + * See the dev server's "POST-EXIT GROUP-KILL POLICY" for the full rationale and + * the accepted residual (the synchronous probe→kill window). Otherwise we + * SIGTERM the tree, wait up to `graceMs` for a clean exit, then SIGKILL the tree + * and wait a short grace. * * Return value — IMPORTANT: the boolean reflects only the **direct child's** * exit state (its `exitCode`/`signalCode`), NOT whole-group teardown or port @@ -153,11 +207,27 @@ export async function terminateProcessTree( graceMs = 2000, killTree: (c: KillableProcess, signal: NodeJS.Signals) => void = killFrontendTree, sleep: (ms: number) => Promise = defaultSleep, + isGroupAlive: (pid: number) => boolean = isProcessGroupAlive, ): Promise { if (child.exitCode !== null || child.signalCode !== null) { - // Shell already exited — a detached grandchild may still hold its port, so - // reap the group best-effort instead of returning blind. - killTree(child, 'SIGKILL'); + // ── POST-EXIT GROUP-KILL (scoped) ────────────────────────────────────── + // The direct child has already exited, but a detached *grandchild* (e.g. an + // orphaned Vite) may still be alive in its process group, still holding a + // port — reap it with one best-effort group SIGKILL. + // + // SCOPING: only reap when the group still has a live member. killFrontendTree's + // `-pid` group signal is PID-reuse-safe ONLY while a member keeps `pid` + // reserved as the group id; once the whole group has drained `pid` can be + // recycled and a blind `process.kill(-pid)` could hit an unrelated group. So + // we probe first (isProcessGroupAlive; POSIX signal 0) and skip when already + // drained — there is then nothing of ours to reap. The residual synchronous + // probe→kill window is the accepted trade-off documented in dev-server.ts + // "POST-EXIT GROUP-KILL POLICY", cross-referenced here so the risk is + // discoverable at this shared primitive. + const { pid } = child; + if (pid && pid > 1 && isGroupAlive(pid)) { + killTree(child, 'SIGKILL'); + } return true; } const exited = new Promise((res) => child.once('exit', () => res())); diff --git a/packages/core/src/scripts/sandbox.ts b/packages/core/src/scripts/sandbox.ts index 225ed74b..74c449a7 100644 --- a/packages/core/src/scripts/sandbox.ts +++ b/packages/core/src/scripts/sandbox.ts @@ -206,6 +206,12 @@ export async function startSandbox(options: SandboxOptions) { // 6s budget (> that ~2s drain) — a hung dev server still escalates to a tree // SIGKILL and we exit regardless, so shutdown can never wedge. cdk watch // holds no local port, so a bounded tree-kill is all it needs. + // + // That a group SIGTERM (terminateProcessTree → killFrontendTree's + // `process.kill(-pid, 'SIGTERM')`) actually reaches the *nested* node dev + // server and runs its own SIGTERM handler — the load-bearing assumption of + // the 6s budget above — is verified by the "group SIGTERM reaches a nested + // node child" integration test in dev-server-supervisor.test.ts. await Promise.all([ terminateProcessTree(cdkWatch, 2000), terminateProcessTree(devServer, 6000),