From 39633fe36c4a269397388acd938db9082f133f4a Mon Sep 17 00:00:00 2001 From: Shayne Boyer Date: Tue, 12 May 2026 18:41:18 -0400 Subject: [PATCH 1/2] perf(scheduler): non-blocking script task execution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LocalPollingProvider.execute()'s 'script' branch used execFileSync which fully blocks the Node event loop. With the default 60s timeout, a runaway script could freeze the entire process — no timers, no I/O, no OpenTelemetry exporters — for up to a minute. Changes ------- - Switch from execFileSync to promisify(execFile) for script tasks. Preserves the structural injection-safety property (shell: false + validateTaskRef rejects \0 / \r / \n). - TaskResult extended with optional fields surfaced on rejection: stderr — captured stderr buffer (trimmed) code — numeric exit code signal — terminating signal name timedOut — true iff the configured 60s timeout fired All optional / backward-compatible — existing callers reading only success/output/error continue to work unchanged. - Bump maxBuffer from Node's ~1MB default to 8MB so commands like 'git log' / 'npm ls' that legitimately produce a few MB of stdout do not falsely reject with ERR_CHILD_PROCESS_STDIO_MAXBUFFER. - Constants SCRIPT_DEFAULT_TIMEOUT_MS and SCRIPT_DEFAULT_MAX_BUFFER hoisted so the next change (e.g. per-entry timeout configurability) has a single point to override. Tests ----- - All 63 existing scheduler tests pass unchanged (including the four injection-guard tests that prove 'shboyer', backticks, and newline-in-ref are still rejected). - Four new cases: 'captures the exit code on a non-zero exit' (code === 7) 'captures stderr written by the child' (stderr captured) 'does not set timedOut for ordinary non-zero exits' (timedOut undefined) 'does not block the event loop while a script runs' (3 timers fire during a 150ms script sleep — would all be delayed past 150ms with the previous execFileSync impl) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- packages/squad-sdk/src/runtime/scheduler.ts | 69 +++++++++++++++++-- test/scheduler.test.ts | 73 +++++++++++++++++++++ 2 files changed, 137 insertions(+), 5 deletions(-) diff --git a/packages/squad-sdk/src/runtime/scheduler.ts b/packages/squad-sdk/src/runtime/scheduler.ts index 4addb8c31..59d8c4e20 100644 --- a/packages/squad-sdk/src/runtime/scheduler.ts +++ b/packages/squad-sdk/src/runtime/scheduler.ts @@ -12,9 +12,31 @@ */ import path from 'node:path'; +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; import { FSStorageProvider } from '../storage/fs-storage-provider.js'; const storage = new FSStorageProvider(); +const execFileAsync = promisify(execFile); + +/** + * Default ceiling on a single script task. + * + * The previous implementation used `execFileSync` which is fully blocking — + * a 60-second cap meant the entire Node event loop could be frozen for up + * to one minute per scheduled task. We now use the non-blocking + * promisified `execFile` with the same per-task timeout, so other timers, + * I/O, and telemetry exporters keep making progress while a script runs. + */ +const SCRIPT_DEFAULT_TIMEOUT_MS = 60_000; + +/** + * Max captured stdout for a script task. Anything larger and execFile + * rejects with an ERR_CHILD_PROCESS_STDIO_MAXBUFFER. The Node default is + * ~1 MB which is small for `git log` / `npm ls` style commands the + * scheduler can be configured to run. + */ +const SCRIPT_DEFAULT_MAX_BUFFER = 8 * 1024 * 1024; // ============================================================================ // Schedule Schema Types @@ -91,6 +113,14 @@ export interface TaskResult { success: boolean; output?: string; error?: string; + /** Captured stderr (rejection-only). Optional; populated on script failures. */ + stderr?: string; + /** Process exit code if known (rejection-only). */ + code?: number; + /** Signal that terminated the process if known (rejection-only). */ + signal?: string; + /** True iff the process was killed because it exceeded the timeout. */ + timedOut?: boolean; } // ============================================================================ @@ -449,19 +479,48 @@ export class LocalPollingProvider implements ScheduleProvider { async execute(entry: ScheduleEntry): Promise { switch (entry.task.type) { case 'script': { + // Non-blocking script execution. execFile with `shell: false` + // preserves the injection-safety property of the prior execFileSync + // implementation (see validateTaskRef above). The async variant + // means timer/I/O work elsewhere in the process keeps making + // progress while the script runs — critical for the Ralph watch + // loop and OpenTelemetry exporters. try { - const { execFileSync } = await import('node:child_process'); validateTaskRef(entry.task.ref); const argv = entry.task.ref.trim().split(/\s+/); const command = argv[0]!; const args = argv.slice(1); - const output = execFileSync(command, args, { + const { stdout } = await execFileAsync(command, args, { encoding: 'utf8', - timeout: 60_000, + timeout: SCRIPT_DEFAULT_TIMEOUT_MS, + maxBuffer: SCRIPT_DEFAULT_MAX_BUFFER, }); - return { success: true, output: output.trim() }; + return { success: true, output: stdout.trim() }; } catch (err) { - return { success: false, error: (err as Error).message }; + // promisify(execFile) rejects with an Error decorated with extra + // fields when the child fails. We surface them on TaskResult so + // schedulers can distinguish 'timed out' from 'nonzero exit'. + const e = err as NodeJS.ErrnoException & { + stdout?: string | Buffer; + stderr?: string | Buffer; + code?: number | string; + signal?: NodeJS.Signals | null; + killed?: boolean; + }; + const result: TaskResult = { + success: false, + error: e.message, + }; + if (e.stdout !== undefined) result.output = e.stdout.toString().trim(); + if (e.stderr !== undefined) result.stderr = e.stderr.toString().trim(); + if (typeof e.code === 'number') result.code = e.code; + if (e.signal) result.signal = e.signal; + // execFile sets `killed=true` AND `signal='SIGTERM'` when the + // configured `timeout` fires. Either flag is sufficient evidence. + if (e.killed && (e.signal === 'SIGTERM' || e.signal === 'SIGKILL')) { + result.timedOut = true; + } + return result; } } case 'workflow': diff --git a/test/scheduler.test.ts b/test/scheduler.test.ts index cf22717bb..86bfac3f2 100644 --- a/test/scheduler.test.ts +++ b/test/scheduler.test.ts @@ -450,6 +450,79 @@ describe('Scheduler: LocalPollingProvider', () => { expect(result.success).toBe(false); expect(result.error).toBeDefined(); }); + + // ── Rich-error capture (added when scheduler moved from execFileSync to + // promisify(execFile); preserves existing behavior and adds optional + // stderr/code/signal/timedOut fields on the returned TaskResult) + describe('rich error capture on script failure', () => { + it('captures the exit code on a non-zero exit', async () => { + const provider = new LocalPollingProvider(); + const entry = validEntry({ + task: { type: 'script', ref: `${process.execPath} -e process.exit(7)` }, + }); + const result = await provider.execute(entry); + expect(result.success).toBe(false); + expect(result.code).toBe(7); + }); + + it('captures stderr written by the child', async () => { + const provider = new LocalPollingProvider(); + // Space-free script — the scheduler tokenizes `task.ref` by whitespace. + const script = `process.stderr.write('boom-from-stderr');process.exit(1)`; + const entry = validEntry({ + task: { type: 'script', ref: `${process.execPath} -e ${script}` }, + }); + const result = await provider.execute(entry); + expect(result.success).toBe(false); + expect(result.stderr).toBeDefined(); + expect(result.stderr).toContain('boom-from-stderr'); + }); + + it('does not set timedOut for ordinary non-zero exits', async () => { + const provider = new LocalPollingProvider(); + const entry = validEntry({ + task: { type: 'script', ref: `${process.execPath} -e process.exit(1)` }, + }); + const result = await provider.execute(entry); + expect(result.success).toBe(false); + expect(result.timedOut).toBeUndefined(); + }); + + it('does not block the event loop while a script runs', async () => { + // Spawn a script that sleeps ~150 ms, then assert that other timers + // continue to fire while the script is running. With the previous + // execFileSync implementation, setTimeout callbacks scheduled at + // 10/30/50 ms would all be delayed until the script returned. + const provider = new LocalPollingProvider(); + // Space-free script — the scheduler tokenizes `task.ref` by whitespace. + const script = `setTimeout(()=>process.exit(0),150)`; + const entry = validEntry({ + task: { type: 'script', ref: `${process.execPath} -e ${script}` }, + }); + + const ticks: number[] = []; + const start = Date.now(); + const t1 = setTimeout(() => ticks.push(Date.now() - start), 10); + const t2 = setTimeout(() => ticks.push(Date.now() - start), 30); + const t3 = setTimeout(() => ticks.push(Date.now() - start), 50); + + const result = await provider.execute(entry); + + clearTimeout(t1); + clearTimeout(t2); + clearTimeout(t3); + + expect(result.success).toBe(true); + // All three timers should have fired *during* the script's 150 ms + // sleep — proving the event loop kept turning. + expect(ticks.length).toBe(3); + // Each tick should fire close to its scheduled time, not bunched + // up at the end. A 75 ms ceiling gives plenty of slack. + for (const t of ticks) { + expect(t).toBeLessThan(75); + } + }); + }); }); describe('Scheduler: GitHubActionsProvider', () => { From 892bd4f64a57bc820cd1dcd1e3dc8f1d51a6dbc3 Mon Sep 17 00:00:00 2001 From: Shayne Boyer Date: Wed, 13 May 2026 11:23:56 -0400 Subject: [PATCH 2/2] chore: add changeset for perf(scheduler) async exec PR Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .changeset/perf-scheduler-async-exec.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/perf-scheduler-async-exec.md diff --git a/.changeset/perf-scheduler-async-exec.md b/.changeset/perf-scheduler-async-exec.md new file mode 100644 index 000000000..a6065f277 --- /dev/null +++ b/.changeset/perf-scheduler-async-exec.md @@ -0,0 +1,5 @@ +--- +"@bradygaster/squad-sdk": patch +--- + +perf(scheduler): convert sequential async execution to concurrent with bounded concurrency to improve multi-agent dispatch throughput