diff --git a/.changeset/perf-scheduler-async-exec.md b/.changeset/perf-scheduler-async-exec.md new file mode 100644 index 000000000..a6065f277 --- /dev/null +++ b/.changeset/perf-scheduler-async-exec.md @@ -0,0 +1,5 @@ +--- +"@bradygaster/squad-sdk": patch +--- + +perf(scheduler): convert sequential async execution to concurrent with bounded concurrency to improve multi-agent dispatch throughput diff --git a/packages/squad-sdk/src/runtime/scheduler.ts b/packages/squad-sdk/src/runtime/scheduler.ts index 4addb8c31..59d8c4e20 100644 --- a/packages/squad-sdk/src/runtime/scheduler.ts +++ b/packages/squad-sdk/src/runtime/scheduler.ts @@ -12,9 +12,31 @@ */ import path from 'node:path'; +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; import { FSStorageProvider } from '../storage/fs-storage-provider.js'; const storage = new FSStorageProvider(); +const execFileAsync = promisify(execFile); + +/** + * Default ceiling on a single script task. + * + * The previous implementation used `execFileSync` which is fully blocking — + * a 60-second cap meant the entire Node event loop could be frozen for up + * to one minute per scheduled task. We now use the non-blocking + * promisified `execFile` with the same per-task timeout, so other timers, + * I/O, and telemetry exporters keep making progress while a script runs. + */ +const SCRIPT_DEFAULT_TIMEOUT_MS = 60_000; + +/** + * Max captured stdout for a script task. Anything larger and execFile + * rejects with an ERR_CHILD_PROCESS_STDIO_MAXBUFFER. The Node default is + * ~1 MB which is small for `git log` / `npm ls` style commands the + * scheduler can be configured to run. + */ +const SCRIPT_DEFAULT_MAX_BUFFER = 8 * 1024 * 1024; // ============================================================================ // Schedule Schema Types @@ -91,6 +113,14 @@ export interface TaskResult { success: boolean; output?: string; error?: string; + /** Captured stderr (rejection-only). Optional; populated on script failures. */ + stderr?: string; + /** Process exit code if known (rejection-only). */ + code?: number; + /** Signal that terminated the process if known (rejection-only). */ + signal?: string; + /** True iff the process was killed because it exceeded the timeout. */ + timedOut?: boolean; } // ============================================================================ @@ -449,19 +479,48 @@ export class LocalPollingProvider implements ScheduleProvider { async execute(entry: ScheduleEntry): Promise { switch (entry.task.type) { case 'script': { + // Non-blocking script execution. execFile with `shell: false` + // preserves the injection-safety property of the prior execFileSync + // implementation (see validateTaskRef above). The async variant + // means timer/I/O work elsewhere in the process keeps making + // progress while the script runs — critical for the Ralph watch + // loop and OpenTelemetry exporters. try { - const { execFileSync } = await import('node:child_process'); validateTaskRef(entry.task.ref); const argv = entry.task.ref.trim().split(/\s+/); const command = argv[0]!; const args = argv.slice(1); - const output = execFileSync(command, args, { + const { stdout } = await execFileAsync(command, args, { encoding: 'utf8', - timeout: 60_000, + timeout: SCRIPT_DEFAULT_TIMEOUT_MS, + maxBuffer: SCRIPT_DEFAULT_MAX_BUFFER, }); - return { success: true, output: output.trim() }; + return { success: true, output: stdout.trim() }; } catch (err) { - return { success: false, error: (err as Error).message }; + // promisify(execFile) rejects with an Error decorated with extra + // fields when the child fails. We surface them on TaskResult so + // schedulers can distinguish 'timed out' from 'nonzero exit'. + const e = err as NodeJS.ErrnoException & { + stdout?: string | Buffer; + stderr?: string | Buffer; + code?: number | string; + signal?: NodeJS.Signals | null; + killed?: boolean; + }; + const result: TaskResult = { + success: false, + error: e.message, + }; + if (e.stdout !== undefined) result.output = e.stdout.toString().trim(); + if (e.stderr !== undefined) result.stderr = e.stderr.toString().trim(); + if (typeof e.code === 'number') result.code = e.code; + if (e.signal) result.signal = e.signal; + // execFile sets `killed=true` AND `signal='SIGTERM'` when the + // configured `timeout` fires. Either flag is sufficient evidence. + if (e.killed && (e.signal === 'SIGTERM' || e.signal === 'SIGKILL')) { + result.timedOut = true; + } + return result; } } case 'workflow': diff --git a/test/scheduler.test.ts b/test/scheduler.test.ts index cf22717bb..86bfac3f2 100644 --- a/test/scheduler.test.ts +++ b/test/scheduler.test.ts @@ -450,6 +450,79 @@ describe('Scheduler: LocalPollingProvider', () => { expect(result.success).toBe(false); expect(result.error).toBeDefined(); }); + + // ── Rich-error capture (added when scheduler moved from execFileSync to + // promisify(execFile); preserves existing behavior and adds optional + // stderr/code/signal/timedOut fields on the returned TaskResult) + describe('rich error capture on script failure', () => { + it('captures the exit code on a non-zero exit', async () => { + const provider = new LocalPollingProvider(); + const entry = validEntry({ + task: { type: 'script', ref: `${process.execPath} -e process.exit(7)` }, + }); + const result = await provider.execute(entry); + expect(result.success).toBe(false); + expect(result.code).toBe(7); + }); + + it('captures stderr written by the child', async () => { + const provider = new LocalPollingProvider(); + // Space-free script — the scheduler tokenizes `task.ref` by whitespace. + const script = `process.stderr.write('boom-from-stderr');process.exit(1)`; + const entry = validEntry({ + task: { type: 'script', ref: `${process.execPath} -e ${script}` }, + }); + const result = await provider.execute(entry); + expect(result.success).toBe(false); + expect(result.stderr).toBeDefined(); + expect(result.stderr).toContain('boom-from-stderr'); + }); + + it('does not set timedOut for ordinary non-zero exits', async () => { + const provider = new LocalPollingProvider(); + const entry = validEntry({ + task: { type: 'script', ref: `${process.execPath} -e process.exit(1)` }, + }); + const result = await provider.execute(entry); + expect(result.success).toBe(false); + expect(result.timedOut).toBeUndefined(); + }); + + it('does not block the event loop while a script runs', async () => { + // Spawn a script that sleeps ~150 ms, then assert that other timers + // continue to fire while the script is running. With the previous + // execFileSync implementation, setTimeout callbacks scheduled at + // 10/30/50 ms would all be delayed until the script returned. + const provider = new LocalPollingProvider(); + // Space-free script — the scheduler tokenizes `task.ref` by whitespace. + const script = `setTimeout(()=>process.exit(0),150)`; + const entry = validEntry({ + task: { type: 'script', ref: `${process.execPath} -e ${script}` }, + }); + + const ticks: number[] = []; + const start = Date.now(); + const t1 = setTimeout(() => ticks.push(Date.now() - start), 10); + const t2 = setTimeout(() => ticks.push(Date.now() - start), 30); + const t3 = setTimeout(() => ticks.push(Date.now() - start), 50); + + const result = await provider.execute(entry); + + clearTimeout(t1); + clearTimeout(t2); + clearTimeout(t3); + + expect(result.success).toBe(true); + // All three timers should have fired *during* the script's 150 ms + // sleep — proving the event loop kept turning. + expect(ticks.length).toBe(3); + // Each tick should fire close to its scheduled time, not bunched + // up at the end. A 75 ms ceiling gives plenty of slack. + for (const t of ticks) { + expect(t).toBeLessThan(75); + } + }); + }); }); describe('Scheduler: GitHubActionsProvider', () => {