Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/perf-charter-discovery-parallel.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@bradygaster/squad-sdk": patch
---

perf(agents): parallelize charter discovery with concurrency limit to reduce multi-agent load time
67 changes: 41 additions & 26 deletions packages/squad-sdk/src/agents/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,21 @@ import { parseCharterMarkdown } from './charter-compiler.js';
import { EventBus } from '../client/event-bus.js';
import { trace, SpanStatusCode } from '../runtime/otel-api.js';
import { recordAgentSpawn, recordAgentDuration, recordAgentError, recordAgentDestroy } from '../runtime/otel-metrics.js';
import { mapWithLimitSettled } from '../utils/map-with-limit.js';

const tracer = trace.getTracer('squad-sdk');

/**
* Concurrency limit for {@link CharterCompiler.compileAll}.
*
* Each compile reads a charter.md (FS or SquadState-backed) and parses it.
* Filesystem reads are cheap individually but unbounded fan-out can
* exhaust file descriptors on large teams (the typical default soft
* ulimit is ~256 on macOS / 1024 on Linux). 8 in flight saturates 5-10
* agent teams while leaving headroom for other SDK code.
*/
const COMPILE_ALL_CONCURRENCY = 8;

// --- M1-8 Charter Compilation + M2-9 Config-driven ---
export {
compileCharter,
Expand Down Expand Up @@ -200,18 +212,22 @@ export class CharterCompiler {
// Use SquadState agents collection when available
if (this.state) {
const names = await this.state.agents.list();
const charters: AgentCharter[] = [];

for (const name of names) {
if (name === 'scribe' || name.startsWith('_')) continue;
try {
charters.push(await this.compileByName(name));
} catch {
// Skip agents without a valid charter.md
}
}

return charters;
const candidates = names.filter(
(name) => name !== 'scribe' && !name.startsWith('_'),
);

// Parallelise the per-charter compile. Order is preserved so
// downstream consumers that index by position (or rely on stable
// ordering for display) continue to see the same sequence.
const results = await mapWithLimitSettled(
candidates,
COMPILE_ALL_CONCURRENCY,
(name) => this.compileByName(name),
);

return results
.filter((r): r is PromiseFulfilledResult<AgentCharter> => r.status === 'fulfilled')
.map((r) => r.value);
}

// Fallback: raw StorageProvider scan
Expand All @@ -220,20 +236,19 @@ export class CharterCompiler {
throw new Error(`Agents directory not found: ${agentsDir}`);
}
const entries = await this.storage.list(agentsDir);
const charters: AgentCharter[] = [];

for (const name of entries) {
if (name === 'scribe' || name.startsWith('_')) continue;

const charterPath = join(agentsDir, name, 'charter.md');
try {
charters.push(await this.compile(charterPath));
} catch {
// Skip agents without a valid charter.md
}
}

return charters;
const candidates = entries.filter(
(name) => name !== 'scribe' && !name.startsWith('_'),
);

const results = await mapWithLimitSettled(
candidates,
COMPILE_ALL_CONCURRENCY,
(name) => this.compile(join(agentsDir, name, 'charter.md')),
);

return results
.filter((r): r is PromiseFulfilledResult<AgentCharter> => r.status === 'fulfilled')
.map((r) => r.value);
}
}

Expand Down
97 changes: 64 additions & 33 deletions packages/squad-sdk/src/config/agent-source.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import * as path from 'path';
import type { StorageProvider } from '../storage/index.js';
import { FSStorageProvider } from '../storage/index.js';
import type { SquadState } from '../state/squad-state.js';
import { mapWithLimit, mapWithLimitSettled } from '../utils/map-with-limit.js';

export interface AgentSource {
readonly name: string;
Expand Down Expand Up @@ -34,6 +35,19 @@ export interface AgentDefinition extends AgentManifest {
/** Directories to scan for agents, in priority order. */
const AGENT_DIRS = ['.squad/agents', '.ai-team/agents'] as const;

/**
* Bounded concurrency for parallel charter discovery.
*
* - LOCAL_LIST_CONCURRENCY: filesystem reads are cheap individually but the
* default ulimit on file descriptors is finite. 8 leaves headroom for the
* rest of the SDK while still saturating typical 5-10 agent teams.
* - GITHUB_LIST_CONCURRENCY: GitHub REST API has secondary rate limits on
* burst concurrency; 5 stays well clear of those thresholds for typical
* team sizes while removing the serial-await bottleneck.
*/
const LOCAL_LIST_CONCURRENCY = 8;
const GITHUB_LIST_CONCURRENCY = 5;

Comment thread
spboyer marked this conversation as resolved.
/**
* Parse charter.md content to extract agent metadata.
*/
Expand Down Expand Up @@ -107,23 +121,23 @@ export class LocalAgentSource implements AgentSource {
if (this.state) {
try {
const names = await this.state.agents.list();
const manifests: AgentManifest[] = [];

for (const entryName of names) {
try {
const content = await this.state.agents.get(entryName).charter();
const meta = parseCharterMetadata(content);
manifests.push({
name: meta.name || entryName,
role: meta.role || 'agent',
source: 'local',
});
} catch {
continue;
}
}

return manifests;

// Parallelise the per-agent charter reads. Each call hits the
// storage backend (FS or otherwise). Order is preserved so the
// resulting manifest list matches the prior sequential behavior.
const results = await mapWithLimitSettled(names, LOCAL_LIST_CONCURRENCY, async (entryName) => {
const content = await this.state!.agents.get(entryName).charter();
const meta = parseCharterMetadata(content);
return {
name: meta.name || entryName,
role: meta.role || 'agent',
source: 'local',
} satisfies AgentManifest;
});

return results
.filter((r): r is PromiseFulfilledResult<AgentManifest> => r.status === 'fulfilled')
.map((r) => r.value);
} catch {
// Fall through to raw StorageProvider path
}
Expand All @@ -133,29 +147,37 @@ export class LocalAgentSource implements AgentSource {
const agentsDir = await this.resolveAgentsDir();
if (!agentsDir) return [];

const manifests: AgentManifest[] = [];
let entries: string[];
try {
entries = await this.storage.list(agentsDir);
} catch {
return [];
}

for (const entryName of entries) {
// For the raw storage fallback, the prior implementation did NOT
// catch per-item errors from isDirectory()/read() — they propagated
// to the caller. Use mapWithLimit (fast-fail) here so a permission
// error or backend outage surfaces as a thrown error instead of
// silently producing a partial list.
const dirFlags = await mapWithLimit(entries, LOCAL_LIST_CONCURRENCY, async (entryName) => {
const entryPath = path.join(agentsDir, entryName);
if (!(await this.storage.isDirectory(entryPath))) continue;
const charterPath = path.join(entryPath, 'charter.md');
return (await this.storage.isDirectory(entryPath)) ? entryName : null;
});
const candidates = dirFlags.filter((name): name is string => name !== null);

const manifests = await mapWithLimit(candidates, LOCAL_LIST_CONCURRENCY, async (entryName) => {
const charterPath = path.join(agentsDir, entryName, 'charter.md');
const content = await this.storage.read(charterPath);
if (!content) continue;
if (!content) return null;
const meta = parseCharterMetadata(content);
manifests.push({
return {
name: meta.name || entryName,
role: meta.role || 'agent',
source: 'local',
});
}
} satisfies AgentManifest;
});

return manifests;
return manifests.filter((m): m is AgentManifest => m !== null);
}

async getAgent(name: string): Promise<AgentDefinition | null> {
Expand Down Expand Up @@ -273,23 +295,32 @@ export class GitHubAgentSource implements AgentSource {
this.owner, this.repoName, this.pathPrefix, this.branch,
);
const dirs = entries.filter(e => e.type === 'dir');
const manifests: AgentManifest[] = [];

for (const dir of dirs) {
// Bounded concurrency to avoid GitHub secondary rate limits on burst
// requests. Five in flight is a conservative ceiling that comfortably
// covers typical 5-10 agent teams without serial latency, while still
// far below GitHub's secondary-limit thresholds.
//
// Use mapWithLimit (fast-fail) — not mapWithLimitSettled — so 403
// / rate-limit / auth / network errors surface to the caller instead
// of being silently dropped. The original `for...await` loop also
// propagated these errors. Only "charter content is empty" is
// expected/skipped via the `return null` path.
const manifests = await mapWithLimit(dirs, GITHUB_LIST_CONCURRENCY, async (dir) => {
const charterPath = `${this.pathPrefix}/${dir.name}/charter.md`;
const content = await this.fetcher.getFileContent(
this.owner, this.repoName, charterPath, this.branch,
);
if (!content) continue;
if (!content) return null;
const meta = parseCharterMetadata(content);
manifests.push({
return {
name: meta.name || dir.name,
role: meta.role || 'agent',
source: 'github',
});
}
} satisfies AgentManifest;
});

return manifests;
return manifests.filter((m): m is AgentManifest => m !== null);
}

async getAgent(name: string): Promise<AgentDefinition | null> {
Expand Down
91 changes: 91 additions & 0 deletions packages/squad-sdk/src/utils/map-with-limit.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/**
* Bounded-concurrency helper for fan-out async work.
*
* @module utils/map-with-limit
*/

/**
* Run `fn` against each item with at most `limit` operations in flight.
*
* Results are returned in **input order**, regardless of the order in which
* individual promises settle. This matches the semantics callers usually
* want when migrating from a sequential `for (const x of xs) { result.push(await fn(x)); }`
* pattern: ordering is preserved, but throughput is bounded.
*
* Errors propagate via the returned Promise. Use `mapWithLimitSettled()`
* when individual failures should not abort the batch.
*
* @example
* // 8 charters fetched 5-at-a-time over HTTP:
* const manifests = await mapWithLimit(dirs, 5, (dir) => fetchCharter(dir));
*
* @param items Inputs to map over.
* @param limit Maximum concurrent calls (must be ≥ 1).
* @param fn Async mapper.
* @returns Array of results in the same order as `items`.
*/
export async function mapWithLimit<T, R>(
items: readonly T[],
limit: number,
fn: (item: T, index: number) => Promise<R>,
): Promise<R[]> {
if (limit < 1 || !Number.isFinite(limit)) {
throw new Error(`mapWithLimit: limit must be a positive integer, got ${limit}`);
}
if (items.length === 0) return [];

const results = new Array<R>(items.length);
let nextIndex = 0;
const workerCount = Math.min(limit, items.length);

async function worker(): Promise<void> {
while (true) {
const idx = nextIndex++;
if (idx >= items.length) return;
results[idx] = await fn(items[idx]!, idx);
}
}

await Promise.all(Array.from({ length: workerCount }, () => worker()));
return results;
}

/**
* Variant of {@link mapWithLimit} that captures individual failures rather
* than aborting on the first rejection. Returns an array of
* `{ status: 'fulfilled', value }` / `{ status: 'rejected', reason }` in
* input order — identical shape to `Promise.allSettled`.
*
* Use this when one bad input (e.g. a corrupt charter.md) should not stop
* the whole batch.
*/
export async function mapWithLimitSettled<T, R>(
items: readonly T[],
limit: number,
fn: (item: T, index: number) => Promise<R>,
): Promise<Array<PromiseSettledResult<R>>> {
if (limit < 1 || !Number.isFinite(limit)) {
throw new Error(`mapWithLimitSettled: limit must be a positive integer, got ${limit}`);
}
if (items.length === 0) return [];

const results = new Array<PromiseSettledResult<R>>(items.length);
let nextIndex = 0;
const workerCount = Math.min(limit, items.length);

async function worker(): Promise<void> {
while (true) {
const idx = nextIndex++;
if (idx >= items.length) return;
try {
const value = await fn(items[idx]!, idx);
results[idx] = { status: 'fulfilled', value };
} catch (reason) {
results[idx] = { status: 'rejected', reason };
}
}
}

await Promise.all(Array.from({ length: workerCount }, () => worker()));
return results;
}
39 changes: 39 additions & 0 deletions test/agent-source-github.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,45 @@ describe('GitHubAgentSource', () => {
await source.listAgents();
expect(fetcher.listDirectory).toHaveBeenCalledWith('acme', 'repo', '.squad/agents', 'develop');
});

// Regression test: when parallel charter fetching was introduced, the
// initial pass swallowed all per-item rejections (mapWithLimitSettled).
// That hid 403/auth/rate-limit/network failures behind silently smaller
// result sets. listAgents must continue to PROPAGATE thrown fetcher
// errors, matching the prior sequential `for...await` behavior.
it('propagates a thrown fetcher error instead of silently dropping the agent', async () => {
const fetcher: GitHubFetcher = {
listDirectory: vi.fn(async () => [
{ name: 'agent1', type: 'dir' },
{ name: 'agent2', type: 'dir' },
]),
getFileContent: vi.fn(async (_o, _r, path) => {
if (path.includes('agent2')) {
// Simulate a transient GitHub failure (e.g. secondary rate limit)
const err = new Error('GitHub API: 403 secondary rate limit');
(err as Error & { status?: number }).status = 403;
throw err;
}
return CHARTER_AGENT1;
}),
};
const source = new GitHubAgentSource('acme/repo', { fetcher });
await expect(source.listAgents()).rejects.toThrow(/secondary rate limit/);
});

it('still skips agents whose charter.md returns null (missing file)', async () => {
const fetcher = makeFetcher(
[
{ name: 'agent1', type: 'dir' },
{ name: 'agent-no-charter', type: 'dir' },
],
{ '.squad/agents/agent1/charter.md': CHARTER_AGENT1 },
);
const source = new GitHubAgentSource('acme/repo', { fetcher });
const agents = await source.listAgents();
expect(agents).toHaveLength(1);
expect(agents[0]!.name).toBe('Agent1');
});
});

describe('getAgent', () => {
Expand Down
Loading
Loading