diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts index 11a8a6a0e3..8bc45f6019 100644 --- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts @@ -63,6 +63,7 @@ describe('configToFormState', () => { expect(state.classifierMaxP95LatencyMs).toBe('1000'); expect(state.classifierModels).toBe(''); expect(state.deciderModels).toEqual([]); + expect(state.maxConcurrency).toBe(100); }); }); diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx index 9bdfac18ba..312e44e602 100644 --- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx @@ -126,7 +126,7 @@ export function configToFormState(config: BenchmarkConfig | null): { deciderModels: [], minAccuracy: 0.7, switchCostFactor: 3, - maxConcurrency: 4, + maxConcurrency: 100, benchmarkUserId: '', classifierRepetitions: 1, deciderRepetitions: 1, @@ -407,13 +407,13 @@ function BenchmarkConfigEditor({
@@ -539,17 +539,13 @@ function BenchmarkConfigEditor({ // Run summaries expandable table // --------------------------------------------------------------------------- -const TIER_ORDER = { low: 0, medium: 1, high: 2, '*': 3 } as const; - function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) { const isDecider = run.kind === 'decider'; const sortedSummaries: BenchmarkModelSummary[] = isDecider ? [...run.summaries].sort((a, b) => { - const tierDiff = - (TIER_ORDER[a.tier as keyof typeof TIER_ORDER] ?? 3) - - (TIER_ORDER[b.tier as keyof typeof TIER_ORDER] ?? 3); - if (tierDiff !== 0) return tierDiff; + const routeDiff = a.routeKey.localeCompare(b.routeKey); + if (routeDiff !== 0) return routeDiff; return b.accuracy - a.accuracy; }) : run.summaries; @@ -571,7 +567,7 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) { Model - {isDecider ? Tier : null} + {isDecider ? Route : null} Accuracy Avg cost Avg latency @@ -584,10 +580,10 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) { {sortedSummaries.map((s, i) => ( - + {s.model} {isDecider ? ( - {s.tier} + {s.routeKey} ) : null} {formatAccuracy(s.accuracy)} @@ -717,11 +713,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) { } const { table } = data; - const tierEntries = [ - { tier: 'low', candidates: table.tiers.low }, - { tier: 'medium', candidates: table.tiers.medium }, - { tier: 'high', candidates: table.tiers.high }, - ] as const; + const routeEntries = Object.entries(table.routes).sort(([a], [b]) => a.localeCompare(b)); return (
@@ -736,9 +728,9 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
- {tierEntries.map(({ tier, candidates }) => ( -
-

{tier} tier

+ {routeEntries.map(([routeKey, candidates]) => ( +
+

{routeKey}

@@ -751,7 +743,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) { {candidates.map((c, i) => ( - + {c.model} {formatAccuracy(c.accuracy)} diff --git a/apps/web/src/app/api/openrouter/[...path]/route.test.ts b/apps/web/src/app/api/openrouter/[...path]/route.test.ts index bb7b22ded3..a82a0e8cfe 100644 --- a/apps/web/src/app/api/openrouter/[...path]/route.test.ts +++ b/apps/web/src/app/api/openrouter/[...path]/route.test.ts @@ -447,7 +447,8 @@ describe('kilo-auto/efficient classifier billing', () => { mockedFetchEfficientAutoDecision.mockResolvedValue({ decision: { model: 'anthropic/claude-haiku-4', - tier: 'low', + taskType: 'implementation', + subtaskType: 'feature_development', source: 'benchmark', tableVersion: 'v1', sticky: false, @@ -481,7 +482,8 @@ describe('kilo-auto/efficient classifier billing', () => { mockedFetchEfficientAutoDecision.mockResolvedValue({ decision: { model: 'anthropic/claude-haiku-4', - tier: 'low', + taskType: 'implementation', + subtaskType: 'feature_development', source: 'benchmark' as const, tableVersion: 'v1', sticky: false, @@ -510,7 +512,8 @@ describe('kilo-auto/efficient classifier billing', () => { mockedFetchEfficientAutoDecision.mockResolvedValue({ decision: { model: 'anthropic/claude-haiku-4', - tier: 'low', + taskType: 'implementation', + subtaskType: 'feature_development', source: 'benchmark', tableVersion: 'v1', sticky: false, @@ -560,7 +563,8 @@ describe('kilo-auto/efficient classifier billing', () => { mockedFetchEfficientAutoDecision.mockResolvedValue({ decision: { model: 'anthropic/claude-haiku-4', - tier: 'low', + taskType: 'implementation', + subtaskType: 'feature_development', source: 'benchmark', tableVersion: 'v1', sticky: false, diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts index f241c5f222..15235c3730 100644 --- a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts +++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts @@ -25,7 +25,8 @@ const zeroBalancePromise = Promise.resolve(0); const sampleDecision: AutoRoutingDecision = { model: 'anthropic/claude-haiku-4', - tier: 'low', + taskType: 'implementation', + subtaskType: 'feature_development', source: 'benchmark', tableVersion: 'v1', sticky: false, diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts index 70d8e7e0c6..52daf63cc8 100644 --- a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts +++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts @@ -47,7 +47,8 @@ const options = { const validDecision = { model: 'anthropic/claude-haiku-4', - tier: 'low' as const, + taskType: 'implementation' as const, + subtaskType: 'feature_development' as const, source: 'benchmark' as const, tableVersion: 'v1', sticky: false, diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts index 8409b7f743..a696ac3063 100644 --- a/packages/auto-routing-contracts/src/benchmark.ts +++ b/packages/auto-routing-contracts/src/benchmark.ts @@ -1,9 +1,10 @@ import * as z from 'zod'; import { RoutingTableSchema } from './routing-table'; -import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers'; +import { ReasoningEffortSchema } from './reasoning'; +import { TaxonomyRouteKeySchema } from './taxonomy'; -export { ReasoningEffortSchema } from './tiers'; -export type { ReasoningEffort } from './tiers'; +export { ReasoningEffortSchema } from './reasoning'; +export type { ReasoningEffort } from './reasoning'; export const BenchmarkKindSchema = z.enum(['classifier', 'decider']); export type BenchmarkKind = z.infer; @@ -39,15 +40,16 @@ export const BenchmarkConfigSchema = z .object({ classifierModels: z.array(z.string().trim().min(1)).min(1), deciderModels: z.array(BenchmarkDeciderModelSchema).min(1), - // Accuracy threshold for "gets the job done" (per tier). + // Accuracy threshold for "gets the job done" (per taxonomy route). minAccuracy: z.number().min(0).max(1), - // Parallel OpenRouter calls per queue message. - maxConcurrency: z.number().int().min(1).max(16), + // Benchmark-wide parallelism budget. Decider runs use it as a live + // container budget; classifier runs use it for parallel OpenRouter calls. + maxConcurrency: z.number().int().min(1).max(100), // The Kilo user whose identity/billing the decider CLI runs execute under. // Null until an admin configures it; decider runs fail fast while null. benchmarkUserId: z.string().trim().min(1).nullable(), // Session stickiness knob carried into published routing tables: a session - // stays on its incumbent model while it meets the tier's accuracy + // stays on its incumbent model while it meets the route's accuracy // threshold, unless the fresh pick is cheaper by more than this factor. // Model switches discard provider prompt caches (cache reads are far // cheaper than fresh input tokens), so switching only pays off when the @@ -79,8 +81,8 @@ export type BenchmarkRunStatus = z.infer; export const BenchmarkModelSummarySchema = z.object({ model: z.string(), - // '*' for classifier runs (no tiering), otherwise the difficulty tier. - tier: z.union([DifficultyTierSchema, z.literal('*')]), + // '*' for classifier runs, otherwise "/". + routeKey: z.union([TaxonomyRouteKeySchema, z.literal('*')]), accuracy: z.number(), avgCostUsd: z.number().nullable(), avgLatencyMs: z.number(), diff --git a/packages/auto-routing-contracts/src/contracts.test.ts b/packages/auto-routing-contracts/src/contracts.test.ts index 0c826251dc..963875f812 100644 --- a/packages/auto-routing-contracts/src/contracts.test.ts +++ b/packages/auto-routing-contracts/src/contracts.test.ts @@ -147,6 +147,20 @@ describe('BenchmarkConfigSchema defaults', () => { expect(result.deciderRepetitions).toBe(1); expect(result.classifierMaxP95LatencyMs).toBe(1000); }); + + it('accepts the benchmark maximum concurrency cap of 100', () => { + const result = BenchmarkConfigSchema.safeParse({ + classifierModels: ['model/a'], + deciderModels: [{ id: 'model/b' }], + minAccuracy: 0.8, + maxConcurrency: 100, + benchmarkUserId: null, + switchCostFactor: 2, + updatedAt: null, + updatedBy: null, + }); + expect(result.success).toBe(true); + }); }); describe('BenchmarkConfigSchema duplicate model ids', () => { diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts index 31915439ec..aeb55bf7b2 100644 --- a/packages/auto-routing-contracts/src/index.ts +++ b/packages/auto-routing-contracts/src/index.ts @@ -1,6 +1,12 @@ import * as z from 'zod'; import { NormalizedClassifierInputSchema } from './input'; -import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers'; +import { ReasoningEffortSchema } from './reasoning'; +import { + ClassifierSubtaskTypeSchema, + ClassifierTaskTypeSchema, + SUBTYPES_BY_TASK_TYPE, + type ClassifierSubtaskType, +} from './taxonomy'; export { NormalizedClassifierInputSchema, @@ -29,47 +35,6 @@ export const MirrorPayloadSchema = z.object({ }); export type MirrorPayload = z.infer; -export const ClassifierTaskTypeSchema = z.enum([ - 'implementation', - 'debugging', - 'refactoring', - 'planning_design', - 'investigation', - 'agentic_execution', -]); -export type ClassifierTaskType = z.infer; - -export const ClassifierSubtaskTypeSchema = z.enum([ - 'feature_development', - 'code_generation', - 'test_creation', - 'bug_fixing', - 'test_repair', - 'root_cause_analysis', - 'code_cleanup', - 'architecture_improvement', - 'migration', - 'architecture_design', - 'technical_planning', - 'system_design', - 'repo_exploration', - 'codebase_understanding', - 'external_research', - 'tool_usage', - 'terminal_operations', - 'multi_step_execution', -]); -export type ClassifierSubtaskType = z.infer; - -const subtypesByTaskType: Record = { - implementation: ['feature_development', 'code_generation', 'test_creation'], - debugging: ['bug_fixing', 'test_repair', 'root_cause_analysis'], - refactoring: ['code_cleanup', 'architecture_improvement', 'migration'], - planning_design: ['architecture_design', 'technical_planning', 'system_design'], - investigation: ['repo_exploration', 'codebase_understanding', 'external_research'], - agentic_execution: ['tool_usage', 'terminal_operations', 'multi_step_execution'], -}; - export const ClassifierOutputSchema = z .strictObject({ taskType: ClassifierTaskTypeSchema, @@ -87,7 +52,10 @@ export const ClassifierOutputSchema = z confidence: z.number().min(0).max(1), }) .superRefine((output, ctx) => { - if (!subtypesByTaskType[output.taskType].includes(output.subtaskType)) { + const allowedSubtypes = SUBTYPES_BY_TASK_TYPE[ + output.taskType + ] as readonly ClassifierSubtaskType[]; + if (!allowedSubtypes.includes(output.subtaskType)) { ctx.addIssue({ code: 'custom', path: ['subtaskType'], @@ -99,7 +67,8 @@ export type ClassifierOutput = z.infer; export const AutoRoutingDecisionSchema = z.object({ model: z.string(), - tier: DifficultyTierSchema, + taskType: ClassifierTaskTypeSchema, + subtaskType: ClassifierSubtaskTypeSchema, source: z.enum(['benchmark']), tableVersion: z.string(), // Mirrors the effort the chosen model was benchmarked with, when set. @@ -180,6 +149,7 @@ export type AutoRoutingClassifierAnalyticsResponse = z.infer< export { normalizeClassifierInput, redactProviderHints, type ClassifierApiKind } from './normalize'; -export * from './tiers'; +export * from './reasoning'; +export * from './taxonomy'; export * from './routing-table'; export * from './benchmark'; diff --git a/packages/auto-routing-contracts/src/reasoning.ts b/packages/auto-routing-contracts/src/reasoning.ts new file mode 100644 index 0000000000..a989853d1c --- /dev/null +++ b/packages/auto-routing-contracts/src/reasoning.ts @@ -0,0 +1,4 @@ +import * as z from 'zod'; + +export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']); +export type ReasoningEffort = z.infer; diff --git a/packages/auto-routing-contracts/src/routing-table.test.ts b/packages/auto-routing-contracts/src/routing-table.test.ts index edcd573b44..a4830ce117 100644 --- a/packages/auto-routing-contracts/src/routing-table.test.ts +++ b/packages/auto-routing-contracts/src/routing-table.test.ts @@ -9,12 +9,16 @@ const candidate = (model: string, accuracy: number, avgCostUsd: number) => ({ }); describe('rankCandidates', () => { - it('puts the cheapest above-threshold candidate first', () => { + it('puts the lowest cost-per-accuracy above-threshold candidate first', () => { const ranked = rankCandidates( - [candidate('expensive', 0.95, 10), candidate('cheap', 0.8, 1), candidate('weak', 0.5, 0.1)], + [ + candidate('lower-raw-cost', 0.7, 0.007), + candidate('better-value', 0.9, 0.008), + candidate('weak', 0.5, 0.001), + ], 0.7 ); - expect(ranked.map(c => c.model)).toEqual(['cheap', 'expensive', 'weak']); + expect(ranked.map(c => c.model)).toEqual(['better-value', 'lower-raw-cost', 'weak']); expect(ranked[0].meetsThreshold).toBe(true); expect(ranked[2].meetsThreshold).toBe(false); }); @@ -29,15 +33,35 @@ describe('rankCandidates', () => { }); describe('RoutingTableSchema', () => { - it('requires at least one candidate per tier', () => { + it('requires at least one candidate per taxonomy route', () => { expect( RoutingTableSchema.safeParse({ version: 'v', generatedAt: new Date(0).toISOString(), minAccuracy: 0.7, + switchCostFactor: 3, source: 'benchmark', - tiers: { low: [], medium: [candidate('m', 1, 1)], high: [candidate('h', 1, 1)] }, + routes: { + 'implementation/code_generation': [], + 'debugging/bug_fixing': [candidate('m', 1, 1)], + }, }).success ).toBe(false); }); + + it('accepts a table routed by classifier taxonomy pair', () => { + const parsed = RoutingTableSchema.parse({ + version: 'v', + generatedAt: new Date(0).toISOString(), + minAccuracy: 0.7, + switchCostFactor: 3, + source: 'benchmark', + routes: { + 'implementation/code_generation': [candidate('impl', 0.9, 1)], + 'debugging/bug_fixing': [candidate('debug', 0.9, 1)], + }, + }); + + expect(parsed.routes['implementation/code_generation']?.[0]?.model).toBe('impl'); + }); }); diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts index ff49e81578..0a1db0c0a5 100644 --- a/packages/auto-routing-contracts/src/routing-table.ts +++ b/packages/auto-routing-contracts/src/routing-table.ts @@ -1,9 +1,10 @@ import * as z from 'zod'; -import { ReasoningEffortSchema } from './tiers'; +import { ReasoningEffortSchema } from './reasoning'; +import { TaxonomyRouteKeySchema } from './taxonomy'; export const RankedCandidateSchema = z.object({ model: z.string().trim().min(1), - // Benchmark accuracy in [0, 1] for this tier. + // Benchmark accuracy in [0, 1] for this taxonomy route. accuracy: z.number().min(0).max(1), // Average observed OpenRouter cost per benchmark case, in USD credits. avgCostUsd: z.number().nonnegative(), @@ -23,19 +24,25 @@ export const RoutingTableSchema = z.object({ // more than this factor (see BenchmarkConfigSchema.switchCostFactor). switchCostFactor: z.number().min(1), source: z.enum(['benchmark']), - tiers: z.object({ - low: z.array(RankedCandidateSchema).min(1), - medium: z.array(RankedCandidateSchema).min(1), - high: z.array(RankedCandidateSchema).min(1), + routes: z.record(z.string(), z.array(RankedCandidateSchema).min(1)).superRefine((routes, ctx) => { + for (const key of Object.keys(routes)) { + if (!TaxonomyRouteKeySchema.safeParse(key).success) { + ctx.addIssue({ + code: 'custom', + path: [key], + message: `Unknown taxonomy route ${key}`, + }); + } + } }), }); export type RoutingTable = z.infer; export const ROUTING_TABLE_KV_KEY = 'routing_table_v1'; -// "Best bang for buck": candidates meeting the accuracy threshold come -// first, cheapest first (accuracy breaks ties); below-threshold candidates -// follow ordered by accuracy so a degenerate table still routes sensibly. +// "Best bang for buck": candidates meeting the accuracy threshold come first, +// lowest cost per unit of accuracy first; below-threshold candidates follow +// ordered by accuracy so a degenerate table still routes sensibly. export function rankCandidates( candidates: ReadonlyArray & { meetsThreshold?: boolean }>, minAccuracy: number @@ -44,7 +51,7 @@ export function rankCandidates( return flagged.toSorted((a, b) => { if (a.meetsThreshold !== b.meetsThreshold) return a.meetsThreshold ? -1 : 1; if (a.meetsThreshold) { - return a.avgCostUsd - b.avgCostUsd || b.accuracy - a.accuracy; + return a.avgCostUsd / a.accuracy - b.avgCostUsd / b.accuracy || b.accuracy - a.accuracy; } return b.accuracy - a.accuracy || a.avgCostUsd - b.avgCostUsd; }); diff --git a/packages/auto-routing-contracts/src/taxonomy.ts b/packages/auto-routing-contracts/src/taxonomy.ts new file mode 100644 index 0000000000..bb5fa70c62 --- /dev/null +++ b/packages/auto-routing-contracts/src/taxonomy.ts @@ -0,0 +1,77 @@ +import * as z from 'zod'; + +export const CLASSIFIER_TASK_TYPES = [ + 'implementation', + 'debugging', + 'refactoring', + 'planning_design', + 'investigation', + 'agentic_execution', +] as const; + +export const CLASSIFIER_SUBTASK_TYPES = [ + 'feature_development', + 'code_generation', + 'test_creation', + 'bug_fixing', + 'test_repair', + 'root_cause_analysis', + 'code_cleanup', + 'architecture_improvement', + 'migration', + 'architecture_design', + 'technical_planning', + 'system_design', + 'repo_exploration', + 'codebase_understanding', + 'external_research', + 'tool_usage', + 'terminal_operations', + 'multi_step_execution', +] as const; + +export const SUBTYPES_BY_TASK_TYPE = { + implementation: ['feature_development', 'code_generation', 'test_creation'], + debugging: ['bug_fixing', 'test_repair', 'root_cause_analysis'], + refactoring: ['code_cleanup', 'architecture_improvement', 'migration'], + planning_design: ['architecture_design', 'technical_planning', 'system_design'], + investigation: ['repo_exploration', 'codebase_understanding', 'external_research'], + agentic_execution: ['tool_usage', 'terminal_operations', 'multi_step_execution'], +} as const; + +export const TAXONOMY_ROUTE_KEYS = [ + 'implementation/feature_development', + 'implementation/code_generation', + 'implementation/test_creation', + 'debugging/bug_fixing', + 'debugging/test_repair', + 'debugging/root_cause_analysis', + 'refactoring/code_cleanup', + 'refactoring/architecture_improvement', + 'refactoring/migration', + 'planning_design/architecture_design', + 'planning_design/technical_planning', + 'planning_design/system_design', + 'investigation/repo_exploration', + 'investigation/codebase_understanding', + 'investigation/external_research', + 'agentic_execution/tool_usage', + 'agentic_execution/terminal_operations', + 'agentic_execution/multi_step_execution', +] as const; + +export const ClassifierTaskTypeSchema = z.enum(CLASSIFIER_TASK_TYPES); +export type ClassifierTaskType = z.infer; + +export const ClassifierSubtaskTypeSchema = z.enum(CLASSIFIER_SUBTASK_TYPES); +export type ClassifierSubtaskType = z.infer; + +export const TaxonomyRouteKeySchema = z.enum(TAXONOMY_ROUTE_KEYS); +export type TaxonomyRouteKey = z.infer; + +export function taxonomyRouteKey(params: { + taskType: ClassifierTaskType; + subtaskType: ClassifierSubtaskType; +}): TaxonomyRouteKey { + return `${params.taskType}/${params.subtaskType}` as TaxonomyRouteKey; +} diff --git a/packages/auto-routing-contracts/src/tiers.test.ts b/packages/auto-routing-contracts/src/tiers.test.ts deleted file mode 100644 index 5d62f7259f..0000000000 --- a/packages/auto-routing-contracts/src/tiers.test.ts +++ /dev/null @@ -1,79 +0,0 @@ -import { describe, expect, it } from 'vitest'; -import { deriveDifficultyTier } from './tiers'; -import type { ClassifierOutput } from './index'; - -function classification(overrides: Partial): ClassifierOutput { - return { - taskType: 'implementation', - subtaskType: 'code_generation', - contextComplexity: 'small', - reasoningComplexity: 'low', - riskLevel: 'low', - executionMode: 'answer_only', - requiresTools: false, - confidence: 0.9, - ...overrides, - }; -} - -describe('deriveDifficultyTier', () => { - it('classifies trivial answer-only requests as low', () => { - expect(deriveDifficultyTier(classification({}))).toBe('low'); - }); - it('classifies mid-size code changes as medium', () => { - expect( - deriveDifficultyTier( - classification({ - contextComplexity: 'medium', - reasoningComplexity: 'medium', - executionMode: 'code_change', - }) - ) - ).toBe('medium'); - }); - it('classifies high-reasoning multi-step work as high', () => { - expect( - deriveDifficultyTier( - classification({ - contextComplexity: 'large', - reasoningComplexity: 'high', - executionMode: 'multi_step_project', - riskLevel: 'high', - }) - ) - ).toBe('high'); - }); - it('high risk tips an otherwise-low request to medium', () => { - expect( - deriveDifficultyTier( - classification({ executionMode: 'multi_step_project', riskLevel: 'high' }) - ) - ).toBe('medium'); - }); - it('high risk tips an otherwise-medium request to high', () => { - expect( - deriveDifficultyTier( - classification({ - reasoningComplexity: 'medium', - contextComplexity: 'large', - executionMode: 'code_change', - riskLevel: 'high', - }) - ) - ).toBe('high'); - }); - it('is monotonic: bumping reasoning complexity never lowers the tier', () => { - const tiers = ['low', 'medium', 'high'] as const; - for (const ctx of ['small', 'medium', 'large'] as const) { - let prev = 0; - for (const reasoning of ['low', 'medium', 'high'] as const) { - const tier = deriveDifficultyTier( - classification({ contextComplexity: ctx, reasoningComplexity: reasoning }) - ); - const idx = tiers.indexOf(tier); - expect(idx).toBeGreaterThanOrEqual(prev); - prev = idx; - } - } - }); -}); diff --git a/packages/auto-routing-contracts/src/tiers.ts b/packages/auto-routing-contracts/src/tiers.ts deleted file mode 100644 index 8358c5e3bf..0000000000 --- a/packages/auto-routing-contracts/src/tiers.ts +++ /dev/null @@ -1,43 +0,0 @@ -import * as z from 'zod'; - -export const DifficultyTierSchema = z.enum(['low', 'medium', 'high']); - -export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']); -export type ReasoningEffort = z.infer; -export type DifficultyTier = z.infer; - -export const DIFFICULTY_TIERS: readonly DifficultyTier[] = ['low', 'medium', 'high']; - -const REASONING_POINTS = { low: 0, medium: 2, high: 4 } as const; -const CONTEXT_POINTS = { small: 0, medium: 1, large: 2 } as const; -const EXECUTION_POINTS = { - answer_only: 0, - code_change: 1, - command_execution: 1, - multi_step_project: 2, -} as const; -const RISK_POINTS = { low: 0, medium: 0, high: 1 } as const; - -// Deterministic mapping from the classifier taxonomy to a difficulty tier. -// Reasoning complexity dominates (weight 2x) because it is the strongest -// signal for whether a cheap model can complete the task; context size, -// execution mode and blast radius nudge borderline cases up. -// Structural subset of ClassifierOutput: importing the full type from -// ./index would create a module cycle (index re-exports this file). -export type DifficultyTierSignal = { - reasoningComplexity: 'low' | 'medium' | 'high'; - contextComplexity: 'small' | 'medium' | 'large'; - executionMode: 'answer_only' | 'code_change' | 'command_execution' | 'multi_step_project'; - riskLevel: 'low' | 'medium' | 'high'; -}; - -export function deriveDifficultyTier(classification: DifficultyTierSignal): DifficultyTier { - const score = - REASONING_POINTS[classification.reasoningComplexity] + - CONTEXT_POINTS[classification.contextComplexity] + - EXECUTION_POINTS[classification.executionMode] + - RISK_POINTS[classification.riskLevel]; - if (score <= 2) return 'low'; - if (score <= 5) return 'medium'; - return 'high'; -} diff --git a/services/auto-routing-benchmark/README.md b/services/auto-routing-benchmark/README.md index cd5a226bf6..6573a38ce4 100644 --- a/services/auto-routing-benchmark/README.md +++ b/services/auto-routing-benchmark/README.md @@ -12,9 +12,9 @@ design, invariants, and rollout/rollback. OpenRouter using the exact production classifier code (`@kilocode/auto-routing-contracts/classifier`), grades per-field, and derives the cheapest above-threshold model as the classifier winner. -- **Decider benchmark** — runs 76 golden tasks per candidate through the real +- **Decider benchmark** — runs 180 golden tasks per candidate through the real `kilo` CLI inside a Cloudflare Container, grades mechanically, and publishes a - per-difficulty-tier routing table. + per-taxonomy-route routing table. - Normalized results live in D1 (`BENCH_DB`); published artifacts are cached in the shared `AUTO_ROUTING_CONFIG` KV namespace (publish = delete the keys so the next read repopulates from D1). @@ -92,10 +92,12 @@ sqlite3 /tmp/.sqlite 'select id, kind, status from benchmark_runs;' ## Debugging container (decider) failures -- Each (model, 10-case chunk) gets its own container instance - (`runId:model:chunk`); CLI runs are serialized per instance (its sqlite state - is not safe under concurrent first runs). A `/warmup` call absorbs the one-time - sqlite migration before the case loop. +- Each decider run seeds bounded shard lanes across the configured models and + repetitions. A lane uses one stable container instance + (`runId:model:rep:shard`) and processes chunk `N`, then `N+shardCount`, and + so on. CLI runs are serialized per instance because its sqlite state is not + safe under concurrent first runs. A `/warmup` call absorbs the one-time sqlite + migration before the case loop. - `case_results` rows carry diagnostics: CLI exit code, output prefix, and an event tail — start there for a failing case. - `POST /admin/debug-cli {model, prompt}` runs one prompt through the container @@ -109,16 +111,16 @@ sqlite3 /tmp/.sqlite 'select id, kind, status from benchmark_runs;' ## Debugging the DLQ Failed queue messages land in `auto-routing-benchmark-dlq` after `max_retries` -(2) on `auto-routing-benchmark-jobs`. A message is one (model, chunk) job, so a -DLQ'd message means that chunk never produced results; its model's summaries for -the affected tier(s) will be missing or incomplete and `finalizeRunIfComplete` -will mark the run accordingly. +(6) on `auto-routing-benchmark-jobs`. A decider message is one +(model, repetition, shard, chunk) job, so a DLQ'd message means that chunk never +produced results; its model's summaries for the affected route(s) will be +missing or incomplete and `finalizeRunIfComplete` will mark the run accordingly. To inspect / handle: - **Prod**: read the DLQ from the Cloudflare dashboard (Workers → Queues → `auto-routing-benchmark-dlq`) or `wrangler queues` tooling; the message body is - the JSON job (`runId`, `model`, `chunk`, case ids). + the JSON job (`runId`, `model`, `rep`, `shard`, `shardCount`, `chunk`, case ids). - **Replay**: re-run the affected model with the admin `force` toggle once the underlying cause (OpenRouter outage, container image, bad case) is fixed — carried summaries mean only the re-triggered model is re-benchmarked. diff --git a/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql b/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql index 3db1df3b2b..d2d038e3f1 100644 --- a/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql +++ b/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql @@ -32,7 +32,7 @@ CREATE TABLE `case_results` ( `run_id` text NOT NULL, `model` text NOT NULL, `case_id` text NOT NULL, - `tier` text, + `route_key` text, `score` real NOT NULL, `latency_ms` integer NOT NULL, `cost_usd` real, @@ -60,7 +60,7 @@ CREATE TABLE `config_decider_models` ( CREATE TABLE `model_summaries` ( `run_id` text NOT NULL, `model` text NOT NULL, - `tier` text NOT NULL, + `route_key` text NOT NULL, `accuracy` real NOT NULL, `avg_cost_usd` real, `avg_latency_ms` real NOT NULL, @@ -70,19 +70,19 @@ CREATE TABLE `model_summaries` ( `p95_latency_ms` real, `timeouts` integer DEFAULT 0 NOT NULL, `carried` integer DEFAULT false NOT NULL, - PRIMARY KEY(`run_id`, `model`, `tier`) + PRIMARY KEY(`run_id`, `model`, `route_key`) ); --> statement-breakpoint CREATE TABLE `routing_table_candidates` ( `run_id` text NOT NULL, - `tier` text NOT NULL, + `route_key` text NOT NULL, `rank` integer NOT NULL, `model` text NOT NULL, `accuracy` real NOT NULL, `avg_cost_usd` real NOT NULL, `meets_threshold` integer NOT NULL, `reasoning_effort` text, - PRIMARY KEY(`run_id`, `tier`, `rank`) + PRIMARY KEY(`run_id`, `route_key`, `rank`) ); --> statement-breakpoint CREATE TABLE `routing_tables` ( diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json index 35ce39e53e..b5614567dc 100644 --- a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json +++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json @@ -1,7 +1,7 @@ { "version": "6", "dialect": "sqlite", - "id": "ba559fc8-fdd3-4c96-b116-53573fb79c74", + "id": "fa33fcda-13d6-4952-84d7-0ad12cd02fea", "prevId": "00000000-0000-0000-0000-000000000000", "tables": { "benchmark_config": { @@ -222,8 +222,8 @@ "notNull": true, "autoincrement": false }, - "tier": { - "name": "tier", + "route_key": { + "name": "route_key", "type": "text", "primaryKey": false, "notNull": false, @@ -390,8 +390,8 @@ "notNull": true, "autoincrement": false }, - "tier": { - "name": "tier", + "route_key": { + "name": "route_key", "type": "text", "primaryKey": false, "notNull": true, @@ -466,13 +466,13 @@ "indexes": {}, "foreignKeys": {}, "compositePrimaryKeys": { - "model_summaries_run_id_model_tier_pk": { + "model_summaries_run_id_model_route_key_pk": { "columns": [ "run_id", "model", - "tier" + "route_key" ], - "name": "model_summaries_run_id_model_tier_pk" + "name": "model_summaries_run_id_model_route_key_pk" } }, "uniqueConstraints": {}, @@ -488,8 +488,8 @@ "notNull": true, "autoincrement": false }, - "tier": { - "name": "tier", + "route_key": { + "name": "route_key", "type": "text", "primaryKey": false, "notNull": true, @@ -541,13 +541,13 @@ "indexes": {}, "foreignKeys": {}, "compositePrimaryKeys": { - "routing_table_candidates_run_id_tier_rank_pk": { + "routing_table_candidates_run_id_route_key_rank_pk": { "columns": [ "run_id", - "tier", + "route_key", "rank" ], - "name": "routing_table_candidates_run_id_tier_rank_pk" + "name": "routing_table_candidates_run_id_route_key_rank_pk" } }, "uniqueConstraints": {}, diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json index 7ee67d2c06..aa20472e95 100644 --- a/services/auto-routing-benchmark/migrations/meta/_journal.json +++ b/services/auto-routing-benchmark/migrations/meta/_journal.json @@ -5,7 +5,7 @@ { "idx": 0, "version": "6", - "when": 1781523205381, + "when": 1781688875647, "tag": "0000_absent_wallow", "breakpoints": true } diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts index 77391db7b2..8bd7a4ba15 100644 --- a/services/auto-routing-benchmark/src/admin.test.ts +++ b/services/auto-routing-benchmark/src/admin.test.ts @@ -12,7 +12,7 @@ import { CLASSIFIER_CASES } from './datasets/classifier-cases'; function makeSummary(model: string): BenchmarkModelSummary { return { model, - tier: 'low', + routeKey: 'implementation/code_generation', accuracy: 0.9, avgCostUsd: 0.001, avgLatencyMs: 100, @@ -32,7 +32,7 @@ const TEST_CONFIG: BenchmarkConfig = { ], minAccuracy: 0.7, switchCostFactor: 3, - maxConcurrency: 4, + maxConcurrency: 100, benchmarkUserId: null, classifierRepetitions: 1, deciderRepetitions: 1, @@ -471,9 +471,10 @@ describe('POST /admin/runs', () => { expect(body.enqueuedModels).toBe(1); }); - it('slices a >100-message decider fan-out into sendBatch-sized batches', async () => { - // 7 decider models × 1 rep × ceil(76/5)=16 chunks = 112 messages, which - // exceeds Cloudflare Queues' 100-per-sendBatch cap and must be sliced. + it('seeds sharded decider lanes bounded by the container cap', async () => { + // Later chunks are chained by processJob within each shard lane. Start + // seeds as many lanes as fit under the 100-container cap so the benchmark + // runs much faster without creating one live container per chunk. const manyModels = Array.from({ length: 7 }, (_, i) => ({ id: `vendor/model-${i}`, reasoningEffort: null, @@ -487,11 +488,74 @@ describe('POST /admin/runs', () => { const res = await authedPost('/admin/runs', { kind: 'decider' }); expect(res.status).toBe(200); - // 112 messages → two batches (100 + 12), neither over the limit. - expect(queueSendBatch).toHaveBeenCalledTimes(2); + expect(queueSendBatch).toHaveBeenCalledTimes(1); const batchSizes = queueSendBatch.mock.calls.map(([batch]) => (batch as unknown[]).length); - expect(batchSizes).toEqual([100, 12]); + expect(batchSizes).toEqual([98]); for (const size of batchSizes) expect(size).toBeLessThanOrEqual(100); + const queuedMessages = queueSendBatch.mock.calls.flatMap(([batch]) => batch as unknown[]); + for (const message of queuedMessages) { + expect(message).toMatchObject({ + body: { + kind: 'decider', + shardCount: 14, + }, + }); + } + }); + + it('keeps 10 decider models with 3 repetitions under the 100-container cap', async () => { + const manyModels = Array.from({ length: 10 }, (_, i) => ({ + id: `vendor/model-${i}`, + reasoningEffort: null, + })); + vi.mocked(getConfigRows).mockResolvedValue({ + ...TEST_CONFIG_ROWS, + config: { + ...TEST_CONFIG_ROWS.config, + benchmark_user_id: 'user-123', + decider_repetitions: 3, + }, + deciderModels: manyModels.map(m => ({ model: m.id, reasoning_effort: null })), + }); + + const res = await authedPost('/admin/runs', { kind: 'decider' }); + expect(res.status).toBe(200); + + expect(queueSendBatch).toHaveBeenCalledTimes(1); + const queuedMessages = queueSendBatch.mock.calls.flatMap(([batch]) => batch as unknown[]); + expect(queuedMessages).toHaveLength(90); + for (const message of queuedMessages) { + expect(message).toMatchObject({ + body: { + kind: 'decider', + shardCount: 3, + }, + }); + } + }); + + it('rejects decider starts when model repetitions alone exceed the container cap', async () => { + const tooManyModels = Array.from({ length: 21 }, (_, i) => ({ + id: `vendor/model-${i}`, + reasoningEffort: null, + })); + vi.mocked(getConfigRows).mockResolvedValue({ + ...TEST_CONFIG_ROWS, + config: { + ...TEST_CONFIG_ROWS.config, + benchmark_user_id: 'user-123', + decider_repetitions: 5, + }, + deciderModels: tooManyModels.map(m => ({ model: m.id, reasoning_effort: null })), + }); + + const res = await authedPost('/admin/runs', { kind: 'decider' }); + expect(res.status).toBe(400); + await expect(res.json()).resolves.toMatchObject({ + error: expect.stringContaining('requires at least one live container lane'), + }); + expect(insertRun).not.toHaveBeenCalled(); + expect(queueSendBatch).not.toHaveBeenCalled(); }); }); @@ -519,7 +583,7 @@ describe('GET /admin/routing-table', () => { minAccuracy: 0.7, switchCostFactor: 3, source: 'benchmark', - tiers: { low: [candidate], medium: [candidate], high: [candidate] }, + routes: { 'implementation/code_generation': [candidate] }, }; vi.mocked(getLatestRoutingTable).mockResolvedValueOnce({ table: tableData as RoutingTable, diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts index 0b95cd3a94..e266eea567 100644 --- a/services/auto-routing-benchmark/src/admin.ts +++ b/services/auto-routing-benchmark/src/admin.ts @@ -8,7 +8,13 @@ import { zodJsonValidator } from '@kilocode/worker-utils'; import type { Hono } from 'hono'; import { getBenchmarkConfig, saveBenchmarkConfig } from './config'; import { debugRunCli } from './cli-runner'; -import { fetchBenchmarkUserToken, RunAlreadyActiveError, startRun, sweepStaleRuns } from './run'; +import { + BenchmarkRunConfigError, + fetchBenchmarkUserToken, + RunAlreadyActiveError, + startRun, + sweepStaleRuns, +} from './run'; import { getClassifierWinner, getLatestRoutingTable, listRuns } from './db'; import type { HonoEnv } from './hono-env'; @@ -59,6 +65,9 @@ export function registerAdminRoutes(app: Hono): void { if (error instanceof RunAlreadyActiveError) { return c.json({ error: error.message }, 409); } + if (error instanceof BenchmarkRunConfigError) { + return c.json({ error: error.message }, 400); + } throw error; } } diff --git a/services/auto-routing-benchmark/src/bench-runner-container.ts b/services/auto-routing-benchmark/src/bench-runner-container.ts index a3c712c4c7..105e36ce52 100644 --- a/services/auto-routing-benchmark/src/bench-runner-container.ts +++ b/services/auto-routing-benchmark/src/bench-runner-container.ts @@ -3,7 +3,8 @@ import { Container } from '@cloudflare/containers'; // Cloudflare Container that runs the stable `kilo` CLI for decider benchmark // cases. The worker proxies POST /run to the container's HTTP server (see // container/server.mjs) via this DO. One instance is keyed per -// (runId, model, chunk) so concurrent chunks/models don't share state. +// (runId, model, rep) so chunks for the same repetition reuse CLI state without +// creating one live container per chunk. export class BenchRunnerContainer extends Container { defaultPort = 3000; sleepAfter = '2m'; @@ -11,4 +12,13 @@ export class BenchRunnerContainer extends Container { // points at the real gateway; local dev overrides it via .dev.vars so the // benchmark runs against the local apps/web instance. envVars = { KILO_API_URL: this.env.KILO_CLI_API_URL }; + + override async fetch(request: Request): Promise { + const url = new URL(request.url); + if (request.method === 'POST' && url.pathname === '/admin/destroy') { + await this.destroy(); + return new Response('destroyed'); + } + return super.fetch(request); + } } diff --git a/services/auto-routing-benchmark/src/cli-runner.test.ts b/services/auto-routing-benchmark/src/cli-runner.test.ts new file mode 100644 index 0000000000..c8966203e2 --- /dev/null +++ b/services/auto-routing-benchmark/src/cli-runner.test.ts @@ -0,0 +1,36 @@ +import { describe, expect, it, vi } from 'vitest'; +import { destroyDeciderCliContainer } from './cli-runner'; + +describe('destroyDeciderCliContainer', () => { + it('calls the container admin destroy endpoint for the instance name', async () => { + const fetch = vi.fn(async () => new Response('destroyed', { status: 200 })); + const idFromName = vi.fn((name: string) => `id:${name}`); + const get = vi.fn(() => ({ fetch })); + const env = { BENCH_RUNNER: { idFromName, get } } as unknown as Env; + + await destroyDeciderCliContainer(env, { instanceName: 'run:model:2' }); + + expect(idFromName).toHaveBeenCalledWith('run:model:2'); + expect(get).toHaveBeenCalledWith('id:run:model:2'); + expect(fetch).toHaveBeenCalledWith( + expect.objectContaining({ + method: 'POST', + url: 'http://container/admin/destroy', + }) + ); + }); + + it('throws when the container destroy endpoint fails', async () => { + const fetch = vi.fn(async () => new Response('nope', { status: 500 })); + const env = { + BENCH_RUNNER: { + idFromName: (name: string) => `id:${name}`, + get: () => ({ fetch }), + }, + } as unknown as Env; + + await expect(destroyDeciderCliContainer(env, { instanceName: 'run:model:2' })).rejects.toThrow( + 'container /admin/destroy failed: HTTP 500 nope' + ); + }); +}); diff --git a/services/auto-routing-benchmark/src/cli-runner.ts b/services/auto-routing-benchmark/src/cli-runner.ts index 9f22cb3695..de826b3a97 100644 --- a/services/auto-routing-benchmark/src/cli-runner.ts +++ b/services/auto-routing-benchmark/src/cli-runner.ts @@ -20,6 +20,19 @@ const DECIDER_CLI_TIMEOUT_MS = 180_000; const FINAL_ANSWER_SUFFIX = '\n\nIMPORTANT: Your final message must contain ONLY the answer in the exact requested format - no explanations, no preamble, no extra words.'; +export function isRetryableContainerAvailabilityError(error: unknown): boolean { + const message = error instanceof Error ? error.message : String(error); + const normalized = message.toLowerCase(); + return ( + normalized.includes('container /run failed: http 503') || + normalized.includes('container /warmup failed: http 503') || + normalized.includes('no container instance available') || + normalized.includes('no container instance that can be provided') || + normalized.includes('max concurrent instance count') || + normalized.includes('maximum number of running container instances exceeded') + ); +} + type ContainerRunResponse = { exitCode: number; durationMs: number; @@ -31,10 +44,10 @@ type ContainerRunResponse = { /** * Run one decider case through the `kilo` CLI inside a Cloudflare Container. * - * `instanceName` is the precomputed DO instance name (e.g. - * `${runId}:${model}:${chunk}`); the caller owns the keying so chunks/models - * map to stable instances. The CLI has no system-prompt flag, so we fold the - * system prompt into the user prompt. + * `instanceName` is the precomputed DO instance name; the caller owns the + * keying so chunks for the same model/repetition share a stable instance. The + * CLI has no system-prompt flag, so we fold the system prompt into the user + * prompt. */ export async function runDeciderCaseViaCli( env: Env, @@ -141,6 +154,23 @@ export async function warmUpCliContainer( }) ); if (!response.ok) { - throw new Error(`container /warmup failed: HTTP ${response.status}`); + const detail = (await response.text().catch(() => '')).slice(0, 500); + throw new Error(`container /warmup failed: HTTP ${response.status} ${detail}`); + } +} + +export async function destroyDeciderCliContainer( + env: Env, + params: { instanceName: string } +): Promise { + const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(params.instanceName)); + const response = await stub.fetch( + new Request('http://container/admin/destroy', { + method: 'POST', + }) + ); + if (!response.ok) { + const detail = (await response.text().catch(() => '')).slice(0, 500); + throw new Error(`container /admin/destroy failed: HTTP ${response.status} ${detail}`); } } diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts index 1fb02e8de4..10e8aade79 100644 --- a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts +++ b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts @@ -18,18 +18,18 @@ describe('DECIDER_CASES', () => { expect(TAXONOMY_PAIRS.length).toBe(18); }); - it('has exactly 76 cases with unique ids', () => { - expect(DECIDER_CASES.length).toBe(76); + it('has exactly 180 cases with unique ids', () => { + expect(DECIDER_CASES.length).toBe(180); const ids = new Set(DECIDER_CASES.map(c => c.id)); expect(ids.size).toBe(DECIDER_CASES.length); }); - it('has at least 4 cases per (taskType, subtaskType) pair', () => { + it('has at least 10 cases per (taskType, subtaskType) pair', () => { for (const pair of TAXONOMY_PAIRS) { const count = DECIDER_CASES.filter( c => c.taskType === pair.taskType && c.subtaskType === pair.subtaskType ).length; - expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(4); + expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(10); } }); @@ -44,19 +44,6 @@ describe('DECIDER_CASES', () => { } }); - it('has at least 20 cases per tier', () => { - for (const tier of ['low', 'medium', 'high'] as const) { - expect(DECIDER_CASES.filter(c => c.tier === tier).length, tier).toBeGreaterThanOrEqual(20); - } - }); - - it('covers at least 4 distinct task types per tier', () => { - for (const tier of ['low', 'medium', 'high'] as const) { - const taskTypes = new Set(DECIDER_CASES.filter(c => c.tier === tier).map(c => c.taskType)); - expect(taskTypes.size, tier).toBeGreaterThanOrEqual(4); - } - }); - it('has compilable regex patterns', () => { for (const c of DECIDER_CASES) { const check = c.check; diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts index fcb82a223f..3760bc1624 100644 --- a/services/auto-routing-benchmark/src/datasets/decider-cases.ts +++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts @@ -1,13 +1,8 @@ -import type { - ClassifierSubtaskType, - ClassifierTaskType, - DifficultyTier, -} from '@kilocode/auto-routing-contracts'; +import type { ClassifierSubtaskType, ClassifierTaskType } from '@kilocode/auto-routing-contracts'; import type { DeciderCheck } from '../grading'; export type DeciderCase = { id: string; // stable slug, e.g. 'impl-gen-squares-array' (--) - tier: DifficultyTier; taskType: ClassifierTaskType; subtaskType: ClassifierSubtaskType; systemPrompt: string; @@ -28,19 +23,15 @@ const AGENT_SYS = // noise (fences/case/whitespace) but never wrong values. For json_equal cases // the prompt pins the exact key set in the same order as the expected value // (the comparison is JSON.stringify-based and order-sensitive). Each case -// carries exactly one difficulty tier: low = mechanical lookups / trivial -// evaluation, medium = multi-step reasoning / off-by-one traps / spec -// application, high = deep tracing / multi-constraint puzzles / subtle -// semantics. agentic_execution cases are self-contained tasks performed with -// file/terminal tools inside the benchmark container (node:22-slim, no repo, -// no network) and every command involved is deterministic there. +// agentic_execution cases are self-contained tasks performed with file/terminal +// tools inside the benchmark container (node:22-slim, no repo, no network) and +// every command involved is deterministic there. export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- // implementation / feature_development // --------------------------------------------------------------------------- { id: 'impl-feat-ternary-parity', - tier: 'low', taskType: 'implementation', subtaskType: 'feature_development', systemPrompt: CODE_SYS, @@ -50,7 +41,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-feat-array-pipeline', - tier: 'low', taskType: 'implementation', subtaskType: 'feature_development', systemPrompt: CODE_SYS, @@ -60,7 +50,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-feat-closure-counter', - tier: 'medium', taskType: 'implementation', subtaskType: 'feature_development', systemPrompt: CODE_SYS, @@ -70,7 +59,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-feat-recursion-fib', - tier: 'medium', taskType: 'implementation', subtaskType: 'feature_development', systemPrompt: CODE_SYS, @@ -80,7 +68,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-feat-this-binding', - tier: 'high', taskType: 'implementation', subtaskType: 'feature_development', systemPrompt: CODE_SYS, @@ -94,7 +81,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'impl-gen-package-manifest', - tier: 'low', taskType: 'implementation', subtaskType: 'code_generation', systemPrompt: CODE_SYS, @@ -104,7 +90,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-gen-squares-array', - tier: 'low', taskType: 'implementation', subtaskType: 'code_generation', systemPrompt: CODE_SYS, @@ -114,7 +99,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-gen-no-consecutive-ones', - tier: 'medium', taskType: 'implementation', subtaskType: 'code_generation', systemPrompt: CODE_SYS, @@ -124,7 +108,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-gen-two-ones-strings', - tier: 'high', taskType: 'implementation', subtaskType: 'code_generation', systemPrompt: CODE_SYS, @@ -141,7 +124,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'impl-test-sort-expectation', - tier: 'low', taskType: 'implementation', subtaskType: 'test_creation', systemPrompt: CODE_SYS, @@ -151,7 +133,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-test-upper-expectation', - tier: 'low', taskType: 'implementation', subtaskType: 'test_creation', systemPrompt: CODE_SYS, @@ -161,7 +142,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-test-mock-call-count', - tier: 'medium', taskType: 'implementation', subtaskType: 'test_creation', systemPrompt: CODE_SYS, @@ -171,7 +151,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'impl-test-trailing-zeros', - tier: 'high', taskType: 'implementation', subtaskType: 'test_creation', systemPrompt: CODE_SYS, @@ -185,7 +164,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'debug-fix-parseint-suffix', - tier: 'low', taskType: 'debugging', subtaskType: 'bug_fixing', systemPrompt: CODE_SYS, @@ -195,7 +173,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'debug-fix-binary-search', - tier: 'medium', taskType: 'debugging', subtaskType: 'bug_fixing', systemPrompt: CODE_SYS, @@ -207,7 +184,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // 'pages' rather than 'pagination' so the id never collides with the // classifier dataset's debug-fix-pagination-slice in shared telemetry. id: 'debug-fix-pages-slice', - tier: 'medium', taskType: 'debugging', subtaskType: 'bug_fixing', systemPrompt: CODE_SYS, @@ -217,7 +193,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'debug-fix-regex-lastindex', - tier: 'high', taskType: 'debugging', subtaskType: 'bug_fixing', systemPrompt: CODE_SYS, @@ -231,7 +206,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'debug-repair-compound-assign', - tier: 'low', taskType: 'debugging', subtaskType: 'test_repair', systemPrompt: CODE_SYS, @@ -241,7 +215,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'debug-repair-date-format', - tier: 'medium', taskType: 'debugging', subtaskType: 'test_repair', systemPrompt: CODE_SYS, @@ -251,7 +224,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'debug-repair-entries-shape', - tier: 'medium', taskType: 'debugging', subtaskType: 'test_repair', systemPrompt: CODE_SYS, @@ -267,7 +239,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'debug-repair-float-sum', - tier: 'high', taskType: 'debugging', subtaskType: 'test_repair', systemPrompt: CODE_SYS, @@ -281,7 +252,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'debug-rca-async-order', - tier: 'medium', taskType: 'debugging', subtaskType: 'root_cause_analysis', systemPrompt: CODE_SYS, @@ -291,7 +261,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'debug-rca-shared-ref', - tier: 'medium', taskType: 'debugging', subtaskType: 'root_cause_analysis', systemPrompt: CODE_SYS, @@ -301,7 +270,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'debug-rca-closure-loop-var', - tier: 'high', taskType: 'debugging', subtaskType: 'root_cause_analysis', systemPrompt: CODE_SYS, @@ -311,7 +279,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'debug-rca-float-equality', - tier: 'high', taskType: 'debugging', subtaskType: 'root_cause_analysis', systemPrompt: CODE_SYS, @@ -325,7 +292,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'refactor-cleanup-loop-to-reduce', - tier: 'low', taskType: 'refactoring', subtaskType: 'code_cleanup', systemPrompt: CODE_SYS, @@ -335,7 +301,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'refactor-cleanup-extract-helper', - tier: 'low', taskType: 'refactoring', subtaskType: 'code_cleanup', systemPrompt: CODE_SYS, @@ -345,7 +310,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'refactor-cleanup-map-equivalent', - tier: 'medium', taskType: 'refactoring', subtaskType: 'code_cleanup', systemPrompt: CODE_SYS, @@ -355,7 +319,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'refactor-cleanup-short-circuit', - tier: 'high', taskType: 'refactoring', subtaskType: 'code_cleanup', systemPrompt: CODE_SYS, @@ -369,7 +332,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'refactor-arch-import-updates', - tier: 'low', taskType: 'refactoring', subtaskType: 'architecture_improvement', systemPrompt: CODE_SYS, @@ -379,7 +341,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'refactor-arch-layer-depth', - tier: 'medium', taskType: 'refactoring', subtaskType: 'architecture_improvement', systemPrompt: CODE_SYS, @@ -389,7 +350,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'refactor-arch-interface-edges', - tier: 'medium', taskType: 'refactoring', subtaskType: 'architecture_improvement', systemPrompt: CODE_SYS, @@ -399,7 +359,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'refactor-arch-cycle-cut', - tier: 'high', taskType: 'refactoring', subtaskType: 'architecture_improvement', systemPrompt: CODE_SYS, @@ -413,7 +372,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'refactor-migrate-substr-slice', - tier: 'low', taskType: 'refactoring', subtaskType: 'migration', systemPrompt: CODE_SYS, @@ -423,7 +381,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'refactor-migrate-promise-chain', - tier: 'medium', taskType: 'refactoring', subtaskType: 'migration', systemPrompt: CODE_SYS, @@ -433,7 +390,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'refactor-migrate-strict-equality', - tier: 'medium', taskType: 'refactoring', subtaskType: 'migration', systemPrompt: CODE_SYS, @@ -443,7 +399,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'refactor-migrate-var-to-let', - tier: 'high', taskType: 'refactoring', subtaskType: 'migration', systemPrompt: CODE_SYS, @@ -456,18 +411,16 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // planning_design / architecture_design // --------------------------------------------------------------------------- { - id: 'plan-arch-three-tier', - tier: 'low', + id: 'plan-arch-three-layer', taskType: 'planning_design', subtaskType: 'architecture_design', systemPrompt: SYS_SYS, userPrompt: - 'In a classic three-tier architecture with presentation, business, and data tiers, which tier should contain the SQL queries? Answer with only one word: presentation, business, or data.', + 'In a classic three-layer architecture with presentation, business, and data layers, which layer should contain the SQL queries? Answer with only one word: presentation, business, or data.', check: { kind: 'exact', value: 'data' }, }, { id: 'plan-arch-call-chain', - tier: 'medium', taskType: 'planning_design', subtaskType: 'architecture_design', systemPrompt: SYS_SYS, @@ -477,7 +430,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-arch-dependency-rules', - tier: 'medium', taskType: 'planning_design', subtaskType: 'architecture_design', systemPrompt: SYS_SYS, @@ -487,7 +439,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-arch-latency-budget', - tier: 'high', taskType: 'planning_design', subtaskType: 'architecture_design', systemPrompt: SYS_SYS, @@ -501,7 +452,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'plan-steps-rollout-order', - tier: 'low', taskType: 'planning_design', subtaskType: 'technical_planning', systemPrompt: SYS_SYS, @@ -511,7 +461,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-steps-batch-count', - tier: 'medium', taskType: 'planning_design', subtaskType: 'technical_planning', systemPrompt: SYS_SYS, @@ -521,7 +470,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-steps-deploy-waves', - tier: 'medium', taskType: 'planning_design', subtaskType: 'technical_planning', systemPrompt: SYS_SYS, @@ -531,7 +479,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-steps-critical-path', - tier: 'high', taskType: 'planning_design', subtaskType: 'technical_planning', systemPrompt: SYS_SYS, @@ -545,7 +492,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'plan-system-write-quorum', - tier: 'low', taskType: 'planning_design', subtaskType: 'system_design', systemPrompt: SYS_SYS, @@ -555,7 +501,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-system-rate-limit-window', - tier: 'medium', taskType: 'planning_design', subtaskType: 'system_design', systemPrompt: SYS_SYS, @@ -565,7 +510,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-system-replica-availability', - tier: 'medium', taskType: 'planning_design', subtaskType: 'system_design', systemPrompt: SYS_SYS, @@ -575,7 +519,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-system-cache-staleness', - tier: 'high', taskType: 'planning_design', subtaskType: 'system_design', systemPrompt: SYS_SYS, @@ -585,7 +528,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-system-queue-trace', - tier: 'high', taskType: 'planning_design', subtaskType: 'system_design', systemPrompt: SYS_SYS, @@ -595,7 +537,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-system-deadlock-order', - tier: 'high', taskType: 'planning_design', subtaskType: 'system_design', systemPrompt: SYS_SYS, @@ -605,7 +546,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'plan-system-txn-isolation', - tier: 'high', taskType: 'planning_design', subtaskType: 'system_design', systemPrompt: SYS_SYS, @@ -619,7 +559,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'invest-repo-test-file-count', - tier: 'low', taskType: 'investigation', subtaskType: 'repo_exploration', systemPrompt: CODE_SYS, @@ -629,7 +568,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'invest-repo-glob-match', - tier: 'medium', taskType: 'investigation', subtaskType: 'repo_exploration', systemPrompt: CODE_SYS, @@ -639,7 +577,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'invest-repo-grep-case', - tier: 'medium', taskType: 'investigation', subtaskType: 'repo_exploration', systemPrompt: CODE_SYS, @@ -649,7 +586,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'invest-repo-gitignore', - tier: 'high', taskType: 'investigation', subtaskType: 'repo_exploration', systemPrompt: CODE_SYS, @@ -663,7 +599,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'invest-code-char-count', - tier: 'low', taskType: 'investigation', subtaskType: 'codebase_understanding', systemPrompt: CODE_SYS, @@ -673,7 +608,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'invest-code-object-keys', - tier: 'low', taskType: 'investigation', subtaskType: 'codebase_understanding', systemPrompt: CODE_SYS, @@ -683,7 +617,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'invest-code-regex-groups', - tier: 'medium', taskType: 'investigation', subtaskType: 'codebase_understanding', systemPrompt: CODE_SYS, @@ -693,7 +626,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'invest-code-collatz-depth', - tier: 'high', taskType: 'investigation', subtaskType: 'codebase_understanding', systemPrompt: CODE_SYS, @@ -707,7 +639,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'invest-ext-http-created', - tier: 'low', taskType: 'investigation', subtaskType: 'external_research', systemPrompt: @@ -718,7 +649,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'invest-ext-utf8-euro', - tier: 'medium', taskType: 'investigation', subtaskType: 'external_research', systemPrompt: SYS_SYS, @@ -728,7 +658,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'invest-ext-semver-caret', - tier: 'medium', taskType: 'investigation', subtaskType: 'external_research', systemPrompt: CODE_SYS, @@ -738,7 +667,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'invest-ext-json-spec', - tier: 'high', taskType: 'investigation', subtaskType: 'external_research', systemPrompt: CODE_SYS, @@ -752,7 +680,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'agentic-tool-json-read', - tier: 'low', taskType: 'agentic_execution', subtaskType: 'tool_usage', systemPrompt: AGENT_SYS, @@ -762,7 +689,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'agentic-tool-notes-count', - tier: 'low', taskType: 'agentic_execution', subtaskType: 'tool_usage', systemPrompt: AGENT_SYS, @@ -772,7 +698,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'agentic-tool-log-grep', - tier: 'medium', taskType: 'agentic_execution', subtaskType: 'tool_usage', systemPrompt: AGENT_SYS, @@ -782,7 +707,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'agentic-tool-csv-filter-sum', - tier: 'high', taskType: 'agentic_execution', subtaskType: 'tool_usage', systemPrompt: AGENT_SYS, @@ -796,7 +720,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'agentic-term-node-major', - tier: 'low', taskType: 'agentic_execution', subtaskType: 'terminal_operations', systemPrompt: AGENT_SYS, @@ -806,7 +729,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'agentic-term-wc-lines', - tier: 'low', taskType: 'agentic_execution', subtaskType: 'terminal_operations', systemPrompt: AGENT_SYS, @@ -816,7 +738,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'agentic-term-sort-pipeline', - tier: 'medium', taskType: 'agentic_execution', subtaskType: 'terminal_operations', systemPrompt: AGENT_SYS, @@ -826,7 +747,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'agentic-term-sha256-prefix', - tier: 'high', taskType: 'agentic_execution', subtaskType: 'terminal_operations', systemPrompt: AGENT_SYS, @@ -840,7 +760,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ // --------------------------------------------------------------------------- { id: 'agentic-multi-seq-sum', - tier: 'medium', taskType: 'agentic_execution', subtaskType: 'multi_step_execution', systemPrompt: AGENT_SYS, @@ -850,7 +769,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'agentic-multi-node-script', - tier: 'medium', taskType: 'agentic_execution', subtaskType: 'multi_step_execution', systemPrompt: AGENT_SYS, @@ -860,7 +778,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'agentic-multi-find-count', - tier: 'medium', taskType: 'agentic_execution', subtaskType: 'multi_step_execution', systemPrompt: AGENT_SYS, @@ -870,7 +787,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ }, { id: 'agentic-multi-json-transform', - tier: 'high', taskType: 'agentic_execution', subtaskType: 'multi_step_execution', systemPrompt: AGENT_SYS, @@ -878,4 +794,943 @@ export const DECIDER_CASES: readonly DeciderCase[] = [ 'Create a file /tmp/bench-in.json containing exactly this JSON array: [3, 1, 4, 1, 5, 9, 2, 6, 5, 3]. Then write and run a Node.js script that reads the file, computes the sum of the distinct values in the array, and prints it. Answer with only the number.', check: { kind: 'exact', value: '30' }, }, + // --------------------------------------------------------------------------- + // Supplemental taxonomy-route coverage + // --------------------------------------------------------------------------- + { + id: 'supp-impl-feat-clamp', + taskType: 'implementation', + subtaskType: 'feature_development', + systemPrompt: CODE_SYS, + userPrompt: + 'Implement mentally: clamp(14, 3, 9) returns min when low, max when high, otherwise value. Answer with only the returned number.', + check: { kind: 'exact', value: '9' }, + }, + { + id: 'supp-impl-feat-join-slugs', + taskType: 'implementation', + subtaskType: 'feature_development', + systemPrompt: CODE_SYS, + userPrompt: + 'What should slug(["Kilo", "Code", "Cloud"]) return if it lowercases words and joins them with hyphens? Answer only the return value.', + check: { kind: 'exact', value: 'kilo-code-cloud' }, + }, + { + id: 'supp-impl-code-nullish', + taskType: 'implementation', + subtaskType: 'code_generation', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this print? Answer with only the output.\n\nconst x = null ?? "fallback";\nconsole.log(x);', + check: { kind: 'exact', value: 'fallback' }, + }, + { + id: 'supp-impl-code-set-size', + taskType: 'implementation', + subtaskType: 'code_generation', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this JavaScript print? Answer only the number.\n\nconst s = new Set(["a", "b", "a", "c"]);\nconsole.log(s.size);', + check: { kind: 'exact', value: '3' }, + }, + { + id: 'supp-impl-test-boundary-count', + taskType: 'implementation', + subtaskType: 'test_creation', + systemPrompt: CODE_SYS, + userPrompt: + 'A clamp(value, min, max) function needs tests for below min, at min, inside range, at max, and above max. How many cases is that? Answer only the number.', + check: { kind: 'exact', value: '5' }, + }, + { + id: 'supp-impl-test-error-case', + taskType: 'implementation', + subtaskType: 'test_creation', + systemPrompt: CODE_SYS, + userPrompt: + 'For parsePort(input), which invalid input should a test include: "3000", "0", or "abc"? Answer only the invalid value.', + check: { kind: 'exact', value: 'abc' }, + }, + { + id: 'supp-debug-bug-off-by-one', + taskType: 'debugging', + subtaskType: 'bug_fixing', + systemPrompt: CODE_SYS, + userPrompt: + 'A loop uses i <= items.length and reads items[i]. What operator should replace <= to avoid reading past the end? Answer only the operator.', + check: { kind: 'exact', value: '<' }, + }, + { + id: 'supp-debug-bug-json-parse', + taskType: 'debugging', + subtaskType: 'bug_fixing', + systemPrompt: CODE_SYS, + userPrompt: + 'JSON.parse("{bad}") throws. Should the fix catch SyntaxError or TypeError? Answer only the error class.', + check: { kind: 'exact', value: 'SyntaxError' }, + }, + { + id: 'supp-debug-test-expected', + taskType: 'debugging', + subtaskType: 'test_repair', + systemPrompt: CODE_SYS, + userPrompt: + 'A function returns ["a", "b"]. The failing test expects ["b", "a"] but order is part of the contract. Which expected array is correct? Answer JSON only.', + check: { kind: 'json_equal', value: ['a', 'b'] }, + }, + { + id: 'supp-debug-test-timeout', + taskType: 'debugging', + subtaskType: 'test_repair', + systemPrompt: CODE_SYS, + userPrompt: + 'A test waits for text that appears after clicking Save, but it never clicks Save. What single action is missing? Answer only the verb.', + check: { kind: 'exact', value: 'click' }, + }, + { + id: 'supp-debug-root-cause-cache', + taskType: 'debugging', + subtaskType: 'root_cause_analysis', + systemPrompt: SYS_SYS, + userPrompt: + 'A value updates in the database but the page shows the old value until cache expiry. Which layer is the likely root cause: database, cache, or compiler? Answer one word.', + check: { kind: 'exact', value: 'cache' }, + }, + { + id: 'supp-debug-root-cause-env', + taskType: 'debugging', + subtaskType: 'root_cause_analysis', + systemPrompt: SYS_SYS, + userPrompt: + 'Local requests hit port 8810 but the worker config says the target service runs on 8814. What kind of mismatch is this? Answer one word.', + check: { kind: 'exact', value: 'port' }, + }, + { + id: 'supp-refactor-cleanup-dead-branch', + taskType: 'refactoring', + subtaskType: 'code_cleanup', + systemPrompt: CODE_SYS, + userPrompt: + 'A condition checks if status === "done" inside a branch where status is already known to be "pending". What should happen to that inner branch? Answer one word.', + check: { kind: 'exact', value: 'remove' }, + }, + { + id: 'supp-refactor-cleanup-name', + taskType: 'refactoring', + subtaskType: 'code_cleanup', + systemPrompt: CODE_SYS, + userPrompt: + 'Which name is clearer for a boolean: data, flag, or hasErrors? Answer only the best name.', + check: { kind: 'exact', value: 'hasErrors' }, + }, + { + id: 'supp-refactor-arch-shared-helper', + taskType: 'refactoring', + subtaskType: 'architecture_improvement', + systemPrompt: SYS_SYS, + userPrompt: + 'Three modules duplicate the same pure validation logic. Should the shared code be a pure helper, global mutable state, or copied again? Answer two words.', + check: { kind: 'exact', value: 'pure helper' }, + }, + { + id: 'supp-refactor-arch-boundary', + taskType: 'refactoring', + subtaskType: 'architecture_improvement', + systemPrompt: SYS_SYS, + userPrompt: + 'A UI component directly opens database connections. Which boundary should own the database call: UI, server, or CSS? Answer one word.', + check: { kind: 'exact', value: 'server' }, + }, + { + id: 'supp-refactor-migration-column', + taskType: 'refactoring', + subtaskType: 'migration', + systemPrompt: SYS_SYS, + userPrompt: + 'A migration renames user_name to display_name without changing values. What SQL operation is this: INSERT, RENAME COLUMN, or DROP TABLE? Answer only the operation.', + check: { kind: 'exact', value: 'RENAME COLUMN' }, + }, + { + id: 'supp-refactor-migration-backfill', + taskType: 'refactoring', + subtaskType: 'migration', + systemPrompt: SYS_SYS, + userPrompt: + 'After adding a non-null slug column to existing rows, what data operation fills slug for old rows? Answer one word.', + check: { kind: 'exact', value: 'backfill' }, + }, + { + id: 'supp-plan-arch-cache-layer', + taskType: 'planning_design', + subtaskType: 'architecture_design', + systemPrompt: SYS_SYS, + userPrompt: + 'For read-heavy config that changes rarely, should the hot path read every request from origin storage or use a short cache? Answer two words.', + check: { kind: 'exact', value: 'short cache' }, + }, + { + id: 'supp-plan-arch-queue', + taskType: 'planning_design', + subtaskType: 'architecture_design', + systemPrompt: SYS_SYS, + userPrompt: + 'A long-running benchmark exceeds request time limits. Which primitive should carry the work asynchronously: queue, cookie, or CSS? Answer one word.', + check: { kind: 'exact', value: 'queue' }, + }, + { + id: 'supp-plan-technical-rollout', + taskType: 'planning_design', + subtaskType: 'technical_planning', + systemPrompt: SYS_SYS, + userPrompt: + 'Order these rollout steps: deploy code, run migration, monitor logs. Which step should be last? Answer two words.', + check: { kind: 'exact', value: 'monitor logs' }, + }, + { + id: 'supp-plan-technical-risk', + taskType: 'planning_design', + subtaskType: 'technical_planning', + systemPrompt: SYS_SYS, + userPrompt: + 'A plan changes a shared API contract. Should verification focus on one file only or all direct consumers? Answer three words.', + check: { kind: 'exact', value: 'all direct consumers' }, + }, + { + id: 'supp-plan-system-slo', + taskType: 'planning_design', + subtaskType: 'system_design', + systemPrompt: SYS_SYS, + userPrompt: + 'A service retries failed jobs and eventually sends hopeless jobs to a separate queue. What is that queue commonly called? Answer only the abbreviation.', + check: { kind: 'exact', value: 'DLQ' }, + }, + { + id: 'supp-plan-system-idempotency', + taskType: 'planning_design', + subtaskType: 'system_design', + systemPrompt: SYS_SYS, + userPrompt: + 'If the same queue message may be delivered twice, should writes be idempotent or random? Answer one word.', + check: { kind: 'exact', value: 'idempotent' }, + }, + { + id: 'supp-invest-repo-rg', + taskType: 'investigation', + subtaskType: 'repo_exploration', + systemPrompt: SYS_SYS, + userPrompt: + 'Which command is the fastest common choice to search a repository for the string saveRoutingTable: rg, cat, or date? Answer one word.', + check: { kind: 'exact', value: 'rg' }, + }, + { + id: 'supp-invest-repo-package', + taskType: 'investigation', + subtaskType: 'repo_exploration', + systemPrompt: SYS_SYS, + userPrompt: + 'In a pnpm monorepo, which file usually names a package and its scripts: package.json or README.md? Answer only the file name.', + check: { kind: 'exact', value: 'package.json' }, + }, + { + id: 'supp-invest-code-flow', + taskType: 'investigation', + subtaskType: 'codebase_understanding', + systemPrompt: SYS_SYS, + userPrompt: + 'A handler calls validateInput, then saveRow, then enqueueJob. Which function creates the async follow-up? Answer only the function name.', + check: { kind: 'exact', value: 'enqueueJob' }, + }, + { + id: 'supp-invest-code-owner', + taskType: 'investigation', + subtaskType: 'codebase_understanding', + systemPrompt: SYS_SYS, + userPrompt: + 'If a type is imported from @kilocode/auto-routing-contracts, which package owns that type? Answer only the package name.', + check: { kind: 'exact', value: '@kilocode/auto-routing-contracts' }, + }, + { + id: 'supp-invest-research-source', + taskType: 'investigation', + subtaskType: 'external_research', + systemPrompt: SYS_SYS, + userPrompt: + 'For a question about current Cloudflare Workers limits, should you prefer official docs or an old blog post? Answer two words.', + check: { kind: 'exact', value: 'official docs' }, + }, + { + id: 'supp-invest-research-date', + taskType: 'investigation', + subtaskType: 'external_research', + systemPrompt: SYS_SYS, + userPrompt: + 'When comparing two search results for current pricing, which field matters most: publish date, font size, or title length? Answer two words.', + check: { kind: 'exact', value: 'publish date' }, + }, + { + id: 'supp-agent-tool-json-file', + taskType: 'agentic_execution', + subtaskType: 'tool_usage', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-tool.json containing exactly {"a":2,"b":5}. Then read it and answer with only the sum of a and b.', + check: { kind: 'exact', value: '7' }, + }, + { + id: 'supp-agent-tool-grep-count', + taskType: 'agentic_execution', + subtaskType: 'tool_usage', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-tool.txt with lines alpha, beta, alphabet, gamma. Count lines containing alpha and answer only the number.', + check: { kind: 'exact', value: '2' }, + }, + { + id: 'supp-agent-term-node-eval', + taskType: 'agentic_execution', + subtaskType: 'terminal_operations', + systemPrompt: AGENT_SYS, + userPrompt: 'Run node -e "console.log(6*7)" in the terminal and answer with only the output.', + check: { kind: 'exact', value: '42' }, + }, + { + id: 'supp-agent-term-pwd-base', + taskType: 'agentic_execution', + subtaskType: 'terminal_operations', + systemPrompt: AGENT_SYS, + userPrompt: + 'Run pwd in the terminal. If it ends with /app, answer app; otherwise answer other. Answer one word.', + check: { kind: 'regex', pattern: '^(app|other)$' }, + }, + { + id: 'supp-agent-multi-script', + taskType: 'agentic_execution', + subtaskType: 'multi_step_execution', + systemPrompt: AGENT_SYS, + userPrompt: + 'Write /tmp/bench-multi.js that prints ["k","i","l","o"].join(""). Run it with node and answer with only what it prints.', + check: { kind: 'exact', value: 'kilo' }, + }, + { + id: 'supp-agent-multi-files', + taskType: 'agentic_execution', + subtaskType: 'multi_step_execution', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-a.txt containing 11 and /tmp/bench-b.txt containing 31. Read both files, add the numbers, and answer only the sum.', + check: { kind: 'exact', value: '42' }, + }, + + // --------------------------------------------------------------------------- + // Additional taxonomy-route coverage to keep every pair at 10+ cases + // --------------------------------------------------------------------------- + { + id: 'supp2-impl-feat-nullish-total', + taskType: 'implementation', + subtaskType: 'feature_development', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this JavaScript print? Answer with only the number.\n\nconst input = { count: null };\nconst total = (input.count ?? 4) + 6;\nconsole.log(total);', + check: { kind: 'exact', value: '10' }, + }, + { + id: 'supp2-impl-feat-spread-merge', + taskType: 'implementation', + subtaskType: 'feature_development', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this JavaScript print? Answer with the exact output line only.\n\nconst base = { a: 1, b: 2 };\nconst next = { ...base, b: 5, c: 8 };\nconsole.log(Object.keys(next).join(","));', + check: { kind: 'exact', value: 'a,b,c' }, + }, + { + id: 'supp2-impl-feat-set-size', + taskType: 'implementation', + subtaskType: 'feature_development', + systemPrompt: CODE_SYS, + userPrompt: + 'What does this JavaScript print? Answer with only the number.\n\nconst tags = new Set(["api", "web", "api", "cli"]);\nconsole.log(tags.size);', + check: { kind: 'exact', value: '3' }, + }, + { + id: 'supp2-impl-gen-config-object', + taskType: 'implementation', + subtaskType: 'code_generation', + systemPrompt: CODE_SYS, + userPrompt: + 'Generate a config fixture. Reply with only a JSON object with exactly the keys "enabled" and "retries" in that order, where enabled is true and retries is 3.', + check: { kind: 'json_equal', value: { enabled: true, retries: 3 } }, + }, + { + id: 'supp2-impl-gen-primes-array', + taskType: 'implementation', + subtaskType: 'code_generation', + systemPrompt: CODE_SYS, + userPrompt: + 'Generate a test fixture: a JSON array containing the prime numbers less than 12, in increasing order. Reply with only the JSON array.', + check: { kind: 'json_equal', value: [2, 3, 5, 7, 11] }, + }, + { + id: 'supp2-impl-gen-user-slug', + taskType: 'implementation', + subtaskType: 'code_generation', + systemPrompt: CODE_SYS, + userPrompt: + 'Generate a slug for the title "Ship Fast, Stay Safe!". Reply with only the lowercase slug.', + check: { kind: 'exact', value: 'ship-fast-stay-safe' }, + }, + { + id: 'supp2-impl-gen-initials-object', + taskType: 'implementation', + subtaskType: 'code_generation', + systemPrompt: CODE_SYS, + userPrompt: + 'Generate a fixture. Reply with only a JSON object with exactly the keys "name" and "initials" in that order, where name is "Ada Lovelace" and initials is "AL".', + check: { kind: 'json_equal', value: { name: 'Ada Lovelace', initials: 'AL' } }, + }, + { + id: 'supp2-impl-test-array-length', + taskType: 'implementation', + subtaskType: 'test_creation', + systemPrompt: CODE_SYS, + userPrompt: + 'You are writing a unit test. What number makes this assertion pass? Answer with only the number.\n\nexpect(["red", "blue", "green"].length).toBe(?)', + check: { kind: 'exact', value: '3' }, + }, + { + id: 'supp2-impl-test-trim-expectation', + taskType: 'implementation', + subtaskType: 'test_creation', + systemPrompt: CODE_SYS, + userPrompt: + 'You are writing a unit test. What exact string makes this assertion pass? Answer with only the string.\n\nexpect(" done\\n".trim()).toBe(?)', + check: { kind: 'exact', value: 'done' }, + }, + { + id: 'supp2-impl-test-map-output', + taskType: 'implementation', + subtaskType: 'test_creation', + systemPrompt: CODE_SYS, + userPrompt: + 'You are writing a unit test. What JSON array should be expected?\n\n[2, 4, 6].map(n => n / 2)', + check: { kind: 'json_equal', value: [1, 2, 3] }, + }, + { + id: 'supp2-impl-test-url-search-param', + taskType: 'implementation', + subtaskType: 'test_creation', + systemPrompt: CODE_SYS, + userPrompt: + 'You are writing a unit test. What value should this assertion expect? Answer with the exact string only.\n\nnew URL("https://example.test/path?mode=fast").searchParams.get("mode")', + check: { kind: 'exact', value: 'fast' }, + }, + { + id: 'supp2-debug-bug-loop-bound', + taskType: 'debugging', + subtaskType: 'bug_fixing', + systemPrompt: CODE_SYS, + userPrompt: + 'A loop should visit indexes 0, 1, and 2 of a 3-item array. Which comparison operator should the loop use with i and length: < or <=? Answer only the operator.', + check: { kind: 'exact', value: '<' }, + }, + { + id: 'supp2-debug-bug-negated-guard', + taskType: 'debugging', + subtaskType: 'bug_fixing', + systemPrompt: CODE_SYS, + userPrompt: + 'A guard should return early when user is missing. Complete the condition: if (___user) return "anonymous"; Answer with only the missing operator.', + check: { kind: 'exact', value: '!' }, + }, + { + id: 'supp2-debug-bug-assignment-condition', + taskType: 'debugging', + subtaskType: 'bug_fixing', + systemPrompt: CODE_SYS, + userPrompt: + 'A condition accidentally uses = instead of comparing status to "ready". Which operator should replace = for strict comparison? Answer only the operator.', + check: { kind: 'exact', value: '===' }, + }, + { + id: 'supp2-debug-bug-missing-await', + taskType: 'debugging', + subtaskType: 'bug_fixing', + systemPrompt: CODE_SYS, + userPrompt: + 'An async function returns Promise { } where the resolved value was expected. What keyword is missing before the promise call? Answer one word.', + check: { kind: 'exact', value: 'await' }, + }, + { + id: 'supp2-debug-test-boolean-expect', + taskType: 'debugging', + subtaskType: 'test_repair', + systemPrompt: CODE_SYS, + userPrompt: + 'A test expected isAdmin("owner") to be false, but the fixed function correctly returns true. What boolean should the test expect? Answer one word.', + check: { kind: 'exact', value: 'true' }, + }, + { + id: 'supp2-debug-test-error-message', + taskType: 'debugging', + subtaskType: 'test_repair', + systemPrompt: CODE_SYS, + userPrompt: + 'A validation test expected "bad input"; the implementation now intentionally throws "missing email". What exact message should the repaired test expect?', + check: { kind: 'exact', value: 'missing email' }, + }, + { + id: 'supp2-debug-test-json-shape', + taskType: 'debugging', + subtaskType: 'test_repair', + systemPrompt: CODE_SYS, + userPrompt: + 'A response fixture changed from {ok:true} to {status:"ok"}. Reply with only the new expected JSON object.', + check: { kind: 'json_equal', value: { status: 'ok' } }, + }, + { + id: 'supp2-debug-test-async-resolve', + taskType: 'debugging', + subtaskType: 'test_repair', + systemPrompt: CODE_SYS, + userPrompt: + 'A test should assert that fetchName() resolves to "Kilo". Which matcher should be used before toBe("Kilo"): resolves or rejects? Answer one word.', + check: { kind: 'exact', value: 'resolves' }, + }, + { + id: 'supp2-debug-rca-unset-secret', + taskType: 'debugging', + subtaskType: 'root_cause_analysis', + systemPrompt: SYS_SYS, + userPrompt: + 'A deploy works locally but production calls fail with "missing OPENROUTER_API_KEY". Which category is the root cause: secret, schema, or css? Answer one word.', + check: { kind: 'exact', value: 'secret' }, + }, + { + id: 'supp2-debug-rca-race-condition', + taskType: 'debugging', + subtaskType: 'root_cause_analysis', + systemPrompt: SYS_SYS, + userPrompt: + 'Two workers update the same counter concurrently and one increment disappears. What kind of bug is this? Answer two words.', + check: { kind: 'exact', value: 'race condition' }, + }, + { + id: 'supp2-debug-rca-cache-key', + taskType: 'debugging', + subtaskType: 'root_cause_analysis', + systemPrompt: SYS_SYS, + userPrompt: + 'Two users see each other cached results because the cache key omits userId. Which part is wrong: cache key, database type, or font? Answer two words.', + check: { kind: 'exact', value: 'cache key' }, + }, + { + id: 'supp2-debug-rca-timeout', + taskType: 'debugging', + subtaskType: 'root_cause_analysis', + systemPrompt: SYS_SYS, + userPrompt: + 'A request always fails after exactly 30 seconds while the downstream job completes at 45 seconds. What limit is most likely being hit? Answer one word.', + check: { kind: 'exact', value: 'timeout' }, + }, + { + id: 'supp2-refactor-cleanup-unused-import', + taskType: 'refactoring', + subtaskType: 'code_cleanup', + systemPrompt: CODE_SYS, + userPrompt: + 'A file imports formatDate but never uses it. What should happen to that import? Answer one word.', + check: { kind: 'exact', value: 'remove' }, + }, + { + id: 'supp2-refactor-cleanup-nested-if', + taskType: 'refactoring', + subtaskType: 'code_cleanup', + systemPrompt: CODE_SYS, + userPrompt: + 'Replacing nested if statements with early returns primarily reduces what? Answer one word.', + check: { kind: 'exact', value: 'nesting' }, + }, + { + id: 'supp2-refactor-cleanup-magic-number', + taskType: 'refactoring', + subtaskType: 'code_cleanup', + systemPrompt: CODE_SYS, + userPrompt: + 'The number 86400000 appears repeatedly to mean milliseconds per day. What should it become: named constant, random value, or inline comment only? Answer two words.', + check: { kind: 'exact', value: 'named constant' }, + }, + { + id: 'supp2-refactor-cleanup-duplicate-branch', + taskType: 'refactoring', + subtaskType: 'code_cleanup', + systemPrompt: CODE_SYS, + userPrompt: + 'Two switch cases have identical bodies. What refactor can combine them: fallthrough, mutation, or sleep? Answer one word.', + check: { kind: 'exact', value: 'fallthrough' }, + }, + { + id: 'supp2-refactor-arch-adapter', + taskType: 'refactoring', + subtaskType: 'architecture_improvement', + systemPrompt: SYS_SYS, + userPrompt: + 'To isolate provider-specific API calls behind a common interface, what pattern is commonly used? Answer one word.', + check: { kind: 'exact', value: 'adapter' }, + }, + { + id: 'supp2-refactor-arch-pure-core', + taskType: 'refactoring', + subtaskType: 'architecture_improvement', + systemPrompt: SYS_SYS, + userPrompt: + 'Moving business rules out of HTTP handlers into pure functions mainly improves what? Answer one word.', + check: { kind: 'exact', value: 'testability' }, + }, + { + id: 'supp2-refactor-arch-layering', + taskType: 'refactoring', + subtaskType: 'architecture_improvement', + systemPrompt: SYS_SYS, + userPrompt: + 'A router imports a React component to reuse validation logic. Should validation move to shared domain code or stay in the component? Answer three words.', + check: { kind: 'exact', value: 'shared domain code' }, + }, + { + id: 'supp2-refactor-arch-contract-package', + taskType: 'refactoring', + subtaskType: 'architecture_improvement', + systemPrompt: SYS_SYS, + userPrompt: + 'Two services duplicate the same Zod request schema. Where should that schema live: shared contracts package, CSS file, or log line? Answer three words.', + check: { kind: 'exact', value: 'shared contracts package' }, + }, + { + id: 'supp2-refactor-migration-add-index', + taskType: 'refactoring', + subtaskType: 'migration', + systemPrompt: SYS_SYS, + userPrompt: + 'A frequent lookup filters by run_id and model. Which database object usually speeds that lookup? Answer one word.', + check: { kind: 'exact', value: 'index' }, + }, + { + id: 'supp2-refactor-migration-nullable-first', + taskType: 'refactoring', + subtaskType: 'migration', + systemPrompt: SYS_SYS, + userPrompt: + 'For a large table, adding a new column before backfilling is usually safer if it starts nullable or non-null with no default? Answer one word.', + check: { kind: 'exact', value: 'nullable' }, + }, + { + id: 'supp2-refactor-migration-drop-column', + taskType: 'refactoring', + subtaskType: 'migration', + systemPrompt: SYS_SYS, + userPrompt: + 'Removing an obsolete database column is which SQL operation: DROP COLUMN, SELECT, or COMMIT? Answer only the operation.', + check: { kind: 'exact', value: 'DROP COLUMN' }, + }, + { + id: 'supp2-refactor-migration-rename-table', + taskType: 'refactoring', + subtaskType: 'migration', + systemPrompt: SYS_SYS, + userPrompt: + 'A migration changes table name old_events to events while preserving rows. What operation is this? Answer two words.', + check: { kind: 'exact', value: 'rename table' }, + }, + { + id: 'supp2-plan-arch-separate-writer', + taskType: 'planning_design', + subtaskType: 'architecture_design', + systemPrompt: SYS_SYS, + userPrompt: + 'If one service should own writes to a shared routing table and others only read, what role does that service have? Answer two words.', + check: { kind: 'exact', value: 'sole writer' }, + }, + { + id: 'supp2-plan-arch-event-queue', + taskType: 'planning_design', + subtaskType: 'architecture_design', + systemPrompt: SYS_SYS, + userPrompt: + 'A user request should return quickly while heavy work continues later. Which architecture primitive usually decouples the work? Answer one word.', + check: { kind: 'exact', value: 'queue' }, + }, + { + id: 'supp2-plan-arch-cache-invalidation', + taskType: 'planning_design', + subtaskType: 'architecture_design', + systemPrompt: SYS_SYS, + userPrompt: + 'After publishing a new config, should readers keep the old KV cache forever or invalidate it? Answer two words.', + check: { kind: 'exact', value: 'invalidate it' }, + }, + { + id: 'supp2-plan-arch-idempotent-writes', + taskType: 'planning_design', + subtaskType: 'architecture_design', + systemPrompt: SYS_SYS, + userPrompt: + 'If a queue retries messages, should database writes be idempotent or time-randomized? Answer one word.', + check: { kind: 'exact', value: 'idempotent' }, + }, + { + id: 'supp2-plan-technical-order', + taskType: 'planning_design', + subtaskType: 'technical_planning', + systemPrompt: SYS_SYS, + userPrompt: + 'For a schema-breaking rollout, which should be planned before deploy: migration or celebration? Answer one word.', + check: { kind: 'exact', value: 'migration' }, + }, + { + id: 'supp2-plan-technical-rollback', + taskType: 'planning_design', + subtaskType: 'technical_planning', + systemPrompt: SYS_SYS, + userPrompt: + 'A rollout plan should include how to return to the previous version. What is that called? Answer one word.', + check: { kind: 'exact', value: 'rollback' }, + }, + { + id: 'supp2-plan-technical-verification', + taskType: 'planning_design', + subtaskType: 'technical_planning', + systemPrompt: SYS_SYS, + userPrompt: + 'A plan touches a worker and a web consumer. Should verification include both surfaces or only the worker? Answer two words.', + check: { kind: 'exact', value: 'both surfaces' }, + }, + { + id: 'supp2-plan-technical-owner', + taskType: 'planning_design', + subtaskType: 'technical_planning', + systemPrompt: SYS_SYS, + userPrompt: + 'When a launch depends on CI deploy finishing, what should the plan wait for before starting a new benchmark? Answer two words.', + check: { kind: 'exact', value: 'deploy completion' }, + }, + { + id: 'supp2-plan-system-backpressure', + taskType: 'planning_design', + subtaskType: 'system_design', + systemPrompt: SYS_SYS, + userPrompt: + 'Limiting how many jobs run at once to protect downstream capacity is called what? Answer one word.', + check: { kind: 'exact', value: 'backpressure' }, + }, + { + id: 'supp2-invest-repo-find-schema', + taskType: 'investigation', + subtaskType: 'repo_exploration', + systemPrompt: SYS_SYS, + userPrompt: + 'To find where benchmark_runs is defined in a repo, which command should you use first: rg, sleep, or curl? Answer one word.', + check: { kind: 'exact', value: 'rg' }, + }, + { + id: 'supp2-invest-repo-list-files', + taskType: 'investigation', + subtaskType: 'repo_exploration', + systemPrompt: SYS_SYS, + userPrompt: + 'Which command lists tracked and untracked file changes in a git worktree: git status or npm version? Answer two words.', + check: { kind: 'exact', value: 'git status' }, + }, + { + id: 'supp2-invest-repo-find-tests', + taskType: 'investigation', + subtaskType: 'repo_exploration', + systemPrompt: SYS_SYS, + userPrompt: 'Files ending in .test.ts usually contain what? Answer one word.', + check: { kind: 'exact', value: 'tests' }, + }, + { + id: 'supp2-invest-repo-read-config', + taskType: 'investigation', + subtaskType: 'repo_exploration', + systemPrompt: SYS_SYS, + userPrompt: + 'In a Cloudflare Worker service, which config file commonly defines bindings: wrangler.jsonc or tsconfig.tsbuildinfo? Answer only the file name.', + check: { kind: 'exact', value: 'wrangler.jsonc' }, + }, + { + id: 'supp2-invest-code-call-chain', + taskType: 'investigation', + subtaskType: 'codebase_understanding', + systemPrompt: SYS_SYS, + userPrompt: + 'Given the call chain handleRequest -> classify -> computeDecision, which function chooses the model? Answer only the function name.', + check: { kind: 'exact', value: 'computeDecision' }, + }, + { + id: 'supp2-invest-code-schema-owner', + taskType: 'investigation', + subtaskType: 'codebase_understanding', + systemPrompt: SYS_SYS, + userPrompt: + 'If RoutingTableSchema parses published artifacts, is it a runtime schema or CSS class? Answer two words.', + check: { kind: 'exact', value: 'runtime schema' }, + }, + { + id: 'supp2-invest-code-field-rename', + taskType: 'investigation', + subtaskType: 'codebase_understanding', + systemPrompt: SYS_SYS, + userPrompt: + 'A database row field route_key maps to API field routeKey. What naming conversion is this: snake to camel, camel to snake, or uppercase? Answer three words.', + check: { kind: 'exact', value: 'snake to camel' }, + }, + { + id: 'supp2-invest-code-consumer', + taskType: 'investigation', + subtaskType: 'codebase_understanding', + systemPrompt: SYS_SYS, + userPrompt: + 'A type change in @kilocode/auto-routing-contracts breaks services/auto-routing and apps/web. What are those packages called relative to the type? Answer one word.', + check: { kind: 'exact', value: 'consumers' }, + }, + { + id: 'supp2-invest-research-primary-source', + taskType: 'investigation', + subtaskType: 'external_research', + systemPrompt: SYS_SYS, + userPrompt: + 'For library API behavior, should you prefer official docs or a random forum answer? Answer two words.', + check: { kind: 'exact', value: 'official docs' }, + }, + { + id: 'supp2-invest-research-cross-check', + taskType: 'investigation', + subtaskType: 'external_research', + systemPrompt: SYS_SYS, + userPrompt: + 'If two current sources disagree, should you cross-check or guess? Answer one word.', + check: { kind: 'exact', value: 'cross-check' }, + }, + { + id: 'supp2-invest-research-version', + taskType: 'investigation', + subtaskType: 'external_research', + systemPrompt: SYS_SYS, + userPrompt: + 'When reading framework docs, which detail matters for compatibility: version or logo color? Answer one word.', + check: { kind: 'exact', value: 'version' }, + }, + { + id: 'supp2-invest-research-quote-limit', + taskType: 'investigation', + subtaskType: 'external_research', + systemPrompt: SYS_SYS, + userPrompt: + 'When using a source, should long copyrighted passages be quoted in full or summarized? Answer one word.', + check: { kind: 'exact', value: 'summarized' }, + }, + { + id: 'supp2-agent-tool-sort-file', + taskType: 'agentic_execution', + subtaskType: 'tool_usage', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-sort.txt with lines delta, alpha, charlie. Sort the lines alphabetically and answer with the first line only.', + check: { kind: 'exact', value: 'alpha' }, + }, + { + id: 'supp2-agent-tool-json-length', + taskType: 'agentic_execution', + subtaskType: 'tool_usage', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-items.json containing ["a","b","c","d"]. Read it and answer only the array length.', + check: { kind: 'exact', value: '4' }, + }, + { + id: 'supp2-agent-tool-word-count', + taskType: 'agentic_execution', + subtaskType: 'tool_usage', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-words.txt containing exactly "one two three". Count the words and answer only the number.', + check: { kind: 'exact', value: '3' }, + }, + { + id: 'supp2-agent-tool-file-exists', + taskType: 'agentic_execution', + subtaskType: 'tool_usage', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-exists.txt containing ok. Then check that the file exists and answer only yes or no.', + check: { kind: 'exact', value: 'yes' }, + }, + { + id: 'supp2-agent-term-node-json', + taskType: 'agentic_execution', + subtaskType: 'terminal_operations', + systemPrompt: AGENT_SYS, + userPrompt: + 'Run node -e "console.log(JSON.stringify([1,2,3].reduce((a,b)=>a+b,0)))" in the terminal and answer with only the output.', + check: { kind: 'exact', value: '6' }, + }, + { + id: 'supp2-agent-term-printf', + taskType: 'agentic_execution', + subtaskType: 'terminal_operations', + systemPrompt: AGENT_SYS, + userPrompt: 'Run printf kilo in the terminal and answer with only the output.', + check: { kind: 'exact', value: 'kilo' }, + }, + { + id: 'supp2-agent-term-sort', + taskType: 'agentic_execution', + subtaskType: 'terminal_operations', + systemPrompt: AGENT_SYS, + userPrompt: + 'Run a shell command that sorts the words "zeta alpha" alphabetically one per line. Answer with only the first sorted word.', + check: { kind: 'exact', value: 'alpha' }, + }, + { + id: 'supp2-agent-term-expr', + taskType: 'agentic_execution', + subtaskType: 'terminal_operations', + systemPrompt: AGENT_SYS, + userPrompt: 'Run a terminal calculation for 9 + 8 + 7 and answer with only the result.', + check: { kind: 'exact', value: '24' }, + }, + { + id: 'supp2-agent-multi-generate-run', + taskType: 'agentic_execution', + subtaskType: 'multi_step_execution', + systemPrompt: AGENT_SYS, + userPrompt: + 'Write /tmp/bench-sum.js that prints 14 + 28. Run it with node and answer with only what it prints.', + check: { kind: 'exact', value: '42' }, + }, + { + id: 'supp2-agent-multi-read-transform', + taskType: 'agentic_execution', + subtaskType: 'multi_step_execution', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-name.txt containing kilo. Read it, uppercase it, and answer only the uppercase text.', + check: { kind: 'exact', value: 'KILO' }, + }, + { + id: 'supp2-agent-multi-two-files-join', + taskType: 'agentic_execution', + subtaskType: 'multi_step_execution', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-left.txt containing auto and /tmp/bench-right.txt containing route. Read both and answer with the two words joined by a hyphen.', + check: { kind: 'exact', value: 'auto-route' }, + }, + { + id: 'supp2-agent-multi-json-sum', + taskType: 'agentic_execution', + subtaskType: 'multi_step_execution', + systemPrompt: AGENT_SYS, + userPrompt: + 'Create /tmp/bench-numbers.json containing [5,10,15]. Read it, sum the numbers, and answer only the sum.', + check: { kind: 'exact', value: '30' }, + }, ]; diff --git a/services/auto-routing-benchmark/src/db-replace-summaries.test.ts b/services/auto-routing-benchmark/src/db-replace-summaries.test.ts index 16b81c8212..d77974387a 100644 --- a/services/auto-routing-benchmark/src/db-replace-summaries.test.ts +++ b/services/auto-routing-benchmark/src/db-replace-summaries.test.ts @@ -25,7 +25,7 @@ import { replaceModelSummaries } from './db'; function makeSummary(model: string): BenchmarkModelSummary { return { model, - tier: '*', + routeKey: '*', accuracy: 0.9, avgCostUsd: 0.001, avgLatencyMs: 100, diff --git a/services/auto-routing-benchmark/src/db-save-routing-table.test.ts b/services/auto-routing-benchmark/src/db-save-routing-table.test.ts new file mode 100644 index 0000000000..7cbc1048d4 --- /dev/null +++ b/services/auto-routing-benchmark/src/db-save-routing-table.test.ts @@ -0,0 +1,66 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { RankedCandidate, RoutingTable } from '@kilocode/auto-routing-contracts'; + +const mockState = vi.hoisted(() => ({ + batchCalls: [] as Array>, +})); + +vi.mock('drizzle-orm/d1', () => ({ + drizzle: vi.fn(() => ({ + delete: vi.fn(() => ({ + where: vi.fn(() => ({ kind: 'delete' })), + })), + insert: vi.fn(() => ({ + values: vi.fn((values: unknown) => ({ + kind: 'insert', + values, + onConflictDoUpdate: vi.fn(() => ({ kind: 'upsert', values })), + })), + })), + batch: vi.fn(async (stmts: Array<{ kind: string; values?: unknown }>) => { + mockState.batchCalls.push(stmts); + }), + })), +})); + +const candidate = (model: string): RankedCandidate => ({ + model, + accuracy: 0.9, + avgCostUsd: 0.001, + meetsThreshold: true, + reasoningEffort: null, +}); + +describe('saveRoutingTable', () => { + it('chunks routing candidate inserts to stay under D1 variable limits', async () => { + const { saveRoutingTable } = await import('./db'); + + const table: RoutingTable = { + version: 'run-large-routing-table', + generatedAt: '2026-06-16T18:00:00.000Z', + minAccuracy: 0.7, + switchCostFactor: 3, + source: 'benchmark', + routes: { + 'implementation/code_generation': Array.from({ length: 23 }, (_, index) => + candidate(`impl-model-${index}`) + ), + 'debugging/bug_fixing': [candidate('debug-model')], + 'planning_design/system_design': [candidate('plan-model')], + }, + }; + + await saveRoutingTable({} as D1Database, table, '2026-06-16T18:01:00.000Z'); + + const [batch] = mockState.batchCalls; + expect(batch).toBeDefined(); + const candidateInsertSizes = batch + .filter(stmt => stmt.kind === 'insert') + .map(stmt => { + expect(Array.isArray(stmt.values)).toBe(true); + return (stmt.values as unknown[]).length; + }); + + expect(candidateInsertSizes).toEqual([10, 10, 5]); + }); +}); diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts index 2a4c88035c..c241939a89 100644 --- a/services/auto-routing-benchmark/src/db-schema.ts +++ b/services/auto-routing-benchmark/src/db-schema.ts @@ -77,7 +77,7 @@ export const modelSummaries = sqliteTable( { run_id: text('run_id').notNull(), model: text('model').notNull(), - tier: text('tier').notNull(), + route_key: text('route_key').notNull(), accuracy: real('accuracy').notNull(), avg_cost_usd: real('avg_cost_usd'), avg_latency_ms: real('avg_latency_ms').notNull(), @@ -89,7 +89,7 @@ export const modelSummaries = sqliteTable( // carried=true rows are prior-run summaries copied in at startRun for skipped models. carried: integer('carried', { mode: 'boolean' }).notNull().default(false), }, - table => [primaryKey({ columns: [table.run_id, table.model, table.tier] })] + table => [primaryKey({ columns: [table.run_id, table.model, table.route_key] })] ); export const caseResults = sqliteTable( @@ -98,7 +98,7 @@ export const caseResults = sqliteTable( run_id: text('run_id').notNull(), model: text('model').notNull(), case_id: text('case_id').notNull(), - tier: text('tier'), + route_key: text('route_key'), score: real('score').notNull(), latency_ms: integer('latency_ms').notNull(), cost_usd: real('cost_usd'), @@ -134,7 +134,7 @@ export const routingTableCandidates = sqliteTable( 'routing_table_candidates', { run_id: text('run_id').notNull(), - tier: text('tier').notNull(), + route_key: text('route_key').notNull(), rank: integer('rank').notNull(), model: text('model').notNull(), accuracy: real('accuracy').notNull(), @@ -145,5 +145,5 @@ export const routingTableCandidates = sqliteTable( meets_threshold: integer('meets_threshold', { mode: 'boolean' }).notNull(), reasoning_effort: text('reasoning_effort'), }, - table => [primaryKey({ columns: [table.run_id, table.tier, table.rank] })] + table => [primaryKey({ columns: [table.run_id, table.route_key, table.rank] })] ); diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts index 103482e00d..5ba9b0b853 100644 --- a/services/auto-routing-benchmark/src/db.test.ts +++ b/services/auto-routing-benchmark/src/db.test.ts @@ -13,7 +13,7 @@ describe('mapSummaryRow', () => { const row = { run_id: 'run-1', model: 'openai/gpt-4o', - tier: 'high', + route_key: 'implementation/code_generation', accuracy: 0.92, avg_cost_usd: 0.0015, avg_latency_ms: 320.5, @@ -27,7 +27,7 @@ describe('mapSummaryRow', () => { const result = mapSummaryRow(row); expect(result).toEqual({ model: 'openai/gpt-4o', - tier: 'high', + routeKey: 'implementation/code_generation', accuracy: 0.92, avgCostUsd: 0.0015, avgLatencyMs: 320.5, @@ -43,7 +43,7 @@ describe('mapSummaryRow', () => { const row = { run_id: 'run-2', model: 'anthropic/claude-3-haiku', - tier: '*', + route_key: '*', accuracy: 0.85, avg_cost_usd: null, avg_latency_ms: 150.0, @@ -58,7 +58,7 @@ describe('mapSummaryRow', () => { expect(result.avgCostUsd).toBeNull(); expect(result.p50LatencyMs).toBeNull(); expect(result.p95LatencyMs).toBeNull(); - expect(result.tier).toBe('*'); + expect(result.routeKey).toBe('*'); expect(result.errors).toBe(0); expect(result.timeouts).toBe(0); }); @@ -88,7 +88,7 @@ describe('mapRunRow', () => { const summaries: BenchmarkModelSummary[] = [ { model: 'openai/gpt-4o-mini', - tier: '*', + routeKey: '*', accuracy: 0.78, avgCostUsd: 0.0002, avgLatencyMs: 120, @@ -150,10 +150,9 @@ const sampleTable: RoutingTable = { minAccuracy: 0.7, switchCostFactor: 3, source: 'benchmark', - tiers: { - low: [candidate('model-a'), candidate('model-b')], - medium: [candidate('model-c')], - high: [candidate('model-a')], + routes: { + 'implementation/code_generation': [candidate('model-a'), candidate('model-b')], + 'debugging/bug_fixing': [candidate('model-c')], }, }; @@ -168,14 +167,16 @@ describe('routingTableToRows', () => { expect(tableRow.source).toBe('benchmark'); }); - it('assigns rank 0,1 for the two low-tier candidates', () => { + it('assigns rank 0,1 for the two implementation/code_generation candidates', () => { const { candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z'); - const lowRows = candidateRows.filter(r => r.tier === 'low').sort((a, b) => a.rank - b.rank); - expect(lowRows).toHaveLength(2); - expect(lowRows[0].model).toBe('model-a'); - expect(lowRows[0].rank).toBe(0); - expect(lowRows[1].model).toBe('model-b'); - expect(lowRows[1].rank).toBe(1); + const routeRows = candidateRows + .filter(r => r.route_key === 'implementation/code_generation') + .sort((a, b) => a.rank - b.rank); + expect(routeRows).toHaveLength(2); + expect(routeRows[0].model).toBe('model-a'); + expect(routeRows[0].rank).toBe(0); + expect(routeRows[1].model).toBe('model-b'); + expect(routeRows[1].rank).toBe(1); }); }); @@ -188,12 +189,12 @@ describe('rowsToRoutingTable', () => { expect(RoutingTableSchema.parse(reassembled)).toEqual(sampleTable); }); - it('preserves candidate order within each tier', () => { + it('preserves candidate order within each route', () => { const { tableRow, candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z'); // Shuffle candidateRows to verify rank-based sorting. const shuffled = [...candidateRows].reverse(); const reassembled = rowsToRoutingTable(tableRow, shuffled); - expect(reassembled.tiers.low[0].model).toBe('model-a'); - expect(reassembled.tiers.low[1].model).toBe('model-b'); + expect(reassembled.routes['implementation/code_generation']?.[0]?.model).toBe('model-a'); + expect(reassembled.routes['implementation/code_generation']?.[1]?.model).toBe('model-b'); }); }); diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts index 8ed87649fa..744adb57f5 100644 --- a/services/auto-routing-benchmark/src/db.ts +++ b/services/auto-routing-benchmark/src/db.ts @@ -34,6 +34,11 @@ type ModelSummaryRow = typeof modelSummaries.$inferSelect; // ceiling while still batching the delete plus inserts together. const MODEL_SUMMARY_INSERT_BATCH_SIZE = 8; +// Routing table candidates bind 8 values per row. Keep each INSERT comfortably +// under D1's 100-variable ceiling; publishing is infrequent, so smaller +// statements are preferable to risking a skipped routing-table update. +const ROUTING_TABLE_CANDIDATE_INSERT_BATCH_SIZE = 10; + // --------------------------------------------------------------------------- // Row mapping helpers // --------------------------------------------------------------------------- @@ -41,7 +46,7 @@ const MODEL_SUMMARY_INSERT_BATCH_SIZE = 8; export function mapSummaryRow(row: ModelSummaryRow): BenchmarkModelSummary { return { model: row.model, - tier: row.tier as BenchmarkModelSummary['tier'], + routeKey: row.route_key as BenchmarkModelSummary['routeKey'], accuracy: row.accuracy, avgCostUsd: row.avg_cost_usd, avgLatencyMs: row.avg_latency_ms, @@ -179,7 +184,7 @@ export async function insertRun( carriedSummaries.map(s => ({ run_id: run.id, model: s.model, - tier: s.tier, + route_key: s.routeKey, accuracy: s.accuracy, avg_cost_usd: s.avgCostUsd, avg_latency_ms: s.avgLatencyMs, @@ -221,7 +226,7 @@ export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Prom .onConflictDoUpdate({ target: [caseResults.run_id, caseResults.model, caseResults.case_id, caseResults.rep], set: { - tier: row.tier, + route_key: row.route_key, score: row.score, latency_ms: row.latency_ms, cost_usd: row.cost_usd, @@ -251,6 +256,25 @@ export async function getCaseResults(db: D1Database, runId: string): Promise> { + if (params.caseIds.length === 0) return new Set(); + const rows = await drizzle(db) + .select({ case_id: caseResults.case_id }) + .from(caseResults) + .where( + and( + eq(caseResults.run_id, params.runId), + eq(caseResults.model, params.model), + eq(caseResults.rep, params.rep), + inArray(caseResults.case_id, params.caseIds) + ) + ); + return new Set(rows.map(row => row.case_id)); +} + // --------------------------------------------------------------------------- // Model summaries // --------------------------------------------------------------------------- @@ -279,7 +303,7 @@ export async function replaceModelSummaries( summaryChunk.map(s => ({ run_id: runId, model: s.model, - tier: s.tier, + route_key: s.routeKey, accuracy: s.accuracy, avg_cost_usd: s.avgCostUsd, avg_latency_ms: s.avgLatencyMs, @@ -415,8 +439,8 @@ export type PriorModelResult = { summaries: BenchmarkModelSummary[]; }; -// Latest summaries per model for a benchmark kind: for each model, all tiers -// from the most recent COMPLETED run that included it (mixing tiers across +// Latest summaries per model for a benchmark kind: for each model, all routes +// from the most recent COMPLETED run that included it (mixing routes across // runs would pair incomparable numbers). export async function getLatestSummariesByModel( db: D1Database, @@ -426,7 +450,7 @@ export async function getLatestSummariesByModel( .select({ run_id: modelSummaries.run_id, model: modelSummaries.model, - tier: modelSummaries.tier, + route_key: modelSummaries.route_key, accuracy: modelSummaries.accuracy, avg_cost_usd: modelSummaries.avg_cost_usd, avg_latency_ms: modelSummaries.avg_latency_ms, @@ -492,11 +516,11 @@ export function routingTableToRows( }; const candidateRows: RoutingTableCandidateRow[] = []; - for (const [tier, candidates] of Object.entries(table.tiers)) { + for (const [routeKey, candidates] of Object.entries(table.routes)) { candidates.forEach((c, rank) => { candidateRows.push({ run_id: table.version, - tier, + route_key: routeKey, rank, model: c.model, accuracy: c.accuracy, @@ -514,14 +538,14 @@ export function rowsToRoutingTable( tableRow: RoutingTableRow, candidateRows: RoutingTableCandidateRow[] ): RoutingTable { - const tierMap: Record = { low: [], medium: [], high: [] }; + const routeMap: Record = {}; const sorted = [...candidateRows].sort((a, b) => { - if (a.tier !== b.tier) return a.tier.localeCompare(b.tier); + if (a.route_key !== b.route_key) return a.route_key.localeCompare(b.route_key); return a.rank - b.rank; }); for (const row of sorted) { - if (!(row.tier in tierMap)) tierMap[row.tier] = []; - tierMap[row.tier].push({ + routeMap[row.route_key] ??= []; + routeMap[row.route_key].push({ model: row.model, accuracy: row.accuracy, avgCostUsd: row.avg_cost_usd, @@ -535,11 +559,7 @@ export function rowsToRoutingTable( minAccuracy: tableRow.min_accuracy, switchCostFactor: tableRow.switch_cost_factor, source: tableRow.source as RoutingTable['source'], - tiers: { - low: tierMap.low ?? [], - medium: tierMap.medium ?? [], - high: tierMap.high ?? [], - }, + routes: routeMap, }; } @@ -568,8 +588,12 @@ export async function saveRoutingTable( }), ]; - if (candidateRows.length > 0) { - stmts.push(orm.insert(routingTableCandidates).values(candidateRows)); + for (let i = 0; i < candidateRows.length; i += ROUTING_TABLE_CANDIDATE_INSERT_BATCH_SIZE) { + stmts.push( + orm + .insert(routingTableCandidates) + .values(candidateRows.slice(i, i + ROUTING_TABLE_CANDIDATE_INSERT_BATCH_SIZE)) + ); } await orm.batch(stmts); @@ -592,7 +616,7 @@ export async function getLatestRoutingTable( .select() .from(routingTableCandidates) .where(eq(routingTableCandidates.run_id, tableRow.run_id)) - .orderBy(routingTableCandidates.tier, routingTableCandidates.rank); + .orderBy(routingTableCandidates.route_key, routingTableCandidates.rank); const assembled = rowsToRoutingTable(tableRow, candidateRows); const parsed = RoutingTableSchema.safeParse(assembled); @@ -627,11 +651,11 @@ export async function getClassifierWinner(db: D1Database): Promise> = {} +): BenchmarkModelSummary[] { + return TAXONOMY_ROUTE_KEYS.flatMap( + routeKey => + overrides[routeKey] ?? [ + summary('model/cheap', routeKey, 0.7, 0.007), + summary('model/value', routeKey, 0.9, 0.008), + summary('model/weak', routeKey, 0.5, 0.001), + ] + ); +} describe('buildRoutingTable', () => { - it('cheapest above-threshold model comes first per tier', () => { + it('ranks candidates by lowest cost per accuracy for each taxonomy route', () => { const table = buildRoutingTable({ runId: 'test-run-1', generatedAt: '2026-01-01T00:00:00.000Z', minAccuracy: 0.7, switchCostFactor: 3, deciderModels: DECIDER_MODELS, - summaries: ALL_TIERS_SUMMARIES, + summaries: summariesForEveryRoute(), }); - // low tier: cheap (0.001) and mid (0.005) and expensive (0.01) all meet threshold (0.7) - // cheapest first - expect(table.tiers.low[0].model).toBe('model/cheap'); - expect(table.tiers.low[1].model).toBe('model/mid'); - expect(table.tiers.low[2].model).toBe('model/expensive'); - - // medium tier: all meet threshold, cheapest first - expect(table.tiers.medium[0].model).toBe('model/cheap'); - expect(table.tiers.medium[1].model).toBe('model/mid'); - expect(table.tiers.medium[2].model).toBe('model/expensive'); - - // high tier: expensive (0.9) and mid (0.75) meet threshold; cheap (0.6) does not - // meeting threshold first, then by cost; cheap last (below threshold) - expect(table.tiers.high[0].model).toBe('model/mid'); // meets threshold, cheaper - expect(table.tiers.high[1].model).toBe('model/expensive'); // meets threshold, more expensive - expect(table.tiers.high[2].model).toBe('model/cheap'); // below threshold + expect(table.routes['implementation/code_generation']?.map(c => c.model)).toEqual([ + 'model/value', + 'model/cheap', + 'model/weak', + ]); }); - it('excludes a model whose tier summary has no cost signal', () => { + it('excludes a model whose route summary has no cost signal', () => { + const routeKey = 'implementation/code_generation'; const table = buildRoutingTable({ runId: 'test-run-nocost', generatedAt: '2026-01-01T00:00:00.000Z', minAccuracy: 0.7, switchCostFactor: 3, deciderModels: DECIDER_MODELS, - summaries: ALL_TIERS_SUMMARIES.map(s => - s.model === 'model/cheap' && s.tier === 'low' ? { ...s, avgCostUsd: null } : s - ), - }); - - // model/cheap would have won 'low' as cheapest; without a cost signal it - // must not be ranked (unknown cost is not zero cost). - expect(table.tiers.low.map(c => c.model)).toEqual(['model/mid', 'model/expensive']); - }); - - it('marks meetsThreshold correctly', () => { - const table = buildRoutingTable({ - runId: 'test-run-2', - generatedAt: '2026-01-01T00:00:00.000Z', - minAccuracy: 0.7, - switchCostFactor: 3, - deciderModels: DECIDER_MODELS, - summaries: ALL_TIERS_SUMMARIES, + summaries: summariesForEveryRoute({ + [routeKey]: [ + summary('model/cheap', routeKey, 0.7, null), + summary('model/value', routeKey, 0.9, 0.008), + ], + }), }); - for (const candidate of table.tiers.low) { - expect(candidate.meetsThreshold).toBe(candidate.accuracy >= 0.7); - } - }); - - it('excludes a model absent from a tier summaries', () => { - // model/cheap has no 'high' summary entry - const summaries: BenchmarkModelSummary[] = [ - summary('model/cheap', 'low', 0.9), - summary('model/cheap', 'medium', 0.8), - // no 'high' entry for model/cheap - summary('model/expensive', 'low', 0.9), - summary('model/expensive', 'medium', 0.8), - summary('model/expensive', 'high', 0.9), - summary('model/mid', 'low', 0.8), - summary('model/mid', 'medium', 0.75), - summary('model/mid', 'high', 0.75), - ]; - - const table = buildRoutingTable({ - runId: 'test-run-3', - generatedAt: '2026-01-01T00:00:00.000Z', - minAccuracy: 0.7, - switchCostFactor: 3, - deciderModels: DECIDER_MODELS, - summaries, - }); - - const highModels = table.tiers.high.map(c => c.model); - expect(highModels).not.toContain('model/cheap'); - expect(highModels).toContain('model/expensive'); - expect(highModels).toContain('model/mid'); + expect(table.routes[routeKey]?.map(c => c.model)).toEqual(['model/value']); }); it('carries reasoningEffort from the run snapshot', () => { @@ -140,119 +90,43 @@ describe('buildRoutingTable', () => { minAccuracy: 0.7, switchCostFactor: 3, deciderModels: DECIDER_MODELS, - summaries: ALL_TIERS_SUMMARIES, - }); - - const expensiveInLow = table.tiers.low.find(c => c.model === 'model/expensive'); - expect(expensiveInLow?.reasoningEffort).toBe('medium'); - - const midInLow = table.tiers.low.find(c => c.model === 'model/mid'); - expect(midInLow?.reasoningEffort).toBeNull(); - }); - - it('defaults reasoningEffort to null when model missing from the snapshot', () => { - const summaries: BenchmarkModelSummary[] = [ - summary('model/unknown', 'low', 0.9), - summary('model/cheap', 'low', 0.8), - summary('model/cheap', 'medium', 0.8), - summary('model/cheap', 'high', 0.8), - summary('model/unknown', 'medium', 0.9), - summary('model/unknown', 'high', 0.9), - ]; - - const table = buildRoutingTable({ - runId: 'test-run-5', - generatedAt: '2026-01-01T00:00:00.000Z', - minAccuracy: 0.7, - switchCostFactor: 3, - deciderModels: DECIDER_MODELS, - summaries, + summaries: summariesForEveryRoute(), }); - const unknown = table.tiers.low.find(c => c.model === 'model/unknown'); - expect(unknown?.reasoningEffort).toBeNull(); - }); - - it('throws when a tier has no candidates', () => { - // Only low and medium summaries — high is missing entirely - const summaries: BenchmarkModelSummary[] = [ - summary('model/cheap', 'low', 0.9), - summary('model/expensive', 'low', 0.9), - summary('model/mid', 'low', 0.9), - summary('model/cheap', 'medium', 0.9), - summary('model/expensive', 'medium', 0.9), - summary('model/mid', 'medium', 0.9), - ]; + const value = table.routes['implementation/code_generation']?.find( + c => c.model === 'model/value' + ); + expect(value?.reasoningEffort).toBe('medium'); - expect(() => - buildRoutingTable({ - runId: 'test-run-6', - generatedAt: '2026-01-01T00:00:00.000Z', - minAccuracy: 0.7, - switchCostFactor: 3, - deciderModels: DECIDER_MODELS, - summaries, - }) - ).toThrow(); + const cheap = table.routes['implementation/code_generation']?.find( + c => c.model === 'model/cheap' + ); + expect(cheap?.reasoningEffort).toBeNull(); }); - it('throws when a tier has only zero-case entries', () => { - const summaries: BenchmarkModelSummary[] = [ - ...ALL_TIERS_SUMMARIES.filter(s => s.tier !== 'high'), - // high tier entries with 0 cases — should be excluded - { ...summary('model/cheap', 'high', 0.9), cases: 0 }, - { ...summary('model/expensive', 'high', 0.9), cases: 0 }, - { ...summary('model/mid', 'high', 0.9), cases: 0 }, - ]; - + it('throws when any taxonomy route has no candidates', () => { expect(() => buildRoutingTable({ - runId: 'test-run-7', + runId: 'test-run-missing-route', generatedAt: '2026-01-01T00:00:00.000Z', minAccuracy: 0.7, switchCostFactor: 3, deciderModels: DECIDER_MODELS, - summaries, + summaries: summariesForEveryRoute({ 'implementation/code_generation': [] }), }) ).toThrow(); }); - it('ignores classifier-style * tier summaries', () => { - const summaries: BenchmarkModelSummary[] = [ - ...ALL_TIERS_SUMMARIES, - // classifier summaries with '*' tier — should be ignored - summary('model/cheap', '*', 0.95), - summary('model/expensive', '*', 0.95), - ]; - - // Should not throw and * tier entries should not affect output + it('ignores classifier-style * route summaries', () => { const table = buildRoutingTable({ - runId: 'test-run-8', + runId: 'test-run-classifier-summary', generatedAt: '2026-01-01T00:00:00.000Z', minAccuracy: 0.7, switchCostFactor: 3, deciderModels: DECIDER_MODELS, - summaries, - }); - - expect(table.tiers.low.length).toBe(3); - expect(table.tiers.medium.length).toBe(3); - }); - - it('sets version and generatedAt from params', () => { - const table = buildRoutingTable({ - runId: 'decider-2026-01-01', - generatedAt: '2026-01-01T12:00:00.000Z', - minAccuracy: 0.7, - switchCostFactor: 3, - deciderModels: DECIDER_MODELS, - summaries: ALL_TIERS_SUMMARIES, + summaries: [...summariesForEveryRoute(), summary('model/value', '*', 1, 0.0001)], }); - expect(table.version).toBe('decider-2026-01-01'); - expect(table.generatedAt).toBe('2026-01-01T12:00:00.000Z'); - expect(table.source).toBe('benchmark'); - expect(table.minAccuracy).toBe(0.7); - expect(table.switchCostFactor).toBe(3); + expect(table.routes['implementation/code_generation']).toHaveLength(3); }); }); diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts index 222f19436f..27e09a177b 100644 --- a/services/auto-routing-benchmark/src/routing-table-builder.ts +++ b/services/auto-routing-benchmark/src/routing-table-builder.ts @@ -1,17 +1,18 @@ import { rankCandidates, RoutingTableSchema, + TAXONOMY_ROUTE_KEYS, type BenchmarkDeciderModel, type BenchmarkModelSummary, - type DifficultyTier, type RoutingTable, + type TaxonomyRouteKey, } from '@kilocode/auto-routing-contracts'; -// Builds the routing table from per-(model, tier) decider summaries. Models -// with zero graded cases in a tier are excluded from that tier, as are +// Builds the routing table from per-(model, taxonomy-route) decider summaries. Models +// with zero graded cases in a route are excluded from that route, as are // models with no cost signal at all (avgCostUsd null means every case failed -// to report cost; ranking such a model as cheapest would hand it the tier). -// Throws when any tier ends up empty so the caller keeps the previous +// to report cost; ranking such a model as cheapest would hand it the route). +// Throws when any route ends up empty so the caller keeps the previous // published table. deciderModels/minAccuracy/switchCostFactor come from the // run's snapshot, not live config. export function buildRoutingTable(params: { @@ -25,10 +26,10 @@ export function buildRoutingTable(params: { const { runId, generatedAt, minAccuracy, switchCostFactor, deciderModels, summaries } = params; const modelConfigById = new Map(deciderModels.map(m => [m.id, m] as const)); - const tierCandidates = (t: DifficultyTier) => + const routeCandidates = (routeKey: TaxonomyRouteKey) => rankCandidates( summaries - .filter(s => s.tier === t && s.cases > 0 && s.avgCostUsd !== null) + .filter(s => s.routeKey === routeKey && s.cases > 0 && s.avgCostUsd !== null) .map(s => ({ model: s.model, accuracy: s.accuracy, @@ -38,21 +39,21 @@ export function buildRoutingTable(params: { minAccuracy ); + const routes = Object.fromEntries( + TAXONOMY_ROUTE_KEYS.map(routeKey => [routeKey, routeCandidates(routeKey)] as const) + ); + const table: RoutingTable = { version: runId, generatedAt, minAccuracy, switchCostFactor, source: 'benchmark', - tiers: { - low: tierCandidates('low'), - medium: tierCandidates('medium'), - high: tierCandidates('high'), - }, + routes, }; - // RoutingTableSchema enforces .min(1) on each tier array; throws ZodError - // when a tier is empty — caller logs and skips publish, keeping the previous + // RoutingTableSchema enforces .min(1) on each route array; throws ZodError + // when a route is empty — caller logs and skips publish, keeping the previous // live table intact. return RoutingTableSchema.parse(table); } diff --git a/services/auto-routing-benchmark/src/run-process-job.test.ts b/services/auto-routing-benchmark/src/run-process-job.test.ts new file mode 100644 index 0000000000..955820cc92 --- /dev/null +++ b/services/auto-routing-benchmark/src/run-process-job.test.ts @@ -0,0 +1,302 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import type * as CliRunnerModule from './cli-runner'; +import type * as DbModule from './db'; +import { DECIDER_CASES } from './datasets/decider-cases'; + +vi.mock('./db', async importOriginal => { + const actual = await importOriginal(); + return { + ...actual, + countCaseResults: vi.fn(), + existsNewerCompletedRun: vi.fn(), + getCaseResults: vi.fn(), + getExistingCaseResultIds: vi.fn(), + getRunWithModels: vi.fn(), + getSummaries: vi.fn(), + markRunCompleted: vi.fn(), + replaceModelSummaries: vi.fn(), + saveRoutingTable: vi.fn(), + upsertCaseResult: vi.fn(), + }; +}); + +vi.mock('./cli-runner', async importOriginal => { + const actual = await importOriginal(); + return { + ...actual, + destroyDeciderCliContainer: vi.fn(), + runDeciderCaseViaCli: vi.fn(), + warmUpCliContainer: vi.fn(), + }; +}); + +import { + destroyDeciderCliContainer, + runDeciderCaseViaCli, + warmUpCliContainer, + type CliRunResult, +} from './cli-runner'; +import { + countCaseResults, + getExistingCaseResultIds, + getRunWithModels, + upsertCaseResult, +} from './db'; +import { processJob } from './run'; + +const tokenGet = vi.fn<() => Promise>(); +const queueSendBatch = vi.fn<(messages: unknown[]) => Promise>(); +const model = 'qwen/qwen3-coder-next'; +const runId = 'decider-test-run'; +const [benchCase] = DECIDER_CASES; + +const successfulCliResult = { + text: 'not the expected answer', + costUsd: null, + latencyMs: 25, + exitCode: 0, + stderrTail: '', + eventCount: 1, + lastEventTypes: ['session.created'], + timedOut: false, +} satisfies CliRunResult; + +const env = { + INTERNAL_API_SECRET_PROD: { get: tokenGet }, + BENCH_DB: {} as D1Database, + BENCH_QUEUE: { sendBatch: queueSendBatch }, + AUTO_ROUTING_CONFIG: { delete: vi.fn() }, +} as unknown as Env; + +function mockRunSnapshot(): void { + vi.mocked(getRunWithModels).mockResolvedValue({ + run: { + max_concurrency: 4, + min_accuracy: 0.7, + switch_cost_factor: 3, + benchmark_user_id: 'benchmark-user', + repetitions: 1, + classifier_max_p95_latency_ms: null, + started_at: '2026-06-16T00:00:00.000Z', + }, + models: [{ model, enqueued: true, reasoning_effort: null }], + } as never); +} + +function deciderMessage() { + return { + runId, + kind: 'decider', + model, + caseIds: [benchCase.id], + chunk: 0, + rep: 0, + }; +} + +beforeEach(() => { + vi.clearAllMocks(); + tokenGet.mockResolvedValue('internal-secret'); + queueSendBatch.mockResolvedValue(undefined); + vi.stubGlobal( + 'fetch', + vi.fn(async () => + Response.json({ token: 'kilo-user-token', expiresAt: '2026-06-16T01:00:00.000Z' }) + ) + ); + mockRunSnapshot(); + vi.mocked(countCaseResults).mockResolvedValue(0); + vi.mocked(getExistingCaseResultIds).mockResolvedValue(new Set()); + vi.mocked(destroyDeciderCliContainer).mockResolvedValue(undefined); + vi.mocked(warmUpCliContainer).mockResolvedValue(undefined); + vi.mocked(runDeciderCaseViaCli).mockResolvedValue(successfulCliResult); +}); + +afterEach(() => { + vi.unstubAllGlobals(); +}); + +describe('processJob — decider container availability failures', () => { + it.each([ + 'container /run failed: HTTP 503 There is no Container instance available at this time. This is likely because you have reached your max concurrent instance count.', + 'container /run failed: HTTP 503 Maximum number of running container instances exceeded', + 'container /run failed: HTTP 503 There is no container instance that can be provided to this Durable Object, try again later', + ])('lets the queue retry %s', async message => { + vi.mocked(runDeciderCaseViaCli).mockRejectedValueOnce(new Error(message)); + + await expect(processJob(env, deciderMessage())).rejects.toThrow(message); + + expect(upsertCaseResult).not.toHaveBeenCalled(); + expect(countCaseResults).not.toHaveBeenCalled(); + }); + + it('lets the queue retry warmup capacity failures before running cases', async () => { + const message = + 'container /warmup failed: HTTP 503 There is no Container instance available at this time'; + vi.mocked(warmUpCliContainer).mockRejectedValueOnce(new Error(message)); + + await expect(processJob(env, deciderMessage())).rejects.toThrow(message); + + expect(runDeciderCaseViaCli).not.toHaveBeenCalled(); + expect(upsertCaseResult).not.toHaveBeenCalled(); + expect(countCaseResults).not.toHaveBeenCalled(); + }); +}); + +describe('processJob — decider chunk chaining', () => { + it('runs a chunk on the model-repetition shard container and enqueues the next chunk', async () => { + const message = { + ...deciderMessage(), + caseIds: DECIDER_CASES.slice(0, 5).map(c => c.id), + }; + + await processJob(env, message); + + expect(warmUpCliContainer).toHaveBeenCalledWith( + env, + expect.objectContaining({ instanceName: `${runId}:${model}:0:0` }) + ); + expect(runDeciderCaseViaCli).toHaveBeenCalledWith( + env, + expect.objectContaining({ instanceName: `${runId}:${model}:0:0` }) + ); + expect(queueSendBatch).toHaveBeenCalledWith([ + { + body: { + runId, + kind: 'decider', + model, + chunk: 1, + shard: 0, + shardCount: 1, + rep: 0, + caseIds: DECIDER_CASES.slice(5, 10).map(c => c.id), + }, + }, + ]); + expect(countCaseResults).not.toHaveBeenCalled(); + }); + + it('enqueues the next chunk assigned to the same shard lane', async () => { + const chunk = 2; + const shard = 2; + const shardCount = 8; + const currentCaseIds = DECIDER_CASES.slice(chunk * 5, chunk * 5 + 5).map(c => c.id); + const nextChunk = chunk + shardCount; + const nextCaseIds = DECIDER_CASES.slice(nextChunk * 5, nextChunk * 5 + 5).map(c => c.id); + + await processJob(env, { + ...deciderMessage(), + chunk, + shard, + shardCount, + caseIds: currentCaseIds, + }); + + expect(warmUpCliContainer).toHaveBeenCalledWith( + env, + expect.objectContaining({ instanceName: `${runId}:${model}:0:2` }) + ); + expect(queueSendBatch).toHaveBeenCalledWith([ + { + body: { + runId, + kind: 'decider', + model, + chunk: nextChunk, + shard, + shardCount, + rep: 0, + caseIds: nextCaseIds, + }, + }, + ]); + expect(countCaseResults).not.toHaveBeenCalled(); + }); + + it('does not rerun completed chunk cases or enqueue a fully completed next chunk', async () => { + const currentCaseIds = DECIDER_CASES.slice(0, 5).map(c => c.id); + const nextCaseIds = DECIDER_CASES.slice(5, 10).map(c => c.id); + vi.mocked(getExistingCaseResultIds) + .mockResolvedValueOnce(new Set(currentCaseIds)) + .mockResolvedValueOnce(new Set(nextCaseIds)); + + await processJob(env, { ...deciderMessage(), caseIds: currentCaseIds }); + + expect(warmUpCliContainer).not.toHaveBeenCalled(); + expect(runDeciderCaseViaCli).not.toHaveBeenCalled(); + expect(upsertCaseResult).not.toHaveBeenCalled(); + expect(queueSendBatch).not.toHaveBeenCalled(); + }); + + it('re-enqueues a partially completed next chunk so DLQ leftovers cannot strand a run', async () => { + const currentCaseIds = DECIDER_CASES.slice(0, 5).map(c => c.id); + const nextCaseIds = DECIDER_CASES.slice(5, 10).map(c => c.id); + vi.mocked(getExistingCaseResultIds) + .mockResolvedValueOnce(new Set(currentCaseIds)) + .mockResolvedValueOnce(new Set([nextCaseIds[0]])); + + await processJob(env, { ...deciderMessage(), caseIds: currentCaseIds }); + + expect(warmUpCliContainer).not.toHaveBeenCalled(); + expect(runDeciderCaseViaCli).not.toHaveBeenCalled(); + expect(upsertCaseResult).not.toHaveBeenCalled(); + expect(queueSendBatch).toHaveBeenCalledWith([ + { + body: { + runId, + kind: 'decider', + model, + chunk: 1, + shard: 0, + shardCount: 1, + rep: 0, + caseIds: nextCaseIds, + }, + }, + ]); + }); + + it('destroys the model-repetition shard container after the terminal chunk', async () => { + const terminalChunk = Math.floor((DECIDER_CASES.length - 1) / 5); + const terminalCaseIds = DECIDER_CASES.slice(terminalChunk * 5).map(c => c.id); + + await processJob(env, { + ...deciderMessage(), + chunk: terminalChunk, + shard: 3, + shardCount: 4, + caseIds: terminalCaseIds, + }); + + expect(queueSendBatch).not.toHaveBeenCalled(); + expect(destroyDeciderCliContainer).toHaveBeenCalledWith(env, { + instanceName: `${runId}:${model}:0:3`, + }); + expect(countCaseResults).toHaveBeenCalled(); + }); + + it('finalizes terminal chunks even when best-effort container destroy fails', async () => { + const terminalChunk = Math.floor((DECIDER_CASES.length - 1) / 5); + const terminalCaseIds = DECIDER_CASES.slice(terminalChunk * 5).map(c => c.id); + const warn = vi.spyOn(console, 'warn').mockImplementation(() => {}); + vi.mocked(destroyDeciderCliContainer).mockRejectedValueOnce(new Error('already stopped')); + + await processJob(env, { + ...deciderMessage(), + chunk: terminalChunk, + shard: 3, + shardCount: 4, + caseIds: terminalCaseIds, + }); + + expect(destroyDeciderCliContainer).toHaveBeenCalledWith(env, { + instanceName: `${runId}:${model}:0:3`, + }); + expect(warn).toHaveBeenCalledWith( + expect.stringContaining('benchmark_container_destroy_failed') + ); + expect(countCaseResults).toHaveBeenCalled(); + warn.mockRestore(); + }); +}); diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts index 9d40b883e0..4c38613658 100644 --- a/services/auto-routing-benchmark/src/run.test.ts +++ b/services/auto-routing-benchmark/src/run.test.ts @@ -5,7 +5,9 @@ import { buildClassifierMessages, buildDeciderMessages, chunkArray, + computeDeciderShardCount, computeEngineIdentity, + getDeciderContainerInstanceName, runCasesWithConcurrency, summarize, } from './run'; @@ -16,7 +18,7 @@ function makeRow(overrides: Partial = {}): CaseResultRow { run_id: 'run-1', model: 'model/a', case_id: 'case-1', - tier: null, + route_key: null, score: 1, latency_ms: 100, cost_usd: 0.001, @@ -34,12 +36,12 @@ function makeRow(overrides: Partial = {}): CaseResultRow { } describe('summarize — classifier kind', () => { - it('groups all classifier rows under * tier', () => { + it('groups all classifier rows under * route key', () => { const rows: CaseResultRow[] = [ makeRow({ model: 'model/a', case_id: 'c1', - tier: null, + route_key: null, score: 1, latency_ms: 100, cost_usd: 0.001, @@ -47,7 +49,7 @@ describe('summarize — classifier kind', () => { makeRow({ model: 'model/a', case_id: 'c2', - tier: null, + route_key: null, score: 0.5, latency_ms: 200, cost_usd: 0.002, @@ -58,7 +60,7 @@ describe('summarize — classifier kind', () => { expect(summaries).toHaveLength(1); const [s] = summaries; expect(s.model).toBe('model/a'); - expect(s.tier).toBe('*'); + expect(s.routeKey).toBe('*'); expect(s.cases).toBe(2); }); @@ -123,39 +125,65 @@ describe('summarize — classifier kind', () => { }); describe('summarize — decider kind', () => { - it('groups by tier', () => { + it('groups by taxonomy route key', () => { const rows: CaseResultRow[] = [ - makeRow({ model: 'model/a', case_id: 'low-1', tier: 'low', score: 1 }), - makeRow({ model: 'model/a', case_id: 'low-2', tier: 'low', score: 0 }), - makeRow({ model: 'model/a', case_id: 'med-1', tier: 'medium', score: 1 }), - makeRow({ model: 'model/b', case_id: 'low-3', tier: 'low', score: 1 }), + makeRow({ + model: 'model/a', + case_id: 'impl-1', + route_key: 'implementation/code_generation', + score: 1, + }), + makeRow({ + model: 'model/a', + case_id: 'impl-2', + route_key: 'implementation/code_generation', + score: 0, + }), + makeRow({ + model: 'model/a', + case_id: 'debug-1', + route_key: 'debugging/bug_fixing', + score: 1, + }), + makeRow({ + model: 'model/b', + case_id: 'impl-3', + route_key: 'implementation/code_generation', + score: 1, + }), ]; const summaries = summarize(rows, 'decider'); expect(summaries).toHaveLength(3); - const aLow = summaries.find(s => s.model === 'model/a' && s.tier === 'low'); - expect(aLow?.cases).toBe(2); - expect(aLow?.accuracy).toBe(0.5); + const aImpl = summaries.find( + s => s.model === 'model/a' && s.routeKey === 'implementation/code_generation' + ); + expect(aImpl?.cases).toBe(2); + expect(aImpl?.accuracy).toBe(0.5); - const aMed = summaries.find(s => s.model === 'model/a' && s.tier === 'medium'); - expect(aMed?.cases).toBe(1); - expect(aMed?.accuracy).toBe(1); + const aDebug = summaries.find( + s => s.model === 'model/a' && s.routeKey === 'debugging/bug_fixing' + ); + expect(aDebug?.cases).toBe(1); + expect(aDebug?.accuracy).toBe(1); - const bLow = summaries.find(s => s.model === 'model/b' && s.tier === 'low'); - expect(bLow?.cases).toBe(1); + const bImpl = summaries.find( + s => s.model === 'model/b' && s.routeKey === 'implementation/code_generation' + ); + expect(bImpl?.cases).toBe(1); }); - it('uses * fallback when tier is null', () => { - const rows: CaseResultRow[] = [makeRow({ tier: null, score: 1 })]; + it('uses * fallback when route key is null', () => { + const rows: CaseResultRow[] = [makeRow({ route_key: null, score: 1 })]; const [s] = summarize(rows, 'decider'); - expect(s.tier).toBe('*'); + expect(s.routeKey).toBe('*'); }); it('computes avgLatencyMs as rounded mean', () => { const rows: CaseResultRow[] = [ - makeRow({ case_id: 'c1', tier: 'low', latency_ms: 100 }), - makeRow({ case_id: 'c2', tier: 'low', latency_ms: 301 }), + makeRow({ case_id: 'c1', route_key: 'implementation/code_generation', latency_ms: 100 }), + makeRow({ case_id: 'c2', route_key: 'implementation/code_generation', latency_ms: 301 }), ]; const [s] = summarize(rows, 'decider'); @@ -163,7 +191,9 @@ describe('summarize — decider kind', () => { }); it('handles single-element groups for p50', () => { - const rows: CaseResultRow[] = [makeRow({ tier: 'high', latency_ms: 500 })]; + const rows: CaseResultRow[] = [ + makeRow({ route_key: 'implementation/code_generation', latency_ms: 500 }), + ]; const [s] = summarize(rows, 'decider'); expect(s.p50LatencyMs).toBe(500); }); @@ -266,7 +296,7 @@ describe('chunkArray', () => { describe('pickClassifierWinner', () => { const summary = (model: string, accuracy: number, avgCostUsd: number | null) => ({ model, - tier: '*' as const, + routeKey: '*' as const, accuracy, avgCostUsd, avgLatencyMs: 100, @@ -298,9 +328,12 @@ describe('pickClassifierWinner', () => { expect(winner?.model).toBe('cheap'); }); - it('ignores decider-tier summaries and returns null when nothing is graded', () => { + it('ignores decider route summaries and returns null when nothing is graded', () => { expect( - pickClassifierWinner([{ ...summary('m', 1, 0.001), tier: 'low' as const }], 0.7) + pickClassifierWinner( + [{ ...summary('m', 1, 0.001), routeKey: 'implementation/code_generation' as const }], + 0.7 + ) ).toBeNull(); expect(pickClassifierWinner([], 0.7)).toBeNull(); }); @@ -313,7 +346,7 @@ describe('pickClassifierWinner', () => { p95: number | null = 90 ) => ({ model, - tier: '*' as const, + routeKey: '*' as const, accuracy, avgCostUsd, avgLatencyMs: 100, @@ -412,8 +445,7 @@ describe('summarize — p95 and timeouts', () => { }); describe('decider message fan-out', () => { - it('DECIDER_CHUNK_SIZE is 5 (chunk count for 76 cases)', () => { - // DECIDER_CASES = 76, chunk size 5 → ceil(76/5) = 16 chunks + it('DECIDER_CHUNK_SIZE is 5', () => { const chunks = chunkArray( Array.from({ length: 76 }, (_, i) => String(i)), 5 @@ -429,45 +461,95 @@ describe('decider message fan-out', () => { kind: 'decider', model: 'm1', rep: 2, + shard: 1, + shardCount: 4, caseIds: ['a'], chunk: 0, }); expect(withRep.rep).toBe(2); + expect(withRep.shard).toBe(1); + expect(withRep.shardCount).toBe(4); }); - it('buildDeciderMessages: produces models × reps × ceil(76/5) messages with correct rep', () => { - // 76 cases, chunk size 5 → 16 chunks - const cases76 = Array.from({ length: 76 }, (_, i) => ({ id: `case-${i}` })); - const chunks = chunkArray(cases76, 5); - expect(chunks).toHaveLength(16); + it('computeDeciderShardCount maximizes shard lanes under the live container cap', () => { + expect(computeDeciderShardCount({ modelCount: 2, repetitions: 3, chunkCount: 36 })).toBe(16); + expect( + computeDeciderShardCount({ + modelCount: 7, + repetitions: 1, + chunkCount: 36, + maxLiveContainers: 100, + }) + ).toBe(14); + expect( + computeDeciderShardCount({ + modelCount: 25, + repetitions: 1, + chunkCount: 36, + maxLiveContainers: 100, + }) + ).toBe(4); + expect( + computeDeciderShardCount({ + modelCount: 10, + repetitions: 3, + chunkCount: 36, + maxLiveContainers: 100, + }) + ).toBe(3); + expect( + computeDeciderShardCount({ + modelCount: 101, + repetitions: 1, + chunkCount: 36, + maxLiveContainers: 100, + }) + ).toBe(0); + }); + + it('buildDeciderMessages: seeds sharded chunk lanes under the container cap', () => { + const cases180 = Array.from({ length: 180 }, (_, i) => ({ id: `case-${i}` })); + const chunks = chunkArray(cases180, 5); + expect(chunks).toHaveLength(36); const models = ['model/a', 'model/b']; const repetitions = 3; const messages = buildDeciderMessages('run-test', 'decider', models, repetitions, chunks); + const expectedShardCount = 16; - // Total: 2 models × 3 reps × 16 chunks = 96 messages - expect(messages).toHaveLength(models.length * repetitions * chunks.length); + // Initial fan-out is bounded by the 100-container budget while running + // multiple independent chunk lanes per model/repetition. + expect(messages).toHaveLength(models.length * repetitions * expectedShardCount); + expect(messages.length).toBeLessThanOrEqual(100); - // Each rep index (0..2) should appear exactly models.length × chunks.length times for (let rep = 0; rep < repetitions; rep++) { const forRep = messages.filter(m => m.body.rep === rep); - expect(forRep).toHaveLength(models.length * chunks.length); + expect(forRep).toHaveLength(models.length * expectedShardCount); } - // Every message carries the correct rep in its body for (const { body } of messages) { expect(typeof body.rep).toBe('number'); expect(body.rep).toBeGreaterThanOrEqual(0); expect(body.rep).toBeLessThan(repetitions); + expect(body.shardCount).toBe(expectedShardCount); + expect(body.shard).toBeGreaterThanOrEqual(0); + expect(body.shard).toBeLessThan(expectedShardCount); + expect(body.chunk).toBe(body.shard); + expect(body.caseIds).toEqual(chunks[body.shard!]?.map(c => c.id)); } + }); - // caseIds on each message match the chunk - for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) { - const forChunk = messages.filter(m => m.body.chunk === chunkIdx); - for (const { body } of forChunk) { - expect(body.caseIds).toEqual(chunks[chunkIdx].map(c => c.id)); - } - } + it('getDeciderContainerInstanceName reuses one container per model repetition shard', () => { + const base = { runId: 'run-test', kind: 'decider' as const, model: 'model/a', rep: 2 }; + expect(getDeciderContainerInstanceName({ ...base, chunk: 0, shard: 0 })).toBe( + 'run-test:model/a:2:0' + ); + expect(getDeciderContainerInstanceName({ ...base, chunk: 16, shard: 0 })).toBe( + 'run-test:model/a:2:0' + ); + expect(getDeciderContainerInstanceName({ ...base, chunk: 1, shard: 1 })).toBe( + 'run-test:model/a:2:1' + ); }); }); diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts index 326134cdd7..f4aa1dbd26 100644 --- a/services/auto-routing-benchmark/src/run.ts +++ b/services/auto-routing-benchmark/src/run.ts @@ -5,6 +5,7 @@ import { type BenchmarkDeciderModel, type BenchmarkKind, type BenchmarkModelSummary, + taxonomyRouteKey, } from '@kilocode/auto-routing-contracts'; import { formatError } from '@kilocode/worker-utils'; import * as z from 'zod'; @@ -16,6 +17,7 @@ import { countCaseResults, existsNewerCompletedRun, getCaseResults, + getExistingCaseResultIds, getLatestSummariesByModel, getRunningRun, getRunWithModels, @@ -33,7 +35,12 @@ import { import { gradeClassifierOutput, runDeciderCheck } from './grading'; import { createOpenRouterClient } from './openrouter'; import { buildRoutingTable } from './routing-table-builder'; -import { runDeciderCaseViaCli, warmUpCliContainer } from './cli-runner'; +import { + destroyDeciderCliContainer, + isRetryableContainerAvailabilityError, + runDeciderCaseViaCli, + warmUpCliContainer, +} from './cli-runner'; import { pickClassifierWinner } from './winner'; export type BenchmarkJobMessage = { @@ -41,9 +48,11 @@ export type BenchmarkJobMessage = { kind: BenchmarkKind; model: string; // The case ids this message is responsible for, plus the chunk index. Decider - // chunks also use this index to key their container instance. + // chunks are split across shard lanes; each lane has one stable container. caseIds?: string[]; chunk?: number; + shard?: number; + shardCount?: number; // Repetition index (0-based). rep?: number; }; @@ -54,6 +63,8 @@ export const BenchmarkJobMessageSchema = z.object({ model: z.string().min(1), caseIds: z.array(z.string().min(1)).optional(), chunk: z.number().int().min(0).optional(), + shard: z.number().int().min(0).optional(), + shardCount: z.number().int().min(1).optional(), rep: z.number().int().min(0).optional(), }); @@ -67,9 +78,13 @@ const DECIDER_CHUNK_SIZE = 5; // keep it below Cloudflare Queues' 15-minute wall-clock limit. const CLASSIFIER_CHUNK_SIZE = 1; -// Cloudflare Queues caps a single sendBatch at 100 messages. A decider fan-out -// is models × reps × ceil(76 / 5) messages, which clears 100 with as few as two -// models, so the dispatch must be sliced. +// Cloudflare Containers cap for the benchmark runner. Sharded decider fan-out +// uses this as the live-container budget. +export const DECIDER_CONTAINER_INSTANCE_CAP = 100; + +// Cloudflare Queues caps a single sendBatch at 100 messages. Classifier fan-out +// can exceed that because each classifier case is its own message, so dispatch +// must be sliced. const QUEUE_SEND_BATCH_LIMIT = 100; export function chunkArray(items: readonly T[], size: number): T[][] { @@ -80,6 +95,24 @@ export function chunkArray(items: readonly T[], size: number): T[][] { return chunks; } +export function computeDeciderShardCount({ + modelCount, + repetitions, + chunkCount, + maxLiveContainers = DECIDER_CONTAINER_INSTANCE_CAP, +}: { + modelCount: number; + repetitions: number; + chunkCount: number; + maxLiveContainers?: number; +}): number { + if (modelCount <= 0 || repetitions <= 0 || chunkCount <= 0) return 0; + const modelRepetitions = modelCount * repetitions; + const shardsPerModelRepetition = Math.floor(maxLiveContainers / modelRepetitions); + if (shardsPerModelRepetition <= 0) return 0; + return Math.min(chunkCount, shardsPerModelRepetition); +} + // Enqueues messages in sendBatch-sized slices. A mid-dispatch failure leaves a // partially-enqueued run that can never reach its expected result count, so the // run is marked failed (surfacing in the admin panel) before the throw @@ -138,36 +171,64 @@ export function computeEngineIdentity(kind: BenchmarkKind): string { const datasetSignature = kind === 'classifier' ? CLASSIFIER_CASES.map(c => ({ id: c.id, expected: c.expected })) - : DECIDER_CASES.map(c => ({ id: c.id, tier: c.tier, check: c.check })); + : DECIDER_CASES.map(c => ({ + id: c.id, + taskType: c.taskType, + subtaskType: c.subtaskType, + check: c.check, + })); return `v${BENCHMARK_ENGINE_VERSION}:${fnv1aHex(JSON.stringify(datasetSignature))}`; } -/** Pure helper: produces the sendBatch bodies for a decider run fan-out. - * Extracted for unit-testability; the shape is models × reps × chunks messages. +/** Pure helper: produces the initial sendBatch bodies for a decider run. + * Extracted for unit-testability; the shape is models × reps messages. Later + * chunks are chained by processDeciderJob after the previous chunk completes. */ export function buildDeciderMessages( runId: string, kind: BenchmarkKind, modelIds: string[], repetitions: number, - chunks: readonly (readonly { id: string }[])[] + chunks: readonly (readonly { id: string }[])[], + maxLiveContainers: number = DECIDER_CONTAINER_INSTANCE_CAP ): { body: BenchmarkJobMessage }[] { + const shardCount = computeDeciderShardCount({ + modelCount: modelIds.length, + repetitions, + chunkCount: chunks.length, + maxLiveContainers, + }); + if (shardCount === 0) return []; return modelIds.flatMap(model => Array.from({ length: repetitions }, (_, rep) => - chunks.map((chunkCases, chunk) => ({ - body: { - runId, - kind, - model, - chunk, - rep, - caseIds: chunkCases.map(c => c.id), - } satisfies BenchmarkJobMessage, - })) + Array.from({ length: shardCount }, (_, shard) => { + const chunkCases = chunks[shard]; + if (!chunkCases) return []; + return [ + { + body: { + runId, + kind, + model, + chunk: shard, + shard, + shardCount, + rep, + caseIds: chunkCases.map(c => c.id), + } satisfies BenchmarkJobMessage, + }, + ]; + }).flat() ).flat() ); } +export function getDeciderContainerInstanceName( + message: Pick +): string { + return `${message.runId}:${message.model}:${message.rep ?? 0}:${message.shard ?? 0}`; +} + export function buildClassifierMessages( runId: string, modelIds: string[], @@ -202,6 +263,33 @@ export class RunAlreadyActiveError extends Error { } } +// Thrown when the saved benchmark config would exceed a hard runtime limit. +// The admin route maps it to HTTP 400 so operators can fix config instead of +// starting a run that will immediately hit platform capacity. +export class BenchmarkRunConfigError extends Error { + constructor(message: string) { + super(message); + this.name = 'BenchmarkRunConfigError'; + } +} + +function validateDeciderContainerBudget({ + modelCount, + repetitions, + maxLiveContainers, +}: { + modelCount: number; + repetitions: number; + maxLiveContainers: number; +}): void { + const modelRepetitions = modelCount * repetitions; + if (modelRepetitions <= maxLiveContainers) return; + + throw new BenchmarkRunConfigError( + `decider benchmark requires at least one live container lane per model repetition (${modelRepetitions}), but maxConcurrency is ${maxLiveContainers}; reduce decider models/repetitions before starting` + ); +} + export async function startRun( env: Env, kind: BenchmarkKind, @@ -264,6 +352,14 @@ export async function startRun( 'benchmark user not configured: set benchmarkUserId before running the decider benchmark' ); } + const maxLiveDeciderContainers = Math.min(config.maxConcurrency, DECIDER_CONTAINER_INSTANCE_CAP); + if (kind === 'decider') { + validateDeciderContainerBudget({ + modelCount: enqueuedModelIds.length, + repetitions, + maxLiveContainers: maxLiveDeciderContainers, + }); + } const startedAt = new Date().toISOString(); const runId = `${kind}-${startedAt.replace(/[:.]/g, '-')}`; @@ -341,10 +437,18 @@ export async function startRun( return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels }; } - // Decider: one message per (model, rep, chunk) so each queue invocation stays - // bounded. finalizeRunIfComplete expects enqueuedModels × DECIDER_CASES × repetitions rows. + // Decider: seed as many shard lanes as fit under the live-container cap. Each + // completed chunk enqueues the next chunk for the same lane, so one stable + // container handles chunk N, N+shardCount, N+(2*shardCount), ... const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE); - const messages = buildDeciderMessages(runId, kind, enqueuedModelIds, repetitions, chunks); + const messages = buildDeciderMessages( + runId, + kind, + enqueuedModelIds, + repetitions, + chunks, + maxLiveDeciderContainers + ); await enqueueRunMessages(env, runId, messages); return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels }; } @@ -367,6 +471,7 @@ export async function processJob(env: Env, rawMessage: unknown): Promise { const message = parsed.data; const state = await getRunState(env, message.runId); + let shouldFinalize = true; if (message.kind === 'classifier') { if (!message.caseIds?.length || message.rep === undefined) { console.warn( @@ -400,7 +505,7 @@ export async function processJob(env: Env, rawMessage: unknown): Promise { run_id: message.runId, model: message.model, case_id: benchCase.id, - tier: null, + route_key: null, score, latency_ms: Math.round(performance.now() - startedAt), cost_usd: result.cost, @@ -423,10 +528,13 @@ export async function processJob(env: Env, rawMessage: unknown): Promise { } ); } else { - await processDeciderJob(env, message, state); + const result = await processDeciderJob(env, message, state); + shouldFinalize = result.shouldFinalize; } - await finalizeRunIfComplete(env, message.runId, message.kind, state); + if (shouldFinalize) { + await finalizeRunIfComplete(env, message.runId, message.kind, state); + } } type RunState = { @@ -461,15 +569,26 @@ async function processDeciderJob( env: Env, message: BenchmarkJobMessage, state: RunState -): Promise { +): Promise<{ shouldFinalize: boolean }> { // Decider messages always carry their chunk's case ids; anything else is // malformed and dropped (same policy as unparseable messages). if (!message.caseIds?.length) { console.warn(JSON.stringify({ event: 'benchmark_job_missing_case_ids', runId: message.runId })); - return; + return { shouldFinalize: false }; } const caseIds = new Set(message.caseIds); const cases = DECIDER_CASES.filter(c => caseIds.has(c.id)); + if (cases.length === 0) { + console.warn( + JSON.stringify({ + event: 'benchmark_job_empty_case_chunk', + runId: message.runId, + model: message.model, + chunk: message.chunk ?? 0, + }) + ); + return { shouldFinalize: false }; + } if (!state.benchmarkUserId) { // startRun fails fast before enqueueing, so this only happens if the run @@ -477,83 +596,163 @@ async function processDeciderJob( throw new Error(`run ${message.runId} has no benchmarkUserId`); } - // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the - // queue retries the message. The token is never logged. - const kiloToken = await fetchBenchmarkUserToken(env, state.benchmarkUserId); const rep = message.rep ?? 0; - const instanceName = `${message.runId}:${message.model}:${rep}:${message.chunk ?? 0}`; + const chunk = message.chunk ?? 0; + const shard = message.shard ?? 0; + const shardCount = message.shardCount ?? 1; + const instanceName = getDeciderContainerInstanceName(message); + + const existingCaseIds = await getExistingCaseResultIds(env.BENCH_DB, { + runId: message.runId, + model: message.model, + rep, + caseIds: cases.map(c => c.id), + }); + const casesToRun = cases.filter(c => !existingCaseIds.has(c.id)); // Reasoning effort comes from the run snapshot (run_models row), not live config. const modelRow = state.models.find(m => m.model === message.model); const reasoningEffort = modelRow?.reasoning_effort ?? null; - // Fresh container instances run the CLI's one-time sqlite migration; the - // container owns that via its /warmup endpoint so the first real case - // doesn't burn its timeout on it. Failures are non-fatal: the first case - // simply absorbs whatever warmup work remains. - await warmUpCliContainer(env, { instanceName, model: message.model, kiloToken }).catch(() => {}); - - // Concurrency 1: the CLI's sqlite state in the container is not safe under - // concurrent sessions (partial-migration crashes); the container serializes - // too, so higher concurrency here would only hold HTTP requests open. - await runCasesWithConcurrency(cases, 1, async benchCase => { - const startedAt = performance.now(); - try { - let result = await runDeciderCaseViaCli(env, { - instanceName, - model: message.model, - benchCase, - kiloToken, - reasoningEffort, - }); - // The CLI occasionally ends a session with no assistant text at all - // (transient empty completion: a lone step_finish with cost 0). Mirror - // the production classifier's policy and retry once. - let retried = false; - if (result.exitCode === 0 && result.text.length === 0) { - retried = true; - const retry = await runDeciderCaseViaCli(env, { + if (casesToRun.length > 0) { + // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the + // queue retries the message. The token is never logged. + const kiloToken = await fetchBenchmarkUserToken(env, state.benchmarkUserId); + + // Fresh container instances run the CLI's one-time sqlite migration; the + // container owns that via its /warmup endpoint so the first real case + // doesn't burn its timeout on it. Ordinary warmup failures are non-fatal: + // the first case absorbs whatever warmup work remains. Container capacity + // failures are infrastructure pressure, so the queue retries the message. + await warmUpCliContainer(env, { instanceName, model: message.model, kiloToken }).catch( + error => { + if (isRetryableContainerAvailabilityError(error)) throw error; + } + ); + + // Concurrency 1: the CLI's sqlite state in the container is not safe under + // concurrent sessions (partial-migration crashes); the container serializes + // too, so higher concurrency here would only hold HTTP requests open. + await runCasesWithConcurrency(casesToRun, 1, async benchCase => { + const startedAt = performance.now(); + try { + let result = await runDeciderCaseViaCli(env, { instanceName, model: message.model, benchCase, kiloToken, reasoningEffort, }); - retry.costUsd = - retry.costUsd === null && result.costUsd === null - ? null - : (retry.costUsd ?? 0) + (result.costUsd ?? 0); - result = retry; + // The CLI occasionally ends a session with no assistant text at all + // (transient empty completion: a lone step_finish with cost 0). Mirror + // the production classifier's policy and retry once. + let retried = false; + if (result.exitCode === 0 && result.text.length === 0) { + retried = true; + const retry = await runDeciderCaseViaCli(env, { + instanceName, + model: message.model, + benchCase, + kiloToken, + reasoningEffort, + }); + retry.costUsd = + retry.costUsd === null && result.costUsd === null + ? null + : (retry.costUsd ?? 0) + (result.costUsd ?? 0); + result = retry; + } + const succeeded = + result.exitCode === 0 && + result.text.length > 0 && + runDeciderCheck(benchCase.check, result.text); + await upsertCaseResult(env.BENCH_DB, { + run_id: message.runId, + model: message.model, + case_id: benchCase.id, + route_key: taxonomyRouteKey(benchCase), + score: succeeded ? 1 : 0, + latency_ms: result.latencyMs, + cost_usd: result.costUsd, + error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null, + fallback_reason: null, + retried, + exit_code: result.exitCode, + output_prefix: result.text.slice(0, 200), + event_count: result.eventCount, + last_event_types: result.lastEventTypes.join(' '), + rep, + timed_out: result.timedOut ? 1 : 0, + }); + } catch (error) { + if (isRetryableContainerAvailabilityError(error)) throw error; + await upsertCaseResult( + env.BENCH_DB, + failedRow(message, benchCase.id, taxonomyRouteKey(benchCase), startedAt, error, rep) + ); } - const succeeded = - result.exitCode === 0 && - result.text.length > 0 && - runDeciderCheck(benchCase.check, result.text); - await upsertCaseResult(env.BENCH_DB, { - run_id: message.runId, - model: message.model, - case_id: benchCase.id, - tier: benchCase.tier, - score: succeeded ? 1 : 0, - latency_ms: result.latencyMs, - cost_usd: result.costUsd, - error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null, - fallback_reason: null, - retried, - exit_code: result.exitCode, - output_prefix: result.text.slice(0, 200), - event_count: result.eventCount, - last_event_types: result.lastEventTypes.join(' '), - rep, - timed_out: result.timedOut ? 1 : 0, - }); - } catch (error) { - await upsertCaseResult( - env.BENCH_DB, - failedRow(message, benchCase.id, benchCase.tier, startedAt, error, rep) + }); + } + + const hasNextChunk = await enqueueNextDeciderChunkIfNeeded( + env, + message, + rep, + chunk, + shard, + shardCount + ); + if (!hasNextChunk) { + await destroyDeciderCliContainer(env, { instanceName }).catch(error => { + console.warn( + JSON.stringify({ + event: 'benchmark_container_destroy_failed', + instanceName, + ...formatError(error), + }) ); - } + }); + } + return { shouldFinalize: !hasNextChunk }; +} + +async function enqueueNextDeciderChunkIfNeeded( + env: Env, + message: BenchmarkJobMessage, + rep: number, + chunk: number, + shard: number, + shardCount: number +): Promise { + const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE); + const nextChunkIndex = chunk + shardCount; + const nextChunk = chunks[nextChunkIndex]; + if (!nextChunk) return false; + + const nextCaseIds = nextChunk.map(c => c.id); + const existingNextCaseIds = await getExistingCaseResultIds(env.BENCH_DB, { + runId: message.runId, + model: message.model, + rep, + caseIds: nextCaseIds, }); + if (existingNextCaseIds.size >= nextCaseIds.length) return true; + + await env.BENCH_QUEUE.sendBatch([ + { + body: { + runId: message.runId, + kind: 'decider', + model: message.model, + chunk: nextChunkIndex, + shard, + shardCount, + rep, + caseIds: nextCaseIds, + } satisfies BenchmarkJobMessage, + }, + ]); + return true; } const TokenResponseSchema = z.object({ token: z.string().min(1), expiresAt: z.string() }); @@ -587,7 +786,7 @@ export async function fetchBenchmarkUserToken(env: Env, userId: string): Promise function failedRow( message: BenchmarkJobMessage, caseId: string, - tier: string | null, + routeKey: string | null, startedAt: number, error: unknown, rep: number = 0 @@ -596,7 +795,7 @@ function failedRow( run_id: message.runId, model: message.model, case_id: caseId, - tier, + route_key: routeKey, score: 0, latency_ms: Math.round(performance.now() - startedAt), cost_usd: null, @@ -729,13 +928,12 @@ async function finalizeRunIfComplete( } export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): BenchmarkModelSummary[] { - // Group by "model tier-key" using a plain reduce so this works in all runtimes. - // Classifier rows use '*' as the tier (no tiering); decider rows use the actual tier - // (falling back to '*' when tier is null). + // Group by "model route-key" using a plain reduce so this works in all runtimes. + // Classifier rows use '*' because classification has no decider taxonomy route. const groups = new Map(); for (const row of rows) { - const tierKey = kind === 'classifier' ? '*' : (row.tier ?? '*'); - const key = `${row.model}\0${tierKey}`; + const routeKey = kind === 'classifier' ? '*' : (row.route_key ?? '*'); + const key = `${row.model}\0${routeKey}`; const existing = groups.get(key); if (existing) { existing.push(row); @@ -745,7 +943,7 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark } return [...groups.entries()].map(([key, group]) => { - const [model, tier] = key.split('\0'); + const [model, routeKey] = key.split('\0'); const latencies = group.map(r => r.latency_ms).toSorted((a, b) => a - b); const costs = group.filter(r => r.cost_usd !== null); const p95LatencyMs = @@ -755,7 +953,7 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark : null; return { model, - tier: tier as BenchmarkModelSummary['tier'], + routeKey: routeKey as BenchmarkModelSummary['routeKey'], accuracy: Number((group.reduce((a, r) => a + r.score, 0) / group.length).toFixed(4)), avgCostUsd: costs.length ? Number((costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8)) diff --git a/services/auto-routing-benchmark/src/winner.ts b/services/auto-routing-benchmark/src/winner.ts index 318809c4a3..952b329bb3 100644 --- a/services/auto-routing-benchmark/src/winner.ts +++ b/services/auto-routing-benchmark/src/winner.ts @@ -1,6 +1,6 @@ import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts'; -// Picks the best classifier candidate from summaries (tier '*') applying: +// Picks the best classifier candidate from summaries (routeKey '*') applying: // 1. Accuracy gate: must meet minAccuracy. // 2. Optional p95 latency gate: when maxP95LatencyMs is non-null, prefer // candidates whose measured p95 latency is within budget. @@ -16,7 +16,7 @@ export function pickClassifierWinner( minAccuracy: number, maxP95LatencyMs: number | null = null ): BenchmarkModelSummary | null { - const graded = summaries.filter(s => s.tier === '*' && s.cases > 0); + const graded = summaries.filter(s => s.routeKey === '*' && s.cases > 0); if (graded.length === 0) return null; const cost = (s: BenchmarkModelSummary) => s.avgCostUsd ?? Number.POSITIVE_INFINITY; const p95 = (s: BenchmarkModelSummary) => s.p95LatencyMs ?? Number.POSITIVE_INFINITY; diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc index 9faeb19ac4..c0433b1073 100644 --- a/services/auto-routing-benchmark/wrangler.jsonc +++ b/services/auto-routing-benchmark/wrangler.jsonc @@ -32,7 +32,7 @@ "class_name": "BenchRunnerContainer", "image": "./container/Dockerfile", "instance_type": "standard-2", - "max_instances": 50, + "max_instances": 100, }, ], "durable_objects": { @@ -53,8 +53,9 @@ { "queue": "auto-routing-benchmark-jobs", "max_batch_size": 1, - "max_retries": 2, - "max_concurrency": 4, + "max_retries": 6, + "retry_delay": 180, + "max_concurrency": 100, "dead_letter_queue": "auto-routing-benchmark-dlq", }, ], diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts index fd476a5668..bd89638137 100644 --- a/services/auto-routing/src/decide.ts +++ b/services/auto-routing/src/decide.ts @@ -254,7 +254,8 @@ function recordDecision( mode: ctx.payload.mode, uaPrefix: ctx.payload.userAgent?.slice(0, 40) ?? null, decidedModel: decision?.model ?? null, - decidedTier: decision?.tier ?? null, + decidedTaskType: decision?.taskType ?? null, + decidedSubtaskType: decision?.subtaskType ?? null, decisionSource: decision?.source ?? null, sticky: decision?.sticky ?? null, ...summary.details, diff --git a/services/auto-routing/src/decision-cache.ts b/services/auto-routing/src/decision-cache.ts index ae98778688..0aed20c63e 100644 --- a/services/auto-routing/src/decision-cache.ts +++ b/services/auto-routing/src/decision-cache.ts @@ -82,7 +82,7 @@ function entryKey(contentHash: string, classifierModel: string): string { // Single per-conversation slot remembering the last model the decision // engine served, so the session can stay on it (keeping the provider's -// prompt cache warm) instead of ping-ponging when its tier oscillates. +// prompt cache warm) instead of ping-ponging when its route oscillates. // Cannot collide with classification keys, which always contain a ':'. const STICKY_DECISION_KEY = 'sticky'; diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts index b10fcc2e47..ab137ccd47 100644 --- a/services/auto-routing/src/decision-engine.test.ts +++ b/services/auto-routing/src/decision-engine.test.ts @@ -19,8 +19,8 @@ const table: RoutingTable = { minAccuracy: 0.7, switchCostFactor: 3, source: 'benchmark', - tiers: { - low: [ + routes: { + 'implementation/code_generation': [ { model: 'cheap/chat', accuracy: 0.85, @@ -47,7 +47,7 @@ const table: RoutingTable = { meetsThreshold: false, }, ], - medium: [ + 'debugging/bug_fixing': [ { model: 'mid/chat', accuracy: 0.8, @@ -55,7 +55,7 @@ const table: RoutingTable = { meetsThreshold: true, }, ], - high: [ + 'planning_design/system_design': [ { model: 'big/chat', accuracy: 0.9, @@ -67,59 +67,39 @@ const table: RoutingTable = { }; describe('computeDecision', () => { - it('picks the first candidate of the tier', () => { + it('picks the first candidate of the classifier taxonomy route', () => { const decision = computeDecision(classification, table, null); expect(decision).toEqual({ model: 'cheap/chat', - tier: 'low', + taskType: 'implementation', + subtaskType: 'code_generation', source: 'benchmark', tableVersion: 'run-1', reasoningEffort: null, sticky: false, }); }); - it('uses the tier derived from the classification', () => { - const hard: ClassifierOutput = { + it('uses the classifier task type and subtype directly', () => { + const debugging: ClassifierOutput = { ...classification, - reasoningComplexity: 'high', - contextComplexity: 'large', - executionMode: 'multi_step_project', + taskType: 'debugging', + subtaskType: 'bug_fixing', }; - expect(computeDecision(hard, table, null)?.model).toBe('big/chat'); - }); - it('returns a decision for every tier of a valid table', () => { - const byTier: Array<[ClassifierOutput, string]> = [ - [classification, 'cheap/chat'], - [ - { ...classification, reasoningComplexity: 'medium', contextComplexity: 'medium' }, - 'mid/chat', - ], - [ - { - ...classification, - reasoningComplexity: 'high', - contextComplexity: 'large', - executionMode: 'multi_step_project', - }, - 'big/chat', - ], - ]; - for (const [input, expected] of byTier) { - expect(computeDecision(input, table, null)?.model).toBe(expected); - } + expect(computeDecision(debugging, table, null)?.model).toBe('mid/chat'); }); it('returns null when there is no routing table', () => { expect(computeDecision(classification, null, null)).toBeNull(); }); describe('session stickiness', () => { - it('keeps the incumbent on tier de-escalation when it is within the switch-cost factor', () => { + it('keeps the incumbent on route changes when it is within the switch-cost factor', () => { // Fresh pick cheap/chat at 0.002; mid/chat at 0.005 is not cheaper by // more than 3x (0.002 * 3 = 0.006 >= 0.005), so the session stays put. const decision = computeDecision(classification, table, 'mid/chat'); expect(decision).toEqual({ model: 'mid/chat', - tier: 'low', + taskType: 'implementation', + subtaskType: 'code_generation', source: 'benchmark', tableVersion: 'run-1', // The incumbent's benchmarked effort, not the fresh pick's. @@ -132,11 +112,19 @@ describe('computeDecision', () => { // Integer costs avoid float noise on the equality case (1 * 3 === 3). const boundaryTable: RoutingTable = { ...table, - tiers: { - ...table.tiers, - low: [ - { ...table.tiers.low[0]!, model: 'fresh/chat', avgCostUsd: 1 }, - { ...table.tiers.low[1]!, model: 'incumbent/chat', avgCostUsd: 3 }, + routes: { + ...table.routes, + 'implementation/code_generation': [ + { + ...table.routes['implementation/code_generation'][0]!, + model: 'fresh/chat', + avgCostUsd: 1, + }, + { + ...table.routes['implementation/code_generation'][1]!, + model: 'incumbent/chat', + avgCostUsd: 3, + }, ], }, }; @@ -148,11 +136,11 @@ describe('computeDecision', () => { const decision = computeDecision(classification, table, 'pricey/chat'); expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false }); }); - it('switches when the incumbent no longer meets the tier threshold', () => { + it('switches when the incumbent no longer meets the route threshold', () => { const decision = computeDecision(classification, table, 'weak/chat'); expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false }); }); - it('serves the fresh pick when the incumbent is not in the tier', () => { + it('serves the fresh pick when the incumbent is not in the route', () => { const decision = computeDecision(classification, table, 'gone/model'); expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false }); }); diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts index 0d641e069d..aaa7aba542 100644 --- a/services/auto-routing/src/decision-engine.ts +++ b/services/auto-routing/src/decision-engine.ts @@ -1,5 +1,5 @@ import { - deriveDifficultyTier, + taxonomyRouteKey, type AutoRoutingDecision, type ClassifierOutput, type RoutingTable, @@ -11,14 +11,13 @@ export function computeDecision( incumbentModel: string | null ): AutoRoutingDecision | null { if (!table) return null; - const tier = deriveDifficultyTier(classification); - const candidates = table.tiers[tier]; - // A parsed table guarantees a non-empty tier (schema .min(1)), so with a - // table and a classification a decision always exists. + const routeKey = taxonomyRouteKey(classification); + const candidates = table.routes[routeKey]; + if (!candidates?.length) return null; const freshPick = candidates[0]; // Keep the session on its incumbent model when it is still good enough for - // the current tier. A model switch discards the provider's prompt cache, + // the current taxonomy route. A model switch discards the provider's prompt cache, // and rebuilding it costs full-price input tokens (4-10x cache-read rates) // on a context that dominates agent-session spend — so a switch is only // worth it when the fresh pick's recurring per-turn savings clearly exceed @@ -33,7 +32,8 @@ export function computeDecision( ) { return { model: incumbent.model, - tier, + taskType: classification.taskType, + subtaskType: classification.subtaskType, source: table.source, tableVersion: table.version, reasoningEffort: incumbent.reasoningEffort ?? null, @@ -43,7 +43,8 @@ export function computeDecision( return { model: freshPick.model, - tier, + taskType: classification.taskType, + subtaskType: classification.subtaskType, source: table.source, tableVersion: table.version, reasoningEffort: freshPick.reasoningEffort ?? null, diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts index 4519c7c310..220d443fbb 100644 --- a/services/auto-routing/src/index.test.ts +++ b/services/auto-routing/src/index.test.ts @@ -87,17 +87,15 @@ const benchmarkRoutingTable = { minAccuracy: 0.7, switchCostFactor: 3, source: 'benchmark', - tiers: { - low: [ + routes: { + 'implementation/feature_development': [ { model: 'google/gemini-2.5-flash-lite', accuracy: 0.9, - avgCostUsd: 0.001, + avgCostUsd: 0.002, meetsThreshold: true, reasoningEffort: null, }, - ], - medium: [ { model: 'google/gemini-2.5-flash', accuracy: 0.85, @@ -105,9 +103,9 @@ const benchmarkRoutingTable = { meetsThreshold: true, reasoningEffort: null, }, - // The high-tier model also qualifies for medium, within the 3x + // The planning route's model also qualifies for implementation, within the 3x // switch-cost factor of the fresh pick (0.002 * 3 >= 0.005): a session - // de-escalating from high stays on it. + // moving routes stays on it. { model: 'anthropic/claude-sonnet-4.6', accuracy: 0.8, @@ -116,7 +114,7 @@ const benchmarkRoutingTable = { reasoningEffort: null, }, ], - high: [ + 'planning_design/system_design': [ { model: 'anthropic/claude-sonnet-4.6', accuracy: 0.8, @@ -235,7 +233,8 @@ describe('auto routing worker', () => { cost: 0.00000123, decision: { model: expect.any(String), - tier: expect.stringMatching(/^(low|medium|high)$/), + taskType: 'implementation', + subtaskType: 'feature_development', source: 'benchmark', tableVersion: 'bench-run-1', reasoningEffort: null, @@ -300,7 +299,8 @@ describe('auto routing worker', () => { cost: 0, decision: { model: expect.any(String), - tier: expect.stringMatching(/^(low|medium|high)$/), + taskType: 'implementation', + subtaskType: 'feature_development', source: 'benchmark', tableVersion: 'bench-run-1', reasoningEffort: null, @@ -331,7 +331,7 @@ describe('auto routing worker', () => { ); }); - it('keeps the session on the incumbent model when the tier de-escalates', async () => { + it('keeps the session on the incumbent model when the taxonomy route changes', async () => { // Back the mocked DO stub with real storage so the sticky model written // by the first request is visible to the second. const store = new Map(); @@ -344,19 +344,24 @@ describe('auto routing worker', () => { ...mockClassifierResult, classification: { ...mockClassification, - reasoningComplexity: 'high', - contextComplexity: 'large', - executionMode: 'multi_step_project', + taskType: 'planning_design', + subtaskType: 'system_design', }, }); const first = await decideRequest(mirrorPayload()); expect(first.status).toBe(200); await expect(first.json()).resolves.toMatchObject({ - decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'high', sticky: false }, + decision: { + model: 'anthropic/claude-sonnet-4.6', + taskType: 'planning_design', + subtaskType: 'system_design', + sticky: false, + }, }); + store.set('sticky', { model: 'anthropic/claude-sonnet-4.6' }); - // The second turn (different prompt, same session) classifies as medium. - // The fresh medium pick is cheaper, but not by more than the switch-cost + // The second turn (different prompt, same session) classifies to a cheaper route. + // The fresh implementation pick is cheaper, but not by more than the switch-cost // factor, so the session keeps its incumbent. const second = await decideRequest( mirrorPayload({ @@ -365,7 +370,12 @@ describe('auto routing worker', () => { ); expect(second.status).toBe(200); await expect(second.json()).resolves.toMatchObject({ - decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'medium', sticky: true }, + decision: { + model: 'anthropic/claude-sonnet-4.6', + taskType: 'implementation', + subtaskType: 'feature_development', + sticky: true, + }, }); }); diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts index be60e909ab..9b73d29235 100644 --- a/services/auto-routing/src/routing-table.test.ts +++ b/services/auto-routing/src/routing-table.test.ts @@ -8,8 +8,8 @@ const SAMPLE_TABLE: RoutingTable = { minAccuracy: 0.7, switchCostFactor: 3, source: 'benchmark', - tiers: { - low: [ + routes: { + 'implementation/feature_development': [ { model: 'google/gemini-2.5-flash-lite', accuracy: 0.9, @@ -18,7 +18,7 @@ const SAMPLE_TABLE: RoutingTable = { reasoningEffort: null, }, ], - medium: [ + 'debugging/bug_fixing': [ { model: 'google/gemini-2.5-flash', accuracy: 0.85, @@ -27,7 +27,7 @@ const SAMPLE_TABLE: RoutingTable = { reasoningEffort: null, }, ], - high: [ + 'planning_design/system_design': [ { model: 'anthropic/claude-sonnet-4.6', accuracy: 0.8,