diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
index 11a8a6a0e3..8bc45f6019 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
@@ -63,6 +63,7 @@ describe('configToFormState', () => {
expect(state.classifierMaxP95LatencyMs).toBe('1000');
expect(state.classifierModels).toBe('');
expect(state.deciderModels).toEqual([]);
+ expect(state.maxConcurrency).toBe(100);
});
});
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index 9bdfac18ba..312e44e602 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -126,7 +126,7 @@ export function configToFormState(config: BenchmarkConfig | null): {
deciderModels: [],
minAccuracy: 0.7,
switchCostFactor: 3,
- maxConcurrency: 4,
+ maxConcurrency: 100,
benchmarkUserId: '',
classifierRepetitions: 1,
deciderRepetitions: 1,
@@ -407,13 +407,13 @@ function BenchmarkConfigEditor({
@@ -539,17 +539,13 @@ function BenchmarkConfigEditor({
// Run summaries expandable table
// ---------------------------------------------------------------------------
-const TIER_ORDER = { low: 0, medium: 1, high: 2, '*': 3 } as const;
-
function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
const isDecider = run.kind === 'decider';
const sortedSummaries: BenchmarkModelSummary[] = isDecider
? [...run.summaries].sort((a, b) => {
- const tierDiff =
- (TIER_ORDER[a.tier as keyof typeof TIER_ORDER] ?? 3) -
- (TIER_ORDER[b.tier as keyof typeof TIER_ORDER] ?? 3);
- if (tierDiff !== 0) return tierDiff;
+ const routeDiff = a.routeKey.localeCompare(b.routeKey);
+ if (routeDiff !== 0) return routeDiff;
return b.accuracy - a.accuracy;
})
: run.summaries;
@@ -571,7 +567,7 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
Model
- {isDecider ? Tier : null}
+ {isDecider ? Route : null}
Accuracy
Avg cost
Avg latency
@@ -584,10 +580,10 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
{sortedSummaries.map((s, i) => (
-
+
{s.model}
{isDecider ? (
- {s.tier}
+ {s.routeKey}
) : null}
{formatAccuracy(s.accuracy)}
@@ -717,11 +713,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
}
const { table } = data;
- const tierEntries = [
- { tier: 'low', candidates: table.tiers.low },
- { tier: 'medium', candidates: table.tiers.medium },
- { tier: 'high', candidates: table.tiers.high },
- ] as const;
+ const routeEntries = Object.entries(table.routes).sort(([a], [b]) => a.localeCompare(b));
return (
@@ -736,9 +728,9 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
- {tierEntries.map(({ tier, candidates }) => (
-
-
{tier} tier
+ {routeEntries.map(([routeKey, candidates]) => (
+
+
{routeKey}
@@ -751,7 +743,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
{candidates.map((c, i) => (
-
+
{c.model}
{formatAccuracy(c.accuracy)}
diff --git a/apps/web/src/app/api/openrouter/[...path]/route.test.ts b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
index bb7b22ded3..a82a0e8cfe 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.test.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
@@ -447,7 +447,8 @@ describe('kilo-auto/efficient classifier billing', () => {
mockedFetchEfficientAutoDecision.mockResolvedValue({
decision: {
model: 'anthropic/claude-haiku-4',
- tier: 'low',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'v1',
sticky: false,
@@ -481,7 +482,8 @@ describe('kilo-auto/efficient classifier billing', () => {
mockedFetchEfficientAutoDecision.mockResolvedValue({
decision: {
model: 'anthropic/claude-haiku-4',
- tier: 'low',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
source: 'benchmark' as const,
tableVersion: 'v1',
sticky: false,
@@ -510,7 +512,8 @@ describe('kilo-auto/efficient classifier billing', () => {
mockedFetchEfficientAutoDecision.mockResolvedValue({
decision: {
model: 'anthropic/claude-haiku-4',
- tier: 'low',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'v1',
sticky: false,
@@ -560,7 +563,8 @@ describe('kilo-auto/efficient classifier billing', () => {
mockedFetchEfficientAutoDecision.mockResolvedValue({
decision: {
model: 'anthropic/claude-haiku-4',
- tier: 'low',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'v1',
sticky: false,
diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
index f241c5f222..15235c3730 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
@@ -25,7 +25,8 @@ const zeroBalancePromise = Promise.resolve(0);
const sampleDecision: AutoRoutingDecision = {
model: 'anthropic/claude-haiku-4',
- tier: 'low',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'v1',
sticky: false,
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
index 70d8e7e0c6..52daf63cc8 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
@@ -47,7 +47,8 @@ const options = {
const validDecision = {
model: 'anthropic/claude-haiku-4',
- tier: 'low' as const,
+ taskType: 'implementation' as const,
+ subtaskType: 'feature_development' as const,
source: 'benchmark' as const,
tableVersion: 'v1',
sticky: false,
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 8409b7f743..a696ac3063 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -1,9 +1,10 @@
import * as z from 'zod';
import { RoutingTableSchema } from './routing-table';
-import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
+import { ReasoningEffortSchema } from './reasoning';
+import { TaxonomyRouteKeySchema } from './taxonomy';
-export { ReasoningEffortSchema } from './tiers';
-export type { ReasoningEffort } from './tiers';
+export { ReasoningEffortSchema } from './reasoning';
+export type { ReasoningEffort } from './reasoning';
export const BenchmarkKindSchema = z.enum(['classifier', 'decider']);
export type BenchmarkKind = z.infer;
@@ -39,15 +40,16 @@ export const BenchmarkConfigSchema = z
.object({
classifierModels: z.array(z.string().trim().min(1)).min(1),
deciderModels: z.array(BenchmarkDeciderModelSchema).min(1),
- // Accuracy threshold for "gets the job done" (per tier).
+ // Accuracy threshold for "gets the job done" (per taxonomy route).
minAccuracy: z.number().min(0).max(1),
- // Parallel OpenRouter calls per queue message.
- maxConcurrency: z.number().int().min(1).max(16),
+ // Benchmark-wide parallelism budget. Decider runs use it as a live
+ // container budget; classifier runs use it for parallel OpenRouter calls.
+ maxConcurrency: z.number().int().min(1).max(100),
// The Kilo user whose identity/billing the decider CLI runs execute under.
// Null until an admin configures it; decider runs fail fast while null.
benchmarkUserId: z.string().trim().min(1).nullable(),
// Session stickiness knob carried into published routing tables: a session
- // stays on its incumbent model while it meets the tier's accuracy
+ // stays on its incumbent model while it meets the route's accuracy
// threshold, unless the fresh pick is cheaper by more than this factor.
// Model switches discard provider prompt caches (cache reads are far
// cheaper than fresh input tokens), so switching only pays off when the
@@ -79,8 +81,8 @@ export type BenchmarkRunStatus = z.infer;
export const BenchmarkModelSummarySchema = z.object({
model: z.string(),
- // '*' for classifier runs (no tiering), otherwise the difficulty tier.
- tier: z.union([DifficultyTierSchema, z.literal('*')]),
+ // '*' for classifier runs, otherwise "/".
+ routeKey: z.union([TaxonomyRouteKeySchema, z.literal('*')]),
accuracy: z.number(),
avgCostUsd: z.number().nullable(),
avgLatencyMs: z.number(),
diff --git a/packages/auto-routing-contracts/src/contracts.test.ts b/packages/auto-routing-contracts/src/contracts.test.ts
index 0c826251dc..963875f812 100644
--- a/packages/auto-routing-contracts/src/contracts.test.ts
+++ b/packages/auto-routing-contracts/src/contracts.test.ts
@@ -147,6 +147,20 @@ describe('BenchmarkConfigSchema defaults', () => {
expect(result.deciderRepetitions).toBe(1);
expect(result.classifierMaxP95LatencyMs).toBe(1000);
});
+
+ it('accepts the benchmark maximum concurrency cap of 100', () => {
+ const result = BenchmarkConfigSchema.safeParse({
+ classifierModels: ['model/a'],
+ deciderModels: [{ id: 'model/b' }],
+ minAccuracy: 0.8,
+ maxConcurrency: 100,
+ benchmarkUserId: null,
+ switchCostFactor: 2,
+ updatedAt: null,
+ updatedBy: null,
+ });
+ expect(result.success).toBe(true);
+ });
});
describe('BenchmarkConfigSchema duplicate model ids', () => {
diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
index 31915439ec..aeb55bf7b2 100644
--- a/packages/auto-routing-contracts/src/index.ts
+++ b/packages/auto-routing-contracts/src/index.ts
@@ -1,6 +1,12 @@
import * as z from 'zod';
import { NormalizedClassifierInputSchema } from './input';
-import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
+import { ReasoningEffortSchema } from './reasoning';
+import {
+ ClassifierSubtaskTypeSchema,
+ ClassifierTaskTypeSchema,
+ SUBTYPES_BY_TASK_TYPE,
+ type ClassifierSubtaskType,
+} from './taxonomy';
export {
NormalizedClassifierInputSchema,
@@ -29,47 +35,6 @@ export const MirrorPayloadSchema = z.object({
});
export type MirrorPayload = z.infer;
-export const ClassifierTaskTypeSchema = z.enum([
- 'implementation',
- 'debugging',
- 'refactoring',
- 'planning_design',
- 'investigation',
- 'agentic_execution',
-]);
-export type ClassifierTaskType = z.infer;
-
-export const ClassifierSubtaskTypeSchema = z.enum([
- 'feature_development',
- 'code_generation',
- 'test_creation',
- 'bug_fixing',
- 'test_repair',
- 'root_cause_analysis',
- 'code_cleanup',
- 'architecture_improvement',
- 'migration',
- 'architecture_design',
- 'technical_planning',
- 'system_design',
- 'repo_exploration',
- 'codebase_understanding',
- 'external_research',
- 'tool_usage',
- 'terminal_operations',
- 'multi_step_execution',
-]);
-export type ClassifierSubtaskType = z.infer;
-
-const subtypesByTaskType: Record = {
- implementation: ['feature_development', 'code_generation', 'test_creation'],
- debugging: ['bug_fixing', 'test_repair', 'root_cause_analysis'],
- refactoring: ['code_cleanup', 'architecture_improvement', 'migration'],
- planning_design: ['architecture_design', 'technical_planning', 'system_design'],
- investigation: ['repo_exploration', 'codebase_understanding', 'external_research'],
- agentic_execution: ['tool_usage', 'terminal_operations', 'multi_step_execution'],
-};
-
export const ClassifierOutputSchema = z
.strictObject({
taskType: ClassifierTaskTypeSchema,
@@ -87,7 +52,10 @@ export const ClassifierOutputSchema = z
confidence: z.number().min(0).max(1),
})
.superRefine((output, ctx) => {
- if (!subtypesByTaskType[output.taskType].includes(output.subtaskType)) {
+ const allowedSubtypes = SUBTYPES_BY_TASK_TYPE[
+ output.taskType
+ ] as readonly ClassifierSubtaskType[];
+ if (!allowedSubtypes.includes(output.subtaskType)) {
ctx.addIssue({
code: 'custom',
path: ['subtaskType'],
@@ -99,7 +67,8 @@ export type ClassifierOutput = z.infer;
export const AutoRoutingDecisionSchema = z.object({
model: z.string(),
- tier: DifficultyTierSchema,
+ taskType: ClassifierTaskTypeSchema,
+ subtaskType: ClassifierSubtaskTypeSchema,
source: z.enum(['benchmark']),
tableVersion: z.string(),
// Mirrors the effort the chosen model was benchmarked with, when set.
@@ -180,6 +149,7 @@ export type AutoRoutingClassifierAnalyticsResponse = z.infer<
export { normalizeClassifierInput, redactProviderHints, type ClassifierApiKind } from './normalize';
-export * from './tiers';
+export * from './reasoning';
+export * from './taxonomy';
export * from './routing-table';
export * from './benchmark';
diff --git a/packages/auto-routing-contracts/src/reasoning.ts b/packages/auto-routing-contracts/src/reasoning.ts
new file mode 100644
index 0000000000..a989853d1c
--- /dev/null
+++ b/packages/auto-routing-contracts/src/reasoning.ts
@@ -0,0 +1,4 @@
+import * as z from 'zod';
+
+export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
+export type ReasoningEffort = z.infer;
diff --git a/packages/auto-routing-contracts/src/routing-table.test.ts b/packages/auto-routing-contracts/src/routing-table.test.ts
index edcd573b44..a4830ce117 100644
--- a/packages/auto-routing-contracts/src/routing-table.test.ts
+++ b/packages/auto-routing-contracts/src/routing-table.test.ts
@@ -9,12 +9,16 @@ const candidate = (model: string, accuracy: number, avgCostUsd: number) => ({
});
describe('rankCandidates', () => {
- it('puts the cheapest above-threshold candidate first', () => {
+ it('puts the lowest cost-per-accuracy above-threshold candidate first', () => {
const ranked = rankCandidates(
- [candidate('expensive', 0.95, 10), candidate('cheap', 0.8, 1), candidate('weak', 0.5, 0.1)],
+ [
+ candidate('lower-raw-cost', 0.7, 0.007),
+ candidate('better-value', 0.9, 0.008),
+ candidate('weak', 0.5, 0.001),
+ ],
0.7
);
- expect(ranked.map(c => c.model)).toEqual(['cheap', 'expensive', 'weak']);
+ expect(ranked.map(c => c.model)).toEqual(['better-value', 'lower-raw-cost', 'weak']);
expect(ranked[0].meetsThreshold).toBe(true);
expect(ranked[2].meetsThreshold).toBe(false);
});
@@ -29,15 +33,35 @@ describe('rankCandidates', () => {
});
describe('RoutingTableSchema', () => {
- it('requires at least one candidate per tier', () => {
+ it('requires at least one candidate per taxonomy route', () => {
expect(
RoutingTableSchema.safeParse({
version: 'v',
generatedAt: new Date(0).toISOString(),
minAccuracy: 0.7,
+ switchCostFactor: 3,
source: 'benchmark',
- tiers: { low: [], medium: [candidate('m', 1, 1)], high: [candidate('h', 1, 1)] },
+ routes: {
+ 'implementation/code_generation': [],
+ 'debugging/bug_fixing': [candidate('m', 1, 1)],
+ },
}).success
).toBe(false);
});
+
+ it('accepts a table routed by classifier taxonomy pair', () => {
+ const parsed = RoutingTableSchema.parse({
+ version: 'v',
+ generatedAt: new Date(0).toISOString(),
+ minAccuracy: 0.7,
+ switchCostFactor: 3,
+ source: 'benchmark',
+ routes: {
+ 'implementation/code_generation': [candidate('impl', 0.9, 1)],
+ 'debugging/bug_fixing': [candidate('debug', 0.9, 1)],
+ },
+ });
+
+ expect(parsed.routes['implementation/code_generation']?.[0]?.model).toBe('impl');
+ });
});
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index ff49e81578..0a1db0c0a5 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -1,9 +1,10 @@
import * as z from 'zod';
-import { ReasoningEffortSchema } from './tiers';
+import { ReasoningEffortSchema } from './reasoning';
+import { TaxonomyRouteKeySchema } from './taxonomy';
export const RankedCandidateSchema = z.object({
model: z.string().trim().min(1),
- // Benchmark accuracy in [0, 1] for this tier.
+ // Benchmark accuracy in [0, 1] for this taxonomy route.
accuracy: z.number().min(0).max(1),
// Average observed OpenRouter cost per benchmark case, in USD credits.
avgCostUsd: z.number().nonnegative(),
@@ -23,19 +24,25 @@ export const RoutingTableSchema = z.object({
// more than this factor (see BenchmarkConfigSchema.switchCostFactor).
switchCostFactor: z.number().min(1),
source: z.enum(['benchmark']),
- tiers: z.object({
- low: z.array(RankedCandidateSchema).min(1),
- medium: z.array(RankedCandidateSchema).min(1),
- high: z.array(RankedCandidateSchema).min(1),
+ routes: z.record(z.string(), z.array(RankedCandidateSchema).min(1)).superRefine((routes, ctx) => {
+ for (const key of Object.keys(routes)) {
+ if (!TaxonomyRouteKeySchema.safeParse(key).success) {
+ ctx.addIssue({
+ code: 'custom',
+ path: [key],
+ message: `Unknown taxonomy route ${key}`,
+ });
+ }
+ }
}),
});
export type RoutingTable = z.infer;
export const ROUTING_TABLE_KV_KEY = 'routing_table_v1';
-// "Best bang for buck": candidates meeting the accuracy threshold come
-// first, cheapest first (accuracy breaks ties); below-threshold candidates
-// follow ordered by accuracy so a degenerate table still routes sensibly.
+// "Best bang for buck": candidates meeting the accuracy threshold come first,
+// lowest cost per unit of accuracy first; below-threshold candidates follow
+// ordered by accuracy so a degenerate table still routes sensibly.
export function rankCandidates(
candidates: ReadonlyArray & { meetsThreshold?: boolean }>,
minAccuracy: number
@@ -44,7 +51,7 @@ export function rankCandidates(
return flagged.toSorted((a, b) => {
if (a.meetsThreshold !== b.meetsThreshold) return a.meetsThreshold ? -1 : 1;
if (a.meetsThreshold) {
- return a.avgCostUsd - b.avgCostUsd || b.accuracy - a.accuracy;
+ return a.avgCostUsd / a.accuracy - b.avgCostUsd / b.accuracy || b.accuracy - a.accuracy;
}
return b.accuracy - a.accuracy || a.avgCostUsd - b.avgCostUsd;
});
diff --git a/packages/auto-routing-contracts/src/taxonomy.ts b/packages/auto-routing-contracts/src/taxonomy.ts
new file mode 100644
index 0000000000..bb5fa70c62
--- /dev/null
+++ b/packages/auto-routing-contracts/src/taxonomy.ts
@@ -0,0 +1,77 @@
+import * as z from 'zod';
+
+export const CLASSIFIER_TASK_TYPES = [
+ 'implementation',
+ 'debugging',
+ 'refactoring',
+ 'planning_design',
+ 'investigation',
+ 'agentic_execution',
+] as const;
+
+export const CLASSIFIER_SUBTASK_TYPES = [
+ 'feature_development',
+ 'code_generation',
+ 'test_creation',
+ 'bug_fixing',
+ 'test_repair',
+ 'root_cause_analysis',
+ 'code_cleanup',
+ 'architecture_improvement',
+ 'migration',
+ 'architecture_design',
+ 'technical_planning',
+ 'system_design',
+ 'repo_exploration',
+ 'codebase_understanding',
+ 'external_research',
+ 'tool_usage',
+ 'terminal_operations',
+ 'multi_step_execution',
+] as const;
+
+export const SUBTYPES_BY_TASK_TYPE = {
+ implementation: ['feature_development', 'code_generation', 'test_creation'],
+ debugging: ['bug_fixing', 'test_repair', 'root_cause_analysis'],
+ refactoring: ['code_cleanup', 'architecture_improvement', 'migration'],
+ planning_design: ['architecture_design', 'technical_planning', 'system_design'],
+ investigation: ['repo_exploration', 'codebase_understanding', 'external_research'],
+ agentic_execution: ['tool_usage', 'terminal_operations', 'multi_step_execution'],
+} as const;
+
+export const TAXONOMY_ROUTE_KEYS = [
+ 'implementation/feature_development',
+ 'implementation/code_generation',
+ 'implementation/test_creation',
+ 'debugging/bug_fixing',
+ 'debugging/test_repair',
+ 'debugging/root_cause_analysis',
+ 'refactoring/code_cleanup',
+ 'refactoring/architecture_improvement',
+ 'refactoring/migration',
+ 'planning_design/architecture_design',
+ 'planning_design/technical_planning',
+ 'planning_design/system_design',
+ 'investigation/repo_exploration',
+ 'investigation/codebase_understanding',
+ 'investigation/external_research',
+ 'agentic_execution/tool_usage',
+ 'agentic_execution/terminal_operations',
+ 'agentic_execution/multi_step_execution',
+] as const;
+
+export const ClassifierTaskTypeSchema = z.enum(CLASSIFIER_TASK_TYPES);
+export type ClassifierTaskType = z.infer;
+
+export const ClassifierSubtaskTypeSchema = z.enum(CLASSIFIER_SUBTASK_TYPES);
+export type ClassifierSubtaskType = z.infer;
+
+export const TaxonomyRouteKeySchema = z.enum(TAXONOMY_ROUTE_KEYS);
+export type TaxonomyRouteKey = z.infer;
+
+export function taxonomyRouteKey(params: {
+ taskType: ClassifierTaskType;
+ subtaskType: ClassifierSubtaskType;
+}): TaxonomyRouteKey {
+ return `${params.taskType}/${params.subtaskType}` as TaxonomyRouteKey;
+}
diff --git a/packages/auto-routing-contracts/src/tiers.test.ts b/packages/auto-routing-contracts/src/tiers.test.ts
deleted file mode 100644
index 5d62f7259f..0000000000
--- a/packages/auto-routing-contracts/src/tiers.test.ts
+++ /dev/null
@@ -1,79 +0,0 @@
-import { describe, expect, it } from 'vitest';
-import { deriveDifficultyTier } from './tiers';
-import type { ClassifierOutput } from './index';
-
-function classification(overrides: Partial): ClassifierOutput {
- return {
- taskType: 'implementation',
- subtaskType: 'code_generation',
- contextComplexity: 'small',
- reasoningComplexity: 'low',
- riskLevel: 'low',
- executionMode: 'answer_only',
- requiresTools: false,
- confidence: 0.9,
- ...overrides,
- };
-}
-
-describe('deriveDifficultyTier', () => {
- it('classifies trivial answer-only requests as low', () => {
- expect(deriveDifficultyTier(classification({}))).toBe('low');
- });
- it('classifies mid-size code changes as medium', () => {
- expect(
- deriveDifficultyTier(
- classification({
- contextComplexity: 'medium',
- reasoningComplexity: 'medium',
- executionMode: 'code_change',
- })
- )
- ).toBe('medium');
- });
- it('classifies high-reasoning multi-step work as high', () => {
- expect(
- deriveDifficultyTier(
- classification({
- contextComplexity: 'large',
- reasoningComplexity: 'high',
- executionMode: 'multi_step_project',
- riskLevel: 'high',
- })
- )
- ).toBe('high');
- });
- it('high risk tips an otherwise-low request to medium', () => {
- expect(
- deriveDifficultyTier(
- classification({ executionMode: 'multi_step_project', riskLevel: 'high' })
- )
- ).toBe('medium');
- });
- it('high risk tips an otherwise-medium request to high', () => {
- expect(
- deriveDifficultyTier(
- classification({
- reasoningComplexity: 'medium',
- contextComplexity: 'large',
- executionMode: 'code_change',
- riskLevel: 'high',
- })
- )
- ).toBe('high');
- });
- it('is monotonic: bumping reasoning complexity never lowers the tier', () => {
- const tiers = ['low', 'medium', 'high'] as const;
- for (const ctx of ['small', 'medium', 'large'] as const) {
- let prev = 0;
- for (const reasoning of ['low', 'medium', 'high'] as const) {
- const tier = deriveDifficultyTier(
- classification({ contextComplexity: ctx, reasoningComplexity: reasoning })
- );
- const idx = tiers.indexOf(tier);
- expect(idx).toBeGreaterThanOrEqual(prev);
- prev = idx;
- }
- }
- });
-});
diff --git a/packages/auto-routing-contracts/src/tiers.ts b/packages/auto-routing-contracts/src/tiers.ts
deleted file mode 100644
index 8358c5e3bf..0000000000
--- a/packages/auto-routing-contracts/src/tiers.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-import * as z from 'zod';
-
-export const DifficultyTierSchema = z.enum(['low', 'medium', 'high']);
-
-export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
-export type ReasoningEffort = z.infer;
-export type DifficultyTier = z.infer;
-
-export const DIFFICULTY_TIERS: readonly DifficultyTier[] = ['low', 'medium', 'high'];
-
-const REASONING_POINTS = { low: 0, medium: 2, high: 4 } as const;
-const CONTEXT_POINTS = { small: 0, medium: 1, large: 2 } as const;
-const EXECUTION_POINTS = {
- answer_only: 0,
- code_change: 1,
- command_execution: 1,
- multi_step_project: 2,
-} as const;
-const RISK_POINTS = { low: 0, medium: 0, high: 1 } as const;
-
-// Deterministic mapping from the classifier taxonomy to a difficulty tier.
-// Reasoning complexity dominates (weight 2x) because it is the strongest
-// signal for whether a cheap model can complete the task; context size,
-// execution mode and blast radius nudge borderline cases up.
-// Structural subset of ClassifierOutput: importing the full type from
-// ./index would create a module cycle (index re-exports this file).
-export type DifficultyTierSignal = {
- reasoningComplexity: 'low' | 'medium' | 'high';
- contextComplexity: 'small' | 'medium' | 'large';
- executionMode: 'answer_only' | 'code_change' | 'command_execution' | 'multi_step_project';
- riskLevel: 'low' | 'medium' | 'high';
-};
-
-export function deriveDifficultyTier(classification: DifficultyTierSignal): DifficultyTier {
- const score =
- REASONING_POINTS[classification.reasoningComplexity] +
- CONTEXT_POINTS[classification.contextComplexity] +
- EXECUTION_POINTS[classification.executionMode] +
- RISK_POINTS[classification.riskLevel];
- if (score <= 2) return 'low';
- if (score <= 5) return 'medium';
- return 'high';
-}
diff --git a/services/auto-routing-benchmark/README.md b/services/auto-routing-benchmark/README.md
index cd5a226bf6..6573a38ce4 100644
--- a/services/auto-routing-benchmark/README.md
+++ b/services/auto-routing-benchmark/README.md
@@ -12,9 +12,9 @@ design, invariants, and rollout/rollback.
OpenRouter using the exact production classifier code
(`@kilocode/auto-routing-contracts/classifier`), grades per-field, and derives
the cheapest above-threshold model as the classifier winner.
-- **Decider benchmark** — runs 76 golden tasks per candidate through the real
+- **Decider benchmark** — runs 180 golden tasks per candidate through the real
`kilo` CLI inside a Cloudflare Container, grades mechanically, and publishes a
- per-difficulty-tier routing table.
+ per-taxonomy-route routing table.
- Normalized results live in D1 (`BENCH_DB`); published artifacts are cached in
the shared `AUTO_ROUTING_CONFIG` KV namespace (publish = delete the keys so the
next read repopulates from D1).
@@ -92,10 +92,12 @@ sqlite3 /tmp/.sqlite 'select id, kind, status from benchmark_runs;'
## Debugging container (decider) failures
-- Each (model, 10-case chunk) gets its own container instance
- (`runId:model:chunk`); CLI runs are serialized per instance (its sqlite state
- is not safe under concurrent first runs). A `/warmup` call absorbs the one-time
- sqlite migration before the case loop.
+- Each decider run seeds bounded shard lanes across the configured models and
+ repetitions. A lane uses one stable container instance
+ (`runId:model:rep:shard`) and processes chunk `N`, then `N+shardCount`, and
+ so on. CLI runs are serialized per instance because its sqlite state is not
+ safe under concurrent first runs. A `/warmup` call absorbs the one-time sqlite
+ migration before the case loop.
- `case_results` rows carry diagnostics: CLI exit code, output prefix, and an
event tail — start there for a failing case.
- `POST /admin/debug-cli {model, prompt}` runs one prompt through the container
@@ -109,16 +111,16 @@ sqlite3 /tmp/.sqlite 'select id, kind, status from benchmark_runs;'
## Debugging the DLQ
Failed queue messages land in `auto-routing-benchmark-dlq` after `max_retries`
-(2) on `auto-routing-benchmark-jobs`. A message is one (model, chunk) job, so a
-DLQ'd message means that chunk never produced results; its model's summaries for
-the affected tier(s) will be missing or incomplete and `finalizeRunIfComplete`
-will mark the run accordingly.
+(6) on `auto-routing-benchmark-jobs`. A decider message is one
+(model, repetition, shard, chunk) job, so a DLQ'd message means that chunk never
+produced results; its model's summaries for the affected route(s) will be
+missing or incomplete and `finalizeRunIfComplete` will mark the run accordingly.
To inspect / handle:
- **Prod**: read the DLQ from the Cloudflare dashboard (Workers → Queues →
`auto-routing-benchmark-dlq`) or `wrangler queues` tooling; the message body is
- the JSON job (`runId`, `model`, `chunk`, case ids).
+ the JSON job (`runId`, `model`, `rep`, `shard`, `shardCount`, `chunk`, case ids).
- **Replay**: re-run the affected model with the admin `force` toggle once the
underlying cause (OpenRouter outage, container image, bad case) is fixed —
carried summaries mean only the re-triggered model is re-benchmarked.
diff --git a/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql b/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql
index 3db1df3b2b..d2d038e3f1 100644
--- a/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql
+++ b/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql
@@ -32,7 +32,7 @@ CREATE TABLE `case_results` (
`run_id` text NOT NULL,
`model` text NOT NULL,
`case_id` text NOT NULL,
- `tier` text,
+ `route_key` text,
`score` real NOT NULL,
`latency_ms` integer NOT NULL,
`cost_usd` real,
@@ -60,7 +60,7 @@ CREATE TABLE `config_decider_models` (
CREATE TABLE `model_summaries` (
`run_id` text NOT NULL,
`model` text NOT NULL,
- `tier` text NOT NULL,
+ `route_key` text NOT NULL,
`accuracy` real NOT NULL,
`avg_cost_usd` real,
`avg_latency_ms` real NOT NULL,
@@ -70,19 +70,19 @@ CREATE TABLE `model_summaries` (
`p95_latency_ms` real,
`timeouts` integer DEFAULT 0 NOT NULL,
`carried` integer DEFAULT false NOT NULL,
- PRIMARY KEY(`run_id`, `model`, `tier`)
+ PRIMARY KEY(`run_id`, `model`, `route_key`)
);
--> statement-breakpoint
CREATE TABLE `routing_table_candidates` (
`run_id` text NOT NULL,
- `tier` text NOT NULL,
+ `route_key` text NOT NULL,
`rank` integer NOT NULL,
`model` text NOT NULL,
`accuracy` real NOT NULL,
`avg_cost_usd` real NOT NULL,
`meets_threshold` integer NOT NULL,
`reasoning_effort` text,
- PRIMARY KEY(`run_id`, `tier`, `rank`)
+ PRIMARY KEY(`run_id`, `route_key`, `rank`)
);
--> statement-breakpoint
CREATE TABLE `routing_tables` (
diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
index 35ce39e53e..b5614567dc 100644
--- a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
+++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
@@ -1,7 +1,7 @@
{
"version": "6",
"dialect": "sqlite",
- "id": "ba559fc8-fdd3-4c96-b116-53573fb79c74",
+ "id": "fa33fcda-13d6-4952-84d7-0ad12cd02fea",
"prevId": "00000000-0000-0000-0000-000000000000",
"tables": {
"benchmark_config": {
@@ -222,8 +222,8 @@
"notNull": true,
"autoincrement": false
},
- "tier": {
- "name": "tier",
+ "route_key": {
+ "name": "route_key",
"type": "text",
"primaryKey": false,
"notNull": false,
@@ -390,8 +390,8 @@
"notNull": true,
"autoincrement": false
},
- "tier": {
- "name": "tier",
+ "route_key": {
+ "name": "route_key",
"type": "text",
"primaryKey": false,
"notNull": true,
@@ -466,13 +466,13 @@
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {
- "model_summaries_run_id_model_tier_pk": {
+ "model_summaries_run_id_model_route_key_pk": {
"columns": [
"run_id",
"model",
- "tier"
+ "route_key"
],
- "name": "model_summaries_run_id_model_tier_pk"
+ "name": "model_summaries_run_id_model_route_key_pk"
}
},
"uniqueConstraints": {},
@@ -488,8 +488,8 @@
"notNull": true,
"autoincrement": false
},
- "tier": {
- "name": "tier",
+ "route_key": {
+ "name": "route_key",
"type": "text",
"primaryKey": false,
"notNull": true,
@@ -541,13 +541,13 @@
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {
- "routing_table_candidates_run_id_tier_rank_pk": {
+ "routing_table_candidates_run_id_route_key_rank_pk": {
"columns": [
"run_id",
- "tier",
+ "route_key",
"rank"
],
- "name": "routing_table_candidates_run_id_tier_rank_pk"
+ "name": "routing_table_candidates_run_id_route_key_rank_pk"
}
},
"uniqueConstraints": {},
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
index 7ee67d2c06..aa20472e95 100644
--- a/services/auto-routing-benchmark/migrations/meta/_journal.json
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -5,7 +5,7 @@
{
"idx": 0,
"version": "6",
- "when": 1781523205381,
+ "when": 1781688875647,
"tag": "0000_absent_wallow",
"breakpoints": true
}
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index 77391db7b2..8bd7a4ba15 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -12,7 +12,7 @@ import { CLASSIFIER_CASES } from './datasets/classifier-cases';
function makeSummary(model: string): BenchmarkModelSummary {
return {
model,
- tier: 'low',
+ routeKey: 'implementation/code_generation',
accuracy: 0.9,
avgCostUsd: 0.001,
avgLatencyMs: 100,
@@ -32,7 +32,7 @@ const TEST_CONFIG: BenchmarkConfig = {
],
minAccuracy: 0.7,
switchCostFactor: 3,
- maxConcurrency: 4,
+ maxConcurrency: 100,
benchmarkUserId: null,
classifierRepetitions: 1,
deciderRepetitions: 1,
@@ -471,9 +471,10 @@ describe('POST /admin/runs', () => {
expect(body.enqueuedModels).toBe(1);
});
- it('slices a >100-message decider fan-out into sendBatch-sized batches', async () => {
- // 7 decider models × 1 rep × ceil(76/5)=16 chunks = 112 messages, which
- // exceeds Cloudflare Queues' 100-per-sendBatch cap and must be sliced.
+ it('seeds sharded decider lanes bounded by the container cap', async () => {
+ // Later chunks are chained by processJob within each shard lane. Start
+ // seeds as many lanes as fit under the 100-container cap so the benchmark
+ // runs much faster without creating one live container per chunk.
const manyModels = Array.from({ length: 7 }, (_, i) => ({
id: `vendor/model-${i}`,
reasoningEffort: null,
@@ -487,11 +488,74 @@ describe('POST /admin/runs', () => {
const res = await authedPost('/admin/runs', { kind: 'decider' });
expect(res.status).toBe(200);
- // 112 messages → two batches (100 + 12), neither over the limit.
- expect(queueSendBatch).toHaveBeenCalledTimes(2);
+ expect(queueSendBatch).toHaveBeenCalledTimes(1);
const batchSizes = queueSendBatch.mock.calls.map(([batch]) => (batch as unknown[]).length);
- expect(batchSizes).toEqual([100, 12]);
+ expect(batchSizes).toEqual([98]);
for (const size of batchSizes) expect(size).toBeLessThanOrEqual(100);
+ const queuedMessages = queueSendBatch.mock.calls.flatMap(([batch]) => batch as unknown[]);
+ for (const message of queuedMessages) {
+ expect(message).toMatchObject({
+ body: {
+ kind: 'decider',
+ shardCount: 14,
+ },
+ });
+ }
+ });
+
+ it('keeps 10 decider models with 3 repetitions under the 100-container cap', async () => {
+ const manyModels = Array.from({ length: 10 }, (_, i) => ({
+ id: `vendor/model-${i}`,
+ reasoningEffort: null,
+ }));
+ vi.mocked(getConfigRows).mockResolvedValue({
+ ...TEST_CONFIG_ROWS,
+ config: {
+ ...TEST_CONFIG_ROWS.config,
+ benchmark_user_id: 'user-123',
+ decider_repetitions: 3,
+ },
+ deciderModels: manyModels.map(m => ({ model: m.id, reasoning_effort: null })),
+ });
+
+ const res = await authedPost('/admin/runs', { kind: 'decider' });
+ expect(res.status).toBe(200);
+
+ expect(queueSendBatch).toHaveBeenCalledTimes(1);
+ const queuedMessages = queueSendBatch.mock.calls.flatMap(([batch]) => batch as unknown[]);
+ expect(queuedMessages).toHaveLength(90);
+ for (const message of queuedMessages) {
+ expect(message).toMatchObject({
+ body: {
+ kind: 'decider',
+ shardCount: 3,
+ },
+ });
+ }
+ });
+
+ it('rejects decider starts when model repetitions alone exceed the container cap', async () => {
+ const tooManyModels = Array.from({ length: 21 }, (_, i) => ({
+ id: `vendor/model-${i}`,
+ reasoningEffort: null,
+ }));
+ vi.mocked(getConfigRows).mockResolvedValue({
+ ...TEST_CONFIG_ROWS,
+ config: {
+ ...TEST_CONFIG_ROWS.config,
+ benchmark_user_id: 'user-123',
+ decider_repetitions: 5,
+ },
+ deciderModels: tooManyModels.map(m => ({ model: m.id, reasoning_effort: null })),
+ });
+
+ const res = await authedPost('/admin/runs', { kind: 'decider' });
+ expect(res.status).toBe(400);
+ await expect(res.json()).resolves.toMatchObject({
+ error: expect.stringContaining('requires at least one live container lane'),
+ });
+ expect(insertRun).not.toHaveBeenCalled();
+ expect(queueSendBatch).not.toHaveBeenCalled();
});
});
@@ -519,7 +583,7 @@ describe('GET /admin/routing-table', () => {
minAccuracy: 0.7,
switchCostFactor: 3,
source: 'benchmark',
- tiers: { low: [candidate], medium: [candidate], high: [candidate] },
+ routes: { 'implementation/code_generation': [candidate] },
};
vi.mocked(getLatestRoutingTable).mockResolvedValueOnce({
table: tableData as RoutingTable,
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
index 0b95cd3a94..e266eea567 100644
--- a/services/auto-routing-benchmark/src/admin.ts
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -8,7 +8,13 @@ import { zodJsonValidator } from '@kilocode/worker-utils';
import type { Hono } from 'hono';
import { getBenchmarkConfig, saveBenchmarkConfig } from './config';
import { debugRunCli } from './cli-runner';
-import { fetchBenchmarkUserToken, RunAlreadyActiveError, startRun, sweepStaleRuns } from './run';
+import {
+ BenchmarkRunConfigError,
+ fetchBenchmarkUserToken,
+ RunAlreadyActiveError,
+ startRun,
+ sweepStaleRuns,
+} from './run';
import { getClassifierWinner, getLatestRoutingTable, listRuns } from './db';
import type { HonoEnv } from './hono-env';
@@ -59,6 +65,9 @@ export function registerAdminRoutes(app: Hono): void {
if (error instanceof RunAlreadyActiveError) {
return c.json({ error: error.message }, 409);
}
+ if (error instanceof BenchmarkRunConfigError) {
+ return c.json({ error: error.message }, 400);
+ }
throw error;
}
}
diff --git a/services/auto-routing-benchmark/src/bench-runner-container.ts b/services/auto-routing-benchmark/src/bench-runner-container.ts
index a3c712c4c7..105e36ce52 100644
--- a/services/auto-routing-benchmark/src/bench-runner-container.ts
+++ b/services/auto-routing-benchmark/src/bench-runner-container.ts
@@ -3,7 +3,8 @@ import { Container } from '@cloudflare/containers';
// Cloudflare Container that runs the stable `kilo` CLI for decider benchmark
// cases. The worker proxies POST /run to the container's HTTP server (see
// container/server.mjs) via this DO. One instance is keyed per
-// (runId, model, chunk) so concurrent chunks/models don't share state.
+// (runId, model, rep) so chunks for the same repetition reuse CLI state without
+// creating one live container per chunk.
export class BenchRunnerContainer extends Container {
defaultPort = 3000;
sleepAfter = '2m';
@@ -11,4 +12,13 @@ export class BenchRunnerContainer extends Container {
// points at the real gateway; local dev overrides it via .dev.vars so the
// benchmark runs against the local apps/web instance.
envVars = { KILO_API_URL: this.env.KILO_CLI_API_URL };
+
+ override async fetch(request: Request): Promise {
+ const url = new URL(request.url);
+ if (request.method === 'POST' && url.pathname === '/admin/destroy') {
+ await this.destroy();
+ return new Response('destroyed');
+ }
+ return super.fetch(request);
+ }
}
diff --git a/services/auto-routing-benchmark/src/cli-runner.test.ts b/services/auto-routing-benchmark/src/cli-runner.test.ts
new file mode 100644
index 0000000000..c8966203e2
--- /dev/null
+++ b/services/auto-routing-benchmark/src/cli-runner.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it, vi } from 'vitest';
+import { destroyDeciderCliContainer } from './cli-runner';
+
+describe('destroyDeciderCliContainer', () => {
+ it('calls the container admin destroy endpoint for the instance name', async () => {
+ const fetch = vi.fn(async () => new Response('destroyed', { status: 200 }));
+ const idFromName = vi.fn((name: string) => `id:${name}`);
+ const get = vi.fn(() => ({ fetch }));
+ const env = { BENCH_RUNNER: { idFromName, get } } as unknown as Env;
+
+ await destroyDeciderCliContainer(env, { instanceName: 'run:model:2' });
+
+ expect(idFromName).toHaveBeenCalledWith('run:model:2');
+ expect(get).toHaveBeenCalledWith('id:run:model:2');
+ expect(fetch).toHaveBeenCalledWith(
+ expect.objectContaining({
+ method: 'POST',
+ url: 'http://container/admin/destroy',
+ })
+ );
+ });
+
+ it('throws when the container destroy endpoint fails', async () => {
+ const fetch = vi.fn(async () => new Response('nope', { status: 500 }));
+ const env = {
+ BENCH_RUNNER: {
+ idFromName: (name: string) => `id:${name}`,
+ get: () => ({ fetch }),
+ },
+ } as unknown as Env;
+
+ await expect(destroyDeciderCliContainer(env, { instanceName: 'run:model:2' })).rejects.toThrow(
+ 'container /admin/destroy failed: HTTP 500 nope'
+ );
+ });
+});
diff --git a/services/auto-routing-benchmark/src/cli-runner.ts b/services/auto-routing-benchmark/src/cli-runner.ts
index 9f22cb3695..de826b3a97 100644
--- a/services/auto-routing-benchmark/src/cli-runner.ts
+++ b/services/auto-routing-benchmark/src/cli-runner.ts
@@ -20,6 +20,19 @@ const DECIDER_CLI_TIMEOUT_MS = 180_000;
const FINAL_ANSWER_SUFFIX =
'\n\nIMPORTANT: Your final message must contain ONLY the answer in the exact requested format - no explanations, no preamble, no extra words.';
+export function isRetryableContainerAvailabilityError(error: unknown): boolean {
+ const message = error instanceof Error ? error.message : String(error);
+ const normalized = message.toLowerCase();
+ return (
+ normalized.includes('container /run failed: http 503') ||
+ normalized.includes('container /warmup failed: http 503') ||
+ normalized.includes('no container instance available') ||
+ normalized.includes('no container instance that can be provided') ||
+ normalized.includes('max concurrent instance count') ||
+ normalized.includes('maximum number of running container instances exceeded')
+ );
+}
+
type ContainerRunResponse = {
exitCode: number;
durationMs: number;
@@ -31,10 +44,10 @@ type ContainerRunResponse = {
/**
* Run one decider case through the `kilo` CLI inside a Cloudflare Container.
*
- * `instanceName` is the precomputed DO instance name (e.g.
- * `${runId}:${model}:${chunk}`); the caller owns the keying so chunks/models
- * map to stable instances. The CLI has no system-prompt flag, so we fold the
- * system prompt into the user prompt.
+ * `instanceName` is the precomputed DO instance name; the caller owns the
+ * keying so chunks for the same model/repetition share a stable instance. The
+ * CLI has no system-prompt flag, so we fold the system prompt into the user
+ * prompt.
*/
export async function runDeciderCaseViaCli(
env: Env,
@@ -141,6 +154,23 @@ export async function warmUpCliContainer(
})
);
if (!response.ok) {
- throw new Error(`container /warmup failed: HTTP ${response.status}`);
+ const detail = (await response.text().catch(() => '')).slice(0, 500);
+ throw new Error(`container /warmup failed: HTTP ${response.status} ${detail}`);
+ }
+}
+
+export async function destroyDeciderCliContainer(
+ env: Env,
+ params: { instanceName: string }
+): Promise {
+ const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(params.instanceName));
+ const response = await stub.fetch(
+ new Request('http://container/admin/destroy', {
+ method: 'POST',
+ })
+ );
+ if (!response.ok) {
+ const detail = (await response.text().catch(() => '')).slice(0, 500);
+ throw new Error(`container /admin/destroy failed: HTTP ${response.status} ${detail}`);
}
}
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
index 1fb02e8de4..10e8aade79 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
@@ -18,18 +18,18 @@ describe('DECIDER_CASES', () => {
expect(TAXONOMY_PAIRS.length).toBe(18);
});
- it('has exactly 76 cases with unique ids', () => {
- expect(DECIDER_CASES.length).toBe(76);
+ it('has exactly 180 cases with unique ids', () => {
+ expect(DECIDER_CASES.length).toBe(180);
const ids = new Set(DECIDER_CASES.map(c => c.id));
expect(ids.size).toBe(DECIDER_CASES.length);
});
- it('has at least 4 cases per (taskType, subtaskType) pair', () => {
+ it('has at least 10 cases per (taskType, subtaskType) pair', () => {
for (const pair of TAXONOMY_PAIRS) {
const count = DECIDER_CASES.filter(
c => c.taskType === pair.taskType && c.subtaskType === pair.subtaskType
).length;
- expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(4);
+ expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(10);
}
});
@@ -44,19 +44,6 @@ describe('DECIDER_CASES', () => {
}
});
- it('has at least 20 cases per tier', () => {
- for (const tier of ['low', 'medium', 'high'] as const) {
- expect(DECIDER_CASES.filter(c => c.tier === tier).length, tier).toBeGreaterThanOrEqual(20);
- }
- });
-
- it('covers at least 4 distinct task types per tier', () => {
- for (const tier of ['low', 'medium', 'high'] as const) {
- const taskTypes = new Set(DECIDER_CASES.filter(c => c.tier === tier).map(c => c.taskType));
- expect(taskTypes.size, tier).toBeGreaterThanOrEqual(4);
- }
- });
-
it('has compilable regex patterns', () => {
for (const c of DECIDER_CASES) {
const check = c.check;
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
index fcb82a223f..3760bc1624 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
@@ -1,13 +1,8 @@
-import type {
- ClassifierSubtaskType,
- ClassifierTaskType,
- DifficultyTier,
-} from '@kilocode/auto-routing-contracts';
+import type { ClassifierSubtaskType, ClassifierTaskType } from '@kilocode/auto-routing-contracts';
import type { DeciderCheck } from '../grading';
export type DeciderCase = {
id: string; // stable slug, e.g. 'impl-gen-squares-array' (--)
- tier: DifficultyTier;
taskType: ClassifierTaskType;
subtaskType: ClassifierSubtaskType;
systemPrompt: string;
@@ -28,19 +23,15 @@ const AGENT_SYS =
// noise (fences/case/whitespace) but never wrong values. For json_equal cases
// the prompt pins the exact key set in the same order as the expected value
// (the comparison is JSON.stringify-based and order-sensitive). Each case
-// carries exactly one difficulty tier: low = mechanical lookups / trivial
-// evaluation, medium = multi-step reasoning / off-by-one traps / spec
-// application, high = deep tracing / multi-constraint puzzles / subtle
-// semantics. agentic_execution cases are self-contained tasks performed with
-// file/terminal tools inside the benchmark container (node:22-slim, no repo,
-// no network) and every command involved is deterministic there.
+// agentic_execution cases are self-contained tasks performed with file/terminal
+// tools inside the benchmark container (node:22-slim, no repo, no network) and
+// every command involved is deterministic there.
export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
// implementation / feature_development
// ---------------------------------------------------------------------------
{
id: 'impl-feat-ternary-parity',
- tier: 'low',
taskType: 'implementation',
subtaskType: 'feature_development',
systemPrompt: CODE_SYS,
@@ -50,7 +41,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-feat-array-pipeline',
- tier: 'low',
taskType: 'implementation',
subtaskType: 'feature_development',
systemPrompt: CODE_SYS,
@@ -60,7 +50,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-feat-closure-counter',
- tier: 'medium',
taskType: 'implementation',
subtaskType: 'feature_development',
systemPrompt: CODE_SYS,
@@ -70,7 +59,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-feat-recursion-fib',
- tier: 'medium',
taskType: 'implementation',
subtaskType: 'feature_development',
systemPrompt: CODE_SYS,
@@ -80,7 +68,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-feat-this-binding',
- tier: 'high',
taskType: 'implementation',
subtaskType: 'feature_development',
systemPrompt: CODE_SYS,
@@ -94,7 +81,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'impl-gen-package-manifest',
- tier: 'low',
taskType: 'implementation',
subtaskType: 'code_generation',
systemPrompt: CODE_SYS,
@@ -104,7 +90,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-gen-squares-array',
- tier: 'low',
taskType: 'implementation',
subtaskType: 'code_generation',
systemPrompt: CODE_SYS,
@@ -114,7 +99,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-gen-no-consecutive-ones',
- tier: 'medium',
taskType: 'implementation',
subtaskType: 'code_generation',
systemPrompt: CODE_SYS,
@@ -124,7 +108,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-gen-two-ones-strings',
- tier: 'high',
taskType: 'implementation',
subtaskType: 'code_generation',
systemPrompt: CODE_SYS,
@@ -141,7 +124,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'impl-test-sort-expectation',
- tier: 'low',
taskType: 'implementation',
subtaskType: 'test_creation',
systemPrompt: CODE_SYS,
@@ -151,7 +133,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-test-upper-expectation',
- tier: 'low',
taskType: 'implementation',
subtaskType: 'test_creation',
systemPrompt: CODE_SYS,
@@ -161,7 +142,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-test-mock-call-count',
- tier: 'medium',
taskType: 'implementation',
subtaskType: 'test_creation',
systemPrompt: CODE_SYS,
@@ -171,7 +151,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'impl-test-trailing-zeros',
- tier: 'high',
taskType: 'implementation',
subtaskType: 'test_creation',
systemPrompt: CODE_SYS,
@@ -185,7 +164,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'debug-fix-parseint-suffix',
- tier: 'low',
taskType: 'debugging',
subtaskType: 'bug_fixing',
systemPrompt: CODE_SYS,
@@ -195,7 +173,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'debug-fix-binary-search',
- tier: 'medium',
taskType: 'debugging',
subtaskType: 'bug_fixing',
systemPrompt: CODE_SYS,
@@ -207,7 +184,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// 'pages' rather than 'pagination' so the id never collides with the
// classifier dataset's debug-fix-pagination-slice in shared telemetry.
id: 'debug-fix-pages-slice',
- tier: 'medium',
taskType: 'debugging',
subtaskType: 'bug_fixing',
systemPrompt: CODE_SYS,
@@ -217,7 +193,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'debug-fix-regex-lastindex',
- tier: 'high',
taskType: 'debugging',
subtaskType: 'bug_fixing',
systemPrompt: CODE_SYS,
@@ -231,7 +206,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'debug-repair-compound-assign',
- tier: 'low',
taskType: 'debugging',
subtaskType: 'test_repair',
systemPrompt: CODE_SYS,
@@ -241,7 +215,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'debug-repair-date-format',
- tier: 'medium',
taskType: 'debugging',
subtaskType: 'test_repair',
systemPrompt: CODE_SYS,
@@ -251,7 +224,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'debug-repair-entries-shape',
- tier: 'medium',
taskType: 'debugging',
subtaskType: 'test_repair',
systemPrompt: CODE_SYS,
@@ -267,7 +239,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'debug-repair-float-sum',
- tier: 'high',
taskType: 'debugging',
subtaskType: 'test_repair',
systemPrompt: CODE_SYS,
@@ -281,7 +252,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'debug-rca-async-order',
- tier: 'medium',
taskType: 'debugging',
subtaskType: 'root_cause_analysis',
systemPrompt: CODE_SYS,
@@ -291,7 +261,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'debug-rca-shared-ref',
- tier: 'medium',
taskType: 'debugging',
subtaskType: 'root_cause_analysis',
systemPrompt: CODE_SYS,
@@ -301,7 +270,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'debug-rca-closure-loop-var',
- tier: 'high',
taskType: 'debugging',
subtaskType: 'root_cause_analysis',
systemPrompt: CODE_SYS,
@@ -311,7 +279,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'debug-rca-float-equality',
- tier: 'high',
taskType: 'debugging',
subtaskType: 'root_cause_analysis',
systemPrompt: CODE_SYS,
@@ -325,7 +292,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'refactor-cleanup-loop-to-reduce',
- tier: 'low',
taskType: 'refactoring',
subtaskType: 'code_cleanup',
systemPrompt: CODE_SYS,
@@ -335,7 +301,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'refactor-cleanup-extract-helper',
- tier: 'low',
taskType: 'refactoring',
subtaskType: 'code_cleanup',
systemPrompt: CODE_SYS,
@@ -345,7 +310,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'refactor-cleanup-map-equivalent',
- tier: 'medium',
taskType: 'refactoring',
subtaskType: 'code_cleanup',
systemPrompt: CODE_SYS,
@@ -355,7 +319,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'refactor-cleanup-short-circuit',
- tier: 'high',
taskType: 'refactoring',
subtaskType: 'code_cleanup',
systemPrompt: CODE_SYS,
@@ -369,7 +332,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'refactor-arch-import-updates',
- tier: 'low',
taskType: 'refactoring',
subtaskType: 'architecture_improvement',
systemPrompt: CODE_SYS,
@@ -379,7 +341,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'refactor-arch-layer-depth',
- tier: 'medium',
taskType: 'refactoring',
subtaskType: 'architecture_improvement',
systemPrompt: CODE_SYS,
@@ -389,7 +350,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'refactor-arch-interface-edges',
- tier: 'medium',
taskType: 'refactoring',
subtaskType: 'architecture_improvement',
systemPrompt: CODE_SYS,
@@ -399,7 +359,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'refactor-arch-cycle-cut',
- tier: 'high',
taskType: 'refactoring',
subtaskType: 'architecture_improvement',
systemPrompt: CODE_SYS,
@@ -413,7 +372,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'refactor-migrate-substr-slice',
- tier: 'low',
taskType: 'refactoring',
subtaskType: 'migration',
systemPrompt: CODE_SYS,
@@ -423,7 +381,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'refactor-migrate-promise-chain',
- tier: 'medium',
taskType: 'refactoring',
subtaskType: 'migration',
systemPrompt: CODE_SYS,
@@ -433,7 +390,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'refactor-migrate-strict-equality',
- tier: 'medium',
taskType: 'refactoring',
subtaskType: 'migration',
systemPrompt: CODE_SYS,
@@ -443,7 +399,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'refactor-migrate-var-to-let',
- tier: 'high',
taskType: 'refactoring',
subtaskType: 'migration',
systemPrompt: CODE_SYS,
@@ -456,18 +411,16 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// planning_design / architecture_design
// ---------------------------------------------------------------------------
{
- id: 'plan-arch-three-tier',
- tier: 'low',
+ id: 'plan-arch-three-layer',
taskType: 'planning_design',
subtaskType: 'architecture_design',
systemPrompt: SYS_SYS,
userPrompt:
- 'In a classic three-tier architecture with presentation, business, and data tiers, which tier should contain the SQL queries? Answer with only one word: presentation, business, or data.',
+ 'In a classic three-layer architecture with presentation, business, and data layers, which layer should contain the SQL queries? Answer with only one word: presentation, business, or data.',
check: { kind: 'exact', value: 'data' },
},
{
id: 'plan-arch-call-chain',
- tier: 'medium',
taskType: 'planning_design',
subtaskType: 'architecture_design',
systemPrompt: SYS_SYS,
@@ -477,7 +430,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-arch-dependency-rules',
- tier: 'medium',
taskType: 'planning_design',
subtaskType: 'architecture_design',
systemPrompt: SYS_SYS,
@@ -487,7 +439,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-arch-latency-budget',
- tier: 'high',
taskType: 'planning_design',
subtaskType: 'architecture_design',
systemPrompt: SYS_SYS,
@@ -501,7 +452,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'plan-steps-rollout-order',
- tier: 'low',
taskType: 'planning_design',
subtaskType: 'technical_planning',
systemPrompt: SYS_SYS,
@@ -511,7 +461,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-steps-batch-count',
- tier: 'medium',
taskType: 'planning_design',
subtaskType: 'technical_planning',
systemPrompt: SYS_SYS,
@@ -521,7 +470,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-steps-deploy-waves',
- tier: 'medium',
taskType: 'planning_design',
subtaskType: 'technical_planning',
systemPrompt: SYS_SYS,
@@ -531,7 +479,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-steps-critical-path',
- tier: 'high',
taskType: 'planning_design',
subtaskType: 'technical_planning',
systemPrompt: SYS_SYS,
@@ -545,7 +492,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'plan-system-write-quorum',
- tier: 'low',
taskType: 'planning_design',
subtaskType: 'system_design',
systemPrompt: SYS_SYS,
@@ -555,7 +501,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-system-rate-limit-window',
- tier: 'medium',
taskType: 'planning_design',
subtaskType: 'system_design',
systemPrompt: SYS_SYS,
@@ -565,7 +510,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-system-replica-availability',
- tier: 'medium',
taskType: 'planning_design',
subtaskType: 'system_design',
systemPrompt: SYS_SYS,
@@ -575,7 +519,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-system-cache-staleness',
- tier: 'high',
taskType: 'planning_design',
subtaskType: 'system_design',
systemPrompt: SYS_SYS,
@@ -585,7 +528,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-system-queue-trace',
- tier: 'high',
taskType: 'planning_design',
subtaskType: 'system_design',
systemPrompt: SYS_SYS,
@@ -595,7 +537,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-system-deadlock-order',
- tier: 'high',
taskType: 'planning_design',
subtaskType: 'system_design',
systemPrompt: SYS_SYS,
@@ -605,7 +546,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'plan-system-txn-isolation',
- tier: 'high',
taskType: 'planning_design',
subtaskType: 'system_design',
systemPrompt: SYS_SYS,
@@ -619,7 +559,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'invest-repo-test-file-count',
- tier: 'low',
taskType: 'investigation',
subtaskType: 'repo_exploration',
systemPrompt: CODE_SYS,
@@ -629,7 +568,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'invest-repo-glob-match',
- tier: 'medium',
taskType: 'investigation',
subtaskType: 'repo_exploration',
systemPrompt: CODE_SYS,
@@ -639,7 +577,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'invest-repo-grep-case',
- tier: 'medium',
taskType: 'investigation',
subtaskType: 'repo_exploration',
systemPrompt: CODE_SYS,
@@ -649,7 +586,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'invest-repo-gitignore',
- tier: 'high',
taskType: 'investigation',
subtaskType: 'repo_exploration',
systemPrompt: CODE_SYS,
@@ -663,7 +599,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'invest-code-char-count',
- tier: 'low',
taskType: 'investigation',
subtaskType: 'codebase_understanding',
systemPrompt: CODE_SYS,
@@ -673,7 +608,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'invest-code-object-keys',
- tier: 'low',
taskType: 'investigation',
subtaskType: 'codebase_understanding',
systemPrompt: CODE_SYS,
@@ -683,7 +617,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'invest-code-regex-groups',
- tier: 'medium',
taskType: 'investigation',
subtaskType: 'codebase_understanding',
systemPrompt: CODE_SYS,
@@ -693,7 +626,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'invest-code-collatz-depth',
- tier: 'high',
taskType: 'investigation',
subtaskType: 'codebase_understanding',
systemPrompt: CODE_SYS,
@@ -707,7 +639,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'invest-ext-http-created',
- tier: 'low',
taskType: 'investigation',
subtaskType: 'external_research',
systemPrompt:
@@ -718,7 +649,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'invest-ext-utf8-euro',
- tier: 'medium',
taskType: 'investigation',
subtaskType: 'external_research',
systemPrompt: SYS_SYS,
@@ -728,7 +658,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'invest-ext-semver-caret',
- tier: 'medium',
taskType: 'investigation',
subtaskType: 'external_research',
systemPrompt: CODE_SYS,
@@ -738,7 +667,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'invest-ext-json-spec',
- tier: 'high',
taskType: 'investigation',
subtaskType: 'external_research',
systemPrompt: CODE_SYS,
@@ -752,7 +680,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'agentic-tool-json-read',
- tier: 'low',
taskType: 'agentic_execution',
subtaskType: 'tool_usage',
systemPrompt: AGENT_SYS,
@@ -762,7 +689,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'agentic-tool-notes-count',
- tier: 'low',
taskType: 'agentic_execution',
subtaskType: 'tool_usage',
systemPrompt: AGENT_SYS,
@@ -772,7 +698,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'agentic-tool-log-grep',
- tier: 'medium',
taskType: 'agentic_execution',
subtaskType: 'tool_usage',
systemPrompt: AGENT_SYS,
@@ -782,7 +707,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'agentic-tool-csv-filter-sum',
- tier: 'high',
taskType: 'agentic_execution',
subtaskType: 'tool_usage',
systemPrompt: AGENT_SYS,
@@ -796,7 +720,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'agentic-term-node-major',
- tier: 'low',
taskType: 'agentic_execution',
subtaskType: 'terminal_operations',
systemPrompt: AGENT_SYS,
@@ -806,7 +729,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'agentic-term-wc-lines',
- tier: 'low',
taskType: 'agentic_execution',
subtaskType: 'terminal_operations',
systemPrompt: AGENT_SYS,
@@ -816,7 +738,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'agentic-term-sort-pipeline',
- tier: 'medium',
taskType: 'agentic_execution',
subtaskType: 'terminal_operations',
systemPrompt: AGENT_SYS,
@@ -826,7 +747,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'agentic-term-sha256-prefix',
- tier: 'high',
taskType: 'agentic_execution',
subtaskType: 'terminal_operations',
systemPrompt: AGENT_SYS,
@@ -840,7 +760,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
// ---------------------------------------------------------------------------
{
id: 'agentic-multi-seq-sum',
- tier: 'medium',
taskType: 'agentic_execution',
subtaskType: 'multi_step_execution',
systemPrompt: AGENT_SYS,
@@ -850,7 +769,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'agentic-multi-node-script',
- tier: 'medium',
taskType: 'agentic_execution',
subtaskType: 'multi_step_execution',
systemPrompt: AGENT_SYS,
@@ -860,7 +778,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'agentic-multi-find-count',
- tier: 'medium',
taskType: 'agentic_execution',
subtaskType: 'multi_step_execution',
systemPrompt: AGENT_SYS,
@@ -870,7 +787,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
},
{
id: 'agentic-multi-json-transform',
- tier: 'high',
taskType: 'agentic_execution',
subtaskType: 'multi_step_execution',
systemPrompt: AGENT_SYS,
@@ -878,4 +794,943 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
'Create a file /tmp/bench-in.json containing exactly this JSON array: [3, 1, 4, 1, 5, 9, 2, 6, 5, 3]. Then write and run a Node.js script that reads the file, computes the sum of the distinct values in the array, and prints it. Answer with only the number.',
check: { kind: 'exact', value: '30' },
},
+ // ---------------------------------------------------------------------------
+ // Supplemental taxonomy-route coverage
+ // ---------------------------------------------------------------------------
+ {
+ id: 'supp-impl-feat-clamp',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'Implement mentally: clamp(14, 3, 9) returns min when low, max when high, otherwise value. Answer with only the returned number.',
+ check: { kind: 'exact', value: '9' },
+ },
+ {
+ id: 'supp-impl-feat-join-slugs',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'What should slug(["Kilo", "Code", "Cloud"]) return if it lowercases words and joins them with hyphens? Answer only the return value.',
+ check: { kind: 'exact', value: 'kilo-code-cloud' },
+ },
+ {
+ id: 'supp-impl-code-nullish',
+ taskType: 'implementation',
+ subtaskType: 'code_generation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'What does this print? Answer with only the output.\n\nconst x = null ?? "fallback";\nconsole.log(x);',
+ check: { kind: 'exact', value: 'fallback' },
+ },
+ {
+ id: 'supp-impl-code-set-size',
+ taskType: 'implementation',
+ subtaskType: 'code_generation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'What does this JavaScript print? Answer only the number.\n\nconst s = new Set(["a", "b", "a", "c"]);\nconsole.log(s.size);',
+ check: { kind: 'exact', value: '3' },
+ },
+ {
+ id: 'supp-impl-test-boundary-count',
+ taskType: 'implementation',
+ subtaskType: 'test_creation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A clamp(value, min, max) function needs tests for below min, at min, inside range, at max, and above max. How many cases is that? Answer only the number.',
+ check: { kind: 'exact', value: '5' },
+ },
+ {
+ id: 'supp-impl-test-error-case',
+ taskType: 'implementation',
+ subtaskType: 'test_creation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'For parsePort(input), which invalid input should a test include: "3000", "0", or "abc"? Answer only the invalid value.',
+ check: { kind: 'exact', value: 'abc' },
+ },
+ {
+ id: 'supp-debug-bug-off-by-one',
+ taskType: 'debugging',
+ subtaskType: 'bug_fixing',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A loop uses i <= items.length and reads items[i]. What operator should replace <= to avoid reading past the end? Answer only the operator.',
+ check: { kind: 'exact', value: '<' },
+ },
+ {
+ id: 'supp-debug-bug-json-parse',
+ taskType: 'debugging',
+ subtaskType: 'bug_fixing',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'JSON.parse("{bad}") throws. Should the fix catch SyntaxError or TypeError? Answer only the error class.',
+ check: { kind: 'exact', value: 'SyntaxError' },
+ },
+ {
+ id: 'supp-debug-test-expected',
+ taskType: 'debugging',
+ subtaskType: 'test_repair',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A function returns ["a", "b"]. The failing test expects ["b", "a"] but order is part of the contract. Which expected array is correct? Answer JSON only.',
+ check: { kind: 'json_equal', value: ['a', 'b'] },
+ },
+ {
+ id: 'supp-debug-test-timeout',
+ taskType: 'debugging',
+ subtaskType: 'test_repair',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A test waits for text that appears after clicking Save, but it never clicks Save. What single action is missing? Answer only the verb.',
+ check: { kind: 'exact', value: 'click' },
+ },
+ {
+ id: 'supp-debug-root-cause-cache',
+ taskType: 'debugging',
+ subtaskType: 'root_cause_analysis',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A value updates in the database but the page shows the old value until cache expiry. Which layer is the likely root cause: database, cache, or compiler? Answer one word.',
+ check: { kind: 'exact', value: 'cache' },
+ },
+ {
+ id: 'supp-debug-root-cause-env',
+ taskType: 'debugging',
+ subtaskType: 'root_cause_analysis',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Local requests hit port 8810 but the worker config says the target service runs on 8814. What kind of mismatch is this? Answer one word.',
+ check: { kind: 'exact', value: 'port' },
+ },
+ {
+ id: 'supp-refactor-cleanup-dead-branch',
+ taskType: 'refactoring',
+ subtaskType: 'code_cleanup',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A condition checks if status === "done" inside a branch where status is already known to be "pending". What should happen to that inner branch? Answer one word.',
+ check: { kind: 'exact', value: 'remove' },
+ },
+ {
+ id: 'supp-refactor-cleanup-name',
+ taskType: 'refactoring',
+ subtaskType: 'code_cleanup',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'Which name is clearer for a boolean: data, flag, or hasErrors? Answer only the best name.',
+ check: { kind: 'exact', value: 'hasErrors' },
+ },
+ {
+ id: 'supp-refactor-arch-shared-helper',
+ taskType: 'refactoring',
+ subtaskType: 'architecture_improvement',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Three modules duplicate the same pure validation logic. Should the shared code be a pure helper, global mutable state, or copied again? Answer two words.',
+ check: { kind: 'exact', value: 'pure helper' },
+ },
+ {
+ id: 'supp-refactor-arch-boundary',
+ taskType: 'refactoring',
+ subtaskType: 'architecture_improvement',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A UI component directly opens database connections. Which boundary should own the database call: UI, server, or CSS? Answer one word.',
+ check: { kind: 'exact', value: 'server' },
+ },
+ {
+ id: 'supp-refactor-migration-column',
+ taskType: 'refactoring',
+ subtaskType: 'migration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A migration renames user_name to display_name without changing values. What SQL operation is this: INSERT, RENAME COLUMN, or DROP TABLE? Answer only the operation.',
+ check: { kind: 'exact', value: 'RENAME COLUMN' },
+ },
+ {
+ id: 'supp-refactor-migration-backfill',
+ taskType: 'refactoring',
+ subtaskType: 'migration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'After adding a non-null slug column to existing rows, what data operation fills slug for old rows? Answer one word.',
+ check: { kind: 'exact', value: 'backfill' },
+ },
+ {
+ id: 'supp-plan-arch-cache-layer',
+ taskType: 'planning_design',
+ subtaskType: 'architecture_design',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'For read-heavy config that changes rarely, should the hot path read every request from origin storage or use a short cache? Answer two words.',
+ check: { kind: 'exact', value: 'short cache' },
+ },
+ {
+ id: 'supp-plan-arch-queue',
+ taskType: 'planning_design',
+ subtaskType: 'architecture_design',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A long-running benchmark exceeds request time limits. Which primitive should carry the work asynchronously: queue, cookie, or CSS? Answer one word.',
+ check: { kind: 'exact', value: 'queue' },
+ },
+ {
+ id: 'supp-plan-technical-rollout',
+ taskType: 'planning_design',
+ subtaskType: 'technical_planning',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Order these rollout steps: deploy code, run migration, monitor logs. Which step should be last? Answer two words.',
+ check: { kind: 'exact', value: 'monitor logs' },
+ },
+ {
+ id: 'supp-plan-technical-risk',
+ taskType: 'planning_design',
+ subtaskType: 'technical_planning',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A plan changes a shared API contract. Should verification focus on one file only or all direct consumers? Answer three words.',
+ check: { kind: 'exact', value: 'all direct consumers' },
+ },
+ {
+ id: 'supp-plan-system-slo',
+ taskType: 'planning_design',
+ subtaskType: 'system_design',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A service retries failed jobs and eventually sends hopeless jobs to a separate queue. What is that queue commonly called? Answer only the abbreviation.',
+ check: { kind: 'exact', value: 'DLQ' },
+ },
+ {
+ id: 'supp-plan-system-idempotency',
+ taskType: 'planning_design',
+ subtaskType: 'system_design',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'If the same queue message may be delivered twice, should writes be idempotent or random? Answer one word.',
+ check: { kind: 'exact', value: 'idempotent' },
+ },
+ {
+ id: 'supp-invest-repo-rg',
+ taskType: 'investigation',
+ subtaskType: 'repo_exploration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Which command is the fastest common choice to search a repository for the string saveRoutingTable: rg, cat, or date? Answer one word.',
+ check: { kind: 'exact', value: 'rg' },
+ },
+ {
+ id: 'supp-invest-repo-package',
+ taskType: 'investigation',
+ subtaskType: 'repo_exploration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'In a pnpm monorepo, which file usually names a package and its scripts: package.json or README.md? Answer only the file name.',
+ check: { kind: 'exact', value: 'package.json' },
+ },
+ {
+ id: 'supp-invest-code-flow',
+ taskType: 'investigation',
+ subtaskType: 'codebase_understanding',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A handler calls validateInput, then saveRow, then enqueueJob. Which function creates the async follow-up? Answer only the function name.',
+ check: { kind: 'exact', value: 'enqueueJob' },
+ },
+ {
+ id: 'supp-invest-code-owner',
+ taskType: 'investigation',
+ subtaskType: 'codebase_understanding',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'If a type is imported from @kilocode/auto-routing-contracts, which package owns that type? Answer only the package name.',
+ check: { kind: 'exact', value: '@kilocode/auto-routing-contracts' },
+ },
+ {
+ id: 'supp-invest-research-source',
+ taskType: 'investigation',
+ subtaskType: 'external_research',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'For a question about current Cloudflare Workers limits, should you prefer official docs or an old blog post? Answer two words.',
+ check: { kind: 'exact', value: 'official docs' },
+ },
+ {
+ id: 'supp-invest-research-date',
+ taskType: 'investigation',
+ subtaskType: 'external_research',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'When comparing two search results for current pricing, which field matters most: publish date, font size, or title length? Answer two words.',
+ check: { kind: 'exact', value: 'publish date' },
+ },
+ {
+ id: 'supp-agent-tool-json-file',
+ taskType: 'agentic_execution',
+ subtaskType: 'tool_usage',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-tool.json containing exactly {"a":2,"b":5}. Then read it and answer with only the sum of a and b.',
+ check: { kind: 'exact', value: '7' },
+ },
+ {
+ id: 'supp-agent-tool-grep-count',
+ taskType: 'agentic_execution',
+ subtaskType: 'tool_usage',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-tool.txt with lines alpha, beta, alphabet, gamma. Count lines containing alpha and answer only the number.',
+ check: { kind: 'exact', value: '2' },
+ },
+ {
+ id: 'supp-agent-term-node-eval',
+ taskType: 'agentic_execution',
+ subtaskType: 'terminal_operations',
+ systemPrompt: AGENT_SYS,
+ userPrompt: 'Run node -e "console.log(6*7)" in the terminal and answer with only the output.',
+ check: { kind: 'exact', value: '42' },
+ },
+ {
+ id: 'supp-agent-term-pwd-base',
+ taskType: 'agentic_execution',
+ subtaskType: 'terminal_operations',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Run pwd in the terminal. If it ends with /app, answer app; otherwise answer other. Answer one word.',
+ check: { kind: 'regex', pattern: '^(app|other)$' },
+ },
+ {
+ id: 'supp-agent-multi-script',
+ taskType: 'agentic_execution',
+ subtaskType: 'multi_step_execution',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Write /tmp/bench-multi.js that prints ["k","i","l","o"].join(""). Run it with node and answer with only what it prints.',
+ check: { kind: 'exact', value: 'kilo' },
+ },
+ {
+ id: 'supp-agent-multi-files',
+ taskType: 'agentic_execution',
+ subtaskType: 'multi_step_execution',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-a.txt containing 11 and /tmp/bench-b.txt containing 31. Read both files, add the numbers, and answer only the sum.',
+ check: { kind: 'exact', value: '42' },
+ },
+
+ // ---------------------------------------------------------------------------
+ // Additional taxonomy-route coverage to keep every pair at 10+ cases
+ // ---------------------------------------------------------------------------
+ {
+ id: 'supp2-impl-feat-nullish-total',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'What does this JavaScript print? Answer with only the number.\n\nconst input = { count: null };\nconst total = (input.count ?? 4) + 6;\nconsole.log(total);',
+ check: { kind: 'exact', value: '10' },
+ },
+ {
+ id: 'supp2-impl-feat-spread-merge',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'What does this JavaScript print? Answer with the exact output line only.\n\nconst base = { a: 1, b: 2 };\nconst next = { ...base, b: 5, c: 8 };\nconsole.log(Object.keys(next).join(","));',
+ check: { kind: 'exact', value: 'a,b,c' },
+ },
+ {
+ id: 'supp2-impl-feat-set-size',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'What does this JavaScript print? Answer with only the number.\n\nconst tags = new Set(["api", "web", "api", "cli"]);\nconsole.log(tags.size);',
+ check: { kind: 'exact', value: '3' },
+ },
+ {
+ id: 'supp2-impl-gen-config-object',
+ taskType: 'implementation',
+ subtaskType: 'code_generation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'Generate a config fixture. Reply with only a JSON object with exactly the keys "enabled" and "retries" in that order, where enabled is true and retries is 3.',
+ check: { kind: 'json_equal', value: { enabled: true, retries: 3 } },
+ },
+ {
+ id: 'supp2-impl-gen-primes-array',
+ taskType: 'implementation',
+ subtaskType: 'code_generation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'Generate a test fixture: a JSON array containing the prime numbers less than 12, in increasing order. Reply with only the JSON array.',
+ check: { kind: 'json_equal', value: [2, 3, 5, 7, 11] },
+ },
+ {
+ id: 'supp2-impl-gen-user-slug',
+ taskType: 'implementation',
+ subtaskType: 'code_generation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'Generate a slug for the title "Ship Fast, Stay Safe!". Reply with only the lowercase slug.',
+ check: { kind: 'exact', value: 'ship-fast-stay-safe' },
+ },
+ {
+ id: 'supp2-impl-gen-initials-object',
+ taskType: 'implementation',
+ subtaskType: 'code_generation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'Generate a fixture. Reply with only a JSON object with exactly the keys "name" and "initials" in that order, where name is "Ada Lovelace" and initials is "AL".',
+ check: { kind: 'json_equal', value: { name: 'Ada Lovelace', initials: 'AL' } },
+ },
+ {
+ id: 'supp2-impl-test-array-length',
+ taskType: 'implementation',
+ subtaskType: 'test_creation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'You are writing a unit test. What number makes this assertion pass? Answer with only the number.\n\nexpect(["red", "blue", "green"].length).toBe(?)',
+ check: { kind: 'exact', value: '3' },
+ },
+ {
+ id: 'supp2-impl-test-trim-expectation',
+ taskType: 'implementation',
+ subtaskType: 'test_creation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'You are writing a unit test. What exact string makes this assertion pass? Answer with only the string.\n\nexpect(" done\\n".trim()).toBe(?)',
+ check: { kind: 'exact', value: 'done' },
+ },
+ {
+ id: 'supp2-impl-test-map-output',
+ taskType: 'implementation',
+ subtaskType: 'test_creation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'You are writing a unit test. What JSON array should be expected?\n\n[2, 4, 6].map(n => n / 2)',
+ check: { kind: 'json_equal', value: [1, 2, 3] },
+ },
+ {
+ id: 'supp2-impl-test-url-search-param',
+ taskType: 'implementation',
+ subtaskType: 'test_creation',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'You are writing a unit test. What value should this assertion expect? Answer with the exact string only.\n\nnew URL("https://example.test/path?mode=fast").searchParams.get("mode")',
+ check: { kind: 'exact', value: 'fast' },
+ },
+ {
+ id: 'supp2-debug-bug-loop-bound',
+ taskType: 'debugging',
+ subtaskType: 'bug_fixing',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A loop should visit indexes 0, 1, and 2 of a 3-item array. Which comparison operator should the loop use with i and length: < or <=? Answer only the operator.',
+ check: { kind: 'exact', value: '<' },
+ },
+ {
+ id: 'supp2-debug-bug-negated-guard',
+ taskType: 'debugging',
+ subtaskType: 'bug_fixing',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A guard should return early when user is missing. Complete the condition: if (___user) return "anonymous"; Answer with only the missing operator.',
+ check: { kind: 'exact', value: '!' },
+ },
+ {
+ id: 'supp2-debug-bug-assignment-condition',
+ taskType: 'debugging',
+ subtaskType: 'bug_fixing',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A condition accidentally uses = instead of comparing status to "ready". Which operator should replace = for strict comparison? Answer only the operator.',
+ check: { kind: 'exact', value: '===' },
+ },
+ {
+ id: 'supp2-debug-bug-missing-await',
+ taskType: 'debugging',
+ subtaskType: 'bug_fixing',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'An async function returns Promise { } where the resolved value was expected. What keyword is missing before the promise call? Answer one word.',
+ check: { kind: 'exact', value: 'await' },
+ },
+ {
+ id: 'supp2-debug-test-boolean-expect',
+ taskType: 'debugging',
+ subtaskType: 'test_repair',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A test expected isAdmin("owner") to be false, but the fixed function correctly returns true. What boolean should the test expect? Answer one word.',
+ check: { kind: 'exact', value: 'true' },
+ },
+ {
+ id: 'supp2-debug-test-error-message',
+ taskType: 'debugging',
+ subtaskType: 'test_repair',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A validation test expected "bad input"; the implementation now intentionally throws "missing email". What exact message should the repaired test expect?',
+ check: { kind: 'exact', value: 'missing email' },
+ },
+ {
+ id: 'supp2-debug-test-json-shape',
+ taskType: 'debugging',
+ subtaskType: 'test_repair',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A response fixture changed from {ok:true} to {status:"ok"}. Reply with only the new expected JSON object.',
+ check: { kind: 'json_equal', value: { status: 'ok' } },
+ },
+ {
+ id: 'supp2-debug-test-async-resolve',
+ taskType: 'debugging',
+ subtaskType: 'test_repair',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A test should assert that fetchName() resolves to "Kilo". Which matcher should be used before toBe("Kilo"): resolves or rejects? Answer one word.',
+ check: { kind: 'exact', value: 'resolves' },
+ },
+ {
+ id: 'supp2-debug-rca-unset-secret',
+ taskType: 'debugging',
+ subtaskType: 'root_cause_analysis',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A deploy works locally but production calls fail with "missing OPENROUTER_API_KEY". Which category is the root cause: secret, schema, or css? Answer one word.',
+ check: { kind: 'exact', value: 'secret' },
+ },
+ {
+ id: 'supp2-debug-rca-race-condition',
+ taskType: 'debugging',
+ subtaskType: 'root_cause_analysis',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Two workers update the same counter concurrently and one increment disappears. What kind of bug is this? Answer two words.',
+ check: { kind: 'exact', value: 'race condition' },
+ },
+ {
+ id: 'supp2-debug-rca-cache-key',
+ taskType: 'debugging',
+ subtaskType: 'root_cause_analysis',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Two users see each other cached results because the cache key omits userId. Which part is wrong: cache key, database type, or font? Answer two words.',
+ check: { kind: 'exact', value: 'cache key' },
+ },
+ {
+ id: 'supp2-debug-rca-timeout',
+ taskType: 'debugging',
+ subtaskType: 'root_cause_analysis',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A request always fails after exactly 30 seconds while the downstream job completes at 45 seconds. What limit is most likely being hit? Answer one word.',
+ check: { kind: 'exact', value: 'timeout' },
+ },
+ {
+ id: 'supp2-refactor-cleanup-unused-import',
+ taskType: 'refactoring',
+ subtaskType: 'code_cleanup',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'A file imports formatDate but never uses it. What should happen to that import? Answer one word.',
+ check: { kind: 'exact', value: 'remove' },
+ },
+ {
+ id: 'supp2-refactor-cleanup-nested-if',
+ taskType: 'refactoring',
+ subtaskType: 'code_cleanup',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'Replacing nested if statements with early returns primarily reduces what? Answer one word.',
+ check: { kind: 'exact', value: 'nesting' },
+ },
+ {
+ id: 'supp2-refactor-cleanup-magic-number',
+ taskType: 'refactoring',
+ subtaskType: 'code_cleanup',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'The number 86400000 appears repeatedly to mean milliseconds per day. What should it become: named constant, random value, or inline comment only? Answer two words.',
+ check: { kind: 'exact', value: 'named constant' },
+ },
+ {
+ id: 'supp2-refactor-cleanup-duplicate-branch',
+ taskType: 'refactoring',
+ subtaskType: 'code_cleanup',
+ systemPrompt: CODE_SYS,
+ userPrompt:
+ 'Two switch cases have identical bodies. What refactor can combine them: fallthrough, mutation, or sleep? Answer one word.',
+ check: { kind: 'exact', value: 'fallthrough' },
+ },
+ {
+ id: 'supp2-refactor-arch-adapter',
+ taskType: 'refactoring',
+ subtaskType: 'architecture_improvement',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'To isolate provider-specific API calls behind a common interface, what pattern is commonly used? Answer one word.',
+ check: { kind: 'exact', value: 'adapter' },
+ },
+ {
+ id: 'supp2-refactor-arch-pure-core',
+ taskType: 'refactoring',
+ subtaskType: 'architecture_improvement',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Moving business rules out of HTTP handlers into pure functions mainly improves what? Answer one word.',
+ check: { kind: 'exact', value: 'testability' },
+ },
+ {
+ id: 'supp2-refactor-arch-layering',
+ taskType: 'refactoring',
+ subtaskType: 'architecture_improvement',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A router imports a React component to reuse validation logic. Should validation move to shared domain code or stay in the component? Answer three words.',
+ check: { kind: 'exact', value: 'shared domain code' },
+ },
+ {
+ id: 'supp2-refactor-arch-contract-package',
+ taskType: 'refactoring',
+ subtaskType: 'architecture_improvement',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Two services duplicate the same Zod request schema. Where should that schema live: shared contracts package, CSS file, or log line? Answer three words.',
+ check: { kind: 'exact', value: 'shared contracts package' },
+ },
+ {
+ id: 'supp2-refactor-migration-add-index',
+ taskType: 'refactoring',
+ subtaskType: 'migration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A frequent lookup filters by run_id and model. Which database object usually speeds that lookup? Answer one word.',
+ check: { kind: 'exact', value: 'index' },
+ },
+ {
+ id: 'supp2-refactor-migration-nullable-first',
+ taskType: 'refactoring',
+ subtaskType: 'migration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'For a large table, adding a new column before backfilling is usually safer if it starts nullable or non-null with no default? Answer one word.',
+ check: { kind: 'exact', value: 'nullable' },
+ },
+ {
+ id: 'supp2-refactor-migration-drop-column',
+ taskType: 'refactoring',
+ subtaskType: 'migration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Removing an obsolete database column is which SQL operation: DROP COLUMN, SELECT, or COMMIT? Answer only the operation.',
+ check: { kind: 'exact', value: 'DROP COLUMN' },
+ },
+ {
+ id: 'supp2-refactor-migration-rename-table',
+ taskType: 'refactoring',
+ subtaskType: 'migration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A migration changes table name old_events to events while preserving rows. What operation is this? Answer two words.',
+ check: { kind: 'exact', value: 'rename table' },
+ },
+ {
+ id: 'supp2-plan-arch-separate-writer',
+ taskType: 'planning_design',
+ subtaskType: 'architecture_design',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'If one service should own writes to a shared routing table and others only read, what role does that service have? Answer two words.',
+ check: { kind: 'exact', value: 'sole writer' },
+ },
+ {
+ id: 'supp2-plan-arch-event-queue',
+ taskType: 'planning_design',
+ subtaskType: 'architecture_design',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A user request should return quickly while heavy work continues later. Which architecture primitive usually decouples the work? Answer one word.',
+ check: { kind: 'exact', value: 'queue' },
+ },
+ {
+ id: 'supp2-plan-arch-cache-invalidation',
+ taskType: 'planning_design',
+ subtaskType: 'architecture_design',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'After publishing a new config, should readers keep the old KV cache forever or invalidate it? Answer two words.',
+ check: { kind: 'exact', value: 'invalidate it' },
+ },
+ {
+ id: 'supp2-plan-arch-idempotent-writes',
+ taskType: 'planning_design',
+ subtaskType: 'architecture_design',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'If a queue retries messages, should database writes be idempotent or time-randomized? Answer one word.',
+ check: { kind: 'exact', value: 'idempotent' },
+ },
+ {
+ id: 'supp2-plan-technical-order',
+ taskType: 'planning_design',
+ subtaskType: 'technical_planning',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'For a schema-breaking rollout, which should be planned before deploy: migration or celebration? Answer one word.',
+ check: { kind: 'exact', value: 'migration' },
+ },
+ {
+ id: 'supp2-plan-technical-rollback',
+ taskType: 'planning_design',
+ subtaskType: 'technical_planning',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A rollout plan should include how to return to the previous version. What is that called? Answer one word.',
+ check: { kind: 'exact', value: 'rollback' },
+ },
+ {
+ id: 'supp2-plan-technical-verification',
+ taskType: 'planning_design',
+ subtaskType: 'technical_planning',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A plan touches a worker and a web consumer. Should verification include both surfaces or only the worker? Answer two words.',
+ check: { kind: 'exact', value: 'both surfaces' },
+ },
+ {
+ id: 'supp2-plan-technical-owner',
+ taskType: 'planning_design',
+ subtaskType: 'technical_planning',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'When a launch depends on CI deploy finishing, what should the plan wait for before starting a new benchmark? Answer two words.',
+ check: { kind: 'exact', value: 'deploy completion' },
+ },
+ {
+ id: 'supp2-plan-system-backpressure',
+ taskType: 'planning_design',
+ subtaskType: 'system_design',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Limiting how many jobs run at once to protect downstream capacity is called what? Answer one word.',
+ check: { kind: 'exact', value: 'backpressure' },
+ },
+ {
+ id: 'supp2-invest-repo-find-schema',
+ taskType: 'investigation',
+ subtaskType: 'repo_exploration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'To find where benchmark_runs is defined in a repo, which command should you use first: rg, sleep, or curl? Answer one word.',
+ check: { kind: 'exact', value: 'rg' },
+ },
+ {
+ id: 'supp2-invest-repo-list-files',
+ taskType: 'investigation',
+ subtaskType: 'repo_exploration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Which command lists tracked and untracked file changes in a git worktree: git status or npm version? Answer two words.',
+ check: { kind: 'exact', value: 'git status' },
+ },
+ {
+ id: 'supp2-invest-repo-find-tests',
+ taskType: 'investigation',
+ subtaskType: 'repo_exploration',
+ systemPrompt: SYS_SYS,
+ userPrompt: 'Files ending in .test.ts usually contain what? Answer one word.',
+ check: { kind: 'exact', value: 'tests' },
+ },
+ {
+ id: 'supp2-invest-repo-read-config',
+ taskType: 'investigation',
+ subtaskType: 'repo_exploration',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'In a Cloudflare Worker service, which config file commonly defines bindings: wrangler.jsonc or tsconfig.tsbuildinfo? Answer only the file name.',
+ check: { kind: 'exact', value: 'wrangler.jsonc' },
+ },
+ {
+ id: 'supp2-invest-code-call-chain',
+ taskType: 'investigation',
+ subtaskType: 'codebase_understanding',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'Given the call chain handleRequest -> classify -> computeDecision, which function chooses the model? Answer only the function name.',
+ check: { kind: 'exact', value: 'computeDecision' },
+ },
+ {
+ id: 'supp2-invest-code-schema-owner',
+ taskType: 'investigation',
+ subtaskType: 'codebase_understanding',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'If RoutingTableSchema parses published artifacts, is it a runtime schema or CSS class? Answer two words.',
+ check: { kind: 'exact', value: 'runtime schema' },
+ },
+ {
+ id: 'supp2-invest-code-field-rename',
+ taskType: 'investigation',
+ subtaskType: 'codebase_understanding',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A database row field route_key maps to API field routeKey. What naming conversion is this: snake to camel, camel to snake, or uppercase? Answer three words.',
+ check: { kind: 'exact', value: 'snake to camel' },
+ },
+ {
+ id: 'supp2-invest-code-consumer',
+ taskType: 'investigation',
+ subtaskType: 'codebase_understanding',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'A type change in @kilocode/auto-routing-contracts breaks services/auto-routing and apps/web. What are those packages called relative to the type? Answer one word.',
+ check: { kind: 'exact', value: 'consumers' },
+ },
+ {
+ id: 'supp2-invest-research-primary-source',
+ taskType: 'investigation',
+ subtaskType: 'external_research',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'For library API behavior, should you prefer official docs or a random forum answer? Answer two words.',
+ check: { kind: 'exact', value: 'official docs' },
+ },
+ {
+ id: 'supp2-invest-research-cross-check',
+ taskType: 'investigation',
+ subtaskType: 'external_research',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'If two current sources disagree, should you cross-check or guess? Answer one word.',
+ check: { kind: 'exact', value: 'cross-check' },
+ },
+ {
+ id: 'supp2-invest-research-version',
+ taskType: 'investigation',
+ subtaskType: 'external_research',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'When reading framework docs, which detail matters for compatibility: version or logo color? Answer one word.',
+ check: { kind: 'exact', value: 'version' },
+ },
+ {
+ id: 'supp2-invest-research-quote-limit',
+ taskType: 'investigation',
+ subtaskType: 'external_research',
+ systemPrompt: SYS_SYS,
+ userPrompt:
+ 'When using a source, should long copyrighted passages be quoted in full or summarized? Answer one word.',
+ check: { kind: 'exact', value: 'summarized' },
+ },
+ {
+ id: 'supp2-agent-tool-sort-file',
+ taskType: 'agentic_execution',
+ subtaskType: 'tool_usage',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-sort.txt with lines delta, alpha, charlie. Sort the lines alphabetically and answer with the first line only.',
+ check: { kind: 'exact', value: 'alpha' },
+ },
+ {
+ id: 'supp2-agent-tool-json-length',
+ taskType: 'agentic_execution',
+ subtaskType: 'tool_usage',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-items.json containing ["a","b","c","d"]. Read it and answer only the array length.',
+ check: { kind: 'exact', value: '4' },
+ },
+ {
+ id: 'supp2-agent-tool-word-count',
+ taskType: 'agentic_execution',
+ subtaskType: 'tool_usage',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-words.txt containing exactly "one two three". Count the words and answer only the number.',
+ check: { kind: 'exact', value: '3' },
+ },
+ {
+ id: 'supp2-agent-tool-file-exists',
+ taskType: 'agentic_execution',
+ subtaskType: 'tool_usage',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-exists.txt containing ok. Then check that the file exists and answer only yes or no.',
+ check: { kind: 'exact', value: 'yes' },
+ },
+ {
+ id: 'supp2-agent-term-node-json',
+ taskType: 'agentic_execution',
+ subtaskType: 'terminal_operations',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Run node -e "console.log(JSON.stringify([1,2,3].reduce((a,b)=>a+b,0)))" in the terminal and answer with only the output.',
+ check: { kind: 'exact', value: '6' },
+ },
+ {
+ id: 'supp2-agent-term-printf',
+ taskType: 'agentic_execution',
+ subtaskType: 'terminal_operations',
+ systemPrompt: AGENT_SYS,
+ userPrompt: 'Run printf kilo in the terminal and answer with only the output.',
+ check: { kind: 'exact', value: 'kilo' },
+ },
+ {
+ id: 'supp2-agent-term-sort',
+ taskType: 'agentic_execution',
+ subtaskType: 'terminal_operations',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Run a shell command that sorts the words "zeta alpha" alphabetically one per line. Answer with only the first sorted word.',
+ check: { kind: 'exact', value: 'alpha' },
+ },
+ {
+ id: 'supp2-agent-term-expr',
+ taskType: 'agentic_execution',
+ subtaskType: 'terminal_operations',
+ systemPrompt: AGENT_SYS,
+ userPrompt: 'Run a terminal calculation for 9 + 8 + 7 and answer with only the result.',
+ check: { kind: 'exact', value: '24' },
+ },
+ {
+ id: 'supp2-agent-multi-generate-run',
+ taskType: 'agentic_execution',
+ subtaskType: 'multi_step_execution',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Write /tmp/bench-sum.js that prints 14 + 28. Run it with node and answer with only what it prints.',
+ check: { kind: 'exact', value: '42' },
+ },
+ {
+ id: 'supp2-agent-multi-read-transform',
+ taskType: 'agentic_execution',
+ subtaskType: 'multi_step_execution',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-name.txt containing kilo. Read it, uppercase it, and answer only the uppercase text.',
+ check: { kind: 'exact', value: 'KILO' },
+ },
+ {
+ id: 'supp2-agent-multi-two-files-join',
+ taskType: 'agentic_execution',
+ subtaskType: 'multi_step_execution',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-left.txt containing auto and /tmp/bench-right.txt containing route. Read both and answer with the two words joined by a hyphen.',
+ check: { kind: 'exact', value: 'auto-route' },
+ },
+ {
+ id: 'supp2-agent-multi-json-sum',
+ taskType: 'agentic_execution',
+ subtaskType: 'multi_step_execution',
+ systemPrompt: AGENT_SYS,
+ userPrompt:
+ 'Create /tmp/bench-numbers.json containing [5,10,15]. Read it, sum the numbers, and answer only the sum.',
+ check: { kind: 'exact', value: '30' },
+ },
];
diff --git a/services/auto-routing-benchmark/src/db-replace-summaries.test.ts b/services/auto-routing-benchmark/src/db-replace-summaries.test.ts
index 16b81c8212..d77974387a 100644
--- a/services/auto-routing-benchmark/src/db-replace-summaries.test.ts
+++ b/services/auto-routing-benchmark/src/db-replace-summaries.test.ts
@@ -25,7 +25,7 @@ import { replaceModelSummaries } from './db';
function makeSummary(model: string): BenchmarkModelSummary {
return {
model,
- tier: '*',
+ routeKey: '*',
accuracy: 0.9,
avgCostUsd: 0.001,
avgLatencyMs: 100,
diff --git a/services/auto-routing-benchmark/src/db-save-routing-table.test.ts b/services/auto-routing-benchmark/src/db-save-routing-table.test.ts
new file mode 100644
index 0000000000..7cbc1048d4
--- /dev/null
+++ b/services/auto-routing-benchmark/src/db-save-routing-table.test.ts
@@ -0,0 +1,66 @@
+import { describe, expect, it, vi } from 'vitest';
+import type { RankedCandidate, RoutingTable } from '@kilocode/auto-routing-contracts';
+
+const mockState = vi.hoisted(() => ({
+ batchCalls: [] as Array>,
+}));
+
+vi.mock('drizzle-orm/d1', () => ({
+ drizzle: vi.fn(() => ({
+ delete: vi.fn(() => ({
+ where: vi.fn(() => ({ kind: 'delete' })),
+ })),
+ insert: vi.fn(() => ({
+ values: vi.fn((values: unknown) => ({
+ kind: 'insert',
+ values,
+ onConflictDoUpdate: vi.fn(() => ({ kind: 'upsert', values })),
+ })),
+ })),
+ batch: vi.fn(async (stmts: Array<{ kind: string; values?: unknown }>) => {
+ mockState.batchCalls.push(stmts);
+ }),
+ })),
+}));
+
+const candidate = (model: string): RankedCandidate => ({
+ model,
+ accuracy: 0.9,
+ avgCostUsd: 0.001,
+ meetsThreshold: true,
+ reasoningEffort: null,
+});
+
+describe('saveRoutingTable', () => {
+ it('chunks routing candidate inserts to stay under D1 variable limits', async () => {
+ const { saveRoutingTable } = await import('./db');
+
+ const table: RoutingTable = {
+ version: 'run-large-routing-table',
+ generatedAt: '2026-06-16T18:00:00.000Z',
+ minAccuracy: 0.7,
+ switchCostFactor: 3,
+ source: 'benchmark',
+ routes: {
+ 'implementation/code_generation': Array.from({ length: 23 }, (_, index) =>
+ candidate(`impl-model-${index}`)
+ ),
+ 'debugging/bug_fixing': [candidate('debug-model')],
+ 'planning_design/system_design': [candidate('plan-model')],
+ },
+ };
+
+ await saveRoutingTable({} as D1Database, table, '2026-06-16T18:01:00.000Z');
+
+ const [batch] = mockState.batchCalls;
+ expect(batch).toBeDefined();
+ const candidateInsertSizes = batch
+ .filter(stmt => stmt.kind === 'insert')
+ .map(stmt => {
+ expect(Array.isArray(stmt.values)).toBe(true);
+ return (stmt.values as unknown[]).length;
+ });
+
+ expect(candidateInsertSizes).toEqual([10, 10, 5]);
+ });
+});
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index 2a4c88035c..c241939a89 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -77,7 +77,7 @@ export const modelSummaries = sqliteTable(
{
run_id: text('run_id').notNull(),
model: text('model').notNull(),
- tier: text('tier').notNull(),
+ route_key: text('route_key').notNull(),
accuracy: real('accuracy').notNull(),
avg_cost_usd: real('avg_cost_usd'),
avg_latency_ms: real('avg_latency_ms').notNull(),
@@ -89,7 +89,7 @@ export const modelSummaries = sqliteTable(
// carried=true rows are prior-run summaries copied in at startRun for skipped models.
carried: integer('carried', { mode: 'boolean' }).notNull().default(false),
},
- table => [primaryKey({ columns: [table.run_id, table.model, table.tier] })]
+ table => [primaryKey({ columns: [table.run_id, table.model, table.route_key] })]
);
export const caseResults = sqliteTable(
@@ -98,7 +98,7 @@ export const caseResults = sqliteTable(
run_id: text('run_id').notNull(),
model: text('model').notNull(),
case_id: text('case_id').notNull(),
- tier: text('tier'),
+ route_key: text('route_key'),
score: real('score').notNull(),
latency_ms: integer('latency_ms').notNull(),
cost_usd: real('cost_usd'),
@@ -134,7 +134,7 @@ export const routingTableCandidates = sqliteTable(
'routing_table_candidates',
{
run_id: text('run_id').notNull(),
- tier: text('tier').notNull(),
+ route_key: text('route_key').notNull(),
rank: integer('rank').notNull(),
model: text('model').notNull(),
accuracy: real('accuracy').notNull(),
@@ -145,5 +145,5 @@ export const routingTableCandidates = sqliteTable(
meets_threshold: integer('meets_threshold', { mode: 'boolean' }).notNull(),
reasoning_effort: text('reasoning_effort'),
},
- table => [primaryKey({ columns: [table.run_id, table.tier, table.rank] })]
+ table => [primaryKey({ columns: [table.run_id, table.route_key, table.rank] })]
);
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
index 103482e00d..5ba9b0b853 100644
--- a/services/auto-routing-benchmark/src/db.test.ts
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -13,7 +13,7 @@ describe('mapSummaryRow', () => {
const row = {
run_id: 'run-1',
model: 'openai/gpt-4o',
- tier: 'high',
+ route_key: 'implementation/code_generation',
accuracy: 0.92,
avg_cost_usd: 0.0015,
avg_latency_ms: 320.5,
@@ -27,7 +27,7 @@ describe('mapSummaryRow', () => {
const result = mapSummaryRow(row);
expect(result).toEqual({
model: 'openai/gpt-4o',
- tier: 'high',
+ routeKey: 'implementation/code_generation',
accuracy: 0.92,
avgCostUsd: 0.0015,
avgLatencyMs: 320.5,
@@ -43,7 +43,7 @@ describe('mapSummaryRow', () => {
const row = {
run_id: 'run-2',
model: 'anthropic/claude-3-haiku',
- tier: '*',
+ route_key: '*',
accuracy: 0.85,
avg_cost_usd: null,
avg_latency_ms: 150.0,
@@ -58,7 +58,7 @@ describe('mapSummaryRow', () => {
expect(result.avgCostUsd).toBeNull();
expect(result.p50LatencyMs).toBeNull();
expect(result.p95LatencyMs).toBeNull();
- expect(result.tier).toBe('*');
+ expect(result.routeKey).toBe('*');
expect(result.errors).toBe(0);
expect(result.timeouts).toBe(0);
});
@@ -88,7 +88,7 @@ describe('mapRunRow', () => {
const summaries: BenchmarkModelSummary[] = [
{
model: 'openai/gpt-4o-mini',
- tier: '*',
+ routeKey: '*',
accuracy: 0.78,
avgCostUsd: 0.0002,
avgLatencyMs: 120,
@@ -150,10 +150,9 @@ const sampleTable: RoutingTable = {
minAccuracy: 0.7,
switchCostFactor: 3,
source: 'benchmark',
- tiers: {
- low: [candidate('model-a'), candidate('model-b')],
- medium: [candidate('model-c')],
- high: [candidate('model-a')],
+ routes: {
+ 'implementation/code_generation': [candidate('model-a'), candidate('model-b')],
+ 'debugging/bug_fixing': [candidate('model-c')],
},
};
@@ -168,14 +167,16 @@ describe('routingTableToRows', () => {
expect(tableRow.source).toBe('benchmark');
});
- it('assigns rank 0,1 for the two low-tier candidates', () => {
+ it('assigns rank 0,1 for the two implementation/code_generation candidates', () => {
const { candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
- const lowRows = candidateRows.filter(r => r.tier === 'low').sort((a, b) => a.rank - b.rank);
- expect(lowRows).toHaveLength(2);
- expect(lowRows[0].model).toBe('model-a');
- expect(lowRows[0].rank).toBe(0);
- expect(lowRows[1].model).toBe('model-b');
- expect(lowRows[1].rank).toBe(1);
+ const routeRows = candidateRows
+ .filter(r => r.route_key === 'implementation/code_generation')
+ .sort((a, b) => a.rank - b.rank);
+ expect(routeRows).toHaveLength(2);
+ expect(routeRows[0].model).toBe('model-a');
+ expect(routeRows[0].rank).toBe(0);
+ expect(routeRows[1].model).toBe('model-b');
+ expect(routeRows[1].rank).toBe(1);
});
});
@@ -188,12 +189,12 @@ describe('rowsToRoutingTable', () => {
expect(RoutingTableSchema.parse(reassembled)).toEqual(sampleTable);
});
- it('preserves candidate order within each tier', () => {
+ it('preserves candidate order within each route', () => {
const { tableRow, candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
// Shuffle candidateRows to verify rank-based sorting.
const shuffled = [...candidateRows].reverse();
const reassembled = rowsToRoutingTable(tableRow, shuffled);
- expect(reassembled.tiers.low[0].model).toBe('model-a');
- expect(reassembled.tiers.low[1].model).toBe('model-b');
+ expect(reassembled.routes['implementation/code_generation']?.[0]?.model).toBe('model-a');
+ expect(reassembled.routes['implementation/code_generation']?.[1]?.model).toBe('model-b');
});
});
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index 8ed87649fa..744adb57f5 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -34,6 +34,11 @@ type ModelSummaryRow = typeof modelSummaries.$inferSelect;
// ceiling while still batching the delete plus inserts together.
const MODEL_SUMMARY_INSERT_BATCH_SIZE = 8;
+// Routing table candidates bind 8 values per row. Keep each INSERT comfortably
+// under D1's 100-variable ceiling; publishing is infrequent, so smaller
+// statements are preferable to risking a skipped routing-table update.
+const ROUTING_TABLE_CANDIDATE_INSERT_BATCH_SIZE = 10;
+
// ---------------------------------------------------------------------------
// Row mapping helpers
// ---------------------------------------------------------------------------
@@ -41,7 +46,7 @@ const MODEL_SUMMARY_INSERT_BATCH_SIZE = 8;
export function mapSummaryRow(row: ModelSummaryRow): BenchmarkModelSummary {
return {
model: row.model,
- tier: row.tier as BenchmarkModelSummary['tier'],
+ routeKey: row.route_key as BenchmarkModelSummary['routeKey'],
accuracy: row.accuracy,
avgCostUsd: row.avg_cost_usd,
avgLatencyMs: row.avg_latency_ms,
@@ -179,7 +184,7 @@ export async function insertRun(
carriedSummaries.map(s => ({
run_id: run.id,
model: s.model,
- tier: s.tier,
+ route_key: s.routeKey,
accuracy: s.accuracy,
avg_cost_usd: s.avgCostUsd,
avg_latency_ms: s.avgLatencyMs,
@@ -221,7 +226,7 @@ export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Prom
.onConflictDoUpdate({
target: [caseResults.run_id, caseResults.model, caseResults.case_id, caseResults.rep],
set: {
- tier: row.tier,
+ route_key: row.route_key,
score: row.score,
latency_ms: row.latency_ms,
cost_usd: row.cost_usd,
@@ -251,6 +256,25 @@ export async function getCaseResults(db: D1Database, runId: string): Promise> {
+ if (params.caseIds.length === 0) return new Set();
+ const rows = await drizzle(db)
+ .select({ case_id: caseResults.case_id })
+ .from(caseResults)
+ .where(
+ and(
+ eq(caseResults.run_id, params.runId),
+ eq(caseResults.model, params.model),
+ eq(caseResults.rep, params.rep),
+ inArray(caseResults.case_id, params.caseIds)
+ )
+ );
+ return new Set(rows.map(row => row.case_id));
+}
+
// ---------------------------------------------------------------------------
// Model summaries
// ---------------------------------------------------------------------------
@@ -279,7 +303,7 @@ export async function replaceModelSummaries(
summaryChunk.map(s => ({
run_id: runId,
model: s.model,
- tier: s.tier,
+ route_key: s.routeKey,
accuracy: s.accuracy,
avg_cost_usd: s.avgCostUsd,
avg_latency_ms: s.avgLatencyMs,
@@ -415,8 +439,8 @@ export type PriorModelResult = {
summaries: BenchmarkModelSummary[];
};
-// Latest summaries per model for a benchmark kind: for each model, all tiers
-// from the most recent COMPLETED run that included it (mixing tiers across
+// Latest summaries per model for a benchmark kind: for each model, all routes
+// from the most recent COMPLETED run that included it (mixing routes across
// runs would pair incomparable numbers).
export async function getLatestSummariesByModel(
db: D1Database,
@@ -426,7 +450,7 @@ export async function getLatestSummariesByModel(
.select({
run_id: modelSummaries.run_id,
model: modelSummaries.model,
- tier: modelSummaries.tier,
+ route_key: modelSummaries.route_key,
accuracy: modelSummaries.accuracy,
avg_cost_usd: modelSummaries.avg_cost_usd,
avg_latency_ms: modelSummaries.avg_latency_ms,
@@ -492,11 +516,11 @@ export function routingTableToRows(
};
const candidateRows: RoutingTableCandidateRow[] = [];
- for (const [tier, candidates] of Object.entries(table.tiers)) {
+ for (const [routeKey, candidates] of Object.entries(table.routes)) {
candidates.forEach((c, rank) => {
candidateRows.push({
run_id: table.version,
- tier,
+ route_key: routeKey,
rank,
model: c.model,
accuracy: c.accuracy,
@@ -514,14 +538,14 @@ export function rowsToRoutingTable(
tableRow: RoutingTableRow,
candidateRows: RoutingTableCandidateRow[]
): RoutingTable {
- const tierMap: Record = { low: [], medium: [], high: [] };
+ const routeMap: Record = {};
const sorted = [...candidateRows].sort((a, b) => {
- if (a.tier !== b.tier) return a.tier.localeCompare(b.tier);
+ if (a.route_key !== b.route_key) return a.route_key.localeCompare(b.route_key);
return a.rank - b.rank;
});
for (const row of sorted) {
- if (!(row.tier in tierMap)) tierMap[row.tier] = [];
- tierMap[row.tier].push({
+ routeMap[row.route_key] ??= [];
+ routeMap[row.route_key].push({
model: row.model,
accuracy: row.accuracy,
avgCostUsd: row.avg_cost_usd,
@@ -535,11 +559,7 @@ export function rowsToRoutingTable(
minAccuracy: tableRow.min_accuracy,
switchCostFactor: tableRow.switch_cost_factor,
source: tableRow.source as RoutingTable['source'],
- tiers: {
- low: tierMap.low ?? [],
- medium: tierMap.medium ?? [],
- high: tierMap.high ?? [],
- },
+ routes: routeMap,
};
}
@@ -568,8 +588,12 @@ export async function saveRoutingTable(
}),
];
- if (candidateRows.length > 0) {
- stmts.push(orm.insert(routingTableCandidates).values(candidateRows));
+ for (let i = 0; i < candidateRows.length; i += ROUTING_TABLE_CANDIDATE_INSERT_BATCH_SIZE) {
+ stmts.push(
+ orm
+ .insert(routingTableCandidates)
+ .values(candidateRows.slice(i, i + ROUTING_TABLE_CANDIDATE_INSERT_BATCH_SIZE))
+ );
}
await orm.batch(stmts);
@@ -592,7 +616,7 @@ export async function getLatestRoutingTable(
.select()
.from(routingTableCandidates)
.where(eq(routingTableCandidates.run_id, tableRow.run_id))
- .orderBy(routingTableCandidates.tier, routingTableCandidates.rank);
+ .orderBy(routingTableCandidates.route_key, routingTableCandidates.rank);
const assembled = rowsToRoutingTable(tableRow, candidateRows);
const parsed = RoutingTableSchema.safeParse(assembled);
@@ -627,11 +651,11 @@ export async function getClassifierWinner(db: D1Database): Promise> = {}
+): BenchmarkModelSummary[] {
+ return TAXONOMY_ROUTE_KEYS.flatMap(
+ routeKey =>
+ overrides[routeKey] ?? [
+ summary('model/cheap', routeKey, 0.7, 0.007),
+ summary('model/value', routeKey, 0.9, 0.008),
+ summary('model/weak', routeKey, 0.5, 0.001),
+ ]
+ );
+}
describe('buildRoutingTable', () => {
- it('cheapest above-threshold model comes first per tier', () => {
+ it('ranks candidates by lowest cost per accuracy for each taxonomy route', () => {
const table = buildRoutingTable({
runId: 'test-run-1',
generatedAt: '2026-01-01T00:00:00.000Z',
minAccuracy: 0.7,
switchCostFactor: 3,
deciderModels: DECIDER_MODELS,
- summaries: ALL_TIERS_SUMMARIES,
+ summaries: summariesForEveryRoute(),
});
- // low tier: cheap (0.001) and mid (0.005) and expensive (0.01) all meet threshold (0.7)
- // cheapest first
- expect(table.tiers.low[0].model).toBe('model/cheap');
- expect(table.tiers.low[1].model).toBe('model/mid');
- expect(table.tiers.low[2].model).toBe('model/expensive');
-
- // medium tier: all meet threshold, cheapest first
- expect(table.tiers.medium[0].model).toBe('model/cheap');
- expect(table.tiers.medium[1].model).toBe('model/mid');
- expect(table.tiers.medium[2].model).toBe('model/expensive');
-
- // high tier: expensive (0.9) and mid (0.75) meet threshold; cheap (0.6) does not
- // meeting threshold first, then by cost; cheap last (below threshold)
- expect(table.tiers.high[0].model).toBe('model/mid'); // meets threshold, cheaper
- expect(table.tiers.high[1].model).toBe('model/expensive'); // meets threshold, more expensive
- expect(table.tiers.high[2].model).toBe('model/cheap'); // below threshold
+ expect(table.routes['implementation/code_generation']?.map(c => c.model)).toEqual([
+ 'model/value',
+ 'model/cheap',
+ 'model/weak',
+ ]);
});
- it('excludes a model whose tier summary has no cost signal', () => {
+ it('excludes a model whose route summary has no cost signal', () => {
+ const routeKey = 'implementation/code_generation';
const table = buildRoutingTable({
runId: 'test-run-nocost',
generatedAt: '2026-01-01T00:00:00.000Z',
minAccuracy: 0.7,
switchCostFactor: 3,
deciderModels: DECIDER_MODELS,
- summaries: ALL_TIERS_SUMMARIES.map(s =>
- s.model === 'model/cheap' && s.tier === 'low' ? { ...s, avgCostUsd: null } : s
- ),
- });
-
- // model/cheap would have won 'low' as cheapest; without a cost signal it
- // must not be ranked (unknown cost is not zero cost).
- expect(table.tiers.low.map(c => c.model)).toEqual(['model/mid', 'model/expensive']);
- });
-
- it('marks meetsThreshold correctly', () => {
- const table = buildRoutingTable({
- runId: 'test-run-2',
- generatedAt: '2026-01-01T00:00:00.000Z',
- minAccuracy: 0.7,
- switchCostFactor: 3,
- deciderModels: DECIDER_MODELS,
- summaries: ALL_TIERS_SUMMARIES,
+ summaries: summariesForEveryRoute({
+ [routeKey]: [
+ summary('model/cheap', routeKey, 0.7, null),
+ summary('model/value', routeKey, 0.9, 0.008),
+ ],
+ }),
});
- for (const candidate of table.tiers.low) {
- expect(candidate.meetsThreshold).toBe(candidate.accuracy >= 0.7);
- }
- });
-
- it('excludes a model absent from a tier summaries', () => {
- // model/cheap has no 'high' summary entry
- const summaries: BenchmarkModelSummary[] = [
- summary('model/cheap', 'low', 0.9),
- summary('model/cheap', 'medium', 0.8),
- // no 'high' entry for model/cheap
- summary('model/expensive', 'low', 0.9),
- summary('model/expensive', 'medium', 0.8),
- summary('model/expensive', 'high', 0.9),
- summary('model/mid', 'low', 0.8),
- summary('model/mid', 'medium', 0.75),
- summary('model/mid', 'high', 0.75),
- ];
-
- const table = buildRoutingTable({
- runId: 'test-run-3',
- generatedAt: '2026-01-01T00:00:00.000Z',
- minAccuracy: 0.7,
- switchCostFactor: 3,
- deciderModels: DECIDER_MODELS,
- summaries,
- });
-
- const highModels = table.tiers.high.map(c => c.model);
- expect(highModels).not.toContain('model/cheap');
- expect(highModels).toContain('model/expensive');
- expect(highModels).toContain('model/mid');
+ expect(table.routes[routeKey]?.map(c => c.model)).toEqual(['model/value']);
});
it('carries reasoningEffort from the run snapshot', () => {
@@ -140,119 +90,43 @@ describe('buildRoutingTable', () => {
minAccuracy: 0.7,
switchCostFactor: 3,
deciderModels: DECIDER_MODELS,
- summaries: ALL_TIERS_SUMMARIES,
- });
-
- const expensiveInLow = table.tiers.low.find(c => c.model === 'model/expensive');
- expect(expensiveInLow?.reasoningEffort).toBe('medium');
-
- const midInLow = table.tiers.low.find(c => c.model === 'model/mid');
- expect(midInLow?.reasoningEffort).toBeNull();
- });
-
- it('defaults reasoningEffort to null when model missing from the snapshot', () => {
- const summaries: BenchmarkModelSummary[] = [
- summary('model/unknown', 'low', 0.9),
- summary('model/cheap', 'low', 0.8),
- summary('model/cheap', 'medium', 0.8),
- summary('model/cheap', 'high', 0.8),
- summary('model/unknown', 'medium', 0.9),
- summary('model/unknown', 'high', 0.9),
- ];
-
- const table = buildRoutingTable({
- runId: 'test-run-5',
- generatedAt: '2026-01-01T00:00:00.000Z',
- minAccuracy: 0.7,
- switchCostFactor: 3,
- deciderModels: DECIDER_MODELS,
- summaries,
+ summaries: summariesForEveryRoute(),
});
- const unknown = table.tiers.low.find(c => c.model === 'model/unknown');
- expect(unknown?.reasoningEffort).toBeNull();
- });
-
- it('throws when a tier has no candidates', () => {
- // Only low and medium summaries — high is missing entirely
- const summaries: BenchmarkModelSummary[] = [
- summary('model/cheap', 'low', 0.9),
- summary('model/expensive', 'low', 0.9),
- summary('model/mid', 'low', 0.9),
- summary('model/cheap', 'medium', 0.9),
- summary('model/expensive', 'medium', 0.9),
- summary('model/mid', 'medium', 0.9),
- ];
+ const value = table.routes['implementation/code_generation']?.find(
+ c => c.model === 'model/value'
+ );
+ expect(value?.reasoningEffort).toBe('medium');
- expect(() =>
- buildRoutingTable({
- runId: 'test-run-6',
- generatedAt: '2026-01-01T00:00:00.000Z',
- minAccuracy: 0.7,
- switchCostFactor: 3,
- deciderModels: DECIDER_MODELS,
- summaries,
- })
- ).toThrow();
+ const cheap = table.routes['implementation/code_generation']?.find(
+ c => c.model === 'model/cheap'
+ );
+ expect(cheap?.reasoningEffort).toBeNull();
});
- it('throws when a tier has only zero-case entries', () => {
- const summaries: BenchmarkModelSummary[] = [
- ...ALL_TIERS_SUMMARIES.filter(s => s.tier !== 'high'),
- // high tier entries with 0 cases — should be excluded
- { ...summary('model/cheap', 'high', 0.9), cases: 0 },
- { ...summary('model/expensive', 'high', 0.9), cases: 0 },
- { ...summary('model/mid', 'high', 0.9), cases: 0 },
- ];
-
+ it('throws when any taxonomy route has no candidates', () => {
expect(() =>
buildRoutingTable({
- runId: 'test-run-7',
+ runId: 'test-run-missing-route',
generatedAt: '2026-01-01T00:00:00.000Z',
minAccuracy: 0.7,
switchCostFactor: 3,
deciderModels: DECIDER_MODELS,
- summaries,
+ summaries: summariesForEveryRoute({ 'implementation/code_generation': [] }),
})
).toThrow();
});
- it('ignores classifier-style * tier summaries', () => {
- const summaries: BenchmarkModelSummary[] = [
- ...ALL_TIERS_SUMMARIES,
- // classifier summaries with '*' tier — should be ignored
- summary('model/cheap', '*', 0.95),
- summary('model/expensive', '*', 0.95),
- ];
-
- // Should not throw and * tier entries should not affect output
+ it('ignores classifier-style * route summaries', () => {
const table = buildRoutingTable({
- runId: 'test-run-8',
+ runId: 'test-run-classifier-summary',
generatedAt: '2026-01-01T00:00:00.000Z',
minAccuracy: 0.7,
switchCostFactor: 3,
deciderModels: DECIDER_MODELS,
- summaries,
- });
-
- expect(table.tiers.low.length).toBe(3);
- expect(table.tiers.medium.length).toBe(3);
- });
-
- it('sets version and generatedAt from params', () => {
- const table = buildRoutingTable({
- runId: 'decider-2026-01-01',
- generatedAt: '2026-01-01T12:00:00.000Z',
- minAccuracy: 0.7,
- switchCostFactor: 3,
- deciderModels: DECIDER_MODELS,
- summaries: ALL_TIERS_SUMMARIES,
+ summaries: [...summariesForEveryRoute(), summary('model/value', '*', 1, 0.0001)],
});
- expect(table.version).toBe('decider-2026-01-01');
- expect(table.generatedAt).toBe('2026-01-01T12:00:00.000Z');
- expect(table.source).toBe('benchmark');
- expect(table.minAccuracy).toBe(0.7);
- expect(table.switchCostFactor).toBe(3);
+ expect(table.routes['implementation/code_generation']).toHaveLength(3);
});
});
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
index 222f19436f..27e09a177b 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -1,17 +1,18 @@
import {
rankCandidates,
RoutingTableSchema,
+ TAXONOMY_ROUTE_KEYS,
type BenchmarkDeciderModel,
type BenchmarkModelSummary,
- type DifficultyTier,
type RoutingTable,
+ type TaxonomyRouteKey,
} from '@kilocode/auto-routing-contracts';
-// Builds the routing table from per-(model, tier) decider summaries. Models
-// with zero graded cases in a tier are excluded from that tier, as are
+// Builds the routing table from per-(model, taxonomy-route) decider summaries. Models
+// with zero graded cases in a route are excluded from that route, as are
// models with no cost signal at all (avgCostUsd null means every case failed
-// to report cost; ranking such a model as cheapest would hand it the tier).
-// Throws when any tier ends up empty so the caller keeps the previous
+// to report cost; ranking such a model as cheapest would hand it the route).
+// Throws when any route ends up empty so the caller keeps the previous
// published table. deciderModels/minAccuracy/switchCostFactor come from the
// run's snapshot, not live config.
export function buildRoutingTable(params: {
@@ -25,10 +26,10 @@ export function buildRoutingTable(params: {
const { runId, generatedAt, minAccuracy, switchCostFactor, deciderModels, summaries } = params;
const modelConfigById = new Map(deciderModels.map(m => [m.id, m] as const));
- const tierCandidates = (t: DifficultyTier) =>
+ const routeCandidates = (routeKey: TaxonomyRouteKey) =>
rankCandidates(
summaries
- .filter(s => s.tier === t && s.cases > 0 && s.avgCostUsd !== null)
+ .filter(s => s.routeKey === routeKey && s.cases > 0 && s.avgCostUsd !== null)
.map(s => ({
model: s.model,
accuracy: s.accuracy,
@@ -38,21 +39,21 @@ export function buildRoutingTable(params: {
minAccuracy
);
+ const routes = Object.fromEntries(
+ TAXONOMY_ROUTE_KEYS.map(routeKey => [routeKey, routeCandidates(routeKey)] as const)
+ );
+
const table: RoutingTable = {
version: runId,
generatedAt,
minAccuracy,
switchCostFactor,
source: 'benchmark',
- tiers: {
- low: tierCandidates('low'),
- medium: tierCandidates('medium'),
- high: tierCandidates('high'),
- },
+ routes,
};
- // RoutingTableSchema enforces .min(1) on each tier array; throws ZodError
- // when a tier is empty — caller logs and skips publish, keeping the previous
+ // RoutingTableSchema enforces .min(1) on each route array; throws ZodError
+ // when a route is empty — caller logs and skips publish, keeping the previous
// live table intact.
return RoutingTableSchema.parse(table);
}
diff --git a/services/auto-routing-benchmark/src/run-process-job.test.ts b/services/auto-routing-benchmark/src/run-process-job.test.ts
new file mode 100644
index 0000000000..955820cc92
--- /dev/null
+++ b/services/auto-routing-benchmark/src/run-process-job.test.ts
@@ -0,0 +1,302 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import type * as CliRunnerModule from './cli-runner';
+import type * as DbModule from './db';
+import { DECIDER_CASES } from './datasets/decider-cases';
+
+vi.mock('./db', async importOriginal => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ countCaseResults: vi.fn(),
+ existsNewerCompletedRun: vi.fn(),
+ getCaseResults: vi.fn(),
+ getExistingCaseResultIds: vi.fn(),
+ getRunWithModels: vi.fn(),
+ getSummaries: vi.fn(),
+ markRunCompleted: vi.fn(),
+ replaceModelSummaries: vi.fn(),
+ saveRoutingTable: vi.fn(),
+ upsertCaseResult: vi.fn(),
+ };
+});
+
+vi.mock('./cli-runner', async importOriginal => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ destroyDeciderCliContainer: vi.fn(),
+ runDeciderCaseViaCli: vi.fn(),
+ warmUpCliContainer: vi.fn(),
+ };
+});
+
+import {
+ destroyDeciderCliContainer,
+ runDeciderCaseViaCli,
+ warmUpCliContainer,
+ type CliRunResult,
+} from './cli-runner';
+import {
+ countCaseResults,
+ getExistingCaseResultIds,
+ getRunWithModels,
+ upsertCaseResult,
+} from './db';
+import { processJob } from './run';
+
+const tokenGet = vi.fn<() => Promise>();
+const queueSendBatch = vi.fn<(messages: unknown[]) => Promise>();
+const model = 'qwen/qwen3-coder-next';
+const runId = 'decider-test-run';
+const [benchCase] = DECIDER_CASES;
+
+const successfulCliResult = {
+ text: 'not the expected answer',
+ costUsd: null,
+ latencyMs: 25,
+ exitCode: 0,
+ stderrTail: '',
+ eventCount: 1,
+ lastEventTypes: ['session.created'],
+ timedOut: false,
+} satisfies CliRunResult;
+
+const env = {
+ INTERNAL_API_SECRET_PROD: { get: tokenGet },
+ BENCH_DB: {} as D1Database,
+ BENCH_QUEUE: { sendBatch: queueSendBatch },
+ AUTO_ROUTING_CONFIG: { delete: vi.fn() },
+} as unknown as Env;
+
+function mockRunSnapshot(): void {
+ vi.mocked(getRunWithModels).mockResolvedValue({
+ run: {
+ max_concurrency: 4,
+ min_accuracy: 0.7,
+ switch_cost_factor: 3,
+ benchmark_user_id: 'benchmark-user',
+ repetitions: 1,
+ classifier_max_p95_latency_ms: null,
+ started_at: '2026-06-16T00:00:00.000Z',
+ },
+ models: [{ model, enqueued: true, reasoning_effort: null }],
+ } as never);
+}
+
+function deciderMessage() {
+ return {
+ runId,
+ kind: 'decider',
+ model,
+ caseIds: [benchCase.id],
+ chunk: 0,
+ rep: 0,
+ };
+}
+
+beforeEach(() => {
+ vi.clearAllMocks();
+ tokenGet.mockResolvedValue('internal-secret');
+ queueSendBatch.mockResolvedValue(undefined);
+ vi.stubGlobal(
+ 'fetch',
+ vi.fn(async () =>
+ Response.json({ token: 'kilo-user-token', expiresAt: '2026-06-16T01:00:00.000Z' })
+ )
+ );
+ mockRunSnapshot();
+ vi.mocked(countCaseResults).mockResolvedValue(0);
+ vi.mocked(getExistingCaseResultIds).mockResolvedValue(new Set());
+ vi.mocked(destroyDeciderCliContainer).mockResolvedValue(undefined);
+ vi.mocked(warmUpCliContainer).mockResolvedValue(undefined);
+ vi.mocked(runDeciderCaseViaCli).mockResolvedValue(successfulCliResult);
+});
+
+afterEach(() => {
+ vi.unstubAllGlobals();
+});
+
+describe('processJob — decider container availability failures', () => {
+ it.each([
+ 'container /run failed: HTTP 503 There is no Container instance available at this time. This is likely because you have reached your max concurrent instance count.',
+ 'container /run failed: HTTP 503 Maximum number of running container instances exceeded',
+ 'container /run failed: HTTP 503 There is no container instance that can be provided to this Durable Object, try again later',
+ ])('lets the queue retry %s', async message => {
+ vi.mocked(runDeciderCaseViaCli).mockRejectedValueOnce(new Error(message));
+
+ await expect(processJob(env, deciderMessage())).rejects.toThrow(message);
+
+ expect(upsertCaseResult).not.toHaveBeenCalled();
+ expect(countCaseResults).not.toHaveBeenCalled();
+ });
+
+ it('lets the queue retry warmup capacity failures before running cases', async () => {
+ const message =
+ 'container /warmup failed: HTTP 503 There is no Container instance available at this time';
+ vi.mocked(warmUpCliContainer).mockRejectedValueOnce(new Error(message));
+
+ await expect(processJob(env, deciderMessage())).rejects.toThrow(message);
+
+ expect(runDeciderCaseViaCli).not.toHaveBeenCalled();
+ expect(upsertCaseResult).not.toHaveBeenCalled();
+ expect(countCaseResults).not.toHaveBeenCalled();
+ });
+});
+
+describe('processJob — decider chunk chaining', () => {
+ it('runs a chunk on the model-repetition shard container and enqueues the next chunk', async () => {
+ const message = {
+ ...deciderMessage(),
+ caseIds: DECIDER_CASES.slice(0, 5).map(c => c.id),
+ };
+
+ await processJob(env, message);
+
+ expect(warmUpCliContainer).toHaveBeenCalledWith(
+ env,
+ expect.objectContaining({ instanceName: `${runId}:${model}:0:0` })
+ );
+ expect(runDeciderCaseViaCli).toHaveBeenCalledWith(
+ env,
+ expect.objectContaining({ instanceName: `${runId}:${model}:0:0` })
+ );
+ expect(queueSendBatch).toHaveBeenCalledWith([
+ {
+ body: {
+ runId,
+ kind: 'decider',
+ model,
+ chunk: 1,
+ shard: 0,
+ shardCount: 1,
+ rep: 0,
+ caseIds: DECIDER_CASES.slice(5, 10).map(c => c.id),
+ },
+ },
+ ]);
+ expect(countCaseResults).not.toHaveBeenCalled();
+ });
+
+ it('enqueues the next chunk assigned to the same shard lane', async () => {
+ const chunk = 2;
+ const shard = 2;
+ const shardCount = 8;
+ const currentCaseIds = DECIDER_CASES.slice(chunk * 5, chunk * 5 + 5).map(c => c.id);
+ const nextChunk = chunk + shardCount;
+ const nextCaseIds = DECIDER_CASES.slice(nextChunk * 5, nextChunk * 5 + 5).map(c => c.id);
+
+ await processJob(env, {
+ ...deciderMessage(),
+ chunk,
+ shard,
+ shardCount,
+ caseIds: currentCaseIds,
+ });
+
+ expect(warmUpCliContainer).toHaveBeenCalledWith(
+ env,
+ expect.objectContaining({ instanceName: `${runId}:${model}:0:2` })
+ );
+ expect(queueSendBatch).toHaveBeenCalledWith([
+ {
+ body: {
+ runId,
+ kind: 'decider',
+ model,
+ chunk: nextChunk,
+ shard,
+ shardCount,
+ rep: 0,
+ caseIds: nextCaseIds,
+ },
+ },
+ ]);
+ expect(countCaseResults).not.toHaveBeenCalled();
+ });
+
+ it('does not rerun completed chunk cases or enqueue a fully completed next chunk', async () => {
+ const currentCaseIds = DECIDER_CASES.slice(0, 5).map(c => c.id);
+ const nextCaseIds = DECIDER_CASES.slice(5, 10).map(c => c.id);
+ vi.mocked(getExistingCaseResultIds)
+ .mockResolvedValueOnce(new Set(currentCaseIds))
+ .mockResolvedValueOnce(new Set(nextCaseIds));
+
+ await processJob(env, { ...deciderMessage(), caseIds: currentCaseIds });
+
+ expect(warmUpCliContainer).not.toHaveBeenCalled();
+ expect(runDeciderCaseViaCli).not.toHaveBeenCalled();
+ expect(upsertCaseResult).not.toHaveBeenCalled();
+ expect(queueSendBatch).not.toHaveBeenCalled();
+ });
+
+ it('re-enqueues a partially completed next chunk so DLQ leftovers cannot strand a run', async () => {
+ const currentCaseIds = DECIDER_CASES.slice(0, 5).map(c => c.id);
+ const nextCaseIds = DECIDER_CASES.slice(5, 10).map(c => c.id);
+ vi.mocked(getExistingCaseResultIds)
+ .mockResolvedValueOnce(new Set(currentCaseIds))
+ .mockResolvedValueOnce(new Set([nextCaseIds[0]]));
+
+ await processJob(env, { ...deciderMessage(), caseIds: currentCaseIds });
+
+ expect(warmUpCliContainer).not.toHaveBeenCalled();
+ expect(runDeciderCaseViaCli).not.toHaveBeenCalled();
+ expect(upsertCaseResult).not.toHaveBeenCalled();
+ expect(queueSendBatch).toHaveBeenCalledWith([
+ {
+ body: {
+ runId,
+ kind: 'decider',
+ model,
+ chunk: 1,
+ shard: 0,
+ shardCount: 1,
+ rep: 0,
+ caseIds: nextCaseIds,
+ },
+ },
+ ]);
+ });
+
+ it('destroys the model-repetition shard container after the terminal chunk', async () => {
+ const terminalChunk = Math.floor((DECIDER_CASES.length - 1) / 5);
+ const terminalCaseIds = DECIDER_CASES.slice(terminalChunk * 5).map(c => c.id);
+
+ await processJob(env, {
+ ...deciderMessage(),
+ chunk: terminalChunk,
+ shard: 3,
+ shardCount: 4,
+ caseIds: terminalCaseIds,
+ });
+
+ expect(queueSendBatch).not.toHaveBeenCalled();
+ expect(destroyDeciderCliContainer).toHaveBeenCalledWith(env, {
+ instanceName: `${runId}:${model}:0:3`,
+ });
+ expect(countCaseResults).toHaveBeenCalled();
+ });
+
+ it('finalizes terminal chunks even when best-effort container destroy fails', async () => {
+ const terminalChunk = Math.floor((DECIDER_CASES.length - 1) / 5);
+ const terminalCaseIds = DECIDER_CASES.slice(terminalChunk * 5).map(c => c.id);
+ const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+ vi.mocked(destroyDeciderCliContainer).mockRejectedValueOnce(new Error('already stopped'));
+
+ await processJob(env, {
+ ...deciderMessage(),
+ chunk: terminalChunk,
+ shard: 3,
+ shardCount: 4,
+ caseIds: terminalCaseIds,
+ });
+
+ expect(destroyDeciderCliContainer).toHaveBeenCalledWith(env, {
+ instanceName: `${runId}:${model}:0:3`,
+ });
+ expect(warn).toHaveBeenCalledWith(
+ expect.stringContaining('benchmark_container_destroy_failed')
+ );
+ expect(countCaseResults).toHaveBeenCalled();
+ warn.mockRestore();
+ });
+});
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
index 9d40b883e0..4c38613658 100644
--- a/services/auto-routing-benchmark/src/run.test.ts
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -5,7 +5,9 @@ import {
buildClassifierMessages,
buildDeciderMessages,
chunkArray,
+ computeDeciderShardCount,
computeEngineIdentity,
+ getDeciderContainerInstanceName,
runCasesWithConcurrency,
summarize,
} from './run';
@@ -16,7 +18,7 @@ function makeRow(overrides: Partial = {}): CaseResultRow {
run_id: 'run-1',
model: 'model/a',
case_id: 'case-1',
- tier: null,
+ route_key: null,
score: 1,
latency_ms: 100,
cost_usd: 0.001,
@@ -34,12 +36,12 @@ function makeRow(overrides: Partial = {}): CaseResultRow {
}
describe('summarize — classifier kind', () => {
- it('groups all classifier rows under * tier', () => {
+ it('groups all classifier rows under * route key', () => {
const rows: CaseResultRow[] = [
makeRow({
model: 'model/a',
case_id: 'c1',
- tier: null,
+ route_key: null,
score: 1,
latency_ms: 100,
cost_usd: 0.001,
@@ -47,7 +49,7 @@ describe('summarize — classifier kind', () => {
makeRow({
model: 'model/a',
case_id: 'c2',
- tier: null,
+ route_key: null,
score: 0.5,
latency_ms: 200,
cost_usd: 0.002,
@@ -58,7 +60,7 @@ describe('summarize — classifier kind', () => {
expect(summaries).toHaveLength(1);
const [s] = summaries;
expect(s.model).toBe('model/a');
- expect(s.tier).toBe('*');
+ expect(s.routeKey).toBe('*');
expect(s.cases).toBe(2);
});
@@ -123,39 +125,65 @@ describe('summarize — classifier kind', () => {
});
describe('summarize — decider kind', () => {
- it('groups by tier', () => {
+ it('groups by taxonomy route key', () => {
const rows: CaseResultRow[] = [
- makeRow({ model: 'model/a', case_id: 'low-1', tier: 'low', score: 1 }),
- makeRow({ model: 'model/a', case_id: 'low-2', tier: 'low', score: 0 }),
- makeRow({ model: 'model/a', case_id: 'med-1', tier: 'medium', score: 1 }),
- makeRow({ model: 'model/b', case_id: 'low-3', tier: 'low', score: 1 }),
+ makeRow({
+ model: 'model/a',
+ case_id: 'impl-1',
+ route_key: 'implementation/code_generation',
+ score: 1,
+ }),
+ makeRow({
+ model: 'model/a',
+ case_id: 'impl-2',
+ route_key: 'implementation/code_generation',
+ score: 0,
+ }),
+ makeRow({
+ model: 'model/a',
+ case_id: 'debug-1',
+ route_key: 'debugging/bug_fixing',
+ score: 1,
+ }),
+ makeRow({
+ model: 'model/b',
+ case_id: 'impl-3',
+ route_key: 'implementation/code_generation',
+ score: 1,
+ }),
];
const summaries = summarize(rows, 'decider');
expect(summaries).toHaveLength(3);
- const aLow = summaries.find(s => s.model === 'model/a' && s.tier === 'low');
- expect(aLow?.cases).toBe(2);
- expect(aLow?.accuracy).toBe(0.5);
+ const aImpl = summaries.find(
+ s => s.model === 'model/a' && s.routeKey === 'implementation/code_generation'
+ );
+ expect(aImpl?.cases).toBe(2);
+ expect(aImpl?.accuracy).toBe(0.5);
- const aMed = summaries.find(s => s.model === 'model/a' && s.tier === 'medium');
- expect(aMed?.cases).toBe(1);
- expect(aMed?.accuracy).toBe(1);
+ const aDebug = summaries.find(
+ s => s.model === 'model/a' && s.routeKey === 'debugging/bug_fixing'
+ );
+ expect(aDebug?.cases).toBe(1);
+ expect(aDebug?.accuracy).toBe(1);
- const bLow = summaries.find(s => s.model === 'model/b' && s.tier === 'low');
- expect(bLow?.cases).toBe(1);
+ const bImpl = summaries.find(
+ s => s.model === 'model/b' && s.routeKey === 'implementation/code_generation'
+ );
+ expect(bImpl?.cases).toBe(1);
});
- it('uses * fallback when tier is null', () => {
- const rows: CaseResultRow[] = [makeRow({ tier: null, score: 1 })];
+ it('uses * fallback when route key is null', () => {
+ const rows: CaseResultRow[] = [makeRow({ route_key: null, score: 1 })];
const [s] = summarize(rows, 'decider');
- expect(s.tier).toBe('*');
+ expect(s.routeKey).toBe('*');
});
it('computes avgLatencyMs as rounded mean', () => {
const rows: CaseResultRow[] = [
- makeRow({ case_id: 'c1', tier: 'low', latency_ms: 100 }),
- makeRow({ case_id: 'c2', tier: 'low', latency_ms: 301 }),
+ makeRow({ case_id: 'c1', route_key: 'implementation/code_generation', latency_ms: 100 }),
+ makeRow({ case_id: 'c2', route_key: 'implementation/code_generation', latency_ms: 301 }),
];
const [s] = summarize(rows, 'decider');
@@ -163,7 +191,9 @@ describe('summarize — decider kind', () => {
});
it('handles single-element groups for p50', () => {
- const rows: CaseResultRow[] = [makeRow({ tier: 'high', latency_ms: 500 })];
+ const rows: CaseResultRow[] = [
+ makeRow({ route_key: 'implementation/code_generation', latency_ms: 500 }),
+ ];
const [s] = summarize(rows, 'decider');
expect(s.p50LatencyMs).toBe(500);
});
@@ -266,7 +296,7 @@ describe('chunkArray', () => {
describe('pickClassifierWinner', () => {
const summary = (model: string, accuracy: number, avgCostUsd: number | null) => ({
model,
- tier: '*' as const,
+ routeKey: '*' as const,
accuracy,
avgCostUsd,
avgLatencyMs: 100,
@@ -298,9 +328,12 @@ describe('pickClassifierWinner', () => {
expect(winner?.model).toBe('cheap');
});
- it('ignores decider-tier summaries and returns null when nothing is graded', () => {
+ it('ignores decider route summaries and returns null when nothing is graded', () => {
expect(
- pickClassifierWinner([{ ...summary('m', 1, 0.001), tier: 'low' as const }], 0.7)
+ pickClassifierWinner(
+ [{ ...summary('m', 1, 0.001), routeKey: 'implementation/code_generation' as const }],
+ 0.7
+ )
).toBeNull();
expect(pickClassifierWinner([], 0.7)).toBeNull();
});
@@ -313,7 +346,7 @@ describe('pickClassifierWinner', () => {
p95: number | null = 90
) => ({
model,
- tier: '*' as const,
+ routeKey: '*' as const,
accuracy,
avgCostUsd,
avgLatencyMs: 100,
@@ -412,8 +445,7 @@ describe('summarize — p95 and timeouts', () => {
});
describe('decider message fan-out', () => {
- it('DECIDER_CHUNK_SIZE is 5 (chunk count for 76 cases)', () => {
- // DECIDER_CASES = 76, chunk size 5 → ceil(76/5) = 16 chunks
+ it('DECIDER_CHUNK_SIZE is 5', () => {
const chunks = chunkArray(
Array.from({ length: 76 }, (_, i) => String(i)),
5
@@ -429,45 +461,95 @@ describe('decider message fan-out', () => {
kind: 'decider',
model: 'm1',
rep: 2,
+ shard: 1,
+ shardCount: 4,
caseIds: ['a'],
chunk: 0,
});
expect(withRep.rep).toBe(2);
+ expect(withRep.shard).toBe(1);
+ expect(withRep.shardCount).toBe(4);
});
- it('buildDeciderMessages: produces models × reps × ceil(76/5) messages with correct rep', () => {
- // 76 cases, chunk size 5 → 16 chunks
- const cases76 = Array.from({ length: 76 }, (_, i) => ({ id: `case-${i}` }));
- const chunks = chunkArray(cases76, 5);
- expect(chunks).toHaveLength(16);
+ it('computeDeciderShardCount maximizes shard lanes under the live container cap', () => {
+ expect(computeDeciderShardCount({ modelCount: 2, repetitions: 3, chunkCount: 36 })).toBe(16);
+ expect(
+ computeDeciderShardCount({
+ modelCount: 7,
+ repetitions: 1,
+ chunkCount: 36,
+ maxLiveContainers: 100,
+ })
+ ).toBe(14);
+ expect(
+ computeDeciderShardCount({
+ modelCount: 25,
+ repetitions: 1,
+ chunkCount: 36,
+ maxLiveContainers: 100,
+ })
+ ).toBe(4);
+ expect(
+ computeDeciderShardCount({
+ modelCount: 10,
+ repetitions: 3,
+ chunkCount: 36,
+ maxLiveContainers: 100,
+ })
+ ).toBe(3);
+ expect(
+ computeDeciderShardCount({
+ modelCount: 101,
+ repetitions: 1,
+ chunkCount: 36,
+ maxLiveContainers: 100,
+ })
+ ).toBe(0);
+ });
+
+ it('buildDeciderMessages: seeds sharded chunk lanes under the container cap', () => {
+ const cases180 = Array.from({ length: 180 }, (_, i) => ({ id: `case-${i}` }));
+ const chunks = chunkArray(cases180, 5);
+ expect(chunks).toHaveLength(36);
const models = ['model/a', 'model/b'];
const repetitions = 3;
const messages = buildDeciderMessages('run-test', 'decider', models, repetitions, chunks);
+ const expectedShardCount = 16;
- // Total: 2 models × 3 reps × 16 chunks = 96 messages
- expect(messages).toHaveLength(models.length * repetitions * chunks.length);
+ // Initial fan-out is bounded by the 100-container budget while running
+ // multiple independent chunk lanes per model/repetition.
+ expect(messages).toHaveLength(models.length * repetitions * expectedShardCount);
+ expect(messages.length).toBeLessThanOrEqual(100);
- // Each rep index (0..2) should appear exactly models.length × chunks.length times
for (let rep = 0; rep < repetitions; rep++) {
const forRep = messages.filter(m => m.body.rep === rep);
- expect(forRep).toHaveLength(models.length * chunks.length);
+ expect(forRep).toHaveLength(models.length * expectedShardCount);
}
- // Every message carries the correct rep in its body
for (const { body } of messages) {
expect(typeof body.rep).toBe('number');
expect(body.rep).toBeGreaterThanOrEqual(0);
expect(body.rep).toBeLessThan(repetitions);
+ expect(body.shardCount).toBe(expectedShardCount);
+ expect(body.shard).toBeGreaterThanOrEqual(0);
+ expect(body.shard).toBeLessThan(expectedShardCount);
+ expect(body.chunk).toBe(body.shard);
+ expect(body.caseIds).toEqual(chunks[body.shard!]?.map(c => c.id));
}
+ });
- // caseIds on each message match the chunk
- for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
- const forChunk = messages.filter(m => m.body.chunk === chunkIdx);
- for (const { body } of forChunk) {
- expect(body.caseIds).toEqual(chunks[chunkIdx].map(c => c.id));
- }
- }
+ it('getDeciderContainerInstanceName reuses one container per model repetition shard', () => {
+ const base = { runId: 'run-test', kind: 'decider' as const, model: 'model/a', rep: 2 };
+ expect(getDeciderContainerInstanceName({ ...base, chunk: 0, shard: 0 })).toBe(
+ 'run-test:model/a:2:0'
+ );
+ expect(getDeciderContainerInstanceName({ ...base, chunk: 16, shard: 0 })).toBe(
+ 'run-test:model/a:2:0'
+ );
+ expect(getDeciderContainerInstanceName({ ...base, chunk: 1, shard: 1 })).toBe(
+ 'run-test:model/a:2:1'
+ );
});
});
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 326134cdd7..f4aa1dbd26 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -5,6 +5,7 @@ import {
type BenchmarkDeciderModel,
type BenchmarkKind,
type BenchmarkModelSummary,
+ taxonomyRouteKey,
} from '@kilocode/auto-routing-contracts';
import { formatError } from '@kilocode/worker-utils';
import * as z from 'zod';
@@ -16,6 +17,7 @@ import {
countCaseResults,
existsNewerCompletedRun,
getCaseResults,
+ getExistingCaseResultIds,
getLatestSummariesByModel,
getRunningRun,
getRunWithModels,
@@ -33,7 +35,12 @@ import {
import { gradeClassifierOutput, runDeciderCheck } from './grading';
import { createOpenRouterClient } from './openrouter';
import { buildRoutingTable } from './routing-table-builder';
-import { runDeciderCaseViaCli, warmUpCliContainer } from './cli-runner';
+import {
+ destroyDeciderCliContainer,
+ isRetryableContainerAvailabilityError,
+ runDeciderCaseViaCli,
+ warmUpCliContainer,
+} from './cli-runner';
import { pickClassifierWinner } from './winner';
export type BenchmarkJobMessage = {
@@ -41,9 +48,11 @@ export type BenchmarkJobMessage = {
kind: BenchmarkKind;
model: string;
// The case ids this message is responsible for, plus the chunk index. Decider
- // chunks also use this index to key their container instance.
+ // chunks are split across shard lanes; each lane has one stable container.
caseIds?: string[];
chunk?: number;
+ shard?: number;
+ shardCount?: number;
// Repetition index (0-based).
rep?: number;
};
@@ -54,6 +63,8 @@ export const BenchmarkJobMessageSchema = z.object({
model: z.string().min(1),
caseIds: z.array(z.string().min(1)).optional(),
chunk: z.number().int().min(0).optional(),
+ shard: z.number().int().min(0).optional(),
+ shardCount: z.number().int().min(1).optional(),
rep: z.number().int().min(0).optional(),
});
@@ -67,9 +78,13 @@ const DECIDER_CHUNK_SIZE = 5;
// keep it below Cloudflare Queues' 15-minute wall-clock limit.
const CLASSIFIER_CHUNK_SIZE = 1;
-// Cloudflare Queues caps a single sendBatch at 100 messages. A decider fan-out
-// is models × reps × ceil(76 / 5) messages, which clears 100 with as few as two
-// models, so the dispatch must be sliced.
+// Cloudflare Containers cap for the benchmark runner. Sharded decider fan-out
+// uses this as the live-container budget.
+export const DECIDER_CONTAINER_INSTANCE_CAP = 100;
+
+// Cloudflare Queues caps a single sendBatch at 100 messages. Classifier fan-out
+// can exceed that because each classifier case is its own message, so dispatch
+// must be sliced.
const QUEUE_SEND_BATCH_LIMIT = 100;
export function chunkArray(items: readonly T[], size: number): T[][] {
@@ -80,6 +95,24 @@ export function chunkArray(items: readonly T[], size: number): T[][] {
return chunks;
}
+export function computeDeciderShardCount({
+ modelCount,
+ repetitions,
+ chunkCount,
+ maxLiveContainers = DECIDER_CONTAINER_INSTANCE_CAP,
+}: {
+ modelCount: number;
+ repetitions: number;
+ chunkCount: number;
+ maxLiveContainers?: number;
+}): number {
+ if (modelCount <= 0 || repetitions <= 0 || chunkCount <= 0) return 0;
+ const modelRepetitions = modelCount * repetitions;
+ const shardsPerModelRepetition = Math.floor(maxLiveContainers / modelRepetitions);
+ if (shardsPerModelRepetition <= 0) return 0;
+ return Math.min(chunkCount, shardsPerModelRepetition);
+}
+
// Enqueues messages in sendBatch-sized slices. A mid-dispatch failure leaves a
// partially-enqueued run that can never reach its expected result count, so the
// run is marked failed (surfacing in the admin panel) before the throw
@@ -138,36 +171,64 @@ export function computeEngineIdentity(kind: BenchmarkKind): string {
const datasetSignature =
kind === 'classifier'
? CLASSIFIER_CASES.map(c => ({ id: c.id, expected: c.expected }))
- : DECIDER_CASES.map(c => ({ id: c.id, tier: c.tier, check: c.check }));
+ : DECIDER_CASES.map(c => ({
+ id: c.id,
+ taskType: c.taskType,
+ subtaskType: c.subtaskType,
+ check: c.check,
+ }));
return `v${BENCHMARK_ENGINE_VERSION}:${fnv1aHex(JSON.stringify(datasetSignature))}`;
}
-/** Pure helper: produces the sendBatch bodies for a decider run fan-out.
- * Extracted for unit-testability; the shape is models × reps × chunks messages.
+/** Pure helper: produces the initial sendBatch bodies for a decider run.
+ * Extracted for unit-testability; the shape is models × reps messages. Later
+ * chunks are chained by processDeciderJob after the previous chunk completes.
*/
export function buildDeciderMessages(
runId: string,
kind: BenchmarkKind,
modelIds: string[],
repetitions: number,
- chunks: readonly (readonly { id: string }[])[]
+ chunks: readonly (readonly { id: string }[])[],
+ maxLiveContainers: number = DECIDER_CONTAINER_INSTANCE_CAP
): { body: BenchmarkJobMessage }[] {
+ const shardCount = computeDeciderShardCount({
+ modelCount: modelIds.length,
+ repetitions,
+ chunkCount: chunks.length,
+ maxLiveContainers,
+ });
+ if (shardCount === 0) return [];
return modelIds.flatMap(model =>
Array.from({ length: repetitions }, (_, rep) =>
- chunks.map((chunkCases, chunk) => ({
- body: {
- runId,
- kind,
- model,
- chunk,
- rep,
- caseIds: chunkCases.map(c => c.id),
- } satisfies BenchmarkJobMessage,
- }))
+ Array.from({ length: shardCount }, (_, shard) => {
+ const chunkCases = chunks[shard];
+ if (!chunkCases) return [];
+ return [
+ {
+ body: {
+ runId,
+ kind,
+ model,
+ chunk: shard,
+ shard,
+ shardCount,
+ rep,
+ caseIds: chunkCases.map(c => c.id),
+ } satisfies BenchmarkJobMessage,
+ },
+ ];
+ }).flat()
).flat()
);
}
+export function getDeciderContainerInstanceName(
+ message: Pick
+): string {
+ return `${message.runId}:${message.model}:${message.rep ?? 0}:${message.shard ?? 0}`;
+}
+
export function buildClassifierMessages(
runId: string,
modelIds: string[],
@@ -202,6 +263,33 @@ export class RunAlreadyActiveError extends Error {
}
}
+// Thrown when the saved benchmark config would exceed a hard runtime limit.
+// The admin route maps it to HTTP 400 so operators can fix config instead of
+// starting a run that will immediately hit platform capacity.
+export class BenchmarkRunConfigError extends Error {
+ constructor(message: string) {
+ super(message);
+ this.name = 'BenchmarkRunConfigError';
+ }
+}
+
+function validateDeciderContainerBudget({
+ modelCount,
+ repetitions,
+ maxLiveContainers,
+}: {
+ modelCount: number;
+ repetitions: number;
+ maxLiveContainers: number;
+}): void {
+ const modelRepetitions = modelCount * repetitions;
+ if (modelRepetitions <= maxLiveContainers) return;
+
+ throw new BenchmarkRunConfigError(
+ `decider benchmark requires at least one live container lane per model repetition (${modelRepetitions}), but maxConcurrency is ${maxLiveContainers}; reduce decider models/repetitions before starting`
+ );
+}
+
export async function startRun(
env: Env,
kind: BenchmarkKind,
@@ -264,6 +352,14 @@ export async function startRun(
'benchmark user not configured: set benchmarkUserId before running the decider benchmark'
);
}
+ const maxLiveDeciderContainers = Math.min(config.maxConcurrency, DECIDER_CONTAINER_INSTANCE_CAP);
+ if (kind === 'decider') {
+ validateDeciderContainerBudget({
+ modelCount: enqueuedModelIds.length,
+ repetitions,
+ maxLiveContainers: maxLiveDeciderContainers,
+ });
+ }
const startedAt = new Date().toISOString();
const runId = `${kind}-${startedAt.replace(/[:.]/g, '-')}`;
@@ -341,10 +437,18 @@ export async function startRun(
return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
}
- // Decider: one message per (model, rep, chunk) so each queue invocation stays
- // bounded. finalizeRunIfComplete expects enqueuedModels × DECIDER_CASES × repetitions rows.
+ // Decider: seed as many shard lanes as fit under the live-container cap. Each
+ // completed chunk enqueues the next chunk for the same lane, so one stable
+ // container handles chunk N, N+shardCount, N+(2*shardCount), ...
const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
- const messages = buildDeciderMessages(runId, kind, enqueuedModelIds, repetitions, chunks);
+ const messages = buildDeciderMessages(
+ runId,
+ kind,
+ enqueuedModelIds,
+ repetitions,
+ chunks,
+ maxLiveDeciderContainers
+ );
await enqueueRunMessages(env, runId, messages);
return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
}
@@ -367,6 +471,7 @@ export async function processJob(env: Env, rawMessage: unknown): Promise {
const message = parsed.data;
const state = await getRunState(env, message.runId);
+ let shouldFinalize = true;
if (message.kind === 'classifier') {
if (!message.caseIds?.length || message.rep === undefined) {
console.warn(
@@ -400,7 +505,7 @@ export async function processJob(env: Env, rawMessage: unknown): Promise {
run_id: message.runId,
model: message.model,
case_id: benchCase.id,
- tier: null,
+ route_key: null,
score,
latency_ms: Math.round(performance.now() - startedAt),
cost_usd: result.cost,
@@ -423,10 +528,13 @@ export async function processJob(env: Env, rawMessage: unknown): Promise {
}
);
} else {
- await processDeciderJob(env, message, state);
+ const result = await processDeciderJob(env, message, state);
+ shouldFinalize = result.shouldFinalize;
}
- await finalizeRunIfComplete(env, message.runId, message.kind, state);
+ if (shouldFinalize) {
+ await finalizeRunIfComplete(env, message.runId, message.kind, state);
+ }
}
type RunState = {
@@ -461,15 +569,26 @@ async function processDeciderJob(
env: Env,
message: BenchmarkJobMessage,
state: RunState
-): Promise {
+): Promise<{ shouldFinalize: boolean }> {
// Decider messages always carry their chunk's case ids; anything else is
// malformed and dropped (same policy as unparseable messages).
if (!message.caseIds?.length) {
console.warn(JSON.stringify({ event: 'benchmark_job_missing_case_ids', runId: message.runId }));
- return;
+ return { shouldFinalize: false };
}
const caseIds = new Set(message.caseIds);
const cases = DECIDER_CASES.filter(c => caseIds.has(c.id));
+ if (cases.length === 0) {
+ console.warn(
+ JSON.stringify({
+ event: 'benchmark_job_empty_case_chunk',
+ runId: message.runId,
+ model: message.model,
+ chunk: message.chunk ?? 0,
+ })
+ );
+ return { shouldFinalize: false };
+ }
if (!state.benchmarkUserId) {
// startRun fails fast before enqueueing, so this only happens if the run
@@ -477,83 +596,163 @@ async function processDeciderJob(
throw new Error(`run ${message.runId} has no benchmarkUserId`);
}
- // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the
- // queue retries the message. The token is never logged.
- const kiloToken = await fetchBenchmarkUserToken(env, state.benchmarkUserId);
const rep = message.rep ?? 0;
- const instanceName = `${message.runId}:${message.model}:${rep}:${message.chunk ?? 0}`;
+ const chunk = message.chunk ?? 0;
+ const shard = message.shard ?? 0;
+ const shardCount = message.shardCount ?? 1;
+ const instanceName = getDeciderContainerInstanceName(message);
+
+ const existingCaseIds = await getExistingCaseResultIds(env.BENCH_DB, {
+ runId: message.runId,
+ model: message.model,
+ rep,
+ caseIds: cases.map(c => c.id),
+ });
+ const casesToRun = cases.filter(c => !existingCaseIds.has(c.id));
// Reasoning effort comes from the run snapshot (run_models row), not live config.
const modelRow = state.models.find(m => m.model === message.model);
const reasoningEffort = modelRow?.reasoning_effort ?? null;
- // Fresh container instances run the CLI's one-time sqlite migration; the
- // container owns that via its /warmup endpoint so the first real case
- // doesn't burn its timeout on it. Failures are non-fatal: the first case
- // simply absorbs whatever warmup work remains.
- await warmUpCliContainer(env, { instanceName, model: message.model, kiloToken }).catch(() => {});
-
- // Concurrency 1: the CLI's sqlite state in the container is not safe under
- // concurrent sessions (partial-migration crashes); the container serializes
- // too, so higher concurrency here would only hold HTTP requests open.
- await runCasesWithConcurrency(cases, 1, async benchCase => {
- const startedAt = performance.now();
- try {
- let result = await runDeciderCaseViaCli(env, {
- instanceName,
- model: message.model,
- benchCase,
- kiloToken,
- reasoningEffort,
- });
- // The CLI occasionally ends a session with no assistant text at all
- // (transient empty completion: a lone step_finish with cost 0). Mirror
- // the production classifier's policy and retry once.
- let retried = false;
- if (result.exitCode === 0 && result.text.length === 0) {
- retried = true;
- const retry = await runDeciderCaseViaCli(env, {
+ if (casesToRun.length > 0) {
+ // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the
+ // queue retries the message. The token is never logged.
+ const kiloToken = await fetchBenchmarkUserToken(env, state.benchmarkUserId);
+
+ // Fresh container instances run the CLI's one-time sqlite migration; the
+ // container owns that via its /warmup endpoint so the first real case
+ // doesn't burn its timeout on it. Ordinary warmup failures are non-fatal:
+ // the first case absorbs whatever warmup work remains. Container capacity
+ // failures are infrastructure pressure, so the queue retries the message.
+ await warmUpCliContainer(env, { instanceName, model: message.model, kiloToken }).catch(
+ error => {
+ if (isRetryableContainerAvailabilityError(error)) throw error;
+ }
+ );
+
+ // Concurrency 1: the CLI's sqlite state in the container is not safe under
+ // concurrent sessions (partial-migration crashes); the container serializes
+ // too, so higher concurrency here would only hold HTTP requests open.
+ await runCasesWithConcurrency(casesToRun, 1, async benchCase => {
+ const startedAt = performance.now();
+ try {
+ let result = await runDeciderCaseViaCli(env, {
instanceName,
model: message.model,
benchCase,
kiloToken,
reasoningEffort,
});
- retry.costUsd =
- retry.costUsd === null && result.costUsd === null
- ? null
- : (retry.costUsd ?? 0) + (result.costUsd ?? 0);
- result = retry;
+ // The CLI occasionally ends a session with no assistant text at all
+ // (transient empty completion: a lone step_finish with cost 0). Mirror
+ // the production classifier's policy and retry once.
+ let retried = false;
+ if (result.exitCode === 0 && result.text.length === 0) {
+ retried = true;
+ const retry = await runDeciderCaseViaCli(env, {
+ instanceName,
+ model: message.model,
+ benchCase,
+ kiloToken,
+ reasoningEffort,
+ });
+ retry.costUsd =
+ retry.costUsd === null && result.costUsd === null
+ ? null
+ : (retry.costUsd ?? 0) + (result.costUsd ?? 0);
+ result = retry;
+ }
+ const succeeded =
+ result.exitCode === 0 &&
+ result.text.length > 0 &&
+ runDeciderCheck(benchCase.check, result.text);
+ await upsertCaseResult(env.BENCH_DB, {
+ run_id: message.runId,
+ model: message.model,
+ case_id: benchCase.id,
+ route_key: taxonomyRouteKey(benchCase),
+ score: succeeded ? 1 : 0,
+ latency_ms: result.latencyMs,
+ cost_usd: result.costUsd,
+ error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null,
+ fallback_reason: null,
+ retried,
+ exit_code: result.exitCode,
+ output_prefix: result.text.slice(0, 200),
+ event_count: result.eventCount,
+ last_event_types: result.lastEventTypes.join(' '),
+ rep,
+ timed_out: result.timedOut ? 1 : 0,
+ });
+ } catch (error) {
+ if (isRetryableContainerAvailabilityError(error)) throw error;
+ await upsertCaseResult(
+ env.BENCH_DB,
+ failedRow(message, benchCase.id, taxonomyRouteKey(benchCase), startedAt, error, rep)
+ );
}
- const succeeded =
- result.exitCode === 0 &&
- result.text.length > 0 &&
- runDeciderCheck(benchCase.check, result.text);
- await upsertCaseResult(env.BENCH_DB, {
- run_id: message.runId,
- model: message.model,
- case_id: benchCase.id,
- tier: benchCase.tier,
- score: succeeded ? 1 : 0,
- latency_ms: result.latencyMs,
- cost_usd: result.costUsd,
- error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null,
- fallback_reason: null,
- retried,
- exit_code: result.exitCode,
- output_prefix: result.text.slice(0, 200),
- event_count: result.eventCount,
- last_event_types: result.lastEventTypes.join(' '),
- rep,
- timed_out: result.timedOut ? 1 : 0,
- });
- } catch (error) {
- await upsertCaseResult(
- env.BENCH_DB,
- failedRow(message, benchCase.id, benchCase.tier, startedAt, error, rep)
+ });
+ }
+
+ const hasNextChunk = await enqueueNextDeciderChunkIfNeeded(
+ env,
+ message,
+ rep,
+ chunk,
+ shard,
+ shardCount
+ );
+ if (!hasNextChunk) {
+ await destroyDeciderCliContainer(env, { instanceName }).catch(error => {
+ console.warn(
+ JSON.stringify({
+ event: 'benchmark_container_destroy_failed',
+ instanceName,
+ ...formatError(error),
+ })
);
- }
+ });
+ }
+ return { shouldFinalize: !hasNextChunk };
+}
+
+async function enqueueNextDeciderChunkIfNeeded(
+ env: Env,
+ message: BenchmarkJobMessage,
+ rep: number,
+ chunk: number,
+ shard: number,
+ shardCount: number
+): Promise {
+ const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
+ const nextChunkIndex = chunk + shardCount;
+ const nextChunk = chunks[nextChunkIndex];
+ if (!nextChunk) return false;
+
+ const nextCaseIds = nextChunk.map(c => c.id);
+ const existingNextCaseIds = await getExistingCaseResultIds(env.BENCH_DB, {
+ runId: message.runId,
+ model: message.model,
+ rep,
+ caseIds: nextCaseIds,
});
+ if (existingNextCaseIds.size >= nextCaseIds.length) return true;
+
+ await env.BENCH_QUEUE.sendBatch([
+ {
+ body: {
+ runId: message.runId,
+ kind: 'decider',
+ model: message.model,
+ chunk: nextChunkIndex,
+ shard,
+ shardCount,
+ rep,
+ caseIds: nextCaseIds,
+ } satisfies BenchmarkJobMessage,
+ },
+ ]);
+ return true;
}
const TokenResponseSchema = z.object({ token: z.string().min(1), expiresAt: z.string() });
@@ -587,7 +786,7 @@ export async function fetchBenchmarkUserToken(env: Env, userId: string): Promise
function failedRow(
message: BenchmarkJobMessage,
caseId: string,
- tier: string | null,
+ routeKey: string | null,
startedAt: number,
error: unknown,
rep: number = 0
@@ -596,7 +795,7 @@ function failedRow(
run_id: message.runId,
model: message.model,
case_id: caseId,
- tier,
+ route_key: routeKey,
score: 0,
latency_ms: Math.round(performance.now() - startedAt),
cost_usd: null,
@@ -729,13 +928,12 @@ async function finalizeRunIfComplete(
}
export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): BenchmarkModelSummary[] {
- // Group by "model tier-key" using a plain reduce so this works in all runtimes.
- // Classifier rows use '*' as the tier (no tiering); decider rows use the actual tier
- // (falling back to '*' when tier is null).
+ // Group by "model route-key" using a plain reduce so this works in all runtimes.
+ // Classifier rows use '*' because classification has no decider taxonomy route.
const groups = new Map();
for (const row of rows) {
- const tierKey = kind === 'classifier' ? '*' : (row.tier ?? '*');
- const key = `${row.model}\0${tierKey}`;
+ const routeKey = kind === 'classifier' ? '*' : (row.route_key ?? '*');
+ const key = `${row.model}\0${routeKey}`;
const existing = groups.get(key);
if (existing) {
existing.push(row);
@@ -745,7 +943,7 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark
}
return [...groups.entries()].map(([key, group]) => {
- const [model, tier] = key.split('\0');
+ const [model, routeKey] = key.split('\0');
const latencies = group.map(r => r.latency_ms).toSorted((a, b) => a - b);
const costs = group.filter(r => r.cost_usd !== null);
const p95LatencyMs =
@@ -755,7 +953,7 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark
: null;
return {
model,
- tier: tier as BenchmarkModelSummary['tier'],
+ routeKey: routeKey as BenchmarkModelSummary['routeKey'],
accuracy: Number((group.reduce((a, r) => a + r.score, 0) / group.length).toFixed(4)),
avgCostUsd: costs.length
? Number((costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8))
diff --git a/services/auto-routing-benchmark/src/winner.ts b/services/auto-routing-benchmark/src/winner.ts
index 318809c4a3..952b329bb3 100644
--- a/services/auto-routing-benchmark/src/winner.ts
+++ b/services/auto-routing-benchmark/src/winner.ts
@@ -1,6 +1,6 @@
import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
-// Picks the best classifier candidate from summaries (tier '*') applying:
+// Picks the best classifier candidate from summaries (routeKey '*') applying:
// 1. Accuracy gate: must meet minAccuracy.
// 2. Optional p95 latency gate: when maxP95LatencyMs is non-null, prefer
// candidates whose measured p95 latency is within budget.
@@ -16,7 +16,7 @@ export function pickClassifierWinner(
minAccuracy: number,
maxP95LatencyMs: number | null = null
): BenchmarkModelSummary | null {
- const graded = summaries.filter(s => s.tier === '*' && s.cases > 0);
+ const graded = summaries.filter(s => s.routeKey === '*' && s.cases > 0);
if (graded.length === 0) return null;
const cost = (s: BenchmarkModelSummary) => s.avgCostUsd ?? Number.POSITIVE_INFINITY;
const p95 = (s: BenchmarkModelSummary) => s.p95LatencyMs ?? Number.POSITIVE_INFINITY;
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
index 9faeb19ac4..c0433b1073 100644
--- a/services/auto-routing-benchmark/wrangler.jsonc
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -32,7 +32,7 @@
"class_name": "BenchRunnerContainer",
"image": "./container/Dockerfile",
"instance_type": "standard-2",
- "max_instances": 50,
+ "max_instances": 100,
},
],
"durable_objects": {
@@ -53,8 +53,9 @@
{
"queue": "auto-routing-benchmark-jobs",
"max_batch_size": 1,
- "max_retries": 2,
- "max_concurrency": 4,
+ "max_retries": 6,
+ "retry_delay": 180,
+ "max_concurrency": 100,
"dead_letter_queue": "auto-routing-benchmark-dlq",
},
],
diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts
index fd476a5668..bd89638137 100644
--- a/services/auto-routing/src/decide.ts
+++ b/services/auto-routing/src/decide.ts
@@ -254,7 +254,8 @@ function recordDecision(
mode: ctx.payload.mode,
uaPrefix: ctx.payload.userAgent?.slice(0, 40) ?? null,
decidedModel: decision?.model ?? null,
- decidedTier: decision?.tier ?? null,
+ decidedTaskType: decision?.taskType ?? null,
+ decidedSubtaskType: decision?.subtaskType ?? null,
decisionSource: decision?.source ?? null,
sticky: decision?.sticky ?? null,
...summary.details,
diff --git a/services/auto-routing/src/decision-cache.ts b/services/auto-routing/src/decision-cache.ts
index ae98778688..0aed20c63e 100644
--- a/services/auto-routing/src/decision-cache.ts
+++ b/services/auto-routing/src/decision-cache.ts
@@ -82,7 +82,7 @@ function entryKey(contentHash: string, classifierModel: string): string {
// Single per-conversation slot remembering the last model the decision
// engine served, so the session can stay on it (keeping the provider's
-// prompt cache warm) instead of ping-ponging when its tier oscillates.
+// prompt cache warm) instead of ping-ponging when its route oscillates.
// Cannot collide with classification keys, which always contain a ':'.
const STICKY_DECISION_KEY = 'sticky';
diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts
index b10fcc2e47..ab137ccd47 100644
--- a/services/auto-routing/src/decision-engine.test.ts
+++ b/services/auto-routing/src/decision-engine.test.ts
@@ -19,8 +19,8 @@ const table: RoutingTable = {
minAccuracy: 0.7,
switchCostFactor: 3,
source: 'benchmark',
- tiers: {
- low: [
+ routes: {
+ 'implementation/code_generation': [
{
model: 'cheap/chat',
accuracy: 0.85,
@@ -47,7 +47,7 @@ const table: RoutingTable = {
meetsThreshold: false,
},
],
- medium: [
+ 'debugging/bug_fixing': [
{
model: 'mid/chat',
accuracy: 0.8,
@@ -55,7 +55,7 @@ const table: RoutingTable = {
meetsThreshold: true,
},
],
- high: [
+ 'planning_design/system_design': [
{
model: 'big/chat',
accuracy: 0.9,
@@ -67,59 +67,39 @@ const table: RoutingTable = {
};
describe('computeDecision', () => {
- it('picks the first candidate of the tier', () => {
+ it('picks the first candidate of the classifier taxonomy route', () => {
const decision = computeDecision(classification, table, null);
expect(decision).toEqual({
model: 'cheap/chat',
- tier: 'low',
+ taskType: 'implementation',
+ subtaskType: 'code_generation',
source: 'benchmark',
tableVersion: 'run-1',
reasoningEffort: null,
sticky: false,
});
});
- it('uses the tier derived from the classification', () => {
- const hard: ClassifierOutput = {
+ it('uses the classifier task type and subtype directly', () => {
+ const debugging: ClassifierOutput = {
...classification,
- reasoningComplexity: 'high',
- contextComplexity: 'large',
- executionMode: 'multi_step_project',
+ taskType: 'debugging',
+ subtaskType: 'bug_fixing',
};
- expect(computeDecision(hard, table, null)?.model).toBe('big/chat');
- });
- it('returns a decision for every tier of a valid table', () => {
- const byTier: Array<[ClassifierOutput, string]> = [
- [classification, 'cheap/chat'],
- [
- { ...classification, reasoningComplexity: 'medium', contextComplexity: 'medium' },
- 'mid/chat',
- ],
- [
- {
- ...classification,
- reasoningComplexity: 'high',
- contextComplexity: 'large',
- executionMode: 'multi_step_project',
- },
- 'big/chat',
- ],
- ];
- for (const [input, expected] of byTier) {
- expect(computeDecision(input, table, null)?.model).toBe(expected);
- }
+ expect(computeDecision(debugging, table, null)?.model).toBe('mid/chat');
});
it('returns null when there is no routing table', () => {
expect(computeDecision(classification, null, null)).toBeNull();
});
describe('session stickiness', () => {
- it('keeps the incumbent on tier de-escalation when it is within the switch-cost factor', () => {
+ it('keeps the incumbent on route changes when it is within the switch-cost factor', () => {
// Fresh pick cheap/chat at 0.002; mid/chat at 0.005 is not cheaper by
// more than 3x (0.002 * 3 = 0.006 >= 0.005), so the session stays put.
const decision = computeDecision(classification, table, 'mid/chat');
expect(decision).toEqual({
model: 'mid/chat',
- tier: 'low',
+ taskType: 'implementation',
+ subtaskType: 'code_generation',
source: 'benchmark',
tableVersion: 'run-1',
// The incumbent's benchmarked effort, not the fresh pick's.
@@ -132,11 +112,19 @@ describe('computeDecision', () => {
// Integer costs avoid float noise on the equality case (1 * 3 === 3).
const boundaryTable: RoutingTable = {
...table,
- tiers: {
- ...table.tiers,
- low: [
- { ...table.tiers.low[0]!, model: 'fresh/chat', avgCostUsd: 1 },
- { ...table.tiers.low[1]!, model: 'incumbent/chat', avgCostUsd: 3 },
+ routes: {
+ ...table.routes,
+ 'implementation/code_generation': [
+ {
+ ...table.routes['implementation/code_generation'][0]!,
+ model: 'fresh/chat',
+ avgCostUsd: 1,
+ },
+ {
+ ...table.routes['implementation/code_generation'][1]!,
+ model: 'incumbent/chat',
+ avgCostUsd: 3,
+ },
],
},
};
@@ -148,11 +136,11 @@ describe('computeDecision', () => {
const decision = computeDecision(classification, table, 'pricey/chat');
expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
});
- it('switches when the incumbent no longer meets the tier threshold', () => {
+ it('switches when the incumbent no longer meets the route threshold', () => {
const decision = computeDecision(classification, table, 'weak/chat');
expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
});
- it('serves the fresh pick when the incumbent is not in the tier', () => {
+ it('serves the fresh pick when the incumbent is not in the route', () => {
const decision = computeDecision(classification, table, 'gone/model');
expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
});
diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts
index 0d641e069d..aaa7aba542 100644
--- a/services/auto-routing/src/decision-engine.ts
+++ b/services/auto-routing/src/decision-engine.ts
@@ -1,5 +1,5 @@
import {
- deriveDifficultyTier,
+ taxonomyRouteKey,
type AutoRoutingDecision,
type ClassifierOutput,
type RoutingTable,
@@ -11,14 +11,13 @@ export function computeDecision(
incumbentModel: string | null
): AutoRoutingDecision | null {
if (!table) return null;
- const tier = deriveDifficultyTier(classification);
- const candidates = table.tiers[tier];
- // A parsed table guarantees a non-empty tier (schema .min(1)), so with a
- // table and a classification a decision always exists.
+ const routeKey = taxonomyRouteKey(classification);
+ const candidates = table.routes[routeKey];
+ if (!candidates?.length) return null;
const freshPick = candidates[0];
// Keep the session on its incumbent model when it is still good enough for
- // the current tier. A model switch discards the provider's prompt cache,
+ // the current taxonomy route. A model switch discards the provider's prompt cache,
// and rebuilding it costs full-price input tokens (4-10x cache-read rates)
// on a context that dominates agent-session spend — so a switch is only
// worth it when the fresh pick's recurring per-turn savings clearly exceed
@@ -33,7 +32,8 @@ export function computeDecision(
) {
return {
model: incumbent.model,
- tier,
+ taskType: classification.taskType,
+ subtaskType: classification.subtaskType,
source: table.source,
tableVersion: table.version,
reasoningEffort: incumbent.reasoningEffort ?? null,
@@ -43,7 +43,8 @@ export function computeDecision(
return {
model: freshPick.model,
- tier,
+ taskType: classification.taskType,
+ subtaskType: classification.subtaskType,
source: table.source,
tableVersion: table.version,
reasoningEffort: freshPick.reasoningEffort ?? null,
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 4519c7c310..220d443fbb 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -87,17 +87,15 @@ const benchmarkRoutingTable = {
minAccuracy: 0.7,
switchCostFactor: 3,
source: 'benchmark',
- tiers: {
- low: [
+ routes: {
+ 'implementation/feature_development': [
{
model: 'google/gemini-2.5-flash-lite',
accuracy: 0.9,
- avgCostUsd: 0.001,
+ avgCostUsd: 0.002,
meetsThreshold: true,
reasoningEffort: null,
},
- ],
- medium: [
{
model: 'google/gemini-2.5-flash',
accuracy: 0.85,
@@ -105,9 +103,9 @@ const benchmarkRoutingTable = {
meetsThreshold: true,
reasoningEffort: null,
},
- // The high-tier model also qualifies for medium, within the 3x
+ // The planning route's model also qualifies for implementation, within the 3x
// switch-cost factor of the fresh pick (0.002 * 3 >= 0.005): a session
- // de-escalating from high stays on it.
+ // moving routes stays on it.
{
model: 'anthropic/claude-sonnet-4.6',
accuracy: 0.8,
@@ -116,7 +114,7 @@ const benchmarkRoutingTable = {
reasoningEffort: null,
},
],
- high: [
+ 'planning_design/system_design': [
{
model: 'anthropic/claude-sonnet-4.6',
accuracy: 0.8,
@@ -235,7 +233,8 @@ describe('auto routing worker', () => {
cost: 0.00000123,
decision: {
model: expect.any(String),
- tier: expect.stringMatching(/^(low|medium|high)$/),
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'bench-run-1',
reasoningEffort: null,
@@ -300,7 +299,8 @@ describe('auto routing worker', () => {
cost: 0,
decision: {
model: expect.any(String),
- tier: expect.stringMatching(/^(low|medium|high)$/),
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'bench-run-1',
reasoningEffort: null,
@@ -331,7 +331,7 @@ describe('auto routing worker', () => {
);
});
- it('keeps the session on the incumbent model when the tier de-escalates', async () => {
+ it('keeps the session on the incumbent model when the taxonomy route changes', async () => {
// Back the mocked DO stub with real storage so the sticky model written
// by the first request is visible to the second.
const store = new Map();
@@ -344,19 +344,24 @@ describe('auto routing worker', () => {
...mockClassifierResult,
classification: {
...mockClassification,
- reasoningComplexity: 'high',
- contextComplexity: 'large',
- executionMode: 'multi_step_project',
+ taskType: 'planning_design',
+ subtaskType: 'system_design',
},
});
const first = await decideRequest(mirrorPayload());
expect(first.status).toBe(200);
await expect(first.json()).resolves.toMatchObject({
- decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'high', sticky: false },
+ decision: {
+ model: 'anthropic/claude-sonnet-4.6',
+ taskType: 'planning_design',
+ subtaskType: 'system_design',
+ sticky: false,
+ },
});
+ store.set('sticky', { model: 'anthropic/claude-sonnet-4.6' });
- // The second turn (different prompt, same session) classifies as medium.
- // The fresh medium pick is cheaper, but not by more than the switch-cost
+ // The second turn (different prompt, same session) classifies to a cheaper route.
+ // The fresh implementation pick is cheaper, but not by more than the switch-cost
// factor, so the session keeps its incumbent.
const second = await decideRequest(
mirrorPayload({
@@ -365,7 +370,12 @@ describe('auto routing worker', () => {
);
expect(second.status).toBe(200);
await expect(second.json()).resolves.toMatchObject({
- decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'medium', sticky: true },
+ decision: {
+ model: 'anthropic/claude-sonnet-4.6',
+ taskType: 'implementation',
+ subtaskType: 'feature_development',
+ sticky: true,
+ },
});
});
diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts
index be60e909ab..9b73d29235 100644
--- a/services/auto-routing/src/routing-table.test.ts
+++ b/services/auto-routing/src/routing-table.test.ts
@@ -8,8 +8,8 @@ const SAMPLE_TABLE: RoutingTable = {
minAccuracy: 0.7,
switchCostFactor: 3,
source: 'benchmark',
- tiers: {
- low: [
+ routes: {
+ 'implementation/feature_development': [
{
model: 'google/gemini-2.5-flash-lite',
accuracy: 0.9,
@@ -18,7 +18,7 @@ const SAMPLE_TABLE: RoutingTable = {
reasoningEffort: null,
},
],
- medium: [
+ 'debugging/bug_fixing': [
{
model: 'google/gemini-2.5-flash',
accuracy: 0.85,
@@ -27,7 +27,7 @@ const SAMPLE_TABLE: RoutingTable = {
reasoningEffort: null,
},
],
- high: [
+ 'planning_design/system_design': [
{
model: 'anthropic/claude-sonnet-4.6',
accuracy: 0.8,