diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
index 11a8a6a0e3..8bc45f6019 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
@@ -63,6 +63,7 @@ describe('configToFormState', () => {
     expect(state.classifierMaxP95LatencyMs).toBe('1000');
     expect(state.classifierModels).toBe('');
     expect(state.deciderModels).toEqual([]);
+    expect(state.maxConcurrency).toBe(100);
   });
 });
 
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
index 9bdfac18ba..312e44e602 100644
--- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -126,7 +126,7 @@ export function configToFormState(config: BenchmarkConfig | null): {
       deciderModels: [],
       minAccuracy: 0.7,
       switchCostFactor: 3,
-      maxConcurrency: 4,
+      maxConcurrency: 100,
       benchmarkUserId: '',
       classifierRepetitions: 1,
       deciderRepetitions: 1,
@@ -407,13 +407,13 @@ function BenchmarkConfigEditor({
           </div>
           <div className="flex flex-col gap-1.5">
             <Label htmlFor="benchmark-max-concurrency" className="text-sm font-medium">
-              Max concurrency (1–16)
+              Max concurrency (1–100)
             </Label>
             <Input
               id="benchmark-max-concurrency"
               type="number"
               min={1}
-              max={16}
+              max={100}
               step={1}
               value={form.maxConcurrency}
               onChange={e =>
@@ -539,17 +539,13 @@ function BenchmarkConfigEditor({
 // Run summaries expandable table
 // ---------------------------------------------------------------------------
 
-const TIER_ORDER = { low: 0, medium: 1, high: 2, '*': 3 } as const;
-
 function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
   const isDecider = run.kind === 'decider';
 
   const sortedSummaries: BenchmarkModelSummary[] = isDecider
     ? [...run.summaries].sort((a, b) => {
-        const tierDiff =
-          (TIER_ORDER[a.tier as keyof typeof TIER_ORDER] ?? 3) -
-          (TIER_ORDER[b.tier as keyof typeof TIER_ORDER] ?? 3);
-        if (tierDiff !== 0) return tierDiff;
+        const routeDiff = a.routeKey.localeCompare(b.routeKey);
+        if (routeDiff !== 0) return routeDiff;
         return b.accuracy - a.accuracy;
       })
     : run.summaries;
@@ -571,7 +567,7 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
               <TableHeader>
                 <TableRow>
                   <TableHead className="text-xs">Model</TableHead>
-                  {isDecider ? <TableHead className="text-xs">Tier</TableHead> : null}
+                  {isDecider ? <TableHead className="text-xs">Route</TableHead> : null}
                   <TableHead className="text-right text-xs">Accuracy</TableHead>
                   <TableHead className="text-right text-xs">Avg cost</TableHead>
                   <TableHead className="text-right text-xs">Avg latency</TableHead>
@@ -584,10 +580,10 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
               </TableHeader>
               <TableBody>
                 {sortedSummaries.map((s, i) => (
-                  <TableRow key={`${s.model}-${s.tier}-${i}`}>
+                  <TableRow key={`${s.model}-${s.routeKey}-${i}`}>
                     <TableCell className="max-w-56 truncate font-mono text-xs">{s.model}</TableCell>
                     {isDecider ? (
-                      <TableCell className="text-xs capitalize">{s.tier}</TableCell>
+                      <TableCell className="font-mono text-xs">{s.routeKey}</TableCell>
                     ) : null}
                     <TableCell className="text-right tabular-nums text-xs">
                       {formatAccuracy(s.accuracy)}
@@ -717,11 +713,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
   }
 
   const { table } = data;
-  const tierEntries = [
-    { tier: 'low', candidates: table.tiers.low },
-    { tier: 'medium', candidates: table.tiers.medium },
-    { tier: 'high', candidates: table.tiers.high },
-  ] as const;
+  const routeEntries = Object.entries(table.routes).sort(([a], [b]) => a.localeCompare(b));
 
   return (
     <div className="flex flex-col gap-3">
@@ -736,9 +728,9 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
         </span>
       </div>
 
-      {tierEntries.map(({ tier, candidates }) => (
-        <div key={tier}>
-          <p className="text-sm font-medium capitalize mb-1.5">{tier} tier</p>
+      {routeEntries.map(([routeKey, candidates]) => (
+        <div key={routeKey}>
+          <p className="mb-1.5 font-mono text-sm font-medium">{routeKey}</p>
           <div className="overflow-x-auto rounded-md border">
             <Table className="min-w-max">
               <TableHeader>
@@ -751,7 +743,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
               </TableHeader>
               <TableBody>
                 {candidates.map((c, i) => (
-                  <TableRow key={`${tier}-${c.model}-${i}`}>
+                  <TableRow key={`${routeKey}-${c.model}-${i}`}>
                     <TableCell className="max-w-56 truncate font-mono text-xs">{c.model}</TableCell>
                     <TableCell className="text-right tabular-nums text-xs">
                       {formatAccuracy(c.accuracy)}
diff --git a/apps/web/src/app/api/openrouter/[...path]/route.test.ts b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
index bb7b22ded3..a82a0e8cfe 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.test.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
@@ -447,7 +447,8 @@ describe('kilo-auto/efficient classifier billing', () => {
     mockedFetchEfficientAutoDecision.mockResolvedValue({
       decision: {
         model: 'anthropic/claude-haiku-4',
-        tier: 'low',
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark',
         tableVersion: 'v1',
         sticky: false,
@@ -481,7 +482,8 @@ describe('kilo-auto/efficient classifier billing', () => {
     mockedFetchEfficientAutoDecision.mockResolvedValue({
       decision: {
         model: 'anthropic/claude-haiku-4',
-        tier: 'low',
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark' as const,
         tableVersion: 'v1',
         sticky: false,
@@ -510,7 +512,8 @@ describe('kilo-auto/efficient classifier billing', () => {
     mockedFetchEfficientAutoDecision.mockResolvedValue({
       decision: {
         model: 'anthropic/claude-haiku-4',
-        tier: 'low',
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark',
         tableVersion: 'v1',
         sticky: false,
@@ -560,7 +563,8 @@ describe('kilo-auto/efficient classifier billing', () => {
     mockedFetchEfficientAutoDecision.mockResolvedValue({
       decision: {
         model: 'anthropic/claude-haiku-4',
-        tier: 'low',
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark',
         tableVersion: 'v1',
         sticky: false,
diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
index f241c5f222..15235c3730 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
@@ -25,7 +25,8 @@ const zeroBalancePromise = Promise.resolve(0);
 
 const sampleDecision: AutoRoutingDecision = {
   model: 'anthropic/claude-haiku-4',
-  tier: 'low',
+  taskType: 'implementation',
+  subtaskType: 'feature_development',
   source: 'benchmark',
   tableVersion: 'v1',
   sticky: false,
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
index 70d8e7e0c6..52daf63cc8 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
@@ -47,7 +47,8 @@ const options = {
 
 const validDecision = {
   model: 'anthropic/claude-haiku-4',
-  tier: 'low' as const,
+  taskType: 'implementation' as const,
+  subtaskType: 'feature_development' as const,
   source: 'benchmark' as const,
   tableVersion: 'v1',
   sticky: false,
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
index 8409b7f743..a696ac3063 100644
--- a/packages/auto-routing-contracts/src/benchmark.ts
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -1,9 +1,10 @@
 import * as z from 'zod';
 import { RoutingTableSchema } from './routing-table';
-import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
+import { ReasoningEffortSchema } from './reasoning';
+import { TaxonomyRouteKeySchema } from './taxonomy';
 
-export { ReasoningEffortSchema } from './tiers';
-export type { ReasoningEffort } from './tiers';
+export { ReasoningEffortSchema } from './reasoning';
+export type { ReasoningEffort } from './reasoning';
 
 export const BenchmarkKindSchema = z.enum(['classifier', 'decider']);
 export type BenchmarkKind = z.infer<typeof BenchmarkKindSchema>;
@@ -39,15 +40,16 @@ export const BenchmarkConfigSchema = z
   .object({
     classifierModels: z.array(z.string().trim().min(1)).min(1),
     deciderModels: z.array(BenchmarkDeciderModelSchema).min(1),
-    // Accuracy threshold for "gets the job done" (per tier).
+    // Accuracy threshold for "gets the job done" (per taxonomy route).
     minAccuracy: z.number().min(0).max(1),
-    // Parallel OpenRouter calls per queue message.
-    maxConcurrency: z.number().int().min(1).max(16),
+    // Benchmark-wide parallelism budget. Decider runs use it as a live
+    // container budget; classifier runs use it for parallel OpenRouter calls.
+    maxConcurrency: z.number().int().min(1).max(100),
     // The Kilo user whose identity/billing the decider CLI runs execute under.
     // Null until an admin configures it; decider runs fail fast while null.
     benchmarkUserId: z.string().trim().min(1).nullable(),
     // Session stickiness knob carried into published routing tables: a session
-    // stays on its incumbent model while it meets the tier's accuracy
+    // stays on its incumbent model while it meets the route's accuracy
     // threshold, unless the fresh pick is cheaper by more than this factor.
     // Model switches discard provider prompt caches (cache reads are far
     // cheaper than fresh input tokens), so switching only pays off when the
@@ -79,8 +81,8 @@ export type BenchmarkRunStatus = z.infer<typeof BenchmarkRunStatusSchema>;
 
 export const BenchmarkModelSummarySchema = z.object({
   model: z.string(),
-  // '*' for classifier runs (no tiering), otherwise the difficulty tier.
-  tier: z.union([DifficultyTierSchema, z.literal('*')]),
+  // '*' for classifier runs, otherwise "<taskType>/<subtaskType>".
+  routeKey: z.union([TaxonomyRouteKeySchema, z.literal('*')]),
   accuracy: z.number(),
   avgCostUsd: z.number().nullable(),
   avgLatencyMs: z.number(),
diff --git a/packages/auto-routing-contracts/src/contracts.test.ts b/packages/auto-routing-contracts/src/contracts.test.ts
index 0c826251dc..963875f812 100644
--- a/packages/auto-routing-contracts/src/contracts.test.ts
+++ b/packages/auto-routing-contracts/src/contracts.test.ts
@@ -147,6 +147,20 @@ describe('BenchmarkConfigSchema defaults', () => {
     expect(result.deciderRepetitions).toBe(1);
     expect(result.classifierMaxP95LatencyMs).toBe(1000);
   });
+
+  it('accepts the benchmark maximum concurrency cap of 100', () => {
+    const result = BenchmarkConfigSchema.safeParse({
+      classifierModels: ['model/a'],
+      deciderModels: [{ id: 'model/b' }],
+      minAccuracy: 0.8,
+      maxConcurrency: 100,
+      benchmarkUserId: null,
+      switchCostFactor: 2,
+      updatedAt: null,
+      updatedBy: null,
+    });
+    expect(result.success).toBe(true);
+  });
 });
 
 describe('BenchmarkConfigSchema duplicate model ids', () => {
diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
index 31915439ec..aeb55bf7b2 100644
--- a/packages/auto-routing-contracts/src/index.ts
+++ b/packages/auto-routing-contracts/src/index.ts
@@ -1,6 +1,12 @@
 import * as z from 'zod';
 import { NormalizedClassifierInputSchema } from './input';
-import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
+import { ReasoningEffortSchema } from './reasoning';
+import {
+  ClassifierSubtaskTypeSchema,
+  ClassifierTaskTypeSchema,
+  SUBTYPES_BY_TASK_TYPE,
+  type ClassifierSubtaskType,
+} from './taxonomy';
 
 export {
   NormalizedClassifierInputSchema,
@@ -29,47 +35,6 @@ export const MirrorPayloadSchema = z.object({
 });
 export type MirrorPayload = z.infer<typeof MirrorPayloadSchema>;
 
-export const ClassifierTaskTypeSchema = z.enum([
-  'implementation',
-  'debugging',
-  'refactoring',
-  'planning_design',
-  'investigation',
-  'agentic_execution',
-]);
-export type ClassifierTaskType = z.infer<typeof ClassifierTaskTypeSchema>;
-
-export const ClassifierSubtaskTypeSchema = z.enum([
-  'feature_development',
-  'code_generation',
-  'test_creation',
-  'bug_fixing',
-  'test_repair',
-  'root_cause_analysis',
-  'code_cleanup',
-  'architecture_improvement',
-  'migration',
-  'architecture_design',
-  'technical_planning',
-  'system_design',
-  'repo_exploration',
-  'codebase_understanding',
-  'external_research',
-  'tool_usage',
-  'terminal_operations',
-  'multi_step_execution',
-]);
-export type ClassifierSubtaskType = z.infer<typeof ClassifierSubtaskTypeSchema>;
-
-const subtypesByTaskType: Record<ClassifierTaskType, readonly ClassifierSubtaskType[]> = {
-  implementation: ['feature_development', 'code_generation', 'test_creation'],
-  debugging: ['bug_fixing', 'test_repair', 'root_cause_analysis'],
-  refactoring: ['code_cleanup', 'architecture_improvement', 'migration'],
-  planning_design: ['architecture_design', 'technical_planning', 'system_design'],
-  investigation: ['repo_exploration', 'codebase_understanding', 'external_research'],
-  agentic_execution: ['tool_usage', 'terminal_operations', 'multi_step_execution'],
-};
-
 export const ClassifierOutputSchema = z
   .strictObject({
     taskType: ClassifierTaskTypeSchema,
@@ -87,7 +52,10 @@ export const ClassifierOutputSchema = z
     confidence: z.number().min(0).max(1),
   })
   .superRefine((output, ctx) => {
-    if (!subtypesByTaskType[output.taskType].includes(output.subtaskType)) {
+    const allowedSubtypes = SUBTYPES_BY_TASK_TYPE[
+      output.taskType
+    ] as readonly ClassifierSubtaskType[];
+    if (!allowedSubtypes.includes(output.subtaskType)) {
       ctx.addIssue({
         code: 'custom',
         path: ['subtaskType'],
@@ -99,7 +67,8 @@ export type ClassifierOutput = z.infer<typeof ClassifierOutputSchema>;
 
 export const AutoRoutingDecisionSchema = z.object({
   model: z.string(),
-  tier: DifficultyTierSchema,
+  taskType: ClassifierTaskTypeSchema,
+  subtaskType: ClassifierSubtaskTypeSchema,
   source: z.enum(['benchmark']),
   tableVersion: z.string(),
   // Mirrors the effort the chosen model was benchmarked with, when set.
@@ -180,6 +149,7 @@ export type AutoRoutingClassifierAnalyticsResponse = z.infer<
 
 export { normalizeClassifierInput, redactProviderHints, type ClassifierApiKind } from './normalize';
 
-export * from './tiers';
+export * from './reasoning';
+export * from './taxonomy';
 export * from './routing-table';
 export * from './benchmark';
diff --git a/packages/auto-routing-contracts/src/reasoning.ts b/packages/auto-routing-contracts/src/reasoning.ts
new file mode 100644
index 0000000000..a989853d1c
--- /dev/null
+++ b/packages/auto-routing-contracts/src/reasoning.ts
@@ -0,0 +1,4 @@
+import * as z from 'zod';
+
+export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
+export type ReasoningEffort = z.infer<typeof ReasoningEffortSchema>;
diff --git a/packages/auto-routing-contracts/src/routing-table.test.ts b/packages/auto-routing-contracts/src/routing-table.test.ts
index edcd573b44..a4830ce117 100644
--- a/packages/auto-routing-contracts/src/routing-table.test.ts
+++ b/packages/auto-routing-contracts/src/routing-table.test.ts
@@ -9,12 +9,16 @@ const candidate = (model: string, accuracy: number, avgCostUsd: number) => ({
 });
 
 describe('rankCandidates', () => {
-  it('puts the cheapest above-threshold candidate first', () => {
+  it('puts the lowest cost-per-accuracy above-threshold candidate first', () => {
     const ranked = rankCandidates(
-      [candidate('expensive', 0.95, 10), candidate('cheap', 0.8, 1), candidate('weak', 0.5, 0.1)],
+      [
+        candidate('lower-raw-cost', 0.7, 0.007),
+        candidate('better-value', 0.9, 0.008),
+        candidate('weak', 0.5, 0.001),
+      ],
       0.7
     );
-    expect(ranked.map(c => c.model)).toEqual(['cheap', 'expensive', 'weak']);
+    expect(ranked.map(c => c.model)).toEqual(['better-value', 'lower-raw-cost', 'weak']);
     expect(ranked[0].meetsThreshold).toBe(true);
     expect(ranked[2].meetsThreshold).toBe(false);
   });
@@ -29,15 +33,35 @@ describe('rankCandidates', () => {
 });
 
 describe('RoutingTableSchema', () => {
-  it('requires at least one candidate per tier', () => {
+  it('requires at least one candidate per taxonomy route', () => {
     expect(
       RoutingTableSchema.safeParse({
         version: 'v',
         generatedAt: new Date(0).toISOString(),
         minAccuracy: 0.7,
+        switchCostFactor: 3,
         source: 'benchmark',
-        tiers: { low: [], medium: [candidate('m', 1, 1)], high: [candidate('h', 1, 1)] },
+        routes: {
+          'implementation/code_generation': [],
+          'debugging/bug_fixing': [candidate('m', 1, 1)],
+        },
       }).success
     ).toBe(false);
   });
+
+  it('accepts a table routed by classifier taxonomy pair', () => {
+    const parsed = RoutingTableSchema.parse({
+      version: 'v',
+      generatedAt: new Date(0).toISOString(),
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      source: 'benchmark',
+      routes: {
+        'implementation/code_generation': [candidate('impl', 0.9, 1)],
+        'debugging/bug_fixing': [candidate('debug', 0.9, 1)],
+      },
+    });
+
+    expect(parsed.routes['implementation/code_generation']?.[0]?.model).toBe('impl');
+  });
 });
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
index ff49e81578..0a1db0c0a5 100644
--- a/packages/auto-routing-contracts/src/routing-table.ts
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -1,9 +1,10 @@
 import * as z from 'zod';
-import { ReasoningEffortSchema } from './tiers';
+import { ReasoningEffortSchema } from './reasoning';
+import { TaxonomyRouteKeySchema } from './taxonomy';
 
 export const RankedCandidateSchema = z.object({
   model: z.string().trim().min(1),
-  // Benchmark accuracy in [0, 1] for this tier.
+  // Benchmark accuracy in [0, 1] for this taxonomy route.
   accuracy: z.number().min(0).max(1),
   // Average observed OpenRouter cost per benchmark case, in USD credits.
   avgCostUsd: z.number().nonnegative(),
@@ -23,19 +24,25 @@ export const RoutingTableSchema = z.object({
   // more than this factor (see BenchmarkConfigSchema.switchCostFactor).
   switchCostFactor: z.number().min(1),
   source: z.enum(['benchmark']),
-  tiers: z.object({
-    low: z.array(RankedCandidateSchema).min(1),
-    medium: z.array(RankedCandidateSchema).min(1),
-    high: z.array(RankedCandidateSchema).min(1),
+  routes: z.record(z.string(), z.array(RankedCandidateSchema).min(1)).superRefine((routes, ctx) => {
+    for (const key of Object.keys(routes)) {
+      if (!TaxonomyRouteKeySchema.safeParse(key).success) {
+        ctx.addIssue({
+          code: 'custom',
+          path: [key],
+          message: `Unknown taxonomy route ${key}`,
+        });
+      }
+    }
   }),
 });
 export type RoutingTable = z.infer<typeof RoutingTableSchema>;
 
 export const ROUTING_TABLE_KV_KEY = 'routing_table_v1';
 
-// "Best bang for buck": candidates meeting the accuracy threshold come
-// first, cheapest first (accuracy breaks ties); below-threshold candidates
-// follow ordered by accuracy so a degenerate table still routes sensibly.
+// "Best bang for buck": candidates meeting the accuracy threshold come first,
+// lowest cost per unit of accuracy first; below-threshold candidates follow
+// ordered by accuracy so a degenerate table still routes sensibly.
 export function rankCandidates(
   candidates: ReadonlyArray<Omit<RankedCandidate, 'meetsThreshold'> & { meetsThreshold?: boolean }>,
   minAccuracy: number
@@ -44,7 +51,7 @@ export function rankCandidates(
   return flagged.toSorted((a, b) => {
     if (a.meetsThreshold !== b.meetsThreshold) return a.meetsThreshold ? -1 : 1;
     if (a.meetsThreshold) {
-      return a.avgCostUsd - b.avgCostUsd || b.accuracy - a.accuracy;
+      return a.avgCostUsd / a.accuracy - b.avgCostUsd / b.accuracy || b.accuracy - a.accuracy;
     }
     return b.accuracy - a.accuracy || a.avgCostUsd - b.avgCostUsd;
   });
diff --git a/packages/auto-routing-contracts/src/taxonomy.ts b/packages/auto-routing-contracts/src/taxonomy.ts
new file mode 100644
index 0000000000..bb5fa70c62
--- /dev/null
+++ b/packages/auto-routing-contracts/src/taxonomy.ts
@@ -0,0 +1,77 @@
+import * as z from 'zod';
+
+export const CLASSIFIER_TASK_TYPES = [
+  'implementation',
+  'debugging',
+  'refactoring',
+  'planning_design',
+  'investigation',
+  'agentic_execution',
+] as const;
+
+export const CLASSIFIER_SUBTASK_TYPES = [
+  'feature_development',
+  'code_generation',
+  'test_creation',
+  'bug_fixing',
+  'test_repair',
+  'root_cause_analysis',
+  'code_cleanup',
+  'architecture_improvement',
+  'migration',
+  'architecture_design',
+  'technical_planning',
+  'system_design',
+  'repo_exploration',
+  'codebase_understanding',
+  'external_research',
+  'tool_usage',
+  'terminal_operations',
+  'multi_step_execution',
+] as const;
+
+export const SUBTYPES_BY_TASK_TYPE = {
+  implementation: ['feature_development', 'code_generation', 'test_creation'],
+  debugging: ['bug_fixing', 'test_repair', 'root_cause_analysis'],
+  refactoring: ['code_cleanup', 'architecture_improvement', 'migration'],
+  planning_design: ['architecture_design', 'technical_planning', 'system_design'],
+  investigation: ['repo_exploration', 'codebase_understanding', 'external_research'],
+  agentic_execution: ['tool_usage', 'terminal_operations', 'multi_step_execution'],
+} as const;
+
+export const TAXONOMY_ROUTE_KEYS = [
+  'implementation/feature_development',
+  'implementation/code_generation',
+  'implementation/test_creation',
+  'debugging/bug_fixing',
+  'debugging/test_repair',
+  'debugging/root_cause_analysis',
+  'refactoring/code_cleanup',
+  'refactoring/architecture_improvement',
+  'refactoring/migration',
+  'planning_design/architecture_design',
+  'planning_design/technical_planning',
+  'planning_design/system_design',
+  'investigation/repo_exploration',
+  'investigation/codebase_understanding',
+  'investigation/external_research',
+  'agentic_execution/tool_usage',
+  'agentic_execution/terminal_operations',
+  'agentic_execution/multi_step_execution',
+] as const;
+
+export const ClassifierTaskTypeSchema = z.enum(CLASSIFIER_TASK_TYPES);
+export type ClassifierTaskType = z.infer<typeof ClassifierTaskTypeSchema>;
+
+export const ClassifierSubtaskTypeSchema = z.enum(CLASSIFIER_SUBTASK_TYPES);
+export type ClassifierSubtaskType = z.infer<typeof ClassifierSubtaskTypeSchema>;
+
+export const TaxonomyRouteKeySchema = z.enum(TAXONOMY_ROUTE_KEYS);
+export type TaxonomyRouteKey = z.infer<typeof TaxonomyRouteKeySchema>;
+
+export function taxonomyRouteKey(params: {
+  taskType: ClassifierTaskType;
+  subtaskType: ClassifierSubtaskType;
+}): TaxonomyRouteKey {
+  return `${params.taskType}/${params.subtaskType}` as TaxonomyRouteKey;
+}
diff --git a/packages/auto-routing-contracts/src/tiers.test.ts b/packages/auto-routing-contracts/src/tiers.test.ts
deleted file mode 100644
index 5d62f7259f..0000000000
--- a/packages/auto-routing-contracts/src/tiers.test.ts
+++ /dev/null
@@ -1,79 +0,0 @@
-import { describe, expect, it } from 'vitest';
-import { deriveDifficultyTier } from './tiers';
-import type { ClassifierOutput } from './index';
-
-function classification(overrides: Partial<ClassifierOutput>): ClassifierOutput {
-  return {
-    taskType: 'implementation',
-    subtaskType: 'code_generation',
-    contextComplexity: 'small',
-    reasoningComplexity: 'low',
-    riskLevel: 'low',
-    executionMode: 'answer_only',
-    requiresTools: false,
-    confidence: 0.9,
-    ...overrides,
-  };
-}
-
-describe('deriveDifficultyTier', () => {
-  it('classifies trivial answer-only requests as low', () => {
-    expect(deriveDifficultyTier(classification({}))).toBe('low');
-  });
-  it('classifies mid-size code changes as medium', () => {
-    expect(
-      deriveDifficultyTier(
-        classification({
-          contextComplexity: 'medium',
-          reasoningComplexity: 'medium',
-          executionMode: 'code_change',
-        })
-      )
-    ).toBe('medium');
-  });
-  it('classifies high-reasoning multi-step work as high', () => {
-    expect(
-      deriveDifficultyTier(
-        classification({
-          contextComplexity: 'large',
-          reasoningComplexity: 'high',
-          executionMode: 'multi_step_project',
-          riskLevel: 'high',
-        })
-      )
-    ).toBe('high');
-  });
-  it('high risk tips an otherwise-low request to medium', () => {
-    expect(
-      deriveDifficultyTier(
-        classification({ executionMode: 'multi_step_project', riskLevel: 'high' })
-      )
-    ).toBe('medium');
-  });
-  it('high risk tips an otherwise-medium request to high', () => {
-    expect(
-      deriveDifficultyTier(
-        classification({
-          reasoningComplexity: 'medium',
-          contextComplexity: 'large',
-          executionMode: 'code_change',
-          riskLevel: 'high',
-        })
-      )
-    ).toBe('high');
-  });
-  it('is monotonic: bumping reasoning complexity never lowers the tier', () => {
-    const tiers = ['low', 'medium', 'high'] as const;
-    for (const ctx of ['small', 'medium', 'large'] as const) {
-      let prev = 0;
-      for (const reasoning of ['low', 'medium', 'high'] as const) {
-        const tier = deriveDifficultyTier(
-          classification({ contextComplexity: ctx, reasoningComplexity: reasoning })
-        );
-        const idx = tiers.indexOf(tier);
-        expect(idx).toBeGreaterThanOrEqual(prev);
-        prev = idx;
-      }
-    }
-  });
-});
diff --git a/packages/auto-routing-contracts/src/tiers.ts b/packages/auto-routing-contracts/src/tiers.ts
deleted file mode 100644
index 8358c5e3bf..0000000000
--- a/packages/auto-routing-contracts/src/tiers.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-import * as z from 'zod';
-
-export const DifficultyTierSchema = z.enum(['low', 'medium', 'high']);
-
-export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
-export type ReasoningEffort = z.infer<typeof ReasoningEffortSchema>;
-export type DifficultyTier = z.infer<typeof DifficultyTierSchema>;
-
-export const DIFFICULTY_TIERS: readonly DifficultyTier[] = ['low', 'medium', 'high'];
-
-const REASONING_POINTS = { low: 0, medium: 2, high: 4 } as const;
-const CONTEXT_POINTS = { small: 0, medium: 1, large: 2 } as const;
-const EXECUTION_POINTS = {
-  answer_only: 0,
-  code_change: 1,
-  command_execution: 1,
-  multi_step_project: 2,
-} as const;
-const RISK_POINTS = { low: 0, medium: 0, high: 1 } as const;
-
-// Deterministic mapping from the classifier taxonomy to a difficulty tier.
-// Reasoning complexity dominates (weight 2x) because it is the strongest
-// signal for whether a cheap model can complete the task; context size,
-// execution mode and blast radius nudge borderline cases up.
-// Structural subset of ClassifierOutput: importing the full type from
-// ./index would create a module cycle (index re-exports this file).
-export type DifficultyTierSignal = {
-  reasoningComplexity: 'low' | 'medium' | 'high';
-  contextComplexity: 'small' | 'medium' | 'large';
-  executionMode: 'answer_only' | 'code_change' | 'command_execution' | 'multi_step_project';
-  riskLevel: 'low' | 'medium' | 'high';
-};
-
-export function deriveDifficultyTier(classification: DifficultyTierSignal): DifficultyTier {
-  const score =
-    REASONING_POINTS[classification.reasoningComplexity] +
-    CONTEXT_POINTS[classification.contextComplexity] +
-    EXECUTION_POINTS[classification.executionMode] +
-    RISK_POINTS[classification.riskLevel];
-  if (score <= 2) return 'low';
-  if (score <= 5) return 'medium';
-  return 'high';
-}
diff --git a/services/auto-routing-benchmark/README.md b/services/auto-routing-benchmark/README.md
index cd5a226bf6..6573a38ce4 100644
--- a/services/auto-routing-benchmark/README.md
+++ b/services/auto-routing-benchmark/README.md
@@ -12,9 +12,9 @@ design, invariants, and rollout/rollback.
   OpenRouter using the exact production classifier code
   (`@kilocode/auto-routing-contracts/classifier`), grades per-field, and derives
   the cheapest above-threshold model as the classifier winner.
-- **Decider benchmark** — runs 76 golden tasks per candidate through the real
+- **Decider benchmark** — runs 180 golden tasks per candidate through the real
   `kilo` CLI inside a Cloudflare Container, grades mechanically, and publishes a
-  per-difficulty-tier routing table.
+  per-taxonomy-route routing table.
 - Normalized results live in D1 (`BENCH_DB`); published artifacts are cached in
   the shared `AUTO_ROUTING_CONFIG` KV namespace (publish = delete the keys so the
   next read repopulates from D1).
@@ -92,10 +92,12 @@ sqlite3 /tmp/<file>.sqlite 'select id, kind, status from benchmark_runs;'
 
 ## Debugging container (decider) failures
 
-- Each (model, 10-case chunk) gets its own container instance
-  (`runId:model:chunk`); CLI runs are serialized per instance (its sqlite state
-  is not safe under concurrent first runs). A `/warmup` call absorbs the one-time
-  sqlite migration before the case loop.
+- Each decider run seeds bounded shard lanes across the configured models and
+  repetitions. A lane uses one stable container instance
+  (`runId:model:rep:shard`) and processes chunk `N`, then `N+shardCount`, and
+  so on. CLI runs are serialized per instance because its sqlite state is not
+  safe under concurrent first runs. A `/warmup` call absorbs the one-time sqlite
+  migration before the case loop.
 - `case_results` rows carry diagnostics: CLI exit code, output prefix, and an
   event tail — start there for a failing case.
 - `POST /admin/debug-cli {model, prompt}` runs one prompt through the container
@@ -109,16 +111,16 @@ sqlite3 /tmp/<file>.sqlite 'select id, kind, status from benchmark_runs;'
 ## Debugging the DLQ
 
 Failed queue messages land in `auto-routing-benchmark-dlq` after `max_retries`
-(2) on `auto-routing-benchmark-jobs`. A message is one (model, chunk) job, so a
-DLQ'd message means that chunk never produced results; its model's summaries for
-the affected tier(s) will be missing or incomplete and `finalizeRunIfComplete`
-will mark the run accordingly.
+(6) on `auto-routing-benchmark-jobs`. A decider message is one
+(model, repetition, shard, chunk) job, so a DLQ'd message means that chunk never
+produced results; its model's summaries for the affected route(s) will be
+missing or incomplete and `finalizeRunIfComplete` will mark the run accordingly.
 
 To inspect / handle:
 
 - **Prod**: read the DLQ from the Cloudflare dashboard (Workers → Queues →
   `auto-routing-benchmark-dlq`) or `wrangler queues` tooling; the message body is
-  the JSON job (`runId`, `model`, `chunk`, case ids).
+  the JSON job (`runId`, `model`, `rep`, `shard`, `shardCount`, `chunk`, case ids).
 - **Replay**: re-run the affected model with the admin `force` toggle once the
   underlying cause (OpenRouter outage, container image, bad case) is fixed —
   carried summaries mean only the re-triggered model is re-benchmarked.
diff --git a/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql b/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql
index 3db1df3b2b..d2d038e3f1 100644
--- a/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql
+++ b/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql
@@ -32,7 +32,7 @@ CREATE TABLE `case_results` (
 	`run_id` text NOT NULL,
 	`model` text NOT NULL,
 	`case_id` text NOT NULL,
-	`tier` text,
+	`route_key` text,
 	`score` real NOT NULL,
 	`latency_ms` integer NOT NULL,
 	`cost_usd` real,
@@ -60,7 +60,7 @@ CREATE TABLE `config_decider_models` (
 CREATE TABLE `model_summaries` (
 	`run_id` text NOT NULL,
 	`model` text NOT NULL,
-	`tier` text NOT NULL,
+	`route_key` text NOT NULL,
 	`accuracy` real NOT NULL,
 	`avg_cost_usd` real,
 	`avg_latency_ms` real NOT NULL,
@@ -70,19 +70,19 @@ CREATE TABLE `model_summaries` (
 	`p95_latency_ms` real,
 	`timeouts` integer DEFAULT 0 NOT NULL,
 	`carried` integer DEFAULT false NOT NULL,
-	PRIMARY KEY(`run_id`, `model`, `tier`)
+	PRIMARY KEY(`run_id`, `model`, `route_key`)
 );
 --> statement-breakpoint
 CREATE TABLE `routing_table_candidates` (
 	`run_id` text NOT NULL,
-	`tier` text NOT NULL,
+	`route_key` text NOT NULL,
 	`rank` integer NOT NULL,
 	`model` text NOT NULL,
 	`accuracy` real NOT NULL,
 	`avg_cost_usd` real NOT NULL,
 	`meets_threshold` integer NOT NULL,
 	`reasoning_effort` text,
-	PRIMARY KEY(`run_id`, `tier`, `rank`)
+	PRIMARY KEY(`run_id`, `route_key`, `rank`)
 );
 --> statement-breakpoint
 CREATE TABLE `routing_tables` (
diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
index 35ce39e53e..b5614567dc 100644
--- a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
+++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
@@ -1,7 +1,7 @@
 {
   "version": "6",
   "dialect": "sqlite",
-  "id": "ba559fc8-fdd3-4c96-b116-53573fb79c74",
+  "id": "fa33fcda-13d6-4952-84d7-0ad12cd02fea",
   "prevId": "00000000-0000-0000-0000-000000000000",
   "tables": {
     "benchmark_config": {
@@ -222,8 +222,8 @@
           "notNull": true,
           "autoincrement": false
         },
-        "tier": {
-          "name": "tier",
+        "route_key": {
+          "name": "route_key",
           "type": "text",
           "primaryKey": false,
           "notNull": false,
@@ -390,8 +390,8 @@
           "notNull": true,
           "autoincrement": false
         },
-        "tier": {
-          "name": "tier",
+        "route_key": {
+          "name": "route_key",
           "type": "text",
           "primaryKey": false,
           "notNull": true,
@@ -466,13 +466,13 @@
       "indexes": {},
       "foreignKeys": {},
       "compositePrimaryKeys": {
-        "model_summaries_run_id_model_tier_pk": {
+        "model_summaries_run_id_model_route_key_pk": {
           "columns": [
             "run_id",
             "model",
-            "tier"
+            "route_key"
           ],
-          "name": "model_summaries_run_id_model_tier_pk"
+          "name": "model_summaries_run_id_model_route_key_pk"
         }
       },
       "uniqueConstraints": {},
@@ -488,8 +488,8 @@
           "notNull": true,
           "autoincrement": false
         },
-        "tier": {
-          "name": "tier",
+        "route_key": {
+          "name": "route_key",
           "type": "text",
           "primaryKey": false,
           "notNull": true,
@@ -541,13 +541,13 @@
       "indexes": {},
       "foreignKeys": {},
       "compositePrimaryKeys": {
-        "routing_table_candidates_run_id_tier_rank_pk": {
+        "routing_table_candidates_run_id_route_key_rank_pk": {
           "columns": [
             "run_id",
-            "tier",
+            "route_key",
             "rank"
           ],
-          "name": "routing_table_candidates_run_id_tier_rank_pk"
+          "name": "routing_table_candidates_run_id_route_key_rank_pk"
         }
       },
       "uniqueConstraints": {},
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
index 7ee67d2c06..aa20472e95 100644
--- a/services/auto-routing-benchmark/migrations/meta/_journal.json
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -5,7 +5,7 @@
     {
       "idx": 0,
       "version": "6",
-      "when": 1781523205381,
+      "when": 1781688875647,
       "tag": "0000_absent_wallow",
       "breakpoints": true
     }
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
index 77391db7b2..8bd7a4ba15 100644
--- a/services/auto-routing-benchmark/src/admin.test.ts
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -12,7 +12,7 @@ import { CLASSIFIER_CASES } from './datasets/classifier-cases';
 function makeSummary(model: string): BenchmarkModelSummary {
   return {
     model,
-    tier: 'low',
+    routeKey: 'implementation/code_generation',
     accuracy: 0.9,
     avgCostUsd: 0.001,
     avgLatencyMs: 100,
@@ -32,7 +32,7 @@ const TEST_CONFIG: BenchmarkConfig = {
   ],
   minAccuracy: 0.7,
   switchCostFactor: 3,
-  maxConcurrency: 4,
+  maxConcurrency: 100,
   benchmarkUserId: null,
   classifierRepetitions: 1,
   deciderRepetitions: 1,
@@ -471,9 +471,10 @@ describe('POST /admin/runs', () => {
     expect(body.enqueuedModels).toBe(1);
   });
 
-  it('slices a >100-message decider fan-out into sendBatch-sized batches', async () => {
-    // 7 decider models × 1 rep × ceil(76/5)=16 chunks = 112 messages, which
-    // exceeds Cloudflare Queues' 100-per-sendBatch cap and must be sliced.
+  it('seeds sharded decider lanes bounded by the container cap', async () => {
+    // Later chunks are chained by processJob within each shard lane. Start
+    // seeds as many lanes as fit under the 100-container cap so the benchmark
+    // runs much faster without creating one live container per chunk.
     const manyModels = Array.from({ length: 7 }, (_, i) => ({
       id: `vendor/model-${i}`,
       reasoningEffort: null,
@@ -487,11 +488,74 @@ describe('POST /admin/runs', () => {
     const res = await authedPost('/admin/runs', { kind: 'decider' });
     expect(res.status).toBe(200);
 
-    // 112 messages → two batches (100 + 12), neither over the limit.
-    expect(queueSendBatch).toHaveBeenCalledTimes(2);
+    expect(queueSendBatch).toHaveBeenCalledTimes(1);
     const batchSizes = queueSendBatch.mock.calls.map(([batch]) => (batch as unknown[]).length);
-    expect(batchSizes).toEqual([100, 12]);
+    expect(batchSizes).toEqual([98]);
     for (const size of batchSizes) expect(size).toBeLessThanOrEqual(100);
+    const queuedMessages = queueSendBatch.mock.calls.flatMap(([batch]) => batch as unknown[]);
+    for (const message of queuedMessages) {
+      expect(message).toMatchObject({
+        body: {
+          kind: 'decider',
+          shardCount: 14,
+        },
+      });
+    }
+  });
+
+  it('keeps 10 decider models with 3 repetitions under the 100-container cap', async () => {
+    const manyModels = Array.from({ length: 10 }, (_, i) => ({
+      id: `vendor/model-${i}`,
+      reasoningEffort: null,
+    }));
+    vi.mocked(getConfigRows).mockResolvedValue({
+      ...TEST_CONFIG_ROWS,
+      config: {
+        ...TEST_CONFIG_ROWS.config,
+        benchmark_user_id: 'user-123',
+        decider_repetitions: 3,
+      },
+      deciderModels: manyModels.map(m => ({ model: m.id, reasoning_effort: null })),
+    });
+
+    const res = await authedPost('/admin/runs', { kind: 'decider' });
+    expect(res.status).toBe(200);
+
+    expect(queueSendBatch).toHaveBeenCalledTimes(1);
+    const queuedMessages = queueSendBatch.mock.calls.flatMap(([batch]) => batch as unknown[]);
+    expect(queuedMessages).toHaveLength(90);
+    for (const message of queuedMessages) {
+      expect(message).toMatchObject({
+        body: {
+          kind: 'decider',
+          shardCount: 3,
+        },
+      });
+    }
+  });
+
+  it('rejects decider starts when model repetitions alone exceed the container cap', async () => {
+    const tooManyModels = Array.from({ length: 21 }, (_, i) => ({
+      id: `vendor/model-${i}`,
+      reasoningEffort: null,
+    }));
+    vi.mocked(getConfigRows).mockResolvedValue({
+      ...TEST_CONFIG_ROWS,
+      config: {
+        ...TEST_CONFIG_ROWS.config,
+        benchmark_user_id: 'user-123',
+        decider_repetitions: 5,
+      },
+      deciderModels: tooManyModels.map(m => ({ model: m.id, reasoning_effort: null })),
+    });
+
+    const res = await authedPost('/admin/runs', { kind: 'decider' });
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toMatchObject({
+      error: expect.stringContaining('requires at least one live container lane'),
+    });
+    expect(insertRun).not.toHaveBeenCalled();
+    expect(queueSendBatch).not.toHaveBeenCalled();
   });
 });
 
@@ -519,7 +583,7 @@ describe('GET /admin/routing-table', () => {
       minAccuracy: 0.7,
       switchCostFactor: 3,
       source: 'benchmark',
-      tiers: { low: [candidate], medium: [candidate], high: [candidate] },
+      routes: { 'implementation/code_generation': [candidate] },
     };
     vi.mocked(getLatestRoutingTable).mockResolvedValueOnce({
       table: tableData as RoutingTable,
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
index 0b95cd3a94..e266eea567 100644
--- a/services/auto-routing-benchmark/src/admin.ts
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -8,7 +8,13 @@ import { zodJsonValidator } from '@kilocode/worker-utils';
 import type { Hono } from 'hono';
 import { getBenchmarkConfig, saveBenchmarkConfig } from './config';
 import { debugRunCli } from './cli-runner';
-import { fetchBenchmarkUserToken, RunAlreadyActiveError, startRun, sweepStaleRuns } from './run';
+import {
+  BenchmarkRunConfigError,
+  fetchBenchmarkUserToken,
+  RunAlreadyActiveError,
+  startRun,
+  sweepStaleRuns,
+} from './run';
 import { getClassifierWinner, getLatestRoutingTable, listRuns } from './db';
 import type { HonoEnv } from './hono-env';
 
@@ -59,6 +65,9 @@ export function registerAdminRoutes(app: Hono<HonoEnv>): void {
         if (error instanceof RunAlreadyActiveError) {
           return c.json({ error: error.message }, 409);
         }
+        if (error instanceof BenchmarkRunConfigError) {
+          return c.json({ error: error.message }, 400);
+        }
         throw error;
       }
     }
diff --git a/services/auto-routing-benchmark/src/bench-runner-container.ts b/services/auto-routing-benchmark/src/bench-runner-container.ts
index a3c712c4c7..105e36ce52 100644
--- a/services/auto-routing-benchmark/src/bench-runner-container.ts
+++ b/services/auto-routing-benchmark/src/bench-runner-container.ts
@@ -3,7 +3,8 @@ import { Container } from '@cloudflare/containers';
 // Cloudflare Container that runs the stable `kilo` CLI for decider benchmark
 // cases. The worker proxies POST /run to the container's HTTP server (see
 // container/server.mjs) via this DO. One instance is keyed per
-// (runId, model, chunk) so concurrent chunks/models don't share state.
+// (runId, model, rep) so chunks for the same repetition reuse CLI state without
+// creating one live container per chunk.
 export class BenchRunnerContainer extends Container<Env> {
   defaultPort = 3000;
   sleepAfter = '2m';
@@ -11,4 +12,13 @@ export class BenchRunnerContainer extends Container<Env> {
   // points at the real gateway; local dev overrides it via .dev.vars so the
   // benchmark runs against the local apps/web instance.
   envVars = { KILO_API_URL: this.env.KILO_CLI_API_URL };
+
+  override async fetch(request: Request): Promise<Response> {
+    const url = new URL(request.url);
+    if (request.method === 'POST' && url.pathname === '/admin/destroy') {
+      await this.destroy();
+      return new Response('destroyed');
+    }
+    return super.fetch(request);
+  }
 }
diff --git a/services/auto-routing-benchmark/src/cli-runner.test.ts b/services/auto-routing-benchmark/src/cli-runner.test.ts
new file mode 100644
index 0000000000..c8966203e2
--- /dev/null
+++ b/services/auto-routing-benchmark/src/cli-runner.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it, vi } from 'vitest';
+import { destroyDeciderCliContainer } from './cli-runner';
+
+describe('destroyDeciderCliContainer', () => {
+  it('calls the container admin destroy endpoint for the instance name', async () => {
+    const fetch = vi.fn(async () => new Response('destroyed', { status: 200 }));
+    const idFromName = vi.fn((name: string) => `id:${name}`);
+    const get = vi.fn(() => ({ fetch }));
+    const env = { BENCH_RUNNER: { idFromName, get } } as unknown as Env;
+
+    await destroyDeciderCliContainer(env, { instanceName: 'run:model:2' });
+
+    expect(idFromName).toHaveBeenCalledWith('run:model:2');
+    expect(get).toHaveBeenCalledWith('id:run:model:2');
+    expect(fetch).toHaveBeenCalledWith(
+      expect.objectContaining({
+        method: 'POST',
+        url: 'http://container/admin/destroy',
+      })
+    );
+  });
+
+  it('throws when the container destroy endpoint fails', async () => {
+    const fetch = vi.fn(async () => new Response('nope', { status: 500 }));
+    const env = {
+      BENCH_RUNNER: {
+        idFromName: (name: string) => `id:${name}`,
+        get: () => ({ fetch }),
+      },
+    } as unknown as Env;
+
+    await expect(destroyDeciderCliContainer(env, { instanceName: 'run:model:2' })).rejects.toThrow(
+      'container /admin/destroy failed: HTTP 500 nope'
+    );
+  });
+});
diff --git a/services/auto-routing-benchmark/src/cli-runner.ts b/services/auto-routing-benchmark/src/cli-runner.ts
index 9f22cb3695..de826b3a97 100644
--- a/services/auto-routing-benchmark/src/cli-runner.ts
+++ b/services/auto-routing-benchmark/src/cli-runner.ts
@@ -20,6 +20,19 @@ const DECIDER_CLI_TIMEOUT_MS = 180_000;
 const FINAL_ANSWER_SUFFIX =
   '\n\nIMPORTANT: Your final message must contain ONLY the answer in the exact requested format - no explanations, no preamble, no extra words.';
 
+export function isRetryableContainerAvailabilityError(error: unknown): boolean {
+  const message = error instanceof Error ? error.message : String(error);
+  const normalized = message.toLowerCase();
+  return (
+    normalized.includes('container /run failed: http 503') ||
+    normalized.includes('container /warmup failed: http 503') ||
+    normalized.includes('no container instance available') ||
+    normalized.includes('no container instance that can be provided') ||
+    normalized.includes('max concurrent instance count') ||
+    normalized.includes('maximum number of running container instances exceeded')
+  );
+}
+
 type ContainerRunResponse = {
   exitCode: number;
   durationMs: number;
@@ -31,10 +44,10 @@ type ContainerRunResponse = {
 /**
  * Run one decider case through the `kilo` CLI inside a Cloudflare Container.
  *
- * `instanceName` is the precomputed DO instance name (e.g.
- * `${runId}:${model}:${chunk}`); the caller owns the keying so chunks/models
- * map to stable instances. The CLI has no system-prompt flag, so we fold the
- * system prompt into the user prompt.
+ * `instanceName` is the precomputed DO instance name; the caller owns the
+ * keying so chunks for the same model/repetition share a stable instance. The
+ * CLI has no system-prompt flag, so we fold the system prompt into the user
+ * prompt.
  */
 export async function runDeciderCaseViaCli(
   env: Env,
@@ -141,6 +154,23 @@ export async function warmUpCliContainer(
     })
   );
   if (!response.ok) {
-    throw new Error(`container /warmup failed: HTTP ${response.status}`);
+    const detail = (await response.text().catch(() => '')).slice(0, 500);
+    throw new Error(`container /warmup failed: HTTP ${response.status} ${detail}`);
+  }
+}
+
+export async function destroyDeciderCliContainer(
+  env: Env,
+  params: { instanceName: string }
+): Promise<void> {
+  const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(params.instanceName));
+  const response = await stub.fetch(
+    new Request('http://container/admin/destroy', {
+      method: 'POST',
+    })
+  );
+  if (!response.ok) {
+    const detail = (await response.text().catch(() => '')).slice(0, 500);
+    throw new Error(`container /admin/destroy failed: HTTP ${response.status} ${detail}`);
   }
 }
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
index 1fb02e8de4..10e8aade79 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
@@ -18,18 +18,18 @@ describe('DECIDER_CASES', () => {
     expect(TAXONOMY_PAIRS.length).toBe(18);
   });
 
-  it('has exactly 76 cases with unique ids', () => {
-    expect(DECIDER_CASES.length).toBe(76);
+  it('has exactly 180 cases with unique ids', () => {
+    expect(DECIDER_CASES.length).toBe(180);
     const ids = new Set(DECIDER_CASES.map(c => c.id));
     expect(ids.size).toBe(DECIDER_CASES.length);
   });
 
-  it('has at least 4 cases per (taskType, subtaskType) pair', () => {
+  it('has at least 10 cases per (taskType, subtaskType) pair', () => {
     for (const pair of TAXONOMY_PAIRS) {
       const count = DECIDER_CASES.filter(
         c => c.taskType === pair.taskType && c.subtaskType === pair.subtaskType
       ).length;
-      expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(4);
+      expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(10);
     }
   });
 
@@ -44,19 +44,6 @@ describe('DECIDER_CASES', () => {
     }
   });
 
-  it('has at least 20 cases per tier', () => {
-    for (const tier of ['low', 'medium', 'high'] as const) {
-      expect(DECIDER_CASES.filter(c => c.tier === tier).length, tier).toBeGreaterThanOrEqual(20);
-    }
-  });
-
-  it('covers at least 4 distinct task types per tier', () => {
-    for (const tier of ['low', 'medium', 'high'] as const) {
-      const taskTypes = new Set(DECIDER_CASES.filter(c => c.tier === tier).map(c => c.taskType));
-      expect(taskTypes.size, tier).toBeGreaterThanOrEqual(4);
-    }
-  });
-
   it('has compilable regex patterns', () => {
     for (const c of DECIDER_CASES) {
       const check = c.check;
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
index fcb82a223f..3760bc1624 100644
--- a/services/auto-routing-benchmark/src/datasets/decider-cases.ts
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
@@ -1,13 +1,8 @@
-import type {
-  ClassifierSubtaskType,
-  ClassifierTaskType,
-  DifficultyTier,
-} from '@kilocode/auto-routing-contracts';
+import type { ClassifierSubtaskType, ClassifierTaskType } from '@kilocode/auto-routing-contracts';
 import type { DeciderCheck } from '../grading';
 
 export type DeciderCase = {
   id: string; // stable slug, e.g. 'impl-gen-squares-array' (<taskType>-<subtype>-<topic>)
-  tier: DifficultyTier;
   taskType: ClassifierTaskType;
   subtaskType: ClassifierSubtaskType;
   systemPrompt: string;
@@ -28,19 +23,15 @@ const AGENT_SYS =
 // noise (fences/case/whitespace) but never wrong values. For json_equal cases
 // the prompt pins the exact key set in the same order as the expected value
 // (the comparison is JSON.stringify-based and order-sensitive). Each case
-// carries exactly one difficulty tier: low = mechanical lookups / trivial
-// evaluation, medium = multi-step reasoning / off-by-one traps / spec
-// application, high = deep tracing / multi-constraint puzzles / subtle
-// semantics. agentic_execution cases are self-contained tasks performed with
-// file/terminal tools inside the benchmark container (node:22-slim, no repo,
-// no network) and every command involved is deterministic there.
+// agentic_execution cases are self-contained tasks performed with file/terminal
+// tools inside the benchmark container (node:22-slim, no repo, no network) and
+// every command involved is deterministic there.
 export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   // implementation / feature_development
   // ---------------------------------------------------------------------------
   {
     id: 'impl-feat-ternary-parity',
-    tier: 'low',
     taskType: 'implementation',
     subtaskType: 'feature_development',
     systemPrompt: CODE_SYS,
@@ -50,7 +41,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-feat-array-pipeline',
-    tier: 'low',
     taskType: 'implementation',
     subtaskType: 'feature_development',
     systemPrompt: CODE_SYS,
@@ -60,7 +50,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-feat-closure-counter',
-    tier: 'medium',
     taskType: 'implementation',
     subtaskType: 'feature_development',
     systemPrompt: CODE_SYS,
@@ -70,7 +59,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-feat-recursion-fib',
-    tier: 'medium',
     taskType: 'implementation',
     subtaskType: 'feature_development',
     systemPrompt: CODE_SYS,
@@ -80,7 +68,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-feat-this-binding',
-    tier: 'high',
     taskType: 'implementation',
     subtaskType: 'feature_development',
     systemPrompt: CODE_SYS,
@@ -94,7 +81,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'impl-gen-package-manifest',
-    tier: 'low',
     taskType: 'implementation',
     subtaskType: 'code_generation',
     systemPrompt: CODE_SYS,
@@ -104,7 +90,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-gen-squares-array',
-    tier: 'low',
     taskType: 'implementation',
     subtaskType: 'code_generation',
     systemPrompt: CODE_SYS,
@@ -114,7 +99,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-gen-no-consecutive-ones',
-    tier: 'medium',
     taskType: 'implementation',
     subtaskType: 'code_generation',
     systemPrompt: CODE_SYS,
@@ -124,7 +108,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-gen-two-ones-strings',
-    tier: 'high',
     taskType: 'implementation',
     subtaskType: 'code_generation',
     systemPrompt: CODE_SYS,
@@ -141,7 +124,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'impl-test-sort-expectation',
-    tier: 'low',
     taskType: 'implementation',
     subtaskType: 'test_creation',
     systemPrompt: CODE_SYS,
@@ -151,7 +133,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-test-upper-expectation',
-    tier: 'low',
     taskType: 'implementation',
     subtaskType: 'test_creation',
     systemPrompt: CODE_SYS,
@@ -161,7 +142,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-test-mock-call-count',
-    tier: 'medium',
     taskType: 'implementation',
     subtaskType: 'test_creation',
     systemPrompt: CODE_SYS,
@@ -171,7 +151,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'impl-test-trailing-zeros',
-    tier: 'high',
     taskType: 'implementation',
     subtaskType: 'test_creation',
     systemPrompt: CODE_SYS,
@@ -185,7 +164,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'debug-fix-parseint-suffix',
-    tier: 'low',
     taskType: 'debugging',
     subtaskType: 'bug_fixing',
     systemPrompt: CODE_SYS,
@@ -195,7 +173,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'debug-fix-binary-search',
-    tier: 'medium',
     taskType: 'debugging',
     subtaskType: 'bug_fixing',
     systemPrompt: CODE_SYS,
@@ -207,7 +184,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
     // 'pages' rather than 'pagination' so the id never collides with the
     // classifier dataset's debug-fix-pagination-slice in shared telemetry.
     id: 'debug-fix-pages-slice',
-    tier: 'medium',
     taskType: 'debugging',
     subtaskType: 'bug_fixing',
     systemPrompt: CODE_SYS,
@@ -217,7 +193,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'debug-fix-regex-lastindex',
-    tier: 'high',
     taskType: 'debugging',
     subtaskType: 'bug_fixing',
     systemPrompt: CODE_SYS,
@@ -231,7 +206,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'debug-repair-compound-assign',
-    tier: 'low',
     taskType: 'debugging',
     subtaskType: 'test_repair',
     systemPrompt: CODE_SYS,
@@ -241,7 +215,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'debug-repair-date-format',
-    tier: 'medium',
     taskType: 'debugging',
     subtaskType: 'test_repair',
     systemPrompt: CODE_SYS,
@@ -251,7 +224,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'debug-repair-entries-shape',
-    tier: 'medium',
     taskType: 'debugging',
     subtaskType: 'test_repair',
     systemPrompt: CODE_SYS,
@@ -267,7 +239,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'debug-repair-float-sum',
-    tier: 'high',
     taskType: 'debugging',
     subtaskType: 'test_repair',
     systemPrompt: CODE_SYS,
@@ -281,7 +252,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'debug-rca-async-order',
-    tier: 'medium',
     taskType: 'debugging',
     subtaskType: 'root_cause_analysis',
     systemPrompt: CODE_SYS,
@@ -291,7 +261,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'debug-rca-shared-ref',
-    tier: 'medium',
     taskType: 'debugging',
     subtaskType: 'root_cause_analysis',
     systemPrompt: CODE_SYS,
@@ -301,7 +270,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'debug-rca-closure-loop-var',
-    tier: 'high',
     taskType: 'debugging',
     subtaskType: 'root_cause_analysis',
     systemPrompt: CODE_SYS,
@@ -311,7 +279,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'debug-rca-float-equality',
-    tier: 'high',
     taskType: 'debugging',
     subtaskType: 'root_cause_analysis',
     systemPrompt: CODE_SYS,
@@ -325,7 +292,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'refactor-cleanup-loop-to-reduce',
-    tier: 'low',
     taskType: 'refactoring',
     subtaskType: 'code_cleanup',
     systemPrompt: CODE_SYS,
@@ -335,7 +301,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'refactor-cleanup-extract-helper',
-    tier: 'low',
     taskType: 'refactoring',
     subtaskType: 'code_cleanup',
     systemPrompt: CODE_SYS,
@@ -345,7 +310,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'refactor-cleanup-map-equivalent',
-    tier: 'medium',
     taskType: 'refactoring',
     subtaskType: 'code_cleanup',
     systemPrompt: CODE_SYS,
@@ -355,7 +319,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'refactor-cleanup-short-circuit',
-    tier: 'high',
     taskType: 'refactoring',
     subtaskType: 'code_cleanup',
     systemPrompt: CODE_SYS,
@@ -369,7 +332,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'refactor-arch-import-updates',
-    tier: 'low',
     taskType: 'refactoring',
     subtaskType: 'architecture_improvement',
     systemPrompt: CODE_SYS,
@@ -379,7 +341,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'refactor-arch-layer-depth',
-    tier: 'medium',
     taskType: 'refactoring',
     subtaskType: 'architecture_improvement',
     systemPrompt: CODE_SYS,
@@ -389,7 +350,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'refactor-arch-interface-edges',
-    tier: 'medium',
     taskType: 'refactoring',
     subtaskType: 'architecture_improvement',
     systemPrompt: CODE_SYS,
@@ -399,7 +359,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'refactor-arch-cycle-cut',
-    tier: 'high',
     taskType: 'refactoring',
     subtaskType: 'architecture_improvement',
     systemPrompt: CODE_SYS,
@@ -413,7 +372,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'refactor-migrate-substr-slice',
-    tier: 'low',
     taskType: 'refactoring',
     subtaskType: 'migration',
     systemPrompt: CODE_SYS,
@@ -423,7 +381,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'refactor-migrate-promise-chain',
-    tier: 'medium',
     taskType: 'refactoring',
     subtaskType: 'migration',
     systemPrompt: CODE_SYS,
@@ -433,7 +390,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'refactor-migrate-strict-equality',
-    tier: 'medium',
     taskType: 'refactoring',
     subtaskType: 'migration',
     systemPrompt: CODE_SYS,
@@ -443,7 +399,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'refactor-migrate-var-to-let',
-    tier: 'high',
     taskType: 'refactoring',
     subtaskType: 'migration',
     systemPrompt: CODE_SYS,
@@ -456,18 +411,16 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // planning_design / architecture_design
   // ---------------------------------------------------------------------------
   {
-    id: 'plan-arch-three-tier',
-    tier: 'low',
+    id: 'plan-arch-three-layer',
     taskType: 'planning_design',
     subtaskType: 'architecture_design',
     systemPrompt: SYS_SYS,
     userPrompt:
-      'In a classic three-tier architecture with presentation, business, and data tiers, which tier should contain the SQL queries? Answer with only one word: presentation, business, or data.',
+      'In a classic three-layer architecture with presentation, business, and data layers, which layer should contain the SQL queries? Answer with only one word: presentation, business, or data.',
     check: { kind: 'exact', value: 'data' },
   },
   {
     id: 'plan-arch-call-chain',
-    tier: 'medium',
     taskType: 'planning_design',
     subtaskType: 'architecture_design',
     systemPrompt: SYS_SYS,
@@ -477,7 +430,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-arch-dependency-rules',
-    tier: 'medium',
     taskType: 'planning_design',
     subtaskType: 'architecture_design',
     systemPrompt: SYS_SYS,
@@ -487,7 +439,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-arch-latency-budget',
-    tier: 'high',
     taskType: 'planning_design',
     subtaskType: 'architecture_design',
     systemPrompt: SYS_SYS,
@@ -501,7 +452,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'plan-steps-rollout-order',
-    tier: 'low',
     taskType: 'planning_design',
     subtaskType: 'technical_planning',
     systemPrompt: SYS_SYS,
@@ -511,7 +461,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-steps-batch-count',
-    tier: 'medium',
     taskType: 'planning_design',
     subtaskType: 'technical_planning',
     systemPrompt: SYS_SYS,
@@ -521,7 +470,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-steps-deploy-waves',
-    tier: 'medium',
     taskType: 'planning_design',
     subtaskType: 'technical_planning',
     systemPrompt: SYS_SYS,
@@ -531,7 +479,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-steps-critical-path',
-    tier: 'high',
     taskType: 'planning_design',
     subtaskType: 'technical_planning',
     systemPrompt: SYS_SYS,
@@ -545,7 +492,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'plan-system-write-quorum',
-    tier: 'low',
     taskType: 'planning_design',
     subtaskType: 'system_design',
     systemPrompt: SYS_SYS,
@@ -555,7 +501,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-system-rate-limit-window',
-    tier: 'medium',
     taskType: 'planning_design',
     subtaskType: 'system_design',
     systemPrompt: SYS_SYS,
@@ -565,7 +510,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-system-replica-availability',
-    tier: 'medium',
     taskType: 'planning_design',
     subtaskType: 'system_design',
     systemPrompt: SYS_SYS,
@@ -575,7 +519,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-system-cache-staleness',
-    tier: 'high',
     taskType: 'planning_design',
     subtaskType: 'system_design',
     systemPrompt: SYS_SYS,
@@ -585,7 +528,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-system-queue-trace',
-    tier: 'high',
     taskType: 'planning_design',
     subtaskType: 'system_design',
     systemPrompt: SYS_SYS,
@@ -595,7 +537,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-system-deadlock-order',
-    tier: 'high',
     taskType: 'planning_design',
     subtaskType: 'system_design',
     systemPrompt: SYS_SYS,
@@ -605,7 +546,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'plan-system-txn-isolation',
-    tier: 'high',
     taskType: 'planning_design',
     subtaskType: 'system_design',
     systemPrompt: SYS_SYS,
@@ -619,7 +559,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'invest-repo-test-file-count',
-    tier: 'low',
     taskType: 'investigation',
     subtaskType: 'repo_exploration',
     systemPrompt: CODE_SYS,
@@ -629,7 +568,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'invest-repo-glob-match',
-    tier: 'medium',
     taskType: 'investigation',
     subtaskType: 'repo_exploration',
     systemPrompt: CODE_SYS,
@@ -639,7 +577,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'invest-repo-grep-case',
-    tier: 'medium',
     taskType: 'investigation',
     subtaskType: 'repo_exploration',
     systemPrompt: CODE_SYS,
@@ -649,7 +586,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'invest-repo-gitignore',
-    tier: 'high',
     taskType: 'investigation',
     subtaskType: 'repo_exploration',
     systemPrompt: CODE_SYS,
@@ -663,7 +599,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'invest-code-char-count',
-    tier: 'low',
     taskType: 'investigation',
     subtaskType: 'codebase_understanding',
     systemPrompt: CODE_SYS,
@@ -673,7 +608,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'invest-code-object-keys',
-    tier: 'low',
     taskType: 'investigation',
     subtaskType: 'codebase_understanding',
     systemPrompt: CODE_SYS,
@@ -683,7 +617,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'invest-code-regex-groups',
-    tier: 'medium',
     taskType: 'investigation',
     subtaskType: 'codebase_understanding',
     systemPrompt: CODE_SYS,
@@ -693,7 +626,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'invest-code-collatz-depth',
-    tier: 'high',
     taskType: 'investigation',
     subtaskType: 'codebase_understanding',
     systemPrompt: CODE_SYS,
@@ -707,7 +639,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'invest-ext-http-created',
-    tier: 'low',
     taskType: 'investigation',
     subtaskType: 'external_research',
     systemPrompt:
@@ -718,7 +649,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'invest-ext-utf8-euro',
-    tier: 'medium',
     taskType: 'investigation',
     subtaskType: 'external_research',
     systemPrompt: SYS_SYS,
@@ -728,7 +658,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'invest-ext-semver-caret',
-    tier: 'medium',
     taskType: 'investigation',
     subtaskType: 'external_research',
     systemPrompt: CODE_SYS,
@@ -738,7 +667,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'invest-ext-json-spec',
-    tier: 'high',
     taskType: 'investigation',
     subtaskType: 'external_research',
     systemPrompt: CODE_SYS,
@@ -752,7 +680,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'agentic-tool-json-read',
-    tier: 'low',
     taskType: 'agentic_execution',
     subtaskType: 'tool_usage',
     systemPrompt: AGENT_SYS,
@@ -762,7 +689,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'agentic-tool-notes-count',
-    tier: 'low',
     taskType: 'agentic_execution',
     subtaskType: 'tool_usage',
     systemPrompt: AGENT_SYS,
@@ -772,7 +698,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'agentic-tool-log-grep',
-    tier: 'medium',
     taskType: 'agentic_execution',
     subtaskType: 'tool_usage',
     systemPrompt: AGENT_SYS,
@@ -782,7 +707,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'agentic-tool-csv-filter-sum',
-    tier: 'high',
     taskType: 'agentic_execution',
     subtaskType: 'tool_usage',
     systemPrompt: AGENT_SYS,
@@ -796,7 +720,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'agentic-term-node-major',
-    tier: 'low',
     taskType: 'agentic_execution',
     subtaskType: 'terminal_operations',
     systemPrompt: AGENT_SYS,
@@ -806,7 +729,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'agentic-term-wc-lines',
-    tier: 'low',
     taskType: 'agentic_execution',
     subtaskType: 'terminal_operations',
     systemPrompt: AGENT_SYS,
@@ -816,7 +738,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'agentic-term-sort-pipeline',
-    tier: 'medium',
     taskType: 'agentic_execution',
     subtaskType: 'terminal_operations',
     systemPrompt: AGENT_SYS,
@@ -826,7 +747,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'agentic-term-sha256-prefix',
-    tier: 'high',
     taskType: 'agentic_execution',
     subtaskType: 'terminal_operations',
     systemPrompt: AGENT_SYS,
@@ -840,7 +760,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   // ---------------------------------------------------------------------------
   {
     id: 'agentic-multi-seq-sum',
-    tier: 'medium',
     taskType: 'agentic_execution',
     subtaskType: 'multi_step_execution',
     systemPrompt: AGENT_SYS,
@@ -850,7 +769,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'agentic-multi-node-script',
-    tier: 'medium',
     taskType: 'agentic_execution',
     subtaskType: 'multi_step_execution',
     systemPrompt: AGENT_SYS,
@@ -860,7 +778,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'agentic-multi-find-count',
-    tier: 'medium',
     taskType: 'agentic_execution',
     subtaskType: 'multi_step_execution',
     systemPrompt: AGENT_SYS,
@@ -870,7 +787,6 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
   },
   {
     id: 'agentic-multi-json-transform',
-    tier: 'high',
     taskType: 'agentic_execution',
     subtaskType: 'multi_step_execution',
     systemPrompt: AGENT_SYS,
@@ -878,4 +794,943 @@ export const DECIDER_CASES: readonly DeciderCase[] = [
       'Create a file /tmp/bench-in.json containing exactly this JSON array: [3, 1, 4, 1, 5, 9, 2, 6, 5, 3]. Then write and run a Node.js script that reads the file, computes the sum of the distinct values in the array, and prints it. Answer with only the number.',
     check: { kind: 'exact', value: '30' },
   },
+  // ---------------------------------------------------------------------------
+  // Supplemental taxonomy-route coverage
+  // ---------------------------------------------------------------------------
+  {
+    id: 'supp-impl-feat-clamp',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Implement mentally: clamp(14, 3, 9) returns min when low, max when high, otherwise value. Answer with only the returned number.',
+    check: { kind: 'exact', value: '9' },
+  },
+  {
+    id: 'supp-impl-feat-join-slugs',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What should slug(["Kilo", "Code", "Cloud"]) return if it lowercases words and joins them with hyphens? Answer only the return value.',
+    check: { kind: 'exact', value: 'kilo-code-cloud' },
+  },
+  {
+    id: 'supp-impl-code-nullish',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the output.\n\nconst x = null ?? "fallback";\nconsole.log(x);',
+    check: { kind: 'exact', value: 'fallback' },
+  },
+  {
+    id: 'supp-impl-code-set-size',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer only the number.\n\nconst s = new Set(["a", "b", "a", "c"]);\nconsole.log(s.size);',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'supp-impl-test-boundary-count',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A clamp(value, min, max) function needs tests for below min, at min, inside range, at max, and above max. How many cases is that? Answer only the number.',
+    check: { kind: 'exact', value: '5' },
+  },
+  {
+    id: 'supp-impl-test-error-case',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'For parsePort(input), which invalid input should a test include: "3000", "0", or "abc"? Answer only the invalid value.',
+    check: { kind: 'exact', value: 'abc' },
+  },
+  {
+    id: 'supp-debug-bug-off-by-one',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A loop uses i <= items.length and reads items[i]. What operator should replace <= to avoid reading past the end? Answer only the operator.',
+    check: { kind: 'exact', value: '<' },
+  },
+  {
+    id: 'supp-debug-bug-json-parse',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'JSON.parse("{bad}") throws. Should the fix catch SyntaxError or TypeError? Answer only the error class.',
+    check: { kind: 'exact', value: 'SyntaxError' },
+  },
+  {
+    id: 'supp-debug-test-expected',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A function returns ["a", "b"]. The failing test expects ["b", "a"] but order is part of the contract. Which expected array is correct? Answer JSON only.',
+    check: { kind: 'json_equal', value: ['a', 'b'] },
+  },
+  {
+    id: 'supp-debug-test-timeout',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A test waits for text that appears after clicking Save, but it never clicks Save. What single action is missing? Answer only the verb.',
+    check: { kind: 'exact', value: 'click' },
+  },
+  {
+    id: 'supp-debug-root-cause-cache',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A value updates in the database but the page shows the old value until cache expiry. Which layer is the likely root cause: database, cache, or compiler? Answer one word.',
+    check: { kind: 'exact', value: 'cache' },
+  },
+  {
+    id: 'supp-debug-root-cause-env',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Local requests hit port 8810 but the worker config says the target service runs on 8814. What kind of mismatch is this? Answer one word.',
+    check: { kind: 'exact', value: 'port' },
+  },
+  {
+    id: 'supp-refactor-cleanup-dead-branch',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A condition checks if status === "done" inside a branch where status is already known to be "pending". What should happen to that inner branch? Answer one word.',
+    check: { kind: 'exact', value: 'remove' },
+  },
+  {
+    id: 'supp-refactor-cleanup-name',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Which name is clearer for a boolean: data, flag, or hasErrors? Answer only the best name.',
+    check: { kind: 'exact', value: 'hasErrors' },
+  },
+  {
+    id: 'supp-refactor-arch-shared-helper',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Three modules duplicate the same pure validation logic. Should the shared code be a pure helper, global mutable state, or copied again? Answer two words.',
+    check: { kind: 'exact', value: 'pure helper' },
+  },
+  {
+    id: 'supp-refactor-arch-boundary',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A UI component directly opens database connections. Which boundary should own the database call: UI, server, or CSS? Answer one word.',
+    check: { kind: 'exact', value: 'server' },
+  },
+  {
+    id: 'supp-refactor-migration-column',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A migration renames user_name to display_name without changing values. What SQL operation is this: INSERT, RENAME COLUMN, or DROP TABLE? Answer only the operation.',
+    check: { kind: 'exact', value: 'RENAME COLUMN' },
+  },
+  {
+    id: 'supp-refactor-migration-backfill',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'After adding a non-null slug column to existing rows, what data operation fills slug for old rows? Answer one word.',
+    check: { kind: 'exact', value: 'backfill' },
+  },
+  {
+    id: 'supp-plan-arch-cache-layer',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'For read-heavy config that changes rarely, should the hot path read every request from origin storage or use a short cache? Answer two words.',
+    check: { kind: 'exact', value: 'short cache' },
+  },
+  {
+    id: 'supp-plan-arch-queue',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A long-running benchmark exceeds request time limits. Which primitive should carry the work asynchronously: queue, cookie, or CSS? Answer one word.',
+    check: { kind: 'exact', value: 'queue' },
+  },
+  {
+    id: 'supp-plan-technical-rollout',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Order these rollout steps: deploy code, run migration, monitor logs. Which step should be last? Answer two words.',
+    check: { kind: 'exact', value: 'monitor logs' },
+  },
+  {
+    id: 'supp-plan-technical-risk',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A plan changes a shared API contract. Should verification focus on one file only or all direct consumers? Answer three words.',
+    check: { kind: 'exact', value: 'all direct consumers' },
+  },
+  {
+    id: 'supp-plan-system-slo',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A service retries failed jobs and eventually sends hopeless jobs to a separate queue. What is that queue commonly called? Answer only the abbreviation.',
+    check: { kind: 'exact', value: 'DLQ' },
+  },
+  {
+    id: 'supp-plan-system-idempotency',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'If the same queue message may be delivered twice, should writes be idempotent or random? Answer one word.',
+    check: { kind: 'exact', value: 'idempotent' },
+  },
+  {
+    id: 'supp-invest-repo-rg',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Which command is the fastest common choice to search a repository for the string saveRoutingTable: rg, cat, or date? Answer one word.',
+    check: { kind: 'exact', value: 'rg' },
+  },
+  {
+    id: 'supp-invest-repo-package',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'In a pnpm monorepo, which file usually names a package and its scripts: package.json or README.md? Answer only the file name.',
+    check: { kind: 'exact', value: 'package.json' },
+  },
+  {
+    id: 'supp-invest-code-flow',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A handler calls validateInput, then saveRow, then enqueueJob. Which function creates the async follow-up? Answer only the function name.',
+    check: { kind: 'exact', value: 'enqueueJob' },
+  },
+  {
+    id: 'supp-invest-code-owner',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'If a type is imported from @kilocode/auto-routing-contracts, which package owns that type? Answer only the package name.',
+    check: { kind: 'exact', value: '@kilocode/auto-routing-contracts' },
+  },
+  {
+    id: 'supp-invest-research-source',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'For a question about current Cloudflare Workers limits, should you prefer official docs or an old blog post? Answer two words.',
+    check: { kind: 'exact', value: 'official docs' },
+  },
+  {
+    id: 'supp-invest-research-date',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'When comparing two search results for current pricing, which field matters most: publish date, font size, or title length? Answer two words.',
+    check: { kind: 'exact', value: 'publish date' },
+  },
+  {
+    id: 'supp-agent-tool-json-file',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-tool.json containing exactly {"a":2,"b":5}. Then read it and answer with only the sum of a and b.',
+    check: { kind: 'exact', value: '7' },
+  },
+  {
+    id: 'supp-agent-tool-grep-count',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-tool.txt with lines alpha, beta, alphabet, gamma. Count lines containing alpha and answer only the number.',
+    check: { kind: 'exact', value: '2' },
+  },
+  {
+    id: 'supp-agent-term-node-eval',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt: 'Run node -e "console.log(6*7)" in the terminal and answer with only the output.',
+    check: { kind: 'exact', value: '42' },
+  },
+  {
+    id: 'supp-agent-term-pwd-base',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Run pwd in the terminal. If it ends with /app, answer app; otherwise answer other. Answer one word.',
+    check: { kind: 'regex', pattern: '^(app|other)$' },
+  },
+  {
+    id: 'supp-agent-multi-script',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Write /tmp/bench-multi.js that prints ["k","i","l","o"].join(""). Run it with node and answer with only what it prints.',
+    check: { kind: 'exact', value: 'kilo' },
+  },
+  {
+    id: 'supp-agent-multi-files',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-a.txt containing 11 and /tmp/bench-b.txt containing 31. Read both files, add the numbers, and answer only the sum.',
+    check: { kind: 'exact', value: '42' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // Additional taxonomy-route coverage to keep every pair at 10+ cases
+  // ---------------------------------------------------------------------------
+  {
+    id: 'supp2-impl-feat-nullish-total',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with only the number.\n\nconst input = { count: null };\nconst total = (input.count ?? 4) + 6;\nconsole.log(total);',
+    check: { kind: 'exact', value: '10' },
+  },
+  {
+    id: 'supp2-impl-feat-spread-merge',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with the exact output line only.\n\nconst base = { a: 1, b: 2 };\nconst next = { ...base, b: 5, c: 8 };\nconsole.log(Object.keys(next).join(","));',
+    check: { kind: 'exact', value: 'a,b,c' },
+  },
+  {
+    id: 'supp2-impl-feat-set-size',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with only the number.\n\nconst tags = new Set(["api", "web", "api", "cli"]);\nconsole.log(tags.size);',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'supp2-impl-gen-config-object',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a config fixture. Reply with only a JSON object with exactly the keys "enabled" and "retries" in that order, where enabled is true and retries is 3.',
+    check: { kind: 'json_equal', value: { enabled: true, retries: 3 } },
+  },
+  {
+    id: 'supp2-impl-gen-primes-array',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a test fixture: a JSON array containing the prime numbers less than 12, in increasing order. Reply with only the JSON array.',
+    check: { kind: 'json_equal', value: [2, 3, 5, 7, 11] },
+  },
+  {
+    id: 'supp2-impl-gen-user-slug',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a slug for the title "Ship Fast, Stay Safe!". Reply with only the lowercase slug.',
+    check: { kind: 'exact', value: 'ship-fast-stay-safe' },
+  },
+  {
+    id: 'supp2-impl-gen-initials-object',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a fixture. Reply with only a JSON object with exactly the keys "name" and "initials" in that order, where name is "Ada Lovelace" and initials is "AL".',
+    check: { kind: 'json_equal', value: { name: 'Ada Lovelace', initials: 'AL' } },
+  },
+  {
+    id: 'supp2-impl-test-array-length',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are writing a unit test. What number makes this assertion pass? Answer with only the number.\n\nexpect(["red", "blue", "green"].length).toBe(?)',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'supp2-impl-test-trim-expectation',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are writing a unit test. What exact string makes this assertion pass? Answer with only the string.\n\nexpect("  done\\n".trim()).toBe(?)',
+    check: { kind: 'exact', value: 'done' },
+  },
+  {
+    id: 'supp2-impl-test-map-output',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are writing a unit test. What JSON array should be expected?\n\n[2, 4, 6].map(n => n / 2)',
+    check: { kind: 'json_equal', value: [1, 2, 3] },
+  },
+  {
+    id: 'supp2-impl-test-url-search-param',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are writing a unit test. What value should this assertion expect? Answer with the exact string only.\n\nnew URL("https://example.test/path?mode=fast").searchParams.get("mode")',
+    check: { kind: 'exact', value: 'fast' },
+  },
+  {
+    id: 'supp2-debug-bug-loop-bound',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A loop should visit indexes 0, 1, and 2 of a 3-item array. Which comparison operator should the loop use with i and length: < or <=? Answer only the operator.',
+    check: { kind: 'exact', value: '<' },
+  },
+  {
+    id: 'supp2-debug-bug-negated-guard',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A guard should return early when user is missing. Complete the condition: if (___user) return "anonymous"; Answer with only the missing operator.',
+    check: { kind: 'exact', value: '!' },
+  },
+  {
+    id: 'supp2-debug-bug-assignment-condition',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A condition accidentally uses = instead of comparing status to "ready". Which operator should replace = for strict comparison? Answer only the operator.',
+    check: { kind: 'exact', value: '===' },
+  },
+  {
+    id: 'supp2-debug-bug-missing-await',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'An async function returns Promise { <pending> } where the resolved value was expected. What keyword is missing before the promise call? Answer one word.',
+    check: { kind: 'exact', value: 'await' },
+  },
+  {
+    id: 'supp2-debug-test-boolean-expect',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A test expected isAdmin("owner") to be false, but the fixed function correctly returns true. What boolean should the test expect? Answer one word.',
+    check: { kind: 'exact', value: 'true' },
+  },
+  {
+    id: 'supp2-debug-test-error-message',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A validation test expected "bad input"; the implementation now intentionally throws "missing email". What exact message should the repaired test expect?',
+    check: { kind: 'exact', value: 'missing email' },
+  },
+  {
+    id: 'supp2-debug-test-json-shape',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A response fixture changed from {ok:true} to {status:"ok"}. Reply with only the new expected JSON object.',
+    check: { kind: 'json_equal', value: { status: 'ok' } },
+  },
+  {
+    id: 'supp2-debug-test-async-resolve',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A test should assert that fetchName() resolves to "Kilo". Which matcher should be used before toBe("Kilo"): resolves or rejects? Answer one word.',
+    check: { kind: 'exact', value: 'resolves' },
+  },
+  {
+    id: 'supp2-debug-rca-unset-secret',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A deploy works locally but production calls fail with "missing OPENROUTER_API_KEY". Which category is the root cause: secret, schema, or css? Answer one word.',
+    check: { kind: 'exact', value: 'secret' },
+  },
+  {
+    id: 'supp2-debug-rca-race-condition',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Two workers update the same counter concurrently and one increment disappears. What kind of bug is this? Answer two words.',
+    check: { kind: 'exact', value: 'race condition' },
+  },
+  {
+    id: 'supp2-debug-rca-cache-key',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Two users see each other cached results because the cache key omits userId. Which part is wrong: cache key, database type, or font? Answer two words.',
+    check: { kind: 'exact', value: 'cache key' },
+  },
+  {
+    id: 'supp2-debug-rca-timeout',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A request always fails after exactly 30 seconds while the downstream job completes at 45 seconds. What limit is most likely being hit? Answer one word.',
+    check: { kind: 'exact', value: 'timeout' },
+  },
+  {
+    id: 'supp2-refactor-cleanup-unused-import',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A file imports formatDate but never uses it. What should happen to that import? Answer one word.',
+    check: { kind: 'exact', value: 'remove' },
+  },
+  {
+    id: 'supp2-refactor-cleanup-nested-if',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Replacing nested if statements with early returns primarily reduces what? Answer one word.',
+    check: { kind: 'exact', value: 'nesting' },
+  },
+  {
+    id: 'supp2-refactor-cleanup-magic-number',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'The number 86400000 appears repeatedly to mean milliseconds per day. What should it become: named constant, random value, or inline comment only? Answer two words.',
+    check: { kind: 'exact', value: 'named constant' },
+  },
+  {
+    id: 'supp2-refactor-cleanup-duplicate-branch',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Two switch cases have identical bodies. What refactor can combine them: fallthrough, mutation, or sleep? Answer one word.',
+    check: { kind: 'exact', value: 'fallthrough' },
+  },
+  {
+    id: 'supp2-refactor-arch-adapter',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'To isolate provider-specific API calls behind a common interface, what pattern is commonly used? Answer one word.',
+    check: { kind: 'exact', value: 'adapter' },
+  },
+  {
+    id: 'supp2-refactor-arch-pure-core',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Moving business rules out of HTTP handlers into pure functions mainly improves what? Answer one word.',
+    check: { kind: 'exact', value: 'testability' },
+  },
+  {
+    id: 'supp2-refactor-arch-layering',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A router imports a React component to reuse validation logic. Should validation move to shared domain code or stay in the component? Answer three words.',
+    check: { kind: 'exact', value: 'shared domain code' },
+  },
+  {
+    id: 'supp2-refactor-arch-contract-package',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Two services duplicate the same Zod request schema. Where should that schema live: shared contracts package, CSS file, or log line? Answer three words.',
+    check: { kind: 'exact', value: 'shared contracts package' },
+  },
+  {
+    id: 'supp2-refactor-migration-add-index',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A frequent lookup filters by run_id and model. Which database object usually speeds that lookup? Answer one word.',
+    check: { kind: 'exact', value: 'index' },
+  },
+  {
+    id: 'supp2-refactor-migration-nullable-first',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'For a large table, adding a new column before backfilling is usually safer if it starts nullable or non-null with no default? Answer one word.',
+    check: { kind: 'exact', value: 'nullable' },
+  },
+  {
+    id: 'supp2-refactor-migration-drop-column',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Removing an obsolete database column is which SQL operation: DROP COLUMN, SELECT, or COMMIT? Answer only the operation.',
+    check: { kind: 'exact', value: 'DROP COLUMN' },
+  },
+  {
+    id: 'supp2-refactor-migration-rename-table',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A migration changes table name old_events to events while preserving rows. What operation is this? Answer two words.',
+    check: { kind: 'exact', value: 'rename table' },
+  },
+  {
+    id: 'supp2-plan-arch-separate-writer',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'If one service should own writes to a shared routing table and others only read, what role does that service have? Answer two words.',
+    check: { kind: 'exact', value: 'sole writer' },
+  },
+  {
+    id: 'supp2-plan-arch-event-queue',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A user request should return quickly while heavy work continues later. Which architecture primitive usually decouples the work? Answer one word.',
+    check: { kind: 'exact', value: 'queue' },
+  },
+  {
+    id: 'supp2-plan-arch-cache-invalidation',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'After publishing a new config, should readers keep the old KV cache forever or invalidate it? Answer two words.',
+    check: { kind: 'exact', value: 'invalidate it' },
+  },
+  {
+    id: 'supp2-plan-arch-idempotent-writes',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'If a queue retries messages, should database writes be idempotent or time-randomized? Answer one word.',
+    check: { kind: 'exact', value: 'idempotent' },
+  },
+  {
+    id: 'supp2-plan-technical-order',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'For a schema-breaking rollout, which should be planned before deploy: migration or celebration? Answer one word.',
+    check: { kind: 'exact', value: 'migration' },
+  },
+  {
+    id: 'supp2-plan-technical-rollback',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A rollout plan should include how to return to the previous version. What is that called? Answer one word.',
+    check: { kind: 'exact', value: 'rollback' },
+  },
+  {
+    id: 'supp2-plan-technical-verification',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A plan touches a worker and a web consumer. Should verification include both surfaces or only the worker? Answer two words.',
+    check: { kind: 'exact', value: 'both surfaces' },
+  },
+  {
+    id: 'supp2-plan-technical-owner',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'When a launch depends on CI deploy finishing, what should the plan wait for before starting a new benchmark? Answer two words.',
+    check: { kind: 'exact', value: 'deploy completion' },
+  },
+  {
+    id: 'supp2-plan-system-backpressure',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Limiting how many jobs run at once to protect downstream capacity is called what? Answer one word.',
+    check: { kind: 'exact', value: 'backpressure' },
+  },
+  {
+    id: 'supp2-invest-repo-find-schema',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'To find where benchmark_runs is defined in a repo, which command should you use first: rg, sleep, or curl? Answer one word.',
+    check: { kind: 'exact', value: 'rg' },
+  },
+  {
+    id: 'supp2-invest-repo-list-files',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Which command lists tracked and untracked file changes in a git worktree: git status or npm version? Answer two words.',
+    check: { kind: 'exact', value: 'git status' },
+  },
+  {
+    id: 'supp2-invest-repo-find-tests',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: SYS_SYS,
+    userPrompt: 'Files ending in .test.ts usually contain what? Answer one word.',
+    check: { kind: 'exact', value: 'tests' },
+  },
+  {
+    id: 'supp2-invest-repo-read-config',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'In a Cloudflare Worker service, which config file commonly defines bindings: wrangler.jsonc or tsconfig.tsbuildinfo? Answer only the file name.',
+    check: { kind: 'exact', value: 'wrangler.jsonc' },
+  },
+  {
+    id: 'supp2-invest-code-call-chain',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Given the call chain handleRequest -> classify -> computeDecision, which function chooses the model? Answer only the function name.',
+    check: { kind: 'exact', value: 'computeDecision' },
+  },
+  {
+    id: 'supp2-invest-code-schema-owner',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'If RoutingTableSchema parses published artifacts, is it a runtime schema or CSS class? Answer two words.',
+    check: { kind: 'exact', value: 'runtime schema' },
+  },
+  {
+    id: 'supp2-invest-code-field-rename',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A database row field route_key maps to API field routeKey. What naming conversion is this: snake to camel, camel to snake, or uppercase? Answer three words.',
+    check: { kind: 'exact', value: 'snake to camel' },
+  },
+  {
+    id: 'supp2-invest-code-consumer',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A type change in @kilocode/auto-routing-contracts breaks services/auto-routing and apps/web. What are those packages called relative to the type? Answer one word.',
+    check: { kind: 'exact', value: 'consumers' },
+  },
+  {
+    id: 'supp2-invest-research-primary-source',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'For library API behavior, should you prefer official docs or a random forum answer? Answer two words.',
+    check: { kind: 'exact', value: 'official docs' },
+  },
+  {
+    id: 'supp2-invest-research-cross-check',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'If two current sources disagree, should you cross-check or guess? Answer one word.',
+    check: { kind: 'exact', value: 'cross-check' },
+  },
+  {
+    id: 'supp2-invest-research-version',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'When reading framework docs, which detail matters for compatibility: version or logo color? Answer one word.',
+    check: { kind: 'exact', value: 'version' },
+  },
+  {
+    id: 'supp2-invest-research-quote-limit',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'When using a source, should long copyrighted passages be quoted in full or summarized? Answer one word.',
+    check: { kind: 'exact', value: 'summarized' },
+  },
+  {
+    id: 'supp2-agent-tool-sort-file',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-sort.txt with lines delta, alpha, charlie. Sort the lines alphabetically and answer with the first line only.',
+    check: { kind: 'exact', value: 'alpha' },
+  },
+  {
+    id: 'supp2-agent-tool-json-length',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-items.json containing ["a","b","c","d"]. Read it and answer only the array length.',
+    check: { kind: 'exact', value: '4' },
+  },
+  {
+    id: 'supp2-agent-tool-word-count',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-words.txt containing exactly "one two three". Count the words and answer only the number.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'supp2-agent-tool-file-exists',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-exists.txt containing ok. Then check that the file exists and answer only yes or no.',
+    check: { kind: 'exact', value: 'yes' },
+  },
+  {
+    id: 'supp2-agent-term-node-json',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Run node -e "console.log(JSON.stringify([1,2,3].reduce((a,b)=>a+b,0)))" in the terminal and answer with only the output.',
+    check: { kind: 'exact', value: '6' },
+  },
+  {
+    id: 'supp2-agent-term-printf',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt: 'Run printf kilo in the terminal and answer with only the output.',
+    check: { kind: 'exact', value: 'kilo' },
+  },
+  {
+    id: 'supp2-agent-term-sort',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Run a shell command that sorts the words "zeta alpha" alphabetically one per line. Answer with only the first sorted word.',
+    check: { kind: 'exact', value: 'alpha' },
+  },
+  {
+    id: 'supp2-agent-term-expr',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt: 'Run a terminal calculation for 9 + 8 + 7 and answer with only the result.',
+    check: { kind: 'exact', value: '24' },
+  },
+  {
+    id: 'supp2-agent-multi-generate-run',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Write /tmp/bench-sum.js that prints 14 + 28. Run it with node and answer with only what it prints.',
+    check: { kind: 'exact', value: '42' },
+  },
+  {
+    id: 'supp2-agent-multi-read-transform',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-name.txt containing kilo. Read it, uppercase it, and answer only the uppercase text.',
+    check: { kind: 'exact', value: 'KILO' },
+  },
+  {
+    id: 'supp2-agent-multi-two-files-join',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-left.txt containing auto and /tmp/bench-right.txt containing route. Read both and answer with the two words joined by a hyphen.',
+    check: { kind: 'exact', value: 'auto-route' },
+  },
+  {
+    id: 'supp2-agent-multi-json-sum',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create /tmp/bench-numbers.json containing [5,10,15]. Read it, sum the numbers, and answer only the sum.',
+    check: { kind: 'exact', value: '30' },
+  },
 ];
diff --git a/services/auto-routing-benchmark/src/db-replace-summaries.test.ts b/services/auto-routing-benchmark/src/db-replace-summaries.test.ts
index 16b81c8212..d77974387a 100644
--- a/services/auto-routing-benchmark/src/db-replace-summaries.test.ts
+++ b/services/auto-routing-benchmark/src/db-replace-summaries.test.ts
@@ -25,7 +25,7 @@ import { replaceModelSummaries } from './db';
 function makeSummary(model: string): BenchmarkModelSummary {
   return {
     model,
-    tier: '*',
+    routeKey: '*',
     accuracy: 0.9,
     avgCostUsd: 0.001,
     avgLatencyMs: 100,
diff --git a/services/auto-routing-benchmark/src/db-save-routing-table.test.ts b/services/auto-routing-benchmark/src/db-save-routing-table.test.ts
new file mode 100644
index 0000000000..7cbc1048d4
--- /dev/null
+++ b/services/auto-routing-benchmark/src/db-save-routing-table.test.ts
@@ -0,0 +1,66 @@
+import { describe, expect, it, vi } from 'vitest';
+import type { RankedCandidate, RoutingTable } from '@kilocode/auto-routing-contracts';
+
+const mockState = vi.hoisted(() => ({
+  batchCalls: [] as Array<Array<{ kind: string; values?: unknown }>>,
+}));
+
+vi.mock('drizzle-orm/d1', () => ({
+  drizzle: vi.fn(() => ({
+    delete: vi.fn(() => ({
+      where: vi.fn(() => ({ kind: 'delete' })),
+    })),
+    insert: vi.fn(() => ({
+      values: vi.fn((values: unknown) => ({
+        kind: 'insert',
+        values,
+        onConflictDoUpdate: vi.fn(() => ({ kind: 'upsert', values })),
+      })),
+    })),
+    batch: vi.fn(async (stmts: Array<{ kind: string; values?: unknown }>) => {
+      mockState.batchCalls.push(stmts);
+    }),
+  })),
+}));
+
+const candidate = (model: string): RankedCandidate => ({
+  model,
+  accuracy: 0.9,
+  avgCostUsd: 0.001,
+  meetsThreshold: true,
+  reasoningEffort: null,
+});
+
+describe('saveRoutingTable', () => {
+  it('chunks routing candidate inserts to stay under D1 variable limits', async () => {
+    const { saveRoutingTable } = await import('./db');
+
+    const table: RoutingTable = {
+      version: 'run-large-routing-table',
+      generatedAt: '2026-06-16T18:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      source: 'benchmark',
+      routes: {
+        'implementation/code_generation': Array.from({ length: 23 }, (_, index) =>
+          candidate(`impl-model-${index}`)
+        ),
+        'debugging/bug_fixing': [candidate('debug-model')],
+        'planning_design/system_design': [candidate('plan-model')],
+      },
+    };
+
+    await saveRoutingTable({} as D1Database, table, '2026-06-16T18:01:00.000Z');
+
+    const [batch] = mockState.batchCalls;
+    expect(batch).toBeDefined();
+    const candidateInsertSizes = batch
+      .filter(stmt => stmt.kind === 'insert')
+      .map(stmt => {
+        expect(Array.isArray(stmt.values)).toBe(true);
+        return (stmt.values as unknown[]).length;
+      });
+
+    expect(candidateInsertSizes).toEqual([10, 10, 5]);
+  });
+});
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
index 2a4c88035c..c241939a89 100644
--- a/services/auto-routing-benchmark/src/db-schema.ts
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -77,7 +77,7 @@ export const modelSummaries = sqliteTable(
   {
     run_id: text('run_id').notNull(),
     model: text('model').notNull(),
-    tier: text('tier').notNull(),
+    route_key: text('route_key').notNull(),
     accuracy: real('accuracy').notNull(),
     avg_cost_usd: real('avg_cost_usd'),
     avg_latency_ms: real('avg_latency_ms').notNull(),
@@ -89,7 +89,7 @@ export const modelSummaries = sqliteTable(
     // carried=true rows are prior-run summaries copied in at startRun for skipped models.
     carried: integer('carried', { mode: 'boolean' }).notNull().default(false),
   },
-  table => [primaryKey({ columns: [table.run_id, table.model, table.tier] })]
+  table => [primaryKey({ columns: [table.run_id, table.model, table.route_key] })]
 );
 
 export const caseResults = sqliteTable(
@@ -98,7 +98,7 @@ export const caseResults = sqliteTable(
     run_id: text('run_id').notNull(),
     model: text('model').notNull(),
     case_id: text('case_id').notNull(),
-    tier: text('tier'),
+    route_key: text('route_key'),
     score: real('score').notNull(),
     latency_ms: integer('latency_ms').notNull(),
     cost_usd: real('cost_usd'),
@@ -134,7 +134,7 @@ export const routingTableCandidates = sqliteTable(
   'routing_table_candidates',
   {
     run_id: text('run_id').notNull(),
-    tier: text('tier').notNull(),
+    route_key: text('route_key').notNull(),
     rank: integer('rank').notNull(),
     model: text('model').notNull(),
     accuracy: real('accuracy').notNull(),
@@ -145,5 +145,5 @@ export const routingTableCandidates = sqliteTable(
     meets_threshold: integer('meets_threshold', { mode: 'boolean' }).notNull(),
     reasoning_effort: text('reasoning_effort'),
   },
-  table => [primaryKey({ columns: [table.run_id, table.tier, table.rank] })]
+  table => [primaryKey({ columns: [table.run_id, table.route_key, table.rank] })]
 );
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
index 103482e00d..5ba9b0b853 100644
--- a/services/auto-routing-benchmark/src/db.test.ts
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -13,7 +13,7 @@ describe('mapSummaryRow', () => {
     const row = {
       run_id: 'run-1',
       model: 'openai/gpt-4o',
-      tier: 'high',
+      route_key: 'implementation/code_generation',
       accuracy: 0.92,
       avg_cost_usd: 0.0015,
       avg_latency_ms: 320.5,
@@ -27,7 +27,7 @@ describe('mapSummaryRow', () => {
     const result = mapSummaryRow(row);
     expect(result).toEqual<BenchmarkModelSummary>({
       model: 'openai/gpt-4o',
-      tier: 'high',
+      routeKey: 'implementation/code_generation',
       accuracy: 0.92,
       avgCostUsd: 0.0015,
       avgLatencyMs: 320.5,
@@ -43,7 +43,7 @@ describe('mapSummaryRow', () => {
     const row = {
       run_id: 'run-2',
       model: 'anthropic/claude-3-haiku',
-      tier: '*',
+      route_key: '*',
       accuracy: 0.85,
       avg_cost_usd: null,
       avg_latency_ms: 150.0,
@@ -58,7 +58,7 @@ describe('mapSummaryRow', () => {
     expect(result.avgCostUsd).toBeNull();
     expect(result.p50LatencyMs).toBeNull();
     expect(result.p95LatencyMs).toBeNull();
-    expect(result.tier).toBe('*');
+    expect(result.routeKey).toBe('*');
     expect(result.errors).toBe(0);
     expect(result.timeouts).toBe(0);
   });
@@ -88,7 +88,7 @@ describe('mapRunRow', () => {
     const summaries: BenchmarkModelSummary[] = [
       {
         model: 'openai/gpt-4o-mini',
-        tier: '*',
+        routeKey: '*',
         accuracy: 0.78,
         avgCostUsd: 0.0002,
         avgLatencyMs: 120,
@@ -150,10 +150,9 @@ const sampleTable: RoutingTable = {
   minAccuracy: 0.7,
   switchCostFactor: 3,
   source: 'benchmark',
-  tiers: {
-    low: [candidate('model-a'), candidate('model-b')],
-    medium: [candidate('model-c')],
-    high: [candidate('model-a')],
+  routes: {
+    'implementation/code_generation': [candidate('model-a'), candidate('model-b')],
+    'debugging/bug_fixing': [candidate('model-c')],
   },
 };
 
@@ -168,14 +167,16 @@ describe('routingTableToRows', () => {
     expect(tableRow.source).toBe('benchmark');
   });
 
-  it('assigns rank 0,1 for the two low-tier candidates', () => {
+  it('assigns rank 0,1 for the two implementation/code_generation candidates', () => {
     const { candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
-    const lowRows = candidateRows.filter(r => r.tier === 'low').sort((a, b) => a.rank - b.rank);
-    expect(lowRows).toHaveLength(2);
-    expect(lowRows[0].model).toBe('model-a');
-    expect(lowRows[0].rank).toBe(0);
-    expect(lowRows[1].model).toBe('model-b');
-    expect(lowRows[1].rank).toBe(1);
+    const routeRows = candidateRows
+      .filter(r => r.route_key === 'implementation/code_generation')
+      .sort((a, b) => a.rank - b.rank);
+    expect(routeRows).toHaveLength(2);
+    expect(routeRows[0].model).toBe('model-a');
+    expect(routeRows[0].rank).toBe(0);
+    expect(routeRows[1].model).toBe('model-b');
+    expect(routeRows[1].rank).toBe(1);
   });
 });
 
@@ -188,12 +189,12 @@ describe('rowsToRoutingTable', () => {
     expect(RoutingTableSchema.parse(reassembled)).toEqual(sampleTable);
   });
 
-  it('preserves candidate order within each tier', () => {
+  it('preserves candidate order within each route', () => {
     const { tableRow, candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
     // Shuffle candidateRows to verify rank-based sorting.
     const shuffled = [...candidateRows].reverse();
     const reassembled = rowsToRoutingTable(tableRow, shuffled);
-    expect(reassembled.tiers.low[0].model).toBe('model-a');
-    expect(reassembled.tiers.low[1].model).toBe('model-b');
+    expect(reassembled.routes['implementation/code_generation']?.[0]?.model).toBe('model-a');
+    expect(reassembled.routes['implementation/code_generation']?.[1]?.model).toBe('model-b');
   });
 });
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
index 8ed87649fa..744adb57f5 100644
--- a/services/auto-routing-benchmark/src/db.ts
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -34,6 +34,11 @@ type ModelSummaryRow = typeof modelSummaries.$inferSelect;
 // ceiling while still batching the delete plus inserts together.
 const MODEL_SUMMARY_INSERT_BATCH_SIZE = 8;
 
+// Routing table candidates bind 8 values per row. Keep each INSERT comfortably
+// under D1's 100-variable ceiling; publishing is infrequent, so smaller
+// statements are preferable to risking a skipped routing-table update.
+const ROUTING_TABLE_CANDIDATE_INSERT_BATCH_SIZE = 10;
+
 // ---------------------------------------------------------------------------
 // Row mapping helpers
 // ---------------------------------------------------------------------------
@@ -41,7 +46,7 @@ const MODEL_SUMMARY_INSERT_BATCH_SIZE = 8;
 export function mapSummaryRow(row: ModelSummaryRow): BenchmarkModelSummary {
   return {
     model: row.model,
-    tier: row.tier as BenchmarkModelSummary['tier'],
+    routeKey: row.route_key as BenchmarkModelSummary['routeKey'],
     accuracy: row.accuracy,
     avgCostUsd: row.avg_cost_usd,
     avgLatencyMs: row.avg_latency_ms,
@@ -179,7 +184,7 @@ export async function insertRun(
         carriedSummaries.map(s => ({
           run_id: run.id,
           model: s.model,
-          tier: s.tier,
+          route_key: s.routeKey,
           accuracy: s.accuracy,
           avg_cost_usd: s.avgCostUsd,
           avg_latency_ms: s.avgLatencyMs,
@@ -221,7 +226,7 @@ export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Prom
     .onConflictDoUpdate({
       target: [caseResults.run_id, caseResults.model, caseResults.case_id, caseResults.rep],
       set: {
-        tier: row.tier,
+        route_key: row.route_key,
         score: row.score,
         latency_ms: row.latency_ms,
         cost_usd: row.cost_usd,
@@ -251,6 +256,25 @@ export async function getCaseResults(db: D1Database, runId: string): Promise<Cas
   return drizzle(db).select().from(caseResults).where(eq(caseResults.run_id, runId));
 }
 
+export async function getExistingCaseResultIds(
+  db: D1Database,
+  params: { runId: string; model: string; rep: number; caseIds: string[] }
+): Promise<Set<string>> {
+  if (params.caseIds.length === 0) return new Set();
+  const rows = await drizzle(db)
+    .select({ case_id: caseResults.case_id })
+    .from(caseResults)
+    .where(
+      and(
+        eq(caseResults.run_id, params.runId),
+        eq(caseResults.model, params.model),
+        eq(caseResults.rep, params.rep),
+        inArray(caseResults.case_id, params.caseIds)
+      )
+    );
+  return new Set(rows.map(row => row.case_id));
+}
+
 // ---------------------------------------------------------------------------
 // Model summaries
 // ---------------------------------------------------------------------------
@@ -279,7 +303,7 @@ export async function replaceModelSummaries(
         summaryChunk.map(s => ({
           run_id: runId,
           model: s.model,
-          tier: s.tier,
+          route_key: s.routeKey,
           accuracy: s.accuracy,
           avg_cost_usd: s.avgCostUsd,
           avg_latency_ms: s.avgLatencyMs,
@@ -415,8 +439,8 @@ export type PriorModelResult = {
   summaries: BenchmarkModelSummary[];
 };
 
-// Latest summaries per model for a benchmark kind: for each model, all tiers
-// from the most recent COMPLETED run that included it (mixing tiers across
+// Latest summaries per model for a benchmark kind: for each model, all routes
+// from the most recent COMPLETED run that included it (mixing routes across
 // runs would pair incomparable numbers).
 export async function getLatestSummariesByModel(
   db: D1Database,
@@ -426,7 +450,7 @@ export async function getLatestSummariesByModel(
     .select({
       run_id: modelSummaries.run_id,
       model: modelSummaries.model,
-      tier: modelSummaries.tier,
+      route_key: modelSummaries.route_key,
       accuracy: modelSummaries.accuracy,
       avg_cost_usd: modelSummaries.avg_cost_usd,
       avg_latency_ms: modelSummaries.avg_latency_ms,
@@ -492,11 +516,11 @@ export function routingTableToRows(
   };
 
   const candidateRows: RoutingTableCandidateRow[] = [];
-  for (const [tier, candidates] of Object.entries(table.tiers)) {
+  for (const [routeKey, candidates] of Object.entries(table.routes)) {
     candidates.forEach((c, rank) => {
       candidateRows.push({
         run_id: table.version,
-        tier,
+        route_key: routeKey,
         rank,
         model: c.model,
         accuracy: c.accuracy,
@@ -514,14 +538,14 @@ export function rowsToRoutingTable(
   tableRow: RoutingTableRow,
   candidateRows: RoutingTableCandidateRow[]
 ): RoutingTable {
-  const tierMap: Record<string, RankedCandidate[]> = { low: [], medium: [], high: [] };
+  const routeMap: Record<string, RankedCandidate[]> = {};
   const sorted = [...candidateRows].sort((a, b) => {
-    if (a.tier !== b.tier) return a.tier.localeCompare(b.tier);
+    if (a.route_key !== b.route_key) return a.route_key.localeCompare(b.route_key);
     return a.rank - b.rank;
   });
   for (const row of sorted) {
-    if (!(row.tier in tierMap)) tierMap[row.tier] = [];
-    tierMap[row.tier].push({
+    routeMap[row.route_key] ??= [];
+    routeMap[row.route_key].push({
       model: row.model,
       accuracy: row.accuracy,
       avgCostUsd: row.avg_cost_usd,
@@ -535,11 +559,7 @@ export function rowsToRoutingTable(
     minAccuracy: tableRow.min_accuracy,
     switchCostFactor: tableRow.switch_cost_factor,
     source: tableRow.source as RoutingTable['source'],
-    tiers: {
-      low: tierMap.low ?? [],
-      medium: tierMap.medium ?? [],
-      high: tierMap.high ?? [],
-    },
+    routes: routeMap,
   };
 }
 
@@ -568,8 +588,12 @@ export async function saveRoutingTable(
       }),
   ];
 
-  if (candidateRows.length > 0) {
-    stmts.push(orm.insert(routingTableCandidates).values(candidateRows));
+  for (let i = 0; i < candidateRows.length; i += ROUTING_TABLE_CANDIDATE_INSERT_BATCH_SIZE) {
+    stmts.push(
+      orm
+        .insert(routingTableCandidates)
+        .values(candidateRows.slice(i, i + ROUTING_TABLE_CANDIDATE_INSERT_BATCH_SIZE))
+    );
   }
 
   await orm.batch(stmts);
@@ -592,7 +616,7 @@ export async function getLatestRoutingTable(
     .select()
     .from(routingTableCandidates)
     .where(eq(routingTableCandidates.run_id, tableRow.run_id))
-    .orderBy(routingTableCandidates.tier, routingTableCandidates.rank);
+    .orderBy(routingTableCandidates.route_key, routingTableCandidates.rank);
 
   const assembled = rowsToRoutingTable(tableRow, candidateRows);
   const parsed = RoutingTableSchema.safeParse(assembled);
@@ -627,11 +651,11 @@ export async function getClassifierWinner(db: D1Database): Promise<ClassifierWin
 
   if (!runRow) return null;
 
-  // Get the tier='*' summaries for this run (classifier uses '*' tier).
+  // Get the routeKey='*' summaries for this run (classifier has no taxonomy route).
   const summaryRows = await orm
     .select()
     .from(modelSummaries)
-    .where(and(eq(modelSummaries.run_id, runRow.id), eq(modelSummaries.tier, '*')));
+    .where(and(eq(modelSummaries.run_id, runRow.id), eq(modelSummaries.route_key, '*')));
 
   const summaries = summaryRows.map(mapSummaryRow);
   const winner = pickClassifierWinner(
diff --git a/services/auto-routing-benchmark/src/grading.ts b/services/auto-routing-benchmark/src/grading.ts
index 0661e3ac4b..92b8a3ec65 100644
--- a/services/auto-routing-benchmark/src/grading.ts
+++ b/services/auto-routing-benchmark/src/grading.ts
@@ -2,8 +2,7 @@ import type { ClassifierOutput } from '@kilocode/auto-routing-contracts';
 
 // Golden labels grade every classifier field except confidence. subtaskType
 // is worth less than taskType: a wrong subtype under the right type is a near
-// miss. riskLevel gets a small weight matching its small influence on tier
-// derivation.
+// miss. riskLevel gets a small weight because it is a secondary routing signal.
 export type ClassifierExpectation = {
   taskType: ClassifierOutput['taskType'];
   subtaskType: ClassifierOutput['subtaskType'];
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
index dd431b5ce8..75cacb902c 100644
--- a/services/auto-routing-benchmark/src/index.ts
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -23,7 +23,7 @@ export default {
     for (const message of batch.messages) {
       // Deliberately no try/catch: a throw from processJob (transient token,
       // D1 or container failures) must skip the ack so the queue retries the
-      // whole (run, model, chunk) unit, dead-lettering after max_retries.
+      // whole (run, model, rep, chunk) unit, dead-lettering after max_retries.
       // Case-level failures are recorded as failed rows inside processJob and
       // do not throw. Swallowing the throw here would silently drop chunks.
       await processJob(env, message.body);
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
index 8c124ee496..f9a7747f0c 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.test.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -2,24 +2,26 @@ import { describe, expect, it } from 'vitest';
 import type {
   BenchmarkDeciderModel,
   BenchmarkModelSummary,
+  TaxonomyRouteKey,
 } from '@kilocode/auto-routing-contracts';
+import { TAXONOMY_ROUTE_KEYS } from '@kilocode/auto-routing-contracts';
 import { buildRoutingTable } from './routing-table-builder';
 
 const DECIDER_MODELS: BenchmarkDeciderModel[] = [
   { id: 'model/cheap', reasoningEffort: null },
-  { id: 'model/expensive', reasoningEffort: 'medium' },
-  { id: 'model/mid', reasoningEffort: null },
+  { id: 'model/value', reasoningEffort: 'medium' },
+  { id: 'model/weak', reasoningEffort: null },
 ];
 
 function summary(
   model: string,
-  tier: BenchmarkModelSummary['tier'],
+  routeKey: TaxonomyRouteKey | '*',
   accuracy: number,
   avgCostUsd: number | null = 0.001
 ): BenchmarkModelSummary {
   return {
     model,
-    tier,
+    routeKey,
     accuracy,
     avgCostUsd,
     avgLatencyMs: 500,
@@ -31,106 +33,54 @@ function summary(
   };
 }
 
-const ALL_TIERS_SUMMARIES: BenchmarkModelSummary[] = [
-  summary('model/cheap', 'low', 0.9, 0.001),
-  summary('model/expensive', 'low', 0.95, 0.01),
-  summary('model/mid', 'low', 0.8, 0.005),
-  summary('model/cheap', 'medium', 0.75, 0.001),
-  summary('model/expensive', 'medium', 0.85, 0.01),
-  summary('model/mid', 'medium', 0.72, 0.005),
-  summary('model/cheap', 'high', 0.6, 0.001),
-  summary('model/expensive', 'high', 0.9, 0.01),
-  summary('model/mid', 'high', 0.75, 0.005),
-];
+function summariesForEveryRoute(
+  overrides: Partial<Record<TaxonomyRouteKey, BenchmarkModelSummary[]>> = {}
+): BenchmarkModelSummary[] {
+  return TAXONOMY_ROUTE_KEYS.flatMap(
+    routeKey =>
+      overrides[routeKey] ?? [
+        summary('model/cheap', routeKey, 0.7, 0.007),
+        summary('model/value', routeKey, 0.9, 0.008),
+        summary('model/weak', routeKey, 0.5, 0.001),
+      ]
+  );
+}
 
 describe('buildRoutingTable', () => {
-  it('cheapest above-threshold model comes first per tier', () => {
+  it('ranks candidates by lowest cost per accuracy for each taxonomy route', () => {
     const table = buildRoutingTable({
       runId: 'test-run-1',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
       switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
-      summaries: ALL_TIERS_SUMMARIES,
+      summaries: summariesForEveryRoute(),
     });
 
-    // low tier: cheap (0.001) and mid (0.005) and expensive (0.01) all meet threshold (0.7)
-    // cheapest first
-    expect(table.tiers.low[0].model).toBe('model/cheap');
-    expect(table.tiers.low[1].model).toBe('model/mid');
-    expect(table.tiers.low[2].model).toBe('model/expensive');
-
-    // medium tier: all meet threshold, cheapest first
-    expect(table.tiers.medium[0].model).toBe('model/cheap');
-    expect(table.tiers.medium[1].model).toBe('model/mid');
-    expect(table.tiers.medium[2].model).toBe('model/expensive');
-
-    // high tier: expensive (0.9) and mid (0.75) meet threshold; cheap (0.6) does not
-    // meeting threshold first, then by cost; cheap last (below threshold)
-    expect(table.tiers.high[0].model).toBe('model/mid'); // meets threshold, cheaper
-    expect(table.tiers.high[1].model).toBe('model/expensive'); // meets threshold, more expensive
-    expect(table.tiers.high[2].model).toBe('model/cheap'); // below threshold
+    expect(table.routes['implementation/code_generation']?.map(c => c.model)).toEqual([
+      'model/value',
+      'model/cheap',
+      'model/weak',
+    ]);
   });
 
-  it('excludes a model whose tier summary has no cost signal', () => {
+  it('excludes a model whose route summary has no cost signal', () => {
+    const routeKey = 'implementation/code_generation';
     const table = buildRoutingTable({
       runId: 'test-run-nocost',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
       switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
-      summaries: ALL_TIERS_SUMMARIES.map(s =>
-        s.model === 'model/cheap' && s.tier === 'low' ? { ...s, avgCostUsd: null } : s
-      ),
-    });
-
-    // model/cheap would have won 'low' as cheapest; without a cost signal it
-    // must not be ranked (unknown cost is not zero cost).
-    expect(table.tiers.low.map(c => c.model)).toEqual(['model/mid', 'model/expensive']);
-  });
-
-  it('marks meetsThreshold correctly', () => {
-    const table = buildRoutingTable({
-      runId: 'test-run-2',
-      generatedAt: '2026-01-01T00:00:00.000Z',
-      minAccuracy: 0.7,
-      switchCostFactor: 3,
-      deciderModels: DECIDER_MODELS,
-      summaries: ALL_TIERS_SUMMARIES,
+      summaries: summariesForEveryRoute({
+        [routeKey]: [
+          summary('model/cheap', routeKey, 0.7, null),
+          summary('model/value', routeKey, 0.9, 0.008),
+        ],
+      }),
     });
 
-    for (const candidate of table.tiers.low) {
-      expect(candidate.meetsThreshold).toBe(candidate.accuracy >= 0.7);
-    }
-  });
-
-  it('excludes a model absent from a tier summaries', () => {
-    // model/cheap has no 'high' summary entry
-    const summaries: BenchmarkModelSummary[] = [
-      summary('model/cheap', 'low', 0.9),
-      summary('model/cheap', 'medium', 0.8),
-      // no 'high' entry for model/cheap
-      summary('model/expensive', 'low', 0.9),
-      summary('model/expensive', 'medium', 0.8),
-      summary('model/expensive', 'high', 0.9),
-      summary('model/mid', 'low', 0.8),
-      summary('model/mid', 'medium', 0.75),
-      summary('model/mid', 'high', 0.75),
-    ];
-
-    const table = buildRoutingTable({
-      runId: 'test-run-3',
-      generatedAt: '2026-01-01T00:00:00.000Z',
-      minAccuracy: 0.7,
-      switchCostFactor: 3,
-      deciderModels: DECIDER_MODELS,
-      summaries,
-    });
-
-    const highModels = table.tiers.high.map(c => c.model);
-    expect(highModels).not.toContain('model/cheap');
-    expect(highModels).toContain('model/expensive');
-    expect(highModels).toContain('model/mid');
+    expect(table.routes[routeKey]?.map(c => c.model)).toEqual(['model/value']);
   });
 
   it('carries reasoningEffort from the run snapshot', () => {
@@ -140,119 +90,43 @@ describe('buildRoutingTable', () => {
       minAccuracy: 0.7,
       switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
-      summaries: ALL_TIERS_SUMMARIES,
-    });
-
-    const expensiveInLow = table.tiers.low.find(c => c.model === 'model/expensive');
-    expect(expensiveInLow?.reasoningEffort).toBe('medium');
-
-    const midInLow = table.tiers.low.find(c => c.model === 'model/mid');
-    expect(midInLow?.reasoningEffort).toBeNull();
-  });
-
-  it('defaults reasoningEffort to null when model missing from the snapshot', () => {
-    const summaries: BenchmarkModelSummary[] = [
-      summary('model/unknown', 'low', 0.9),
-      summary('model/cheap', 'low', 0.8),
-      summary('model/cheap', 'medium', 0.8),
-      summary('model/cheap', 'high', 0.8),
-      summary('model/unknown', 'medium', 0.9),
-      summary('model/unknown', 'high', 0.9),
-    ];
-
-    const table = buildRoutingTable({
-      runId: 'test-run-5',
-      generatedAt: '2026-01-01T00:00:00.000Z',
-      minAccuracy: 0.7,
-      switchCostFactor: 3,
-      deciderModels: DECIDER_MODELS,
-      summaries,
+      summaries: summariesForEveryRoute(),
     });
 
-    const unknown = table.tiers.low.find(c => c.model === 'model/unknown');
-    expect(unknown?.reasoningEffort).toBeNull();
-  });
-
-  it('throws when a tier has no candidates', () => {
-    // Only low and medium summaries — high is missing entirely
-    const summaries: BenchmarkModelSummary[] = [
-      summary('model/cheap', 'low', 0.9),
-      summary('model/expensive', 'low', 0.9),
-      summary('model/mid', 'low', 0.9),
-      summary('model/cheap', 'medium', 0.9),
-      summary('model/expensive', 'medium', 0.9),
-      summary('model/mid', 'medium', 0.9),
-    ];
+    const value = table.routes['implementation/code_generation']?.find(
+      c => c.model === 'model/value'
+    );
+    expect(value?.reasoningEffort).toBe('medium');
 
-    expect(() =>
-      buildRoutingTable({
-        runId: 'test-run-6',
-        generatedAt: '2026-01-01T00:00:00.000Z',
-        minAccuracy: 0.7,
-        switchCostFactor: 3,
-        deciderModels: DECIDER_MODELS,
-        summaries,
-      })
-    ).toThrow();
+    const cheap = table.routes['implementation/code_generation']?.find(
+      c => c.model === 'model/cheap'
+    );
+    expect(cheap?.reasoningEffort).toBeNull();
   });
 
-  it('throws when a tier has only zero-case entries', () => {
-    const summaries: BenchmarkModelSummary[] = [
-      ...ALL_TIERS_SUMMARIES.filter(s => s.tier !== 'high'),
-      // high tier entries with 0 cases — should be excluded
-      { ...summary('model/cheap', 'high', 0.9), cases: 0 },
-      { ...summary('model/expensive', 'high', 0.9), cases: 0 },
-      { ...summary('model/mid', 'high', 0.9), cases: 0 },
-    ];
-
+  it('throws when any taxonomy route has no candidates', () => {
     expect(() =>
       buildRoutingTable({
-        runId: 'test-run-7',
+        runId: 'test-run-missing-route',
         generatedAt: '2026-01-01T00:00:00.000Z',
         minAccuracy: 0.7,
         switchCostFactor: 3,
         deciderModels: DECIDER_MODELS,
-        summaries,
+        summaries: summariesForEveryRoute({ 'implementation/code_generation': [] }),
       })
     ).toThrow();
   });
 
-  it('ignores classifier-style * tier summaries', () => {
-    const summaries: BenchmarkModelSummary[] = [
-      ...ALL_TIERS_SUMMARIES,
-      // classifier summaries with '*' tier — should be ignored
-      summary('model/cheap', '*', 0.95),
-      summary('model/expensive', '*', 0.95),
-    ];
-
-    // Should not throw and * tier entries should not affect output
+  it('ignores classifier-style * route summaries', () => {
     const table = buildRoutingTable({
-      runId: 'test-run-8',
+      runId: 'test-run-classifier-summary',
       generatedAt: '2026-01-01T00:00:00.000Z',
       minAccuracy: 0.7,
       switchCostFactor: 3,
       deciderModels: DECIDER_MODELS,
-      summaries,
-    });
-
-    expect(table.tiers.low.length).toBe(3);
-    expect(table.tiers.medium.length).toBe(3);
-  });
-
-  it('sets version and generatedAt from params', () => {
-    const table = buildRoutingTable({
-      runId: 'decider-2026-01-01',
-      generatedAt: '2026-01-01T12:00:00.000Z',
-      minAccuracy: 0.7,
-      switchCostFactor: 3,
-      deciderModels: DECIDER_MODELS,
-      summaries: ALL_TIERS_SUMMARIES,
+      summaries: [...summariesForEveryRoute(), summary('model/value', '*', 1, 0.0001)],
     });
 
-    expect(table.version).toBe('decider-2026-01-01');
-    expect(table.generatedAt).toBe('2026-01-01T12:00:00.000Z');
-    expect(table.source).toBe('benchmark');
-    expect(table.minAccuracy).toBe(0.7);
-    expect(table.switchCostFactor).toBe(3);
+    expect(table.routes['implementation/code_generation']).toHaveLength(3);
   });
 });
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
index 222f19436f..27e09a177b 100644
--- a/services/auto-routing-benchmark/src/routing-table-builder.ts
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -1,17 +1,18 @@
 import {
   rankCandidates,
   RoutingTableSchema,
+  TAXONOMY_ROUTE_KEYS,
   type BenchmarkDeciderModel,
   type BenchmarkModelSummary,
-  type DifficultyTier,
   type RoutingTable,
+  type TaxonomyRouteKey,
 } from '@kilocode/auto-routing-contracts';
 
-// Builds the routing table from per-(model, tier) decider summaries. Models
-// with zero graded cases in a tier are excluded from that tier, as are
+// Builds the routing table from per-(model, taxonomy-route) decider summaries. Models
+// with zero graded cases in a route are excluded from that route, as are
 // models with no cost signal at all (avgCostUsd null means every case failed
-// to report cost; ranking such a model as cheapest would hand it the tier).
-// Throws when any tier ends up empty so the caller keeps the previous
+// to report cost; ranking such a model as cheapest would hand it the route).
+// Throws when any route ends up empty so the caller keeps the previous
 // published table. deciderModels/minAccuracy/switchCostFactor come from the
 // run's snapshot, not live config.
 export function buildRoutingTable(params: {
@@ -25,10 +26,10 @@ export function buildRoutingTable(params: {
   const { runId, generatedAt, minAccuracy, switchCostFactor, deciderModels, summaries } = params;
   const modelConfigById = new Map(deciderModels.map(m => [m.id, m] as const));
 
-  const tierCandidates = (t: DifficultyTier) =>
+  const routeCandidates = (routeKey: TaxonomyRouteKey) =>
     rankCandidates(
       summaries
-        .filter(s => s.tier === t && s.cases > 0 && s.avgCostUsd !== null)
+        .filter(s => s.routeKey === routeKey && s.cases > 0 && s.avgCostUsd !== null)
         .map(s => ({
           model: s.model,
           accuracy: s.accuracy,
@@ -38,21 +39,21 @@ export function buildRoutingTable(params: {
       minAccuracy
     );
 
+  const routes = Object.fromEntries(
+    TAXONOMY_ROUTE_KEYS.map(routeKey => [routeKey, routeCandidates(routeKey)] as const)
+  );
+
   const table: RoutingTable = {
     version: runId,
     generatedAt,
     minAccuracy,
     switchCostFactor,
     source: 'benchmark',
-    tiers: {
-      low: tierCandidates('low'),
-      medium: tierCandidates('medium'),
-      high: tierCandidates('high'),
-    },
+    routes,
   };
 
-  // RoutingTableSchema enforces .min(1) on each tier array; throws ZodError
-  // when a tier is empty — caller logs and skips publish, keeping the previous
+  // RoutingTableSchema enforces .min(1) on each route array; throws ZodError
+  // when a route is empty — caller logs and skips publish, keeping the previous
   // live table intact.
   return RoutingTableSchema.parse(table);
 }
diff --git a/services/auto-routing-benchmark/src/run-process-job.test.ts b/services/auto-routing-benchmark/src/run-process-job.test.ts
new file mode 100644
index 0000000000..955820cc92
--- /dev/null
+++ b/services/auto-routing-benchmark/src/run-process-job.test.ts
@@ -0,0 +1,302 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import type * as CliRunnerModule from './cli-runner';
+import type * as DbModule from './db';
+import { DECIDER_CASES } from './datasets/decider-cases';
+
+vi.mock('./db', async importOriginal => {
+  const actual = await importOriginal<typeof DbModule>();
+  return {
+    ...actual,
+    countCaseResults: vi.fn(),
+    existsNewerCompletedRun: vi.fn(),
+    getCaseResults: vi.fn(),
+    getExistingCaseResultIds: vi.fn(),
+    getRunWithModels: vi.fn(),
+    getSummaries: vi.fn(),
+    markRunCompleted: vi.fn(),
+    replaceModelSummaries: vi.fn(),
+    saveRoutingTable: vi.fn(),
+    upsertCaseResult: vi.fn(),
+  };
+});
+
+vi.mock('./cli-runner', async importOriginal => {
+  const actual = await importOriginal<typeof CliRunnerModule>();
+  return {
+    ...actual,
+    destroyDeciderCliContainer: vi.fn(),
+    runDeciderCaseViaCli: vi.fn(),
+    warmUpCliContainer: vi.fn(),
+  };
+});
+
+import {
+  destroyDeciderCliContainer,
+  runDeciderCaseViaCli,
+  warmUpCliContainer,
+  type CliRunResult,
+} from './cli-runner';
+import {
+  countCaseResults,
+  getExistingCaseResultIds,
+  getRunWithModels,
+  upsertCaseResult,
+} from './db';
+import { processJob } from './run';
+
+const tokenGet = vi.fn<() => Promise<string>>();
+const queueSendBatch = vi.fn<(messages: unknown[]) => Promise<void>>();
+const model = 'qwen/qwen3-coder-next';
+const runId = 'decider-test-run';
+const [benchCase] = DECIDER_CASES;
+
+const successfulCliResult = {
+  text: 'not the expected answer',
+  costUsd: null,
+  latencyMs: 25,
+  exitCode: 0,
+  stderrTail: '',
+  eventCount: 1,
+  lastEventTypes: ['session.created'],
+  timedOut: false,
+} satisfies CliRunResult;
+
+const env = {
+  INTERNAL_API_SECRET_PROD: { get: tokenGet },
+  BENCH_DB: {} as D1Database,
+  BENCH_QUEUE: { sendBatch: queueSendBatch },
+  AUTO_ROUTING_CONFIG: { delete: vi.fn() },
+} as unknown as Env;
+
+function mockRunSnapshot(): void {
+  vi.mocked(getRunWithModels).mockResolvedValue({
+    run: {
+      max_concurrency: 4,
+      min_accuracy: 0.7,
+      switch_cost_factor: 3,
+      benchmark_user_id: 'benchmark-user',
+      repetitions: 1,
+      classifier_max_p95_latency_ms: null,
+      started_at: '2026-06-16T00:00:00.000Z',
+    },
+    models: [{ model, enqueued: true, reasoning_effort: null }],
+  } as never);
+}
+
+function deciderMessage() {
+  return {
+    runId,
+    kind: 'decider',
+    model,
+    caseIds: [benchCase.id],
+    chunk: 0,
+    rep: 0,
+  };
+}
+
+beforeEach(() => {
+  vi.clearAllMocks();
+  tokenGet.mockResolvedValue('internal-secret');
+  queueSendBatch.mockResolvedValue(undefined);
+  vi.stubGlobal(
+    'fetch',
+    vi.fn(async () =>
+      Response.json({ token: 'kilo-user-token', expiresAt: '2026-06-16T01:00:00.000Z' })
+    )
+  );
+  mockRunSnapshot();
+  vi.mocked(countCaseResults).mockResolvedValue(0);
+  vi.mocked(getExistingCaseResultIds).mockResolvedValue(new Set());
+  vi.mocked(destroyDeciderCliContainer).mockResolvedValue(undefined);
+  vi.mocked(warmUpCliContainer).mockResolvedValue(undefined);
+  vi.mocked(runDeciderCaseViaCli).mockResolvedValue(successfulCliResult);
+});
+
+afterEach(() => {
+  vi.unstubAllGlobals();
+});
+
+describe('processJob — decider container availability failures', () => {
+  it.each([
+    'container /run failed: HTTP 503 There is no Container instance available at this time. This is likely because you have reached your max concurrent instance count.',
+    'container /run failed: HTTP 503 Maximum number of running container instances exceeded',
+    'container /run failed: HTTP 503 There is no container instance that can be provided to this Durable Object, try again later',
+  ])('lets the queue retry %s', async message => {
+    vi.mocked(runDeciderCaseViaCli).mockRejectedValueOnce(new Error(message));
+
+    await expect(processJob(env, deciderMessage())).rejects.toThrow(message);
+
+    expect(upsertCaseResult).not.toHaveBeenCalled();
+    expect(countCaseResults).not.toHaveBeenCalled();
+  });
+
+  it('lets the queue retry warmup capacity failures before running cases', async () => {
+    const message =
+      'container /warmup failed: HTTP 503 There is no Container instance available at this time';
+    vi.mocked(warmUpCliContainer).mockRejectedValueOnce(new Error(message));
+
+    await expect(processJob(env, deciderMessage())).rejects.toThrow(message);
+
+    expect(runDeciderCaseViaCli).not.toHaveBeenCalled();
+    expect(upsertCaseResult).not.toHaveBeenCalled();
+    expect(countCaseResults).not.toHaveBeenCalled();
+  });
+});
+
+describe('processJob — decider chunk chaining', () => {
+  it('runs a chunk on the model-repetition shard container and enqueues the next chunk', async () => {
+    const message = {
+      ...deciderMessage(),
+      caseIds: DECIDER_CASES.slice(0, 5).map(c => c.id),
+    };
+
+    await processJob(env, message);
+
+    expect(warmUpCliContainer).toHaveBeenCalledWith(
+      env,
+      expect.objectContaining({ instanceName: `${runId}:${model}:0:0` })
+    );
+    expect(runDeciderCaseViaCli).toHaveBeenCalledWith(
+      env,
+      expect.objectContaining({ instanceName: `${runId}:${model}:0:0` })
+    );
+    expect(queueSendBatch).toHaveBeenCalledWith([
+      {
+        body: {
+          runId,
+          kind: 'decider',
+          model,
+          chunk: 1,
+          shard: 0,
+          shardCount: 1,
+          rep: 0,
+          caseIds: DECIDER_CASES.slice(5, 10).map(c => c.id),
+        },
+      },
+    ]);
+    expect(countCaseResults).not.toHaveBeenCalled();
+  });
+
+  it('enqueues the next chunk assigned to the same shard lane', async () => {
+    const chunk = 2;
+    const shard = 2;
+    const shardCount = 8;
+    const currentCaseIds = DECIDER_CASES.slice(chunk * 5, chunk * 5 + 5).map(c => c.id);
+    const nextChunk = chunk + shardCount;
+    const nextCaseIds = DECIDER_CASES.slice(nextChunk * 5, nextChunk * 5 + 5).map(c => c.id);
+
+    await processJob(env, {
+      ...deciderMessage(),
+      chunk,
+      shard,
+      shardCount,
+      caseIds: currentCaseIds,
+    });
+
+    expect(warmUpCliContainer).toHaveBeenCalledWith(
+      env,
+      expect.objectContaining({ instanceName: `${runId}:${model}:0:2` })
+    );
+    expect(queueSendBatch).toHaveBeenCalledWith([
+      {
+        body: {
+          runId,
+          kind: 'decider',
+          model,
+          chunk: nextChunk,
+          shard,
+          shardCount,
+          rep: 0,
+          caseIds: nextCaseIds,
+        },
+      },
+    ]);
+    expect(countCaseResults).not.toHaveBeenCalled();
+  });
+
+  it('does not rerun completed chunk cases or enqueue a fully completed next chunk', async () => {
+    const currentCaseIds = DECIDER_CASES.slice(0, 5).map(c => c.id);
+    const nextCaseIds = DECIDER_CASES.slice(5, 10).map(c => c.id);
+    vi.mocked(getExistingCaseResultIds)
+      .mockResolvedValueOnce(new Set(currentCaseIds))
+      .mockResolvedValueOnce(new Set(nextCaseIds));
+
+    await processJob(env, { ...deciderMessage(), caseIds: currentCaseIds });
+
+    expect(warmUpCliContainer).not.toHaveBeenCalled();
+    expect(runDeciderCaseViaCli).not.toHaveBeenCalled();
+    expect(upsertCaseResult).not.toHaveBeenCalled();
+    expect(queueSendBatch).not.toHaveBeenCalled();
+  });
+
+  it('re-enqueues a partially completed next chunk so DLQ leftovers cannot strand a run', async () => {
+    const currentCaseIds = DECIDER_CASES.slice(0, 5).map(c => c.id);
+    const nextCaseIds = DECIDER_CASES.slice(5, 10).map(c => c.id);
+    vi.mocked(getExistingCaseResultIds)
+      .mockResolvedValueOnce(new Set(currentCaseIds))
+      .mockResolvedValueOnce(new Set([nextCaseIds[0]]));
+
+    await processJob(env, { ...deciderMessage(), caseIds: currentCaseIds });
+
+    expect(warmUpCliContainer).not.toHaveBeenCalled();
+    expect(runDeciderCaseViaCli).not.toHaveBeenCalled();
+    expect(upsertCaseResult).not.toHaveBeenCalled();
+    expect(queueSendBatch).toHaveBeenCalledWith([
+      {
+        body: {
+          runId,
+          kind: 'decider',
+          model,
+          chunk: 1,
+          shard: 0,
+          shardCount: 1,
+          rep: 0,
+          caseIds: nextCaseIds,
+        },
+      },
+    ]);
+  });
+
+  it('destroys the model-repetition shard container after the terminal chunk', async () => {
+    const terminalChunk = Math.floor((DECIDER_CASES.length - 1) / 5);
+    const terminalCaseIds = DECIDER_CASES.slice(terminalChunk * 5).map(c => c.id);
+
+    await processJob(env, {
+      ...deciderMessage(),
+      chunk: terminalChunk,
+      shard: 3,
+      shardCount: 4,
+      caseIds: terminalCaseIds,
+    });
+
+    expect(queueSendBatch).not.toHaveBeenCalled();
+    expect(destroyDeciderCliContainer).toHaveBeenCalledWith(env, {
+      instanceName: `${runId}:${model}:0:3`,
+    });
+    expect(countCaseResults).toHaveBeenCalled();
+  });
+
+  it('finalizes terminal chunks even when best-effort container destroy fails', async () => {
+    const terminalChunk = Math.floor((DECIDER_CASES.length - 1) / 5);
+    const terminalCaseIds = DECIDER_CASES.slice(terminalChunk * 5).map(c => c.id);
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    vi.mocked(destroyDeciderCliContainer).mockRejectedValueOnce(new Error('already stopped'));
+
+    await processJob(env, {
+      ...deciderMessage(),
+      chunk: terminalChunk,
+      shard: 3,
+      shardCount: 4,
+      caseIds: terminalCaseIds,
+    });
+
+    expect(destroyDeciderCliContainer).toHaveBeenCalledWith(env, {
+      instanceName: `${runId}:${model}:0:3`,
+    });
+    expect(warn).toHaveBeenCalledWith(
+      expect.stringContaining('benchmark_container_destroy_failed')
+    );
+    expect(countCaseResults).toHaveBeenCalled();
+    warn.mockRestore();
+  });
+});
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
index 9d40b883e0..4c38613658 100644
--- a/services/auto-routing-benchmark/src/run.test.ts
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -5,7 +5,9 @@ import {
   buildClassifierMessages,
   buildDeciderMessages,
   chunkArray,
+  computeDeciderShardCount,
   computeEngineIdentity,
+  getDeciderContainerInstanceName,
   runCasesWithConcurrency,
   summarize,
 } from './run';
@@ -16,7 +18,7 @@ function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
     run_id: 'run-1',
     model: 'model/a',
     case_id: 'case-1',
-    tier: null,
+    route_key: null,
     score: 1,
     latency_ms: 100,
     cost_usd: 0.001,
@@ -34,12 +36,12 @@ function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
 }
 
 describe('summarize — classifier kind', () => {
-  it('groups all classifier rows under * tier', () => {
+  it('groups all classifier rows under * route key', () => {
     const rows: CaseResultRow[] = [
       makeRow({
         model: 'model/a',
         case_id: 'c1',
-        tier: null,
+        route_key: null,
         score: 1,
         latency_ms: 100,
         cost_usd: 0.001,
@@ -47,7 +49,7 @@ describe('summarize — classifier kind', () => {
       makeRow({
         model: 'model/a',
         case_id: 'c2',
-        tier: null,
+        route_key: null,
         score: 0.5,
         latency_ms: 200,
         cost_usd: 0.002,
@@ -58,7 +60,7 @@ describe('summarize — classifier kind', () => {
     expect(summaries).toHaveLength(1);
     const [s] = summaries;
     expect(s.model).toBe('model/a');
-    expect(s.tier).toBe('*');
+    expect(s.routeKey).toBe('*');
     expect(s.cases).toBe(2);
   });
 
@@ -123,39 +125,65 @@ describe('summarize — classifier kind', () => {
 });
 
 describe('summarize — decider kind', () => {
-  it('groups by tier', () => {
+  it('groups by taxonomy route key', () => {
     const rows: CaseResultRow[] = [
-      makeRow({ model: 'model/a', case_id: 'low-1', tier: 'low', score: 1 }),
-      makeRow({ model: 'model/a', case_id: 'low-2', tier: 'low', score: 0 }),
-      makeRow({ model: 'model/a', case_id: 'med-1', tier: 'medium', score: 1 }),
-      makeRow({ model: 'model/b', case_id: 'low-3', tier: 'low', score: 1 }),
+      makeRow({
+        model: 'model/a',
+        case_id: 'impl-1',
+        route_key: 'implementation/code_generation',
+        score: 1,
+      }),
+      makeRow({
+        model: 'model/a',
+        case_id: 'impl-2',
+        route_key: 'implementation/code_generation',
+        score: 0,
+      }),
+      makeRow({
+        model: 'model/a',
+        case_id: 'debug-1',
+        route_key: 'debugging/bug_fixing',
+        score: 1,
+      }),
+      makeRow({
+        model: 'model/b',
+        case_id: 'impl-3',
+        route_key: 'implementation/code_generation',
+        score: 1,
+      }),
     ];
 
     const summaries = summarize(rows, 'decider');
     expect(summaries).toHaveLength(3);
 
-    const aLow = summaries.find(s => s.model === 'model/a' && s.tier === 'low');
-    expect(aLow?.cases).toBe(2);
-    expect(aLow?.accuracy).toBe(0.5);
+    const aImpl = summaries.find(
+      s => s.model === 'model/a' && s.routeKey === 'implementation/code_generation'
+    );
+    expect(aImpl?.cases).toBe(2);
+    expect(aImpl?.accuracy).toBe(0.5);
 
-    const aMed = summaries.find(s => s.model === 'model/a' && s.tier === 'medium');
-    expect(aMed?.cases).toBe(1);
-    expect(aMed?.accuracy).toBe(1);
+    const aDebug = summaries.find(
+      s => s.model === 'model/a' && s.routeKey === 'debugging/bug_fixing'
+    );
+    expect(aDebug?.cases).toBe(1);
+    expect(aDebug?.accuracy).toBe(1);
 
-    const bLow = summaries.find(s => s.model === 'model/b' && s.tier === 'low');
-    expect(bLow?.cases).toBe(1);
+    const bImpl = summaries.find(
+      s => s.model === 'model/b' && s.routeKey === 'implementation/code_generation'
+    );
+    expect(bImpl?.cases).toBe(1);
   });
 
-  it('uses * fallback when tier is null', () => {
-    const rows: CaseResultRow[] = [makeRow({ tier: null, score: 1 })];
+  it('uses * fallback when route key is null', () => {
+    const rows: CaseResultRow[] = [makeRow({ route_key: null, score: 1 })];
     const [s] = summarize(rows, 'decider');
-    expect(s.tier).toBe('*');
+    expect(s.routeKey).toBe('*');
   });
 
   it('computes avgLatencyMs as rounded mean', () => {
     const rows: CaseResultRow[] = [
-      makeRow({ case_id: 'c1', tier: 'low', latency_ms: 100 }),
-      makeRow({ case_id: 'c2', tier: 'low', latency_ms: 301 }),
+      makeRow({ case_id: 'c1', route_key: 'implementation/code_generation', latency_ms: 100 }),
+      makeRow({ case_id: 'c2', route_key: 'implementation/code_generation', latency_ms: 301 }),
     ];
 
     const [s] = summarize(rows, 'decider');
@@ -163,7 +191,9 @@ describe('summarize — decider kind', () => {
   });
 
   it('handles single-element groups for p50', () => {
-    const rows: CaseResultRow[] = [makeRow({ tier: 'high', latency_ms: 500 })];
+    const rows: CaseResultRow[] = [
+      makeRow({ route_key: 'implementation/code_generation', latency_ms: 500 }),
+    ];
     const [s] = summarize(rows, 'decider');
     expect(s.p50LatencyMs).toBe(500);
   });
@@ -266,7 +296,7 @@ describe('chunkArray', () => {
 describe('pickClassifierWinner', () => {
   const summary = (model: string, accuracy: number, avgCostUsd: number | null) => ({
     model,
-    tier: '*' as const,
+    routeKey: '*' as const,
     accuracy,
     avgCostUsd,
     avgLatencyMs: 100,
@@ -298,9 +328,12 @@ describe('pickClassifierWinner', () => {
     expect(winner?.model).toBe('cheap');
   });
 
-  it('ignores decider-tier summaries and returns null when nothing is graded', () => {
+  it('ignores decider route summaries and returns null when nothing is graded', () => {
     expect(
-      pickClassifierWinner([{ ...summary('m', 1, 0.001), tier: 'low' as const }], 0.7)
+      pickClassifierWinner(
+        [{ ...summary('m', 1, 0.001), routeKey: 'implementation/code_generation' as const }],
+        0.7
+      )
     ).toBeNull();
     expect(pickClassifierWinner([], 0.7)).toBeNull();
   });
@@ -313,7 +346,7 @@ describe('pickClassifierWinner', () => {
     p95: number | null = 90
   ) => ({
     model,
-    tier: '*' as const,
+    routeKey: '*' as const,
     accuracy,
     avgCostUsd,
     avgLatencyMs: 100,
@@ -412,8 +445,7 @@ describe('summarize — p95 and timeouts', () => {
 });
 
 describe('decider message fan-out', () => {
-  it('DECIDER_CHUNK_SIZE is 5 (chunk count for 76 cases)', () => {
-    // DECIDER_CASES = 76, chunk size 5 → ceil(76/5) = 16 chunks
+  it('DECIDER_CHUNK_SIZE is 5', () => {
     const chunks = chunkArray(
       Array.from({ length: 76 }, (_, i) => String(i)),
       5
@@ -429,45 +461,95 @@ describe('decider message fan-out', () => {
       kind: 'decider',
       model: 'm1',
       rep: 2,
+      shard: 1,
+      shardCount: 4,
       caseIds: ['a'],
       chunk: 0,
     });
     expect(withRep.rep).toBe(2);
+    expect(withRep.shard).toBe(1);
+    expect(withRep.shardCount).toBe(4);
   });
 
-  it('buildDeciderMessages: produces models × reps × ceil(76/5) messages with correct rep', () => {
-    // 76 cases, chunk size 5 → 16 chunks
-    const cases76 = Array.from({ length: 76 }, (_, i) => ({ id: `case-${i}` }));
-    const chunks = chunkArray(cases76, 5);
-    expect(chunks).toHaveLength(16);
+  it('computeDeciderShardCount maximizes shard lanes under the live container cap', () => {
+    expect(computeDeciderShardCount({ modelCount: 2, repetitions: 3, chunkCount: 36 })).toBe(16);
+    expect(
+      computeDeciderShardCount({
+        modelCount: 7,
+        repetitions: 1,
+        chunkCount: 36,
+        maxLiveContainers: 100,
+      })
+    ).toBe(14);
+    expect(
+      computeDeciderShardCount({
+        modelCount: 25,
+        repetitions: 1,
+        chunkCount: 36,
+        maxLiveContainers: 100,
+      })
+    ).toBe(4);
+    expect(
+      computeDeciderShardCount({
+        modelCount: 10,
+        repetitions: 3,
+        chunkCount: 36,
+        maxLiveContainers: 100,
+      })
+    ).toBe(3);
+    expect(
+      computeDeciderShardCount({
+        modelCount: 101,
+        repetitions: 1,
+        chunkCount: 36,
+        maxLiveContainers: 100,
+      })
+    ).toBe(0);
+  });
+
+  it('buildDeciderMessages: seeds sharded chunk lanes under the container cap', () => {
+    const cases180 = Array.from({ length: 180 }, (_, i) => ({ id: `case-${i}` }));
+    const chunks = chunkArray(cases180, 5);
+    expect(chunks).toHaveLength(36);
 
     const models = ['model/a', 'model/b'];
     const repetitions = 3;
     const messages = buildDeciderMessages('run-test', 'decider', models, repetitions, chunks);
+    const expectedShardCount = 16;
 
-    // Total: 2 models × 3 reps × 16 chunks = 96 messages
-    expect(messages).toHaveLength(models.length * repetitions * chunks.length);
+    // Initial fan-out is bounded by the 100-container budget while running
+    // multiple independent chunk lanes per model/repetition.
+    expect(messages).toHaveLength(models.length * repetitions * expectedShardCount);
+    expect(messages.length).toBeLessThanOrEqual(100);
 
-    // Each rep index (0..2) should appear exactly models.length × chunks.length times
     for (let rep = 0; rep < repetitions; rep++) {
       const forRep = messages.filter(m => m.body.rep === rep);
-      expect(forRep).toHaveLength(models.length * chunks.length);
+      expect(forRep).toHaveLength(models.length * expectedShardCount);
     }
 
-    // Every message carries the correct rep in its body
     for (const { body } of messages) {
       expect(typeof body.rep).toBe('number');
       expect(body.rep).toBeGreaterThanOrEqual(0);
       expect(body.rep).toBeLessThan(repetitions);
+      expect(body.shardCount).toBe(expectedShardCount);
+      expect(body.shard).toBeGreaterThanOrEqual(0);
+      expect(body.shard).toBeLessThan(expectedShardCount);
+      expect(body.chunk).toBe(body.shard);
+      expect(body.caseIds).toEqual(chunks[body.shard!]?.map(c => c.id));
     }
+  });
 
-    // caseIds on each message match the chunk
-    for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
-      const forChunk = messages.filter(m => m.body.chunk === chunkIdx);
-      for (const { body } of forChunk) {
-        expect(body.caseIds).toEqual(chunks[chunkIdx].map(c => c.id));
-      }
-    }
+  it('getDeciderContainerInstanceName reuses one container per model repetition shard', () => {
+    const base = { runId: 'run-test', kind: 'decider' as const, model: 'model/a', rep: 2 };
+    expect(getDeciderContainerInstanceName({ ...base, chunk: 0, shard: 0 })).toBe(
+      'run-test:model/a:2:0'
+    );
+    expect(getDeciderContainerInstanceName({ ...base, chunk: 16, shard: 0 })).toBe(
+      'run-test:model/a:2:0'
+    );
+    expect(getDeciderContainerInstanceName({ ...base, chunk: 1, shard: 1 })).toBe(
+      'run-test:model/a:2:1'
+    );
   });
 });
 
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
index 326134cdd7..f4aa1dbd26 100644
--- a/services/auto-routing-benchmark/src/run.ts
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -5,6 +5,7 @@ import {
   type BenchmarkDeciderModel,
   type BenchmarkKind,
   type BenchmarkModelSummary,
+  taxonomyRouteKey,
 } from '@kilocode/auto-routing-contracts';
 import { formatError } from '@kilocode/worker-utils';
 import * as z from 'zod';
@@ -16,6 +17,7 @@ import {
   countCaseResults,
   existsNewerCompletedRun,
   getCaseResults,
+  getExistingCaseResultIds,
   getLatestSummariesByModel,
   getRunningRun,
   getRunWithModels,
@@ -33,7 +35,12 @@ import {
 import { gradeClassifierOutput, runDeciderCheck } from './grading';
 import { createOpenRouterClient } from './openrouter';
 import { buildRoutingTable } from './routing-table-builder';
-import { runDeciderCaseViaCli, warmUpCliContainer } from './cli-runner';
+import {
+  destroyDeciderCliContainer,
+  isRetryableContainerAvailabilityError,
+  runDeciderCaseViaCli,
+  warmUpCliContainer,
+} from './cli-runner';
 import { pickClassifierWinner } from './winner';
 
 export type BenchmarkJobMessage = {
@@ -41,9 +48,11 @@ export type BenchmarkJobMessage = {
   kind: BenchmarkKind;
   model: string;
   // The case ids this message is responsible for, plus the chunk index. Decider
-  // chunks also use this index to key their container instance.
+  // chunks are split across shard lanes; each lane has one stable container.
   caseIds?: string[];
   chunk?: number;
+  shard?: number;
+  shardCount?: number;
   // Repetition index (0-based).
   rep?: number;
 };
@@ -54,6 +63,8 @@ export const BenchmarkJobMessageSchema = z.object({
   model: z.string().min(1),
   caseIds: z.array(z.string().min(1)).optional(),
   chunk: z.number().int().min(0).optional(),
+  shard: z.number().int().min(0).optional(),
+  shardCount: z.number().int().min(1).optional(),
   rep: z.number().int().min(0).optional(),
 });
 
@@ -67,9 +78,13 @@ const DECIDER_CHUNK_SIZE = 5;
 // keep it below Cloudflare Queues' 15-minute wall-clock limit.
 const CLASSIFIER_CHUNK_SIZE = 1;
 
-// Cloudflare Queues caps a single sendBatch at 100 messages. A decider fan-out
-// is models × reps × ceil(76 / 5) messages, which clears 100 with as few as two
-// models, so the dispatch must be sliced.
+// Cloudflare Containers cap for the benchmark runner. Sharded decider fan-out
+// uses this as the live-container budget.
+export const DECIDER_CONTAINER_INSTANCE_CAP = 100;
+
+// Cloudflare Queues caps a single sendBatch at 100 messages. Classifier fan-out
+// can exceed that because each classifier case is its own message, so dispatch
+// must be sliced.
 const QUEUE_SEND_BATCH_LIMIT = 100;
 
 export function chunkArray<T>(items: readonly T[], size: number): T[][] {
@@ -80,6 +95,24 @@ export function chunkArray<T>(items: readonly T[], size: number): T[][] {
   return chunks;
 }
 
+export function computeDeciderShardCount({
+  modelCount,
+  repetitions,
+  chunkCount,
+  maxLiveContainers = DECIDER_CONTAINER_INSTANCE_CAP,
+}: {
+  modelCount: number;
+  repetitions: number;
+  chunkCount: number;
+  maxLiveContainers?: number;
+}): number {
+  if (modelCount <= 0 || repetitions <= 0 || chunkCount <= 0) return 0;
+  const modelRepetitions = modelCount * repetitions;
+  const shardsPerModelRepetition = Math.floor(maxLiveContainers / modelRepetitions);
+  if (shardsPerModelRepetition <= 0) return 0;
+  return Math.min(chunkCount, shardsPerModelRepetition);
+}
+
 // Enqueues messages in sendBatch-sized slices. A mid-dispatch failure leaves a
 // partially-enqueued run that can never reach its expected result count, so the
 // run is marked failed (surfacing in the admin panel) before the throw
@@ -138,36 +171,64 @@ export function computeEngineIdentity(kind: BenchmarkKind): string {
   const datasetSignature =
     kind === 'classifier'
       ? CLASSIFIER_CASES.map(c => ({ id: c.id, expected: c.expected }))
-      : DECIDER_CASES.map(c => ({ id: c.id, tier: c.tier, check: c.check }));
+      : DECIDER_CASES.map(c => ({
+          id: c.id,
+          taskType: c.taskType,
+          subtaskType: c.subtaskType,
+          check: c.check,
+        }));
   return `v${BENCHMARK_ENGINE_VERSION}:${fnv1aHex(JSON.stringify(datasetSignature))}`;
 }
 
-/** Pure helper: produces the sendBatch bodies for a decider run fan-out.
- * Extracted for unit-testability; the shape is models × reps × chunks messages.
+/** Pure helper: produces the initial sendBatch bodies for a decider run.
+ * Extracted for unit-testability; the shape is models × reps messages. Later
+ * chunks are chained by processDeciderJob after the previous chunk completes.
  */
 export function buildDeciderMessages(
   runId: string,
   kind: BenchmarkKind,
   modelIds: string[],
   repetitions: number,
-  chunks: readonly (readonly { id: string }[])[]
+  chunks: readonly (readonly { id: string }[])[],
+  maxLiveContainers: number = DECIDER_CONTAINER_INSTANCE_CAP
 ): { body: BenchmarkJobMessage }[] {
+  const shardCount = computeDeciderShardCount({
+    modelCount: modelIds.length,
+    repetitions,
+    chunkCount: chunks.length,
+    maxLiveContainers,
+  });
+  if (shardCount === 0) return [];
   return modelIds.flatMap(model =>
     Array.from({ length: repetitions }, (_, rep) =>
-      chunks.map((chunkCases, chunk) => ({
-        body: {
-          runId,
-          kind,
-          model,
-          chunk,
-          rep,
-          caseIds: chunkCases.map(c => c.id),
-        } satisfies BenchmarkJobMessage,
-      }))
+      Array.from({ length: shardCount }, (_, shard) => {
+        const chunkCases = chunks[shard];
+        if (!chunkCases) return [];
+        return [
+          {
+            body: {
+              runId,
+              kind,
+              model,
+              chunk: shard,
+              shard,
+              shardCount,
+              rep,
+              caseIds: chunkCases.map(c => c.id),
+            } satisfies BenchmarkJobMessage,
+          },
+        ];
+      }).flat()
     ).flat()
   );
 }
 
+export function getDeciderContainerInstanceName(
+  message: Pick<BenchmarkJobMessage, 'runId' | 'model' | 'rep' | 'chunk' | 'shard'>
+): string {
+  return `${message.runId}:${message.model}:${message.rep ?? 0}:${message.shard ?? 0}`;
+}
+
 export function buildClassifierMessages(
   runId: string,
   modelIds: string[],
@@ -202,6 +263,33 @@ export class RunAlreadyActiveError extends Error {
   }
 }
 
+// Thrown when the saved benchmark config would exceed a hard runtime limit.
+// The admin route maps it to HTTP 400 so operators can fix config instead of
+// starting a run that will immediately hit platform capacity.
+export class BenchmarkRunConfigError extends Error {
+  constructor(message: string) {
+    super(message);
+    this.name = 'BenchmarkRunConfigError';
+  }
+}
+
+function validateDeciderContainerBudget({
+  modelCount,
+  repetitions,
+  maxLiveContainers,
+}: {
+  modelCount: number;
+  repetitions: number;
+  maxLiveContainers: number;
+}): void {
+  const modelRepetitions = modelCount * repetitions;
+  if (modelRepetitions <= maxLiveContainers) return;
+
+  throw new BenchmarkRunConfigError(
+    `decider benchmark requires at least one live container lane per model repetition (${modelRepetitions}), but maxConcurrency is ${maxLiveContainers}; reduce decider models/repetitions before starting`
+  );
+}
+
 export async function startRun(
   env: Env,
   kind: BenchmarkKind,
@@ -264,6 +352,14 @@ export async function startRun(
       'benchmark user not configured: set benchmarkUserId before running the decider benchmark'
     );
   }
+  const maxLiveDeciderContainers = Math.min(config.maxConcurrency, DECIDER_CONTAINER_INSTANCE_CAP);
+  if (kind === 'decider') {
+    validateDeciderContainerBudget({
+      modelCount: enqueuedModelIds.length,
+      repetitions,
+      maxLiveContainers: maxLiveDeciderContainers,
+    });
+  }
 
   const startedAt = new Date().toISOString();
   const runId = `${kind}-${startedAt.replace(/[:.]/g, '-')}`;
@@ -341,10 +437,18 @@ export async function startRun(
     return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
   }
 
-  // Decider: one message per (model, rep, chunk) so each queue invocation stays
-  // bounded. finalizeRunIfComplete expects enqueuedModels × DECIDER_CASES × repetitions rows.
+  // Decider: seed as many shard lanes as fit under the live-container cap. Each
+  // completed chunk enqueues the next chunk for the same lane, so one stable
+  // container handles chunk N, N+shardCount, N+(2*shardCount), ...
   const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
-  const messages = buildDeciderMessages(runId, kind, enqueuedModelIds, repetitions, chunks);
+  const messages = buildDeciderMessages(
+    runId,
+    kind,
+    enqueuedModelIds,
+    repetitions,
+    chunks,
+    maxLiveDeciderContainers
+  );
   await enqueueRunMessages(env, runId, messages);
   return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
 }
@@ -367,6 +471,7 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
   const message = parsed.data;
   const state = await getRunState(env, message.runId);
 
+  let shouldFinalize = true;
   if (message.kind === 'classifier') {
     if (!message.caseIds?.length || message.rep === undefined) {
       console.warn(
@@ -400,7 +505,7 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
             run_id: message.runId,
             model: message.model,
             case_id: benchCase.id,
-            tier: null,
+            route_key: null,
             score,
             latency_ms: Math.round(performance.now() - startedAt),
             cost_usd: result.cost,
@@ -423,10 +528,13 @@ export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
       }
     );
   } else {
-    await processDeciderJob(env, message, state);
+    const result = await processDeciderJob(env, message, state);
+    shouldFinalize = result.shouldFinalize;
   }
 
-  await finalizeRunIfComplete(env, message.runId, message.kind, state);
+  if (shouldFinalize) {
+    await finalizeRunIfComplete(env, message.runId, message.kind, state);
+  }
 }
 
 type RunState = {
@@ -461,15 +569,26 @@ async function processDeciderJob(
   env: Env,
   message: BenchmarkJobMessage,
   state: RunState
-): Promise<void> {
+): Promise<{ shouldFinalize: boolean }> {
   // Decider messages always carry their chunk's case ids; anything else is
   // malformed and dropped (same policy as unparseable messages).
   if (!message.caseIds?.length) {
     console.warn(JSON.stringify({ event: 'benchmark_job_missing_case_ids', runId: message.runId }));
-    return;
+    return { shouldFinalize: false };
   }
   const caseIds = new Set(message.caseIds);
   const cases = DECIDER_CASES.filter(c => caseIds.has(c.id));
+  if (cases.length === 0) {
+    console.warn(
+      JSON.stringify({
+        event: 'benchmark_job_empty_case_chunk',
+        runId: message.runId,
+        model: message.model,
+        chunk: message.chunk ?? 0,
+      })
+    );
+    return { shouldFinalize: false };
+  }
 
   if (!state.benchmarkUserId) {
     // startRun fails fast before enqueueing, so this only happens if the run
@@ -477,83 +596,163 @@ async function processDeciderJob(
     throw new Error(`run ${message.runId} has no benchmarkUserId`);
   }
 
-  // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the
-  // queue retries the message. The token is never logged.
-  const kiloToken = await fetchBenchmarkUserToken(env, state.benchmarkUserId);
   const rep = message.rep ?? 0;
-  const instanceName = `${message.runId}:${message.model}:${rep}:${message.chunk ?? 0}`;
+  const chunk = message.chunk ?? 0;
+  const shard = message.shard ?? 0;
+  const shardCount = message.shardCount ?? 1;
+  const instanceName = getDeciderContainerInstanceName(message);
+
+  const existingCaseIds = await getExistingCaseResultIds(env.BENCH_DB, {
+    runId: message.runId,
+    model: message.model,
+    rep,
+    caseIds: cases.map(c => c.id),
+  });
+  const casesToRun = cases.filter(c => !existingCaseIds.has(c.id));
 
   // Reasoning effort comes from the run snapshot (run_models row), not live config.
   const modelRow = state.models.find(m => m.model === message.model);
   const reasoningEffort = modelRow?.reasoning_effort ?? null;
 
-  // Fresh container instances run the CLI's one-time sqlite migration; the
-  // container owns that via its /warmup endpoint so the first real case
-  // doesn't burn its timeout on it. Failures are non-fatal: the first case
-  // simply absorbs whatever warmup work remains.
-  await warmUpCliContainer(env, { instanceName, model: message.model, kiloToken }).catch(() => {});
-
-  // Concurrency 1: the CLI's sqlite state in the container is not safe under
-  // concurrent sessions (partial-migration crashes); the container serializes
-  // too, so higher concurrency here would only hold HTTP requests open.
-  await runCasesWithConcurrency(cases, 1, async benchCase => {
-    const startedAt = performance.now();
-    try {
-      let result = await runDeciderCaseViaCli(env, {
-        instanceName,
-        model: message.model,
-        benchCase,
-        kiloToken,
-        reasoningEffort,
-      });
-      // The CLI occasionally ends a session with no assistant text at all
-      // (transient empty completion: a lone step_finish with cost 0). Mirror
-      // the production classifier's policy and retry once.
-      let retried = false;
-      if (result.exitCode === 0 && result.text.length === 0) {
-        retried = true;
-        const retry = await runDeciderCaseViaCli(env, {
+  if (casesToRun.length > 0) {
+    // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the
+    // queue retries the message. The token is never logged.
+    const kiloToken = await fetchBenchmarkUserToken(env, state.benchmarkUserId);
+
+    // Fresh container instances run the CLI's one-time sqlite migration; the
+    // container owns that via its /warmup endpoint so the first real case
+    // doesn't burn its timeout on it. Ordinary warmup failures are non-fatal:
+    // the first case absorbs whatever warmup work remains. Container capacity
+    // failures are infrastructure pressure, so the queue retries the message.
+    await warmUpCliContainer(env, { instanceName, model: message.model, kiloToken }).catch(
+      error => {
+        if (isRetryableContainerAvailabilityError(error)) throw error;
+      }
+    );
+
+    // Concurrency 1: the CLI's sqlite state in the container is not safe under
+    // concurrent sessions (partial-migration crashes); the container serializes
+    // too, so higher concurrency here would only hold HTTP requests open.
+    await runCasesWithConcurrency(casesToRun, 1, async benchCase => {
+      const startedAt = performance.now();
+      try {
+        let result = await runDeciderCaseViaCli(env, {
           instanceName,
           model: message.model,
           benchCase,
           kiloToken,
           reasoningEffort,
         });
-        retry.costUsd =
-          retry.costUsd === null && result.costUsd === null
-            ? null
-            : (retry.costUsd ?? 0) + (result.costUsd ?? 0);
-        result = retry;
+        // The CLI occasionally ends a session with no assistant text at all
+        // (transient empty completion: a lone step_finish with cost 0). Mirror
+        // the production classifier's policy and retry once.
+        let retried = false;
+        if (result.exitCode === 0 && result.text.length === 0) {
+          retried = true;
+          const retry = await runDeciderCaseViaCli(env, {
+            instanceName,
+            model: message.model,
+            benchCase,
+            kiloToken,
+            reasoningEffort,
+          });
+          retry.costUsd =
+            retry.costUsd === null && result.costUsd === null
+              ? null
+              : (retry.costUsd ?? 0) + (result.costUsd ?? 0);
+          result = retry;
+        }
+        const succeeded =
+          result.exitCode === 0 &&
+          result.text.length > 0 &&
+          runDeciderCheck(benchCase.check, result.text);
+        await upsertCaseResult(env.BENCH_DB, {
+          run_id: message.runId,
+          model: message.model,
+          case_id: benchCase.id,
+          route_key: taxonomyRouteKey(benchCase),
+          score: succeeded ? 1 : 0,
+          latency_ms: result.latencyMs,
+          cost_usd: result.costUsd,
+          error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null,
+          fallback_reason: null,
+          retried,
+          exit_code: result.exitCode,
+          output_prefix: result.text.slice(0, 200),
+          event_count: result.eventCount,
+          last_event_types: result.lastEventTypes.join(' '),
+          rep,
+          timed_out: result.timedOut ? 1 : 0,
+        });
+      } catch (error) {
+        if (isRetryableContainerAvailabilityError(error)) throw error;
+        await upsertCaseResult(
+          env.BENCH_DB,
+          failedRow(message, benchCase.id, taxonomyRouteKey(benchCase), startedAt, error, rep)
+        );
       }
-      const succeeded =
-        result.exitCode === 0 &&
-        result.text.length > 0 &&
-        runDeciderCheck(benchCase.check, result.text);
-      await upsertCaseResult(env.BENCH_DB, {
-        run_id: message.runId,
-        model: message.model,
-        case_id: benchCase.id,
-        tier: benchCase.tier,
-        score: succeeded ? 1 : 0,
-        latency_ms: result.latencyMs,
-        cost_usd: result.costUsd,
-        error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null,
-        fallback_reason: null,
-        retried,
-        exit_code: result.exitCode,
-        output_prefix: result.text.slice(0, 200),
-        event_count: result.eventCount,
-        last_event_types: result.lastEventTypes.join(' '),
-        rep,
-        timed_out: result.timedOut ? 1 : 0,
-      });
-    } catch (error) {
-      await upsertCaseResult(
-        env.BENCH_DB,
-        failedRow(message, benchCase.id, benchCase.tier, startedAt, error, rep)
+    });
+  }
+
+  const hasNextChunk = await enqueueNextDeciderChunkIfNeeded(
+    env,
+    message,
+    rep,
+    chunk,
+    shard,
+    shardCount
+  );
+  if (!hasNextChunk) {
+    await destroyDeciderCliContainer(env, { instanceName }).catch(error => {
+      console.warn(
+        JSON.stringify({
+          event: 'benchmark_container_destroy_failed',
+          instanceName,
+          ...formatError(error),
+        })
       );
-    }
+    });
+  }
+  return { shouldFinalize: !hasNextChunk };
+}
+
+async function enqueueNextDeciderChunkIfNeeded(
+  env: Env,
+  message: BenchmarkJobMessage,
+  rep: number,
+  chunk: number,
+  shard: number,
+  shardCount: number
+): Promise<boolean> {
+  const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
+  const nextChunkIndex = chunk + shardCount;
+  const nextChunk = chunks[nextChunkIndex];
+  if (!nextChunk) return false;
+
+  const nextCaseIds = nextChunk.map(c => c.id);
+  const existingNextCaseIds = await getExistingCaseResultIds(env.BENCH_DB, {
+    runId: message.runId,
+    model: message.model,
+    rep,
+    caseIds: nextCaseIds,
   });
+  if (existingNextCaseIds.size >= nextCaseIds.length) return true;
+
+  await env.BENCH_QUEUE.sendBatch([
+    {
+      body: {
+        runId: message.runId,
+        kind: 'decider',
+        model: message.model,
+        chunk: nextChunkIndex,
+        shard,
+        shardCount,
+        rep,
+        caseIds: nextCaseIds,
+      } satisfies BenchmarkJobMessage,
+    },
+  ]);
+  return true;
 }
 
 const TokenResponseSchema = z.object({ token: z.string().min(1), expiresAt: z.string() });
@@ -587,7 +786,7 @@ export async function fetchBenchmarkUserToken(env: Env, userId: string): Promise
 function failedRow(
   message: BenchmarkJobMessage,
   caseId: string,
-  tier: string | null,
+  routeKey: string | null,
   startedAt: number,
   error: unknown,
   rep: number = 0
@@ -596,7 +795,7 @@ function failedRow(
     run_id: message.runId,
     model: message.model,
     case_id: caseId,
-    tier,
+    route_key: routeKey,
     score: 0,
     latency_ms: Math.round(performance.now() - startedAt),
     cost_usd: null,
@@ -729,13 +928,12 @@ async function finalizeRunIfComplete(
 }
 
 export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): BenchmarkModelSummary[] {
-  // Group by "model tier-key" using a plain reduce so this works in all runtimes.
-  // Classifier rows use '*' as the tier (no tiering); decider rows use the actual tier
-  // (falling back to '*' when tier is null).
+  // Group by "model route-key" using a plain reduce so this works in all runtimes.
+  // Classifier rows use '*' because classification has no decider taxonomy route.
   const groups = new Map<string, CaseResultRow[]>();
   for (const row of rows) {
-    const tierKey = kind === 'classifier' ? '*' : (row.tier ?? '*');
-    const key = `${row.model}\0${tierKey}`;
+    const routeKey = kind === 'classifier' ? '*' : (row.route_key ?? '*');
+    const key = `${row.model}\0${routeKey}`;
     const existing = groups.get(key);
     if (existing) {
       existing.push(row);
@@ -745,7 +943,7 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark
   }
 
   return [...groups.entries()].map(([key, group]) => {
-    const [model, tier] = key.split('\0');
+    const [model, routeKey] = key.split('\0');
     const latencies = group.map(r => r.latency_ms).toSorted((a, b) => a - b);
     const costs = group.filter(r => r.cost_usd !== null);
     const p95LatencyMs =
@@ -755,7 +953,7 @@ export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): Benchmark
         : null;
     return {
       model,
-      tier: tier as BenchmarkModelSummary['tier'],
+      routeKey: routeKey as BenchmarkModelSummary['routeKey'],
       accuracy: Number((group.reduce((a, r) => a + r.score, 0) / group.length).toFixed(4)),
       avgCostUsd: costs.length
         ? Number((costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8))
diff --git a/services/auto-routing-benchmark/src/winner.ts b/services/auto-routing-benchmark/src/winner.ts
index 318809c4a3..952b329bb3 100644
--- a/services/auto-routing-benchmark/src/winner.ts
+++ b/services/auto-routing-benchmark/src/winner.ts
@@ -1,6 +1,6 @@
 import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
 
-// Picks the best classifier candidate from summaries (tier '*') applying:
+// Picks the best classifier candidate from summaries (routeKey '*') applying:
 //   1. Accuracy gate: must meet minAccuracy.
 //   2. Optional p95 latency gate: when maxP95LatencyMs is non-null, prefer
 //      candidates whose measured p95 latency is within budget.
@@ -16,7 +16,7 @@ export function pickClassifierWinner(
   minAccuracy: number,
   maxP95LatencyMs: number | null = null
 ): BenchmarkModelSummary | null {
-  const graded = summaries.filter(s => s.tier === '*' && s.cases > 0);
+  const graded = summaries.filter(s => s.routeKey === '*' && s.cases > 0);
   if (graded.length === 0) return null;
   const cost = (s: BenchmarkModelSummary) => s.avgCostUsd ?? Number.POSITIVE_INFINITY;
   const p95 = (s: BenchmarkModelSummary) => s.p95LatencyMs ?? Number.POSITIVE_INFINITY;
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
index 9faeb19ac4..c0433b1073 100644
--- a/services/auto-routing-benchmark/wrangler.jsonc
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -32,7 +32,7 @@
       "class_name": "BenchRunnerContainer",
       "image": "./container/Dockerfile",
       "instance_type": "standard-2",
-      "max_instances": 50,
+      "max_instances": 100,
     },
   ],
   "durable_objects": {
@@ -53,8 +53,9 @@
       {
         "queue": "auto-routing-benchmark-jobs",
         "max_batch_size": 1,
-        "max_retries": 2,
-        "max_concurrency": 4,
+        "max_retries": 6,
+        "retry_delay": 180,
+        "max_concurrency": 100,
         "dead_letter_queue": "auto-routing-benchmark-dlq",
       },
     ],
diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts
index fd476a5668..bd89638137 100644
--- a/services/auto-routing/src/decide.ts
+++ b/services/auto-routing/src/decide.ts
@@ -254,7 +254,8 @@ function recordDecision(
       mode: ctx.payload.mode,
       uaPrefix: ctx.payload.userAgent?.slice(0, 40) ?? null,
       decidedModel: decision?.model ?? null,
-      decidedTier: decision?.tier ?? null,
+      decidedTaskType: decision?.taskType ?? null,
+      decidedSubtaskType: decision?.subtaskType ?? null,
       decisionSource: decision?.source ?? null,
       sticky: decision?.sticky ?? null,
       ...summary.details,
diff --git a/services/auto-routing/src/decision-cache.ts b/services/auto-routing/src/decision-cache.ts
index ae98778688..0aed20c63e 100644
--- a/services/auto-routing/src/decision-cache.ts
+++ b/services/auto-routing/src/decision-cache.ts
@@ -82,7 +82,7 @@ function entryKey(contentHash: string, classifierModel: string): string {
 
 // Single per-conversation slot remembering the last model the decision
 // engine served, so the session can stay on it (keeping the provider's
-// prompt cache warm) instead of ping-ponging when its tier oscillates.
+// prompt cache warm) instead of ping-ponging when its route oscillates.
 // Cannot collide with classification keys, which always contain a ':'.
 const STICKY_DECISION_KEY = 'sticky';
 
diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts
index b10fcc2e47..ab137ccd47 100644
--- a/services/auto-routing/src/decision-engine.test.ts
+++ b/services/auto-routing/src/decision-engine.test.ts
@@ -19,8 +19,8 @@ const table: RoutingTable = {
   minAccuracy: 0.7,
   switchCostFactor: 3,
   source: 'benchmark',
-  tiers: {
-    low: [
+  routes: {
+    'implementation/code_generation': [
       {
         model: 'cheap/chat',
         accuracy: 0.85,
@@ -47,7 +47,7 @@ const table: RoutingTable = {
         meetsThreshold: false,
       },
     ],
-    medium: [
+    'debugging/bug_fixing': [
       {
         model: 'mid/chat',
         accuracy: 0.8,
@@ -55,7 +55,7 @@ const table: RoutingTable = {
         meetsThreshold: true,
       },
     ],
-    high: [
+    'planning_design/system_design': [
       {
         model: 'big/chat',
         accuracy: 0.9,
@@ -67,59 +67,39 @@ const table: RoutingTable = {
 };
 
 describe('computeDecision', () => {
-  it('picks the first candidate of the tier', () => {
+  it('picks the first candidate of the classifier taxonomy route', () => {
     const decision = computeDecision(classification, table, null);
     expect(decision).toEqual({
       model: 'cheap/chat',
-      tier: 'low',
+      taskType: 'implementation',
+      subtaskType: 'code_generation',
       source: 'benchmark',
       tableVersion: 'run-1',
       reasoningEffort: null,
       sticky: false,
     });
   });
-  it('uses the tier derived from the classification', () => {
-    const hard: ClassifierOutput = {
+  it('uses the classifier task type and subtype directly', () => {
+    const debugging: ClassifierOutput = {
       ...classification,
-      reasoningComplexity: 'high',
-      contextComplexity: 'large',
-      executionMode: 'multi_step_project',
+      taskType: 'debugging',
+      subtaskType: 'bug_fixing',
     };
-    expect(computeDecision(hard, table, null)?.model).toBe('big/chat');
-  });
-  it('returns a decision for every tier of a valid table', () => {
-    const byTier: Array<[ClassifierOutput, string]> = [
-      [classification, 'cheap/chat'],
-      [
-        { ...classification, reasoningComplexity: 'medium', contextComplexity: 'medium' },
-        'mid/chat',
-      ],
-      [
-        {
-          ...classification,
-          reasoningComplexity: 'high',
-          contextComplexity: 'large',
-          executionMode: 'multi_step_project',
-        },
-        'big/chat',
-      ],
-    ];
-    for (const [input, expected] of byTier) {
-      expect(computeDecision(input, table, null)?.model).toBe(expected);
-    }
+    expect(computeDecision(debugging, table, null)?.model).toBe('mid/chat');
   });
   it('returns null when there is no routing table', () => {
     expect(computeDecision(classification, null, null)).toBeNull();
   });
 
   describe('session stickiness', () => {
-    it('keeps the incumbent on tier de-escalation when it is within the switch-cost factor', () => {
+    it('keeps the incumbent on route changes when it is within the switch-cost factor', () => {
       // Fresh pick cheap/chat at 0.002; mid/chat at 0.005 is not cheaper by
       // more than 3x (0.002 * 3 = 0.006 >= 0.005), so the session stays put.
       const decision = computeDecision(classification, table, 'mid/chat');
       expect(decision).toEqual({
         model: 'mid/chat',
-        tier: 'low',
+        taskType: 'implementation',
+        subtaskType: 'code_generation',
         source: 'benchmark',
         tableVersion: 'run-1',
         // The incumbent's benchmarked effort, not the fresh pick's.
@@ -132,11 +112,19 @@ describe('computeDecision', () => {
       // Integer costs avoid float noise on the equality case (1 * 3 === 3).
       const boundaryTable: RoutingTable = {
         ...table,
-        tiers: {
-          ...table.tiers,
-          low: [
-            { ...table.tiers.low[0]!, model: 'fresh/chat', avgCostUsd: 1 },
-            { ...table.tiers.low[1]!, model: 'incumbent/chat', avgCostUsd: 3 },
+        routes: {
+          ...table.routes,
+          'implementation/code_generation': [
+            {
+              ...table.routes['implementation/code_generation'][0]!,
+              model: 'fresh/chat',
+              avgCostUsd: 1,
+            },
+            {
+              ...table.routes['implementation/code_generation'][1]!,
+              model: 'incumbent/chat',
+              avgCostUsd: 3,
+            },
           ],
         },
       };
@@ -148,11 +136,11 @@ describe('computeDecision', () => {
       const decision = computeDecision(classification, table, 'pricey/chat');
       expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
     });
-    it('switches when the incumbent no longer meets the tier threshold', () => {
+    it('switches when the incumbent no longer meets the route threshold', () => {
       const decision = computeDecision(classification, table, 'weak/chat');
       expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
     });
-    it('serves the fresh pick when the incumbent is not in the tier', () => {
+    it('serves the fresh pick when the incumbent is not in the route', () => {
       const decision = computeDecision(classification, table, 'gone/model');
       expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
     });
diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts
index 0d641e069d..aaa7aba542 100644
--- a/services/auto-routing/src/decision-engine.ts
+++ b/services/auto-routing/src/decision-engine.ts
@@ -1,5 +1,5 @@
 import {
-  deriveDifficultyTier,
+  taxonomyRouteKey,
   type AutoRoutingDecision,
   type ClassifierOutput,
   type RoutingTable,
@@ -11,14 +11,13 @@ export function computeDecision(
   incumbentModel: string | null
 ): AutoRoutingDecision | null {
   if (!table) return null;
-  const tier = deriveDifficultyTier(classification);
-  const candidates = table.tiers[tier];
-  // A parsed table guarantees a non-empty tier (schema .min(1)), so with a
-  // table and a classification a decision always exists.
+  const routeKey = taxonomyRouteKey(classification);
+  const candidates = table.routes[routeKey];
+  if (!candidates?.length) return null;
   const freshPick = candidates[0];
 
   // Keep the session on its incumbent model when it is still good enough for
-  // the current tier. A model switch discards the provider's prompt cache,
+  // the current taxonomy route. A model switch discards the provider's prompt cache,
   // and rebuilding it costs full-price input tokens (4-10x cache-read rates)
   // on a context that dominates agent-session spend — so a switch is only
   // worth it when the fresh pick's recurring per-turn savings clearly exceed
@@ -33,7 +32,8 @@ export function computeDecision(
   ) {
     return {
       model: incumbent.model,
-      tier,
+      taskType: classification.taskType,
+      subtaskType: classification.subtaskType,
       source: table.source,
       tableVersion: table.version,
       reasoningEffort: incumbent.reasoningEffort ?? null,
@@ -43,7 +43,8 @@ export function computeDecision(
 
   return {
     model: freshPick.model,
-    tier,
+    taskType: classification.taskType,
+    subtaskType: classification.subtaskType,
     source: table.source,
     tableVersion: table.version,
     reasoningEffort: freshPick.reasoningEffort ?? null,
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 4519c7c310..220d443fbb 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -87,17 +87,15 @@ const benchmarkRoutingTable = {
   minAccuracy: 0.7,
   switchCostFactor: 3,
   source: 'benchmark',
-  tiers: {
-    low: [
+  routes: {
+    'implementation/feature_development': [
       {
         model: 'google/gemini-2.5-flash-lite',
         accuracy: 0.9,
-        avgCostUsd: 0.001,
+        avgCostUsd: 0.002,
         meetsThreshold: true,
         reasoningEffort: null,
       },
-    ],
-    medium: [
       {
         model: 'google/gemini-2.5-flash',
         accuracy: 0.85,
@@ -105,9 +103,9 @@ const benchmarkRoutingTable = {
         meetsThreshold: true,
         reasoningEffort: null,
       },
-      // The high-tier model also qualifies for medium, within the 3x
+      // The planning route's model also qualifies for implementation, within the 3x
       // switch-cost factor of the fresh pick (0.002 * 3 >= 0.005): a session
-      // de-escalating from high stays on it.
+      // moving routes stays on it.
       {
         model: 'anthropic/claude-sonnet-4.6',
         accuracy: 0.8,
@@ -116,7 +114,7 @@ const benchmarkRoutingTable = {
         reasoningEffort: null,
       },
     ],
-    high: [
+    'planning_design/system_design': [
       {
         model: 'anthropic/claude-sonnet-4.6',
         accuracy: 0.8,
@@ -235,7 +233,8 @@ describe('auto routing worker', () => {
       cost: 0.00000123,
       decision: {
         model: expect.any(String),
-        tier: expect.stringMatching(/^(low|medium|high)$/),
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark',
         tableVersion: 'bench-run-1',
         reasoningEffort: null,
@@ -300,7 +299,8 @@ describe('auto routing worker', () => {
       cost: 0,
       decision: {
         model: expect.any(String),
-        tier: expect.stringMatching(/^(low|medium|high)$/),
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark',
         tableVersion: 'bench-run-1',
         reasoningEffort: null,
@@ -331,7 +331,7 @@ describe('auto routing worker', () => {
     );
   });
 
-  it('keeps the session on the incumbent model when the tier de-escalates', async () => {
+  it('keeps the session on the incumbent model when the taxonomy route changes', async () => {
     // Back the mocked DO stub with real storage so the sticky model written
     // by the first request is visible to the second.
     const store = new Map<string, unknown>();
@@ -344,19 +344,24 @@ describe('auto routing worker', () => {
       ...mockClassifierResult,
       classification: {
         ...mockClassification,
-        reasoningComplexity: 'high',
-        contextComplexity: 'large',
-        executionMode: 'multi_step_project',
+        taskType: 'planning_design',
+        subtaskType: 'system_design',
       },
     });
     const first = await decideRequest(mirrorPayload());
     expect(first.status).toBe(200);
     await expect(first.json()).resolves.toMatchObject({
-      decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'high', sticky: false },
+      decision: {
+        model: 'anthropic/claude-sonnet-4.6',
+        taskType: 'planning_design',
+        subtaskType: 'system_design',
+        sticky: false,
+      },
     });
+    store.set('sticky', { model: 'anthropic/claude-sonnet-4.6' });
 
-    // The second turn (different prompt, same session) classifies as medium.
-    // The fresh medium pick is cheaper, but not by more than the switch-cost
+    // The second turn (different prompt, same session) classifies to a cheaper route.
+    // The fresh implementation pick is cheaper, but not by more than the switch-cost
     // factor, so the session keeps its incumbent.
     const second = await decideRequest(
       mirrorPayload({
@@ -365,7 +370,12 @@ describe('auto routing worker', () => {
     );
     expect(second.status).toBe(200);
     await expect(second.json()).resolves.toMatchObject({
-      decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'medium', sticky: true },
+      decision: {
+        model: 'anthropic/claude-sonnet-4.6',
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
+        sticky: true,
+      },
     });
   });
 
diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts
index be60e909ab..9b73d29235 100644
--- a/services/auto-routing/src/routing-table.test.ts
+++ b/services/auto-routing/src/routing-table.test.ts
@@ -8,8 +8,8 @@ const SAMPLE_TABLE: RoutingTable = {
   minAccuracy: 0.7,
   switchCostFactor: 3,
   source: 'benchmark',
-  tiers: {
-    low: [
+  routes: {
+    'implementation/feature_development': [
       {
         model: 'google/gemini-2.5-flash-lite',
         accuracy: 0.9,
@@ -18,7 +18,7 @@ const SAMPLE_TABLE: RoutingTable = {
         reasoningEffort: null,
       },
     ],
-    medium: [
+    'debugging/bug_fixing': [
       {
         model: 'google/gemini-2.5-flash',
         accuracy: 0.85,
@@ -27,7 +27,7 @@ const SAMPLE_TABLE: RoutingTable = {
         reasoningEffort: null,
       },
     ],
-    high: [
+    'planning_design/system_design': [
       {
         model: 'anthropic/claude-sonnet-4.6',
         accuracy: 0.8,