Kilo-Org · iscekic · Jun 17, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
@@ -63,6 +63,7 @@ describe('configToFormState', () => {
     expect(state.classifierMaxP95LatencyMs).toBe('1000');
     expect(state.classifierModels).toBe('');
     expect(state.deciderModels).toEqual([]);
+    expect(state.maxConcurrency).toBe(100);
   });
 });
 

diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -126,7 +126,7 @@ export function configToFormState(config: BenchmarkConfig | null): {
       deciderModels: [],
       minAccuracy: 0.7,
       switchCostFactor: 3,
-      maxConcurrency: 4,
+      maxConcurrency: 100,
       benchmarkUserId: '',
       classifierRepetitions: 1,
       deciderRepetitions: 1,
@@ -407,13 +407,13 @@ function BenchmarkConfigEditor({
           </div>
           <div className="flex flex-col gap-1.5">
             <Label htmlFor="benchmark-max-concurrency" className="text-sm font-medium">
-              Max concurrency (1–16)
+              Max concurrency (1–100)
             </Label>
             <Input
               id="benchmark-max-concurrency"
               type="number"
               min={1}
-              max={16}
+              max={100}
               step={1}
               value={form.maxConcurrency}
               onChange={e =>
@@ -539,17 +539,13 @@ function BenchmarkConfigEditor({
 // Run summaries expandable table
 // ---------------------------------------------------------------------------
 
-const TIER_ORDER = { low: 0, medium: 1, high: 2, '*': 3 } as const;
-
 function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
   const isDecider = run.kind === 'decider';
 
   const sortedSummaries: BenchmarkModelSummary[] = isDecider
     ? [...run.summaries].sort((a, b) => {
-        const tierDiff =
-          (TIER_ORDER[a.tier as keyof typeof TIER_ORDER] ?? 3) -
-          (TIER_ORDER[b.tier as keyof typeof TIER_ORDER] ?? 3);
-        if (tierDiff !== 0) return tierDiff;
+        const routeDiff = a.routeKey.localeCompare(b.routeKey);
+        if (routeDiff !== 0) return routeDiff;
         return b.accuracy - a.accuracy;
       })
     : run.summaries;
@@ -571,7 +567,7 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
               <TableHeader>
                 <TableRow>
                   <TableHead className="text-xs">Model</TableHead>
-                  {isDecider ? <TableHead className="text-xs">Tier</TableHead> : null}
+                  {isDecider ? <TableHead className="text-xs">Route</TableHead> : null}
                   <TableHead className="text-right text-xs">Accuracy</TableHead>
                   <TableHead className="text-right text-xs">Avg cost</TableHead>
                   <TableHead className="text-right text-xs">Avg latency</TableHead>
@@ -584,10 +580,10 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
               </TableHeader>
               <TableBody>
                 {sortedSummaries.map((s, i) => (
-                  <TableRow key={`${s.model}-${s.tier}-${i}`}>
+                  <TableRow key={`${s.model}-${s.routeKey}-${i}`}>
                     <TableCell className="max-w-56 truncate font-mono text-xs">{s.model}</TableCell>
                     {isDecider ? (
-                      <TableCell className="text-xs capitalize">{s.tier}</TableCell>
+                      <TableCell className="font-mono text-xs">{s.routeKey}</TableCell>
                     ) : null}
                     <TableCell className="text-right tabular-nums text-xs">
                       {formatAccuracy(s.accuracy)}
@@ -717,11 +713,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
   }
 
   const { table } = data;
-  const tierEntries = [
-    { tier: 'low', candidates: table.tiers.low },
-    { tier: 'medium', candidates: table.tiers.medium },
-    { tier: 'high', candidates: table.tiers.high },
-  ] as const;
+  const routeEntries = Object.entries(table.routes).sort(([a], [b]) => a.localeCompare(b));
 
   return (
     <div className="flex flex-col gap-3">
@@ -736,9 +728,9 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
         </span>
       </div>
 
-      {tierEntries.map(({ tier, candidates }) => (
-        <div key={tier}>
-          <p className="text-sm font-medium capitalize mb-1.5">{tier} tier</p>
+      {routeEntries.map(([routeKey, candidates]) => (
+        <div key={routeKey}>
+          <p className="mb-1.5 font-mono text-sm font-medium">{routeKey}</p>
           <div className="overflow-x-auto rounded-md border">
             <Table className="min-w-max">
               <TableHeader>
@@ -751,7 +743,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
               </TableHeader>
               <TableBody>
                 {candidates.map((c, i) => (
-                  <TableRow key={`${tier}-${c.model}-${i}`}>
+                  <TableRow key={`${routeKey}-${c.model}-${i}`}>
                     <TableCell className="max-w-56 truncate font-mono text-xs">{c.model}</TableCell>
                     <TableCell className="text-right tabular-nums text-xs">
                       {formatAccuracy(c.accuracy)}

diff --git a/apps/web/src/app/api/openrouter/[...path]/route.test.ts b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
@@ -447,7 +447,8 @@ describe('kilo-auto/efficient classifier billing', () => {
     mockedFetchEfficientAutoDecision.mockResolvedValue({
       decision: {
         model: 'anthropic/claude-haiku-4',
-        tier: 'low',
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark',
         tableVersion: 'v1',
         sticky: false,
@@ -481,7 +482,8 @@ describe('kilo-auto/efficient classifier billing', () => {
     mockedFetchEfficientAutoDecision.mockResolvedValue({
       decision: {
         model: 'anthropic/claude-haiku-4',
-        tier: 'low',
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark' as const,
         tableVersion: 'v1',
         sticky: false,
@@ -510,7 +512,8 @@ describe('kilo-auto/efficient classifier billing', () => {
     mockedFetchEfficientAutoDecision.mockResolvedValue({
       decision: {
         model: 'anthropic/claude-haiku-4',
-        tier: 'low',
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark',
         tableVersion: 'v1',
         sticky: false,
@@ -560,7 +563,8 @@ describe('kilo-auto/efficient classifier billing', () => {
     mockedFetchEfficientAutoDecision.mockResolvedValue({
       decision: {
         model: 'anthropic/claude-haiku-4',
-        tier: 'low',
+        taskType: 'implementation',
+        subtaskType: 'feature_development',
         source: 'benchmark',
         tableVersion: 'v1',
         sticky: false,

diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
@@ -25,7 +25,8 @@ const zeroBalancePromise = Promise.resolve(0);
 
 const sampleDecision: AutoRoutingDecision = {
   model: 'anthropic/claude-haiku-4',
-  tier: 'low',
+  taskType: 'implementation',
+  subtaskType: 'feature_development',
   source: 'benchmark',
   tableVersion: 'v1',
   sticky: false,

diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
@@ -47,7 +47,8 @@ const options = {
 
 const validDecision = {
   model: 'anthropic/claude-haiku-4',
-  tier: 'low' as const,
+  taskType: 'implementation' as const,
+  subtaskType: 'feature_development' as const,
   source: 'benchmark' as const,
   tableVersion: 'v1',
   sticky: false,

diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
@@ -1,9 +1,10 @@
 import * as z from 'zod';
 import { RoutingTableSchema } from './routing-table';
-import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
+import { ReasoningEffortSchema } from './reasoning';
+import { TaxonomyRouteKeySchema } from './taxonomy';
 
-export { ReasoningEffortSchema } from './tiers';
-export type { ReasoningEffort } from './tiers';
+export { ReasoningEffortSchema } from './reasoning';
+export type { ReasoningEffort } from './reasoning';
 
 export const BenchmarkKindSchema = z.enum(['classifier', 'decider']);
 export type BenchmarkKind = z.infer<typeof BenchmarkKindSchema>;
@@ -39,15 +40,16 @@ export const BenchmarkConfigSchema = z
   .object({
     classifierModels: z.array(z.string().trim().min(1)).min(1),
     deciderModels: z.array(BenchmarkDeciderModelSchema).min(1),
-    // Accuracy threshold for "gets the job done" (per tier).
+    // Accuracy threshold for "gets the job done" (per taxonomy route).
     minAccuracy: z.number().min(0).max(1),
-    // Parallel OpenRouter calls per queue message.
-    maxConcurrency: z.number().int().min(1).max(16),
+    // Benchmark-wide parallelism budget. Decider runs use it as a live
+    // container budget; classifier runs use it for parallel OpenRouter calls.
+    maxConcurrency: z.number().int().min(1).max(100),
     // The Kilo user whose identity/billing the decider CLI runs execute under.
     // Null until an admin configures it; decider runs fail fast while null.
     benchmarkUserId: z.string().trim().min(1).nullable(),
     // Session stickiness knob carried into published routing tables: a session
-    // stays on its incumbent model while it meets the tier's accuracy
+    // stays on its incumbent model while it meets the route's accuracy
     // threshold, unless the fresh pick is cheaper by more than this factor.
     // Model switches discard provider prompt caches (cache reads are far
     // cheaper than fresh input tokens), so switching only pays off when the
@@ -79,8 +81,8 @@ export type BenchmarkRunStatus = z.infer<typeof BenchmarkRunStatusSchema>;
 
 export const BenchmarkModelSummarySchema = z.object({
   model: z.string(),
-  // '*' for classifier runs (no tiering), otherwise the difficulty tier.
-  tier: z.union([DifficultyTierSchema, z.literal('*')]),
+  // '*' for classifier runs, otherwise "<taskType>/<subtaskType>".
+  routeKey: z.union([TaxonomyRouteKeySchema, z.literal('*')]),
   accuracy: z.number(),
   avgCostUsd: z.number().nullable(),
   avgLatencyMs: z.number(),

diff --git a/packages/auto-routing-contracts/src/contracts.test.ts b/packages/auto-routing-contracts/src/contracts.test.ts
@@ -147,6 +147,20 @@ describe('BenchmarkConfigSchema defaults', () => {
     expect(result.deciderRepetitions).toBe(1);
     expect(result.classifierMaxP95LatencyMs).toBe(1000);
   });
+
+  it('accepts the benchmark maximum concurrency cap of 100', () => {
+    const result = BenchmarkConfigSchema.safeParse({
+      classifierModels: ['model/a'],
+      deciderModels: [{ id: 'model/b' }],
+      minAccuracy: 0.8,
+      maxConcurrency: 100,
+      benchmarkUserId: null,
+      switchCostFactor: 2,
+      updatedAt: null,
+      updatedBy: null,
+    });
+    expect(result.success).toBe(true);
+  });
 });
 
 describe('BenchmarkConfigSchema duplicate model ids', () => {

diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
@@ -1,6 +1,12 @@
 import * as z from 'zod';
 import { NormalizedClassifierInputSchema } from './input';
-import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
+import { ReasoningEffortSchema } from './reasoning';
+import {
+  ClassifierSubtaskTypeSchema,
+  ClassifierTaskTypeSchema,
+  SUBTYPES_BY_TASK_TYPE,
+  type ClassifierSubtaskType,
+} from './taxonomy';
 
 export {
   NormalizedClassifierInputSchema,
@@ -29,47 +35,6 @@ export const MirrorPayloadSchema = z.object({
 });
 export type MirrorPayload = z.infer<typeof MirrorPayloadSchema>;
 
-export const ClassifierTaskTypeSchema = z.enum([
-  'implementation',
-  'debugging',
-  'refactoring',
-  'planning_design',
-  'investigation',
-  'agentic_execution',
-]);
-export type ClassifierTaskType = z.infer<typeof ClassifierTaskTypeSchema>;
-
-export const ClassifierSubtaskTypeSchema = z.enum([
-  'feature_development',
-  'code_generation',
-  'test_creation',
-  'bug_fixing',
-  'test_repair',
-  'root_cause_analysis',
-  'code_cleanup',
-  'architecture_improvement',
-  'migration',
-  'architecture_design',
-  'technical_planning',
-  'system_design',
-  'repo_exploration',
-  'codebase_understanding',
-  'external_research',
-  'tool_usage',
-  'terminal_operations',
-  'multi_step_execution',
-]);
-export type ClassifierSubtaskType = z.infer<typeof ClassifierSubtaskTypeSchema>;
-
-const subtypesByTaskType: Record<ClassifierTaskType, readonly ClassifierSubtaskType[]> = {
-  implementation: ['feature_development', 'code_generation', 'test_creation'],
-  debugging: ['bug_fixing', 'test_repair', 'root_cause_analysis'],
-  refactoring: ['code_cleanup', 'architecture_improvement', 'migration'],
-  planning_design: ['architecture_design', 'technical_planning', 'system_design'],
-  investigation: ['repo_exploration', 'codebase_understanding', 'external_research'],
-  agentic_execution: ['tool_usage', 'terminal_operations', 'multi_step_execution'],
-};
-
 export const ClassifierOutputSchema = z
   .strictObject({
     taskType: ClassifierTaskTypeSchema,
@@ -87,7 +52,10 @@ export const ClassifierOutputSchema = z
     confidence: z.number().min(0).max(1),
   })
   .superRefine((output, ctx) => {
-    if (!subtypesByTaskType[output.taskType].includes(output.subtaskType)) {
+    const allowedSubtypes = SUBTYPES_BY_TASK_TYPE[
+      output.taskType
+    ] as readonly ClassifierSubtaskType[];
+    if (!allowedSubtypes.includes(output.subtaskType)) {
       ctx.addIssue({
         code: 'custom',
         path: ['subtaskType'],
@@ -99,7 +67,8 @@ export type ClassifierOutput = z.infer<typeof ClassifierOutputSchema>;
 
 export const AutoRoutingDecisionSchema = z.object({
   model: z.string(),
-  tier: DifficultyTierSchema,
+  taskType: ClassifierTaskTypeSchema,
+  subtaskType: ClassifierSubtaskTypeSchema,
   source: z.enum(['benchmark']),
   tableVersion: z.string(),
   // Mirrors the effort the chosen model was benchmarked with, when set.
@@ -180,6 +149,7 @@ export type AutoRoutingClassifierAnalyticsResponse = z.infer<
 
 export { normalizeClassifierInput, redactProviderHints, type ClassifierApiKind } from './normalize';
 
-export * from './tiers';
+export * from './reasoning';
+export * from './taxonomy';
 export * from './routing-table';
 export * from './benchmark';
diff --git a/packages/auto-routing-contracts/src/reasoning.ts b/packages/auto-routing-contracts/src/reasoning.ts
@@ -0,0 +1,4 @@
+import * as z from 'zod';
+
+export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
+export type ReasoningEffort = z.infer<typeof ReasoningEffortSchema>;
diff --git a/packages/auto-routing-contracts/src/routing-table.test.ts b/packages/auto-routing-contracts/src/routing-table.test.ts
@@ -9,12 +9,16 @@ const candidate = (model: string, accuracy: number, avgCostUsd: number) => ({
 });
 
 describe('rankCandidates', () => {
-  it('puts the cheapest above-threshold candidate first', () => {
+  it('puts the lowest cost-per-accuracy above-threshold candidate first', () => {
     const ranked = rankCandidates(
-      [candidate('expensive', 0.95, 10), candidate('cheap', 0.8, 1), candidate('weak', 0.5, 0.1)],
+      [
+        candidate('lower-raw-cost', 0.7, 0.007),
+        candidate('better-value', 0.9, 0.008),
+        candidate('weak', 0.5, 0.001),
+      ],
       0.7
     );
-    expect(ranked.map(c => c.model)).toEqual(['cheap', 'expensive', 'weak']);
+    expect(ranked.map(c => c.model)).toEqual(['better-value', 'lower-raw-cost', 'weak']);
     expect(ranked[0].meetsThreshold).toBe(true);
     expect(ranked[2].meetsThreshold).toBe(false);
   });
@@ -29,15 +33,35 @@ describe('rankCandidates', () => {
 });
 
 describe('RoutingTableSchema', () => {
-  it('requires at least one candidate per tier', () => {
+  it('requires at least one candidate per taxonomy route', () => {
     expect(
       RoutingTableSchema.safeParse({
         version: 'v',
         generatedAt: new Date(0).toISOString(),
         minAccuracy: 0.7,
+        switchCostFactor: 3,
         source: 'benchmark',
-        tiers: { low: [], medium: [candidate('m', 1, 1)], high: [candidate('h', 1, 1)] },
+        routes: {
+          'implementation/code_generation': [],
+          'debugging/bug_fixing': [candidate('m', 1, 1)],
+        },
       }).success
     ).toBe(false);
   });
+
+  it('accepts a table routed by classifier taxonomy pair', () => {
+    const parsed = RoutingTableSchema.parse({
+      version: 'v',
+      generatedAt: new Date(0).toISOString(),
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      source: 'benchmark',
+      routes: {
+        'implementation/code_generation': [candidate('impl', 0.9, 1)],
+        'debugging/bug_fixing': [candidate('debug', 0.9, 1)],
+      },
+    });
+
+    expect(parsed.routes['implementation/code_generation']?.[0]?.model).toBe('impl');
+  });
 });