Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ describe('configToFormState', () => {
expect(state.classifierMaxP95LatencyMs).toBe('1000');
expect(state.classifierModels).toBe('');
expect(state.deciderModels).toEqual([]);
expect(state.maxConcurrency).toBe(100);
});
});

Expand Down
34 changes: 13 additions & 21 deletions apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ export function configToFormState(config: BenchmarkConfig | null): {
deciderModels: [],
minAccuracy: 0.7,
switchCostFactor: 3,
maxConcurrency: 4,
maxConcurrency: 100,
benchmarkUserId: '',
classifierRepetitions: 1,
deciderRepetitions: 1,
Expand Down Expand Up @@ -407,13 +407,13 @@ function BenchmarkConfigEditor({
</div>
<div className="flex flex-col gap-1.5">
<Label htmlFor="benchmark-max-concurrency" className="text-sm font-medium">
Max concurrency (1–16)
Max concurrency (1–100)
</Label>
<Input
id="benchmark-max-concurrency"
type="number"
min={1}
max={16}
max={100}
step={1}
value={form.maxConcurrency}
onChange={e =>
Expand Down Expand Up @@ -539,17 +539,13 @@ function BenchmarkConfigEditor({
// Run summaries expandable table
// ---------------------------------------------------------------------------

const TIER_ORDER = { low: 0, medium: 1, high: 2, '*': 3 } as const;

function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
const isDecider = run.kind === 'decider';

const sortedSummaries: BenchmarkModelSummary[] = isDecider
? [...run.summaries].sort((a, b) => {
const tierDiff =
(TIER_ORDER[a.tier as keyof typeof TIER_ORDER] ?? 3) -
(TIER_ORDER[b.tier as keyof typeof TIER_ORDER] ?? 3);
if (tierDiff !== 0) return tierDiff;
const routeDiff = a.routeKey.localeCompare(b.routeKey);
if (routeDiff !== 0) return routeDiff;
return b.accuracy - a.accuracy;
})
: run.summaries;
Expand All @@ -571,7 +567,7 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
<TableHeader>
<TableRow>
<TableHead className="text-xs">Model</TableHead>
{isDecider ? <TableHead className="text-xs">Tier</TableHead> : null}
{isDecider ? <TableHead className="text-xs">Route</TableHead> : null}
<TableHead className="text-right text-xs">Accuracy</TableHead>
<TableHead className="text-right text-xs">Avg cost</TableHead>
<TableHead className="text-right text-xs">Avg latency</TableHead>
Expand All @@ -584,10 +580,10 @@ function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
</TableHeader>
<TableBody>
{sortedSummaries.map((s, i) => (
<TableRow key={`${s.model}-${s.tier}-${i}`}>
<TableRow key={`${s.model}-${s.routeKey}-${i}`}>
<TableCell className="max-w-56 truncate font-mono text-xs">{s.model}</TableCell>
{isDecider ? (
<TableCell className="text-xs capitalize">{s.tier}</TableCell>
<TableCell className="font-mono text-xs">{s.routeKey}</TableCell>
) : null}
<TableCell className="text-right tabular-nums text-xs">
{formatAccuracy(s.accuracy)}
Expand Down Expand Up @@ -717,11 +713,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
}

const { table } = data;
const tierEntries = [
{ tier: 'low', candidates: table.tiers.low },
{ tier: 'medium', candidates: table.tiers.medium },
{ tier: 'high', candidates: table.tiers.high },
] as const;
const routeEntries = Object.entries(table.routes).sort(([a], [b]) => a.localeCompare(b));

return (
<div className="flex flex-col gap-3">
Expand All @@ -736,9 +728,9 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
</span>
</div>

{tierEntries.map(({ tier, candidates }) => (
<div key={tier}>
<p className="text-sm font-medium capitalize mb-1.5">{tier} tier</p>
{routeEntries.map(([routeKey, candidates]) => (
<div key={routeKey}>
<p className="mb-1.5 font-mono text-sm font-medium">{routeKey}</p>
<div className="overflow-x-auto rounded-md border">
<Table className="min-w-max">
<TableHeader>
Expand All @@ -751,7 +743,7 @@ function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
</TableHeader>
<TableBody>
{candidates.map((c, i) => (
<TableRow key={`${tier}-${c.model}-${i}`}>
<TableRow key={`${routeKey}-${c.model}-${i}`}>
<TableCell className="max-w-56 truncate font-mono text-xs">{c.model}</TableCell>
<TableCell className="text-right tabular-nums text-xs">
{formatAccuracy(c.accuracy)}
Expand Down
12 changes: 8 additions & 4 deletions apps/web/src/app/api/openrouter/[...path]/route.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,8 @@ describe('kilo-auto/efficient classifier billing', () => {
mockedFetchEfficientAutoDecision.mockResolvedValue({
decision: {
model: 'anthropic/claude-haiku-4',
tier: 'low',
taskType: 'implementation',
subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'v1',
sticky: false,
Expand Down Expand Up @@ -481,7 +482,8 @@ describe('kilo-auto/efficient classifier billing', () => {
mockedFetchEfficientAutoDecision.mockResolvedValue({
decision: {
model: 'anthropic/claude-haiku-4',
tier: 'low',
taskType: 'implementation',
subtaskType: 'feature_development',
source: 'benchmark' as const,
tableVersion: 'v1',
sticky: false,
Expand Down Expand Up @@ -510,7 +512,8 @@ describe('kilo-auto/efficient classifier billing', () => {
mockedFetchEfficientAutoDecision.mockResolvedValue({
decision: {
model: 'anthropic/claude-haiku-4',
tier: 'low',
taskType: 'implementation',
subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'v1',
sticky: false,
Expand Down Expand Up @@ -560,7 +563,8 @@ describe('kilo-auto/efficient classifier billing', () => {
mockedFetchEfficientAutoDecision.mockResolvedValue({
decision: {
model: 'anthropic/claude-haiku-4',
tier: 'low',
taskType: 'implementation',
subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'v1',
sticky: false,
Expand Down
3 changes: 2 additions & 1 deletion apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ const zeroBalancePromise = Promise.resolve(0);

const sampleDecision: AutoRoutingDecision = {
model: 'anthropic/claude-haiku-4',
tier: 'low',
taskType: 'implementation',
subtaskType: 'feature_development',
source: 'benchmark',
tableVersion: 'v1',
sticky: false,
Expand Down
3 changes: 2 additions & 1 deletion apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ const options = {

const validDecision = {
model: 'anthropic/claude-haiku-4',
tier: 'low' as const,
taskType: 'implementation' as const,
subtaskType: 'feature_development' as const,
source: 'benchmark' as const,
tableVersion: 'v1',
sticky: false,
Expand Down
20 changes: 11 additions & 9 deletions packages/auto-routing-contracts/src/benchmark.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import * as z from 'zod';
import { RoutingTableSchema } from './routing-table';
import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
import { ReasoningEffortSchema } from './reasoning';
import { TaxonomyRouteKeySchema } from './taxonomy';

export { ReasoningEffortSchema } from './tiers';
export type { ReasoningEffort } from './tiers';
export { ReasoningEffortSchema } from './reasoning';
export type { ReasoningEffort } from './reasoning';

export const BenchmarkKindSchema = z.enum(['classifier', 'decider']);
export type BenchmarkKind = z.infer<typeof BenchmarkKindSchema>;
Expand Down Expand Up @@ -39,15 +40,16 @@ export const BenchmarkConfigSchema = z
.object({
classifierModels: z.array(z.string().trim().min(1)).min(1),
deciderModels: z.array(BenchmarkDeciderModelSchema).min(1),
// Accuracy threshold for "gets the job done" (per tier).
// Accuracy threshold for "gets the job done" (per taxonomy route).
minAccuracy: z.number().min(0).max(1),
// Parallel OpenRouter calls per queue message.
maxConcurrency: z.number().int().min(1).max(16),
// Benchmark-wide parallelism budget. Decider runs use it as a live
// container budget; classifier runs use it for parallel OpenRouter calls.
maxConcurrency: z.number().int().min(1).max(100),
// The Kilo user whose identity/billing the decider CLI runs execute under.
// Null until an admin configures it; decider runs fail fast while null.
benchmarkUserId: z.string().trim().min(1).nullable(),
// Session stickiness knob carried into published routing tables: a session
// stays on its incumbent model while it meets the tier's accuracy
// stays on its incumbent model while it meets the route's accuracy
// threshold, unless the fresh pick is cheaper by more than this factor.
// Model switches discard provider prompt caches (cache reads are far
// cheaper than fresh input tokens), so switching only pays off when the
Expand Down Expand Up @@ -79,8 +81,8 @@ export type BenchmarkRunStatus = z.infer<typeof BenchmarkRunStatusSchema>;

export const BenchmarkModelSummarySchema = z.object({
model: z.string(),
// '*' for classifier runs (no tiering), otherwise the difficulty tier.
tier: z.union([DifficultyTierSchema, z.literal('*')]),
// '*' for classifier runs, otherwise "<taskType>/<subtaskType>".
routeKey: z.union([TaxonomyRouteKeySchema, z.literal('*')]),
accuracy: z.number(),
avgCostUsd: z.number().nullable(),
avgLatencyMs: z.number(),
Expand Down
14 changes: 14 additions & 0 deletions packages/auto-routing-contracts/src/contracts.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,20 @@ describe('BenchmarkConfigSchema defaults', () => {
expect(result.deciderRepetitions).toBe(1);
expect(result.classifierMaxP95LatencyMs).toBe(1000);
});

it('accepts the benchmark maximum concurrency cap of 100', () => {
const result = BenchmarkConfigSchema.safeParse({
classifierModels: ['model/a'],
deciderModels: [{ id: 'model/b' }],
minAccuracy: 0.8,
maxConcurrency: 100,
benchmarkUserId: null,
switchCostFactor: 2,
updatedAt: null,
updatedBy: null,
});
expect(result.success).toBe(true);
});
});

describe('BenchmarkConfigSchema duplicate model ids', () => {
Expand Down
60 changes: 15 additions & 45 deletions packages/auto-routing-contracts/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import * as z from 'zod';
import { NormalizedClassifierInputSchema } from './input';
import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
import { ReasoningEffortSchema } from './reasoning';
import {
ClassifierSubtaskTypeSchema,
ClassifierTaskTypeSchema,
SUBTYPES_BY_TASK_TYPE,
type ClassifierSubtaskType,
} from './taxonomy';

export {
NormalizedClassifierInputSchema,
Expand Down Expand Up @@ -29,47 +35,6 @@ export const MirrorPayloadSchema = z.object({
});
export type MirrorPayload = z.infer<typeof MirrorPayloadSchema>;

export const ClassifierTaskTypeSchema = z.enum([
'implementation',
'debugging',
'refactoring',
'planning_design',
'investigation',
'agentic_execution',
]);
export type ClassifierTaskType = z.infer<typeof ClassifierTaskTypeSchema>;

export const ClassifierSubtaskTypeSchema = z.enum([
'feature_development',
'code_generation',
'test_creation',
'bug_fixing',
'test_repair',
'root_cause_analysis',
'code_cleanup',
'architecture_improvement',
'migration',
'architecture_design',
'technical_planning',
'system_design',
'repo_exploration',
'codebase_understanding',
'external_research',
'tool_usage',
'terminal_operations',
'multi_step_execution',
]);
export type ClassifierSubtaskType = z.infer<typeof ClassifierSubtaskTypeSchema>;

const subtypesByTaskType: Record<ClassifierTaskType, readonly ClassifierSubtaskType[]> = {
implementation: ['feature_development', 'code_generation', 'test_creation'],
debugging: ['bug_fixing', 'test_repair', 'root_cause_analysis'],
refactoring: ['code_cleanup', 'architecture_improvement', 'migration'],
planning_design: ['architecture_design', 'technical_planning', 'system_design'],
investigation: ['repo_exploration', 'codebase_understanding', 'external_research'],
agentic_execution: ['tool_usage', 'terminal_operations', 'multi_step_execution'],
};

export const ClassifierOutputSchema = z
.strictObject({
taskType: ClassifierTaskTypeSchema,
Expand All @@ -87,7 +52,10 @@ export const ClassifierOutputSchema = z
confidence: z.number().min(0).max(1),
})
.superRefine((output, ctx) => {
if (!subtypesByTaskType[output.taskType].includes(output.subtaskType)) {
const allowedSubtypes = SUBTYPES_BY_TASK_TYPE[
output.taskType
] as readonly ClassifierSubtaskType[];
if (!allowedSubtypes.includes(output.subtaskType)) {
ctx.addIssue({
code: 'custom',
path: ['subtaskType'],
Expand All @@ -99,7 +67,8 @@ export type ClassifierOutput = z.infer<typeof ClassifierOutputSchema>;

export const AutoRoutingDecisionSchema = z.object({
model: z.string(),
tier: DifficultyTierSchema,
taskType: ClassifierTaskTypeSchema,
subtaskType: ClassifierSubtaskTypeSchema,
source: z.enum(['benchmark']),
tableVersion: z.string(),
// Mirrors the effort the chosen model was benchmarked with, when set.
Expand Down Expand Up @@ -180,6 +149,7 @@ export type AutoRoutingClassifierAnalyticsResponse = z.infer<

export { normalizeClassifierInput, redactProviderHints, type ClassifierApiKind } from './normalize';

export * from './tiers';
export * from './reasoning';
export * from './taxonomy';
export * from './routing-table';
export * from './benchmark';
4 changes: 4 additions & 0 deletions packages/auto-routing-contracts/src/reasoning.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import * as z from 'zod';

export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
export type ReasoningEffort = z.infer<typeof ReasoningEffortSchema>;
34 changes: 29 additions & 5 deletions packages/auto-routing-contracts/src/routing-table.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@ const candidate = (model: string, accuracy: number, avgCostUsd: number) => ({
});

describe('rankCandidates', () => {
it('puts the cheapest above-threshold candidate first', () => {
it('puts the lowest cost-per-accuracy above-threshold candidate first', () => {
const ranked = rankCandidates(
[candidate('expensive', 0.95, 10), candidate('cheap', 0.8, 1), candidate('weak', 0.5, 0.1)],
[
candidate('lower-raw-cost', 0.7, 0.007),
candidate('better-value', 0.9, 0.008),
candidate('weak', 0.5, 0.001),
],
0.7
);
expect(ranked.map(c => c.model)).toEqual(['cheap', 'expensive', 'weak']);
expect(ranked.map(c => c.model)).toEqual(['better-value', 'lower-raw-cost', 'weak']);
expect(ranked[0].meetsThreshold).toBe(true);
expect(ranked[2].meetsThreshold).toBe(false);
});
Expand All @@ -29,15 +33,35 @@ describe('rankCandidates', () => {
});

describe('RoutingTableSchema', () => {
it('requires at least one candidate per tier', () => {
it('requires at least one candidate per taxonomy route', () => {
expect(
RoutingTableSchema.safeParse({
version: 'v',
generatedAt: new Date(0).toISOString(),
minAccuracy: 0.7,
switchCostFactor: 3,
source: 'benchmark',
tiers: { low: [], medium: [candidate('m', 1, 1)], high: [candidate('h', 1, 1)] },
routes: {
'implementation/code_generation': [],
'debugging/bug_fixing': [candidate('m', 1, 1)],
},
}).success
).toBe(false);
});

it('accepts a table routed by classifier taxonomy pair', () => {
const parsed = RoutingTableSchema.parse({
version: 'v',
generatedAt: new Date(0).toISOString(),
minAccuracy: 0.7,
switchCostFactor: 3,
source: 'benchmark',
routes: {
'implementation/code_generation': [candidate('impl', 0.9, 1)],
'debugging/bug_fixing': [candidate('debug', 0.9, 1)],
},
});

expect(parsed.routes['implementation/code_generation']?.[0]?.model).toBe('impl');
});
});
Loading
Loading