From 5dd959ac6b5622124e868c32dcca15b8c81e424e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Wed, 17 Jun 2026 18:36:50 +0200 Subject: [PATCH 1/5] feat(auto-routing): auto-sync decider benchmark models --- .../auto-routing/BenchmarksSection.test.ts | 44 +- .../admin/auto-routing/BenchmarksSection.tsx | 137 +++- .../decider-candidates/route.test.ts | 50 ++ .../decider-candidates/route.ts | 31 + .../auto-routing-decider-candidates.test.ts | 60 ++ .../auto-routing-decider-candidates.ts | 69 ++ .../auto-routing-contracts/src/benchmark.ts | 43 + .../src/contracts.test.ts | 23 + .../migrations/0002_magical_wendell_rand.sql | 10 + .../migrations/meta/0002_snapshot.json | 734 ++++++++++++++++++ .../migrations/meta/_journal.json | 7 + .../auto-routing-benchmark/src/admin.test.ts | 18 +- .../src/auto-decider-sync.test.ts | 142 ++++ .../src/auto-decider-sync.ts | 130 ++++ .../auto-routing-benchmark/src/config.test.ts | 36 +- services/auto-routing-benchmark/src/config.ts | 58 +- .../auto-routing-benchmark/src/db-schema.ts | 11 + services/auto-routing-benchmark/src/db.ts | 45 +- services/auto-routing-benchmark/src/index.ts | 11 + .../auto-routing-benchmark/wrangler.jsonc | 1 + 20 files changed, 1626 insertions(+), 34 deletions(-) create mode 100644 apps/web/src/app/api/internal/auto-routing-benchmark/decider-candidates/route.test.ts create mode 100644 apps/web/src/app/api/internal/auto-routing-benchmark/decider-candidates/route.ts create mode 100644 apps/web/src/lib/model-stats/auto-routing-decider-candidates.test.ts create mode 100644 apps/web/src/lib/model-stats/auto-routing-decider-candidates.ts create mode 100644 services/auto-routing-benchmark/migrations/0002_magical_wendell_rand.sql create mode 100644 services/auto-routing-benchmark/migrations/meta/0002_snapshot.json create mode 100644 services/auto-routing-benchmark/src/auto-decider-sync.test.ts create mode 100644 services/auto-routing-benchmark/src/auto-decider-sync.ts diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts index 8796256337..059275664d 100644 --- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts @@ -1,9 +1,11 @@ import { describe, expect, it } from '@jest/globals'; +import type { BenchmarkConfig } from '@kilocode/auto-routing-contracts'; import React from 'react'; import { renderToStaticMarkup } from 'react-dom/server'; import { configToFormState, costPerAccuracy, + effectiveDeciderModels, formatCostPerAccuracy, formatAccuracy, formatUsd, @@ -121,6 +123,8 @@ describe('configToFormState', () => { expect(state.classifierMaxP95LatencyMs).toBe('1000'); expect(state.classifierModels).toBe(''); expect(state.deciderModels).toEqual([]); + expect(state.autoDeciderModels).toEqual([]); + expect(state.excludedAutoDeciderModels).toBe(''); expect(state.maxConcurrency).toBe(100); expect(state.benchmarkUserId).toBe('ce12ef3d-ae95-4d77-b4f0-23735f0a0591'); expect(state.benchmarkOrgId).toBe('9d278969-5453-4ae3-a51f-a8d2274a7b56'); @@ -128,9 +132,15 @@ describe('configToFormState', () => { }); describe('formStateToConfig round-trip', () => { - const baseConfig = { + const baseConfig: BenchmarkConfig = { classifierModels: ['model-a', 'model-b'], deciderModels: [{ id: 'model-c', reasoningEffort: null }], + manualDeciderModels: [{ id: 'manual-model', reasoningEffort: 'low' }], + autoDeciderModels: [ + { id: 'auto-model', reasoningEffort: null, avgAttemptCostUsd: 21.25 }, + { id: 'excluded-auto-model', reasoningEffort: 'high', avgAttemptCostUsd: 18 }, + ], + excludedAutoDeciderModels: ['excluded-auto-model'], minAccuracy: 0.8, switchCostFactor: 3, maxConcurrency: 4, @@ -149,12 +159,21 @@ describe('formStateToConfig round-trip', () => { expect(state.deciderRepetitions).toBe(2); expect(state.classifierMaxP95LatencyMs).toBe('500'); expect(state.benchmarkOrgId).toBe('org-123'); + expect(state.deciderModels).toEqual([{ id: 'manual-model', reasoningEffort: 'low' }]); + expect(state.autoDeciderModels).toEqual(baseConfig.autoDeciderModels); + expect(state.excludedAutoDeciderModels).toBe('excluded-auto-model'); const result = formStateToConfig(state, baseConfig); expect(result.classifierRepetitions).toBe(3); expect(result.deciderRepetitions).toBe(2); expect(result.classifierMaxP95LatencyMs).toBe(500); expect(result.benchmarkOrgId).toBe('org-123'); + expect(result.manualDeciderModels).toEqual([{ id: 'manual-model', reasoningEffort: 'low' }]); + expect(result.excludedAutoDeciderModels).toEqual(['excluded-auto-model']); + expect(result.deciderModels).toEqual([ + { id: 'manual-model', reasoningEffort: 'low' }, + { id: 'auto-model', reasoningEffort: null }, + ]); }); it('converts empty-string classifierMaxP95LatencyMs form value to null in config', () => { @@ -164,3 +183,26 @@ describe('formStateToConfig round-trip', () => { expect(result.classifierMaxP95LatencyMs).toBeNull(); }); }); + +describe('effectiveDeciderModels', () => { + it('combines manual models with non-excluded auto models and lets manual override an auto duplicate', () => { + expect( + effectiveDeciderModels({ + manualDeciderModels: [ + { id: 'manual/model', reasoningEffort: null }, + { id: 'auto/duplicate', reasoningEffort: 'high' }, + ], + autoDeciderModels: [ + { id: 'auto/duplicate', reasoningEffort: null, avgAttemptCostUsd: 20 }, + { id: 'auto/included', reasoningEffort: 'low', avgAttemptCostUsd: 22 }, + { id: 'auto/excluded', reasoningEffort: null, avgAttemptCostUsd: 23 }, + ], + excludedAutoDeciderModels: ['auto/excluded'], + }) + ).toEqual([ + { id: 'manual/model', reasoningEffort: null }, + { id: 'auto/duplicate', reasoningEffort: 'high' }, + { id: 'auto/included', reasoningEffort: 'low' }, + ]); + }); +}); diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx index 94096fbd1e..bf73a21be8 100644 --- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx @@ -12,6 +12,7 @@ import { type BenchmarkModelSummary, type RankedCandidate, type ReasoningEffort, + type AutoBenchmarkDeciderModel, } from '@kilocode/auto-routing-contracts'; import React, { useCallback, useEffect, useRef, useState } from 'react'; import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; @@ -119,12 +120,16 @@ type DeciderModelRow = { reasoningEffort: ReasoningEffort | null; }; +type AutoDeciderModelRow = AutoBenchmarkDeciderModel; + const DEFAULT_BENCHMARK_USER_ID = 'ce12ef3d-ae95-4d77-b4f0-23735f0a0591'; const DEFAULT_BENCHMARK_ORG_ID = '9d278969-5453-4ae3-a51f-a8d2274a7b56'; export function configToFormState(config: BenchmarkConfig | null): { classifierModels: string; deciderModels: DeciderModelRow[]; + autoDeciderModels: AutoDeciderModelRow[]; + excludedAutoDeciderModels: string; minAccuracy: number; switchCostFactor: number; maxConcurrency: number; @@ -140,6 +145,8 @@ export function configToFormState(config: BenchmarkConfig | null): { return { classifierModels: '', deciderModels: [], + autoDeciderModels: [], + excludedAutoDeciderModels: '', minAccuracy: 0.7, switchCostFactor: 3, maxConcurrency: 100, @@ -152,10 +159,12 @@ export function configToFormState(config: BenchmarkConfig | null): { } return { classifierModels: config.classifierModels.join('\n'), - deciderModels: config.deciderModels.map(m => ({ + deciderModels: (config.manualDeciderModels ?? config.deciderModels).map(m => ({ id: m.id, reasoningEffort: m.reasoningEffort ?? null, })), + autoDeciderModels: config.autoDeciderModels ?? [], + excludedAutoDeciderModels: (config.excludedAutoDeciderModels ?? []).join('\n'), minAccuracy: config.minAccuracy, switchCostFactor: config.switchCostFactor, maxConcurrency: config.maxConcurrency, @@ -168,20 +177,56 @@ export function configToFormState(config: BenchmarkConfig | null): { }; } -export function formStateToConfig( - state: ReturnType, - base: BenchmarkConfig | null -): BenchmarkConfig { - const classifierModels = state.classifierModels +function parseModelLines(value: string): string[] { + return value .split('\n') .map(s => s.trim()) .filter(s => s.length > 0); - const deciderModels = state.deciderModels +} + +export function effectiveDeciderModels({ + manualDeciderModels, + autoDeciderModels, + excludedAutoDeciderModels, +}: { + manualDeciderModels: DeciderModelRow[]; + autoDeciderModels: AutoDeciderModelRow[]; + excludedAutoDeciderModels: string[]; +}): DeciderModelRow[] { + const manual = manualDeciderModels .filter(row => row.id.trim().length > 0) .map(row => ({ id: row.id.trim(), reasoningEffort: row.reasoningEffort ?? null, })); + const manualIds = new Set(manual.map(model => model.id)); + const excludedAuto = new Set(excludedAutoDeciderModels); + return [ + ...manual, + ...autoDeciderModels + .filter(model => !excludedAuto.has(model.id)) + .filter(model => !manualIds.has(model.id)) + .map(model => ({ + id: model.id, + reasoningEffort: model.reasoningEffort ?? null, + })), + ]; +} + +export function formStateToConfig( + state: ReturnType, + base: BenchmarkConfig | null +): BenchmarkConfig { + const classifierModels = parseModelLines(state.classifierModels); + const excludedAutoDeciderModels = parseModelLines(state.excludedAutoDeciderModels); + const manualDeciderModels = state.deciderModels + .filter(row => row.id.trim().length > 0) + .map(row => ({ id: row.id.trim(), reasoningEffort: row.reasoningEffort ?? null })); + const deciderModels = effectiveDeciderModels({ + manualDeciderModels, + autoDeciderModels: state.autoDeciderModels, + excludedAutoDeciderModels, + }); const benchmarkUserId = state.benchmarkUserId.trim(); const benchmarkOrgId = state.benchmarkOrgId.trim(); const rawLatency = state.classifierMaxP95LatencyMs.trim(); @@ -189,6 +234,9 @@ export function formStateToConfig( return { classifierModels, deciderModels, + manualDeciderModels, + autoDeciderModels: state.autoDeciderModels, + excludedAutoDeciderModels, minAccuracy: state.minAccuracy, switchCostFactor: state.switchCostFactor, maxConcurrency: state.maxConcurrency, @@ -287,6 +335,24 @@ function BenchmarkConfigEditor({ [updateForm] ); + const handleToggleAutoDeciderModel = useCallback( + (modelId: string, included: boolean) => { + updateForm(prev => { + const excluded = new Set(parseModelLines(prev.excludedAutoDeciderModels)); + if (included) { + excluded.delete(modelId); + } else { + excluded.add(modelId); + } + return { + ...prev, + excludedAutoDeciderModels: [...excluded].sort().join('\n'), + }; + }); + }, + [updateForm] + ); + const handleSave = useCallback(() => { saveMutation.mutate(formStateToConfig(form, config)); }, [form, config, saveMutation]); @@ -312,9 +378,9 @@ function BenchmarkConfigEditor({ /> - {/* Decider models table */} + {/* Manual decider models table */}
- +
@@ -389,6 +455,59 @@ function BenchmarkConfigEditor({ + {/* Auto decider models */} +
+
+ + {form.autoDeciderModels.length} synced +
+ {form.autoDeciderModels.length > 0 ? ( +
+
+ + + Model ID + Avg run + Reasoning effort + Included + + + + {form.autoDeciderModels.map(model => { + const excluded = parseModelLines(form.excludedAutoDeciderModels).includes( + model.id + ); + return ( + + {model.id} + + {formatUsd(model.avgAttemptCostUsd)} + + + {model.reasoningEffort ?? 'default'} + + + + handleToggleAutoDeciderModel(model.id, checked === true) + } + aria-label={`${excluded ? 'Include' : 'Exclude'} ${model.id}`} + /> + + + ); + })} + +
+
+ ) : ( +
+ No auto decider models synced yet. +
+ )} +
+ {/* Numeric inputs */}
diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/decider-candidates/route.test.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/decider-candidates/route.test.ts new file mode 100644 index 0000000000..2afdd598c3 --- /dev/null +++ b/apps/web/src/app/api/internal/auto-routing-benchmark/decider-candidates/route.test.ts @@ -0,0 +1,50 @@ +import { NextRequest } from 'next/server'; +import { listAutoRoutingDeciderCandidates } from '@/lib/model-stats/auto-routing-decider-candidates'; + +jest.mock('@/lib/config.server', () => ({ + INTERNAL_API_SECRET: 'internal-secret', +})); + +jest.mock('@/lib/model-stats/auto-routing-decider-candidates', () => ({ + AUTO_DECIDER_MIN_COST_USD: 15, + AUTO_DECIDER_MAX_COST_USD: 25, + listAutoRoutingDeciderCandidates: jest.fn(), +})); + +import { GET } from './route'; + +const mockListAutoRoutingDeciderCandidates = jest.mocked(listAutoRoutingDeciderCandidates); + +function createRequest(headers: Record = {}) { + return new NextRequest( + 'http://localhost:3000/api/internal/auto-routing-benchmark/decider-candidates', + { headers } + ); +} + +describe('GET /api/internal/auto-routing-benchmark/decider-candidates', () => { + beforeEach(() => { + jest.clearAllMocks(); + mockListAutoRoutingDeciderCandidates.mockResolvedValue([ + { id: 'model/a', avgAttemptCostUsd: 20.5 }, + ]); + }); + + it('returns 401 without the bearer secret', async () => { + const res = await GET(createRequest()); + + expect(res.status).toBe(401); + expect(mockListAutoRoutingDeciderCandidates).not.toHaveBeenCalled(); + }); + + it('returns synced auto decider candidates for authenticated worker callers', async () => { + const res = await GET(createRequest({ authorization: 'Bearer internal-secret' })); + + expect(res.status).toBe(200); + await expect(res.json()).resolves.toMatchObject({ + candidates: [{ id: 'model/a', avgAttemptCostUsd: 20.5 }], + minCostUsd: 15, + maxCostUsd: 25, + }); + }); +}); diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/decider-candidates/route.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/decider-candidates/route.ts new file mode 100644 index 0000000000..e7338a8485 --- /dev/null +++ b/apps/web/src/app/api/internal/auto-routing-benchmark/decider-candidates/route.ts @@ -0,0 +1,31 @@ +import type { NextRequest } from 'next/server'; +import { NextResponse } from 'next/server'; +import { timingSafeEqual } from '@kilocode/encryption'; +import { + AUTO_DECIDER_MAX_COST_USD, + AUTO_DECIDER_MIN_COST_USD, + listAutoRoutingDeciderCandidates, +} from '@/lib/model-stats/auto-routing-decider-candidates'; +import { INTERNAL_API_SECRET } from '@/lib/config.server'; + +function extractBearerToken(authHeader: string | null): string | null { + if (!authHeader) return null; + const trimmed = authHeader.trim(); + if (trimmed.slice(0, 7).toLowerCase() !== 'bearer ') return null; + return trimmed.slice(7).trim() || null; +} + +export async function GET(req: NextRequest) { + const token = extractBearerToken(req.headers.get('authorization')); + if (!INTERNAL_API_SECRET || !token || !timingSafeEqual(token, INTERNAL_API_SECRET)) { + return NextResponse.json({ error: 'Unauthorized' }, { status: 401 }); + } + + const candidates = await listAutoRoutingDeciderCandidates(); + return NextResponse.json({ + candidates, + minCostUsd: AUTO_DECIDER_MIN_COST_USD, + maxCostUsd: AUTO_DECIDER_MAX_COST_USD, + generatedAt: new Date().toISOString(), + }); +} diff --git a/apps/web/src/lib/model-stats/auto-routing-decider-candidates.test.ts b/apps/web/src/lib/model-stats/auto-routing-decider-candidates.test.ts new file mode 100644 index 0000000000..4d45fd87e6 --- /dev/null +++ b/apps/web/src/lib/model-stats/auto-routing-decider-candidates.test.ts @@ -0,0 +1,60 @@ +import { describe, expect, it } from '@jest/globals'; +import { + AUTO_DECIDER_MAX_COST_USD, + AUTO_DECIDER_MIN_COST_USD, + summarizeAutoRoutingDeciderCandidates, +} from './auto-routing-decider-candidates'; + +function row( + openrouterId: string, + avgAttemptCostUsd: number, + overrides: { active?: boolean } = {} +) { + return { + openrouterId, + isActive: overrides.active ?? true, + benchmarks: { + kiloBench: { + overallScore: 0.5, + evals: { + 'terminal-bench': { + taskSource: 'terminal-bench', + overallScore: 0.5, + totalScore: 3, + avgCostUsd: 1, + avgInputTokens: 1, + avgOutputTokens: 1, + avgCacheReadTokens: 1, + avgExecutionMs: 1, + nTotalTrials: 6, + nAttempts: 6, + avgAttemptCostUsd, + avgAttemptInputTokens: 1, + avgAttemptOutputTokens: 1, + avgAttemptCacheReadTokens: 1, + nErrored: 0, + lastPromotedAt: '2026-06-01T00:00:00.000Z', + }, + }, + }, + }, + }; +} + +describe('summarizeAutoRoutingDeciderCandidates', () => { + it('keeps active terminal-bench models whose floored average attempt cost is in the auto range', () => { + const candidates = summarizeAutoRoutingDeciderCandidates([ + row('model/too-cheap', AUTO_DECIDER_MIN_COST_USD - 0.01), + row('model/minimum', AUTO_DECIDER_MIN_COST_USD), + row('model/floored-maximum', AUTO_DECIDER_MAX_COST_USD + 0.99), + row('model/too-expensive', AUTO_DECIDER_MAX_COST_USD + 1), + row('model/inactive', 20, { active: false }), + row('kilo-internal/custom', 20), + ]); + + expect(candidates).toEqual([ + { id: 'model/floored-maximum', avgAttemptCostUsd: 25.99 }, + { id: 'model/minimum', avgAttemptCostUsd: 15 }, + ]); + }); +}); diff --git a/apps/web/src/lib/model-stats/auto-routing-decider-candidates.ts b/apps/web/src/lib/model-stats/auto-routing-decider-candidates.ts new file mode 100644 index 0000000000..0fc4b48b47 --- /dev/null +++ b/apps/web/src/lib/model-stats/auto-routing-decider-candidates.ts @@ -0,0 +1,69 @@ +import { CUSTOM_LLM_PREFIX } from '@/lib/ai-gateway/model-utils'; +import { readDb } from '@/lib/drizzle'; +import { ModelStatsBenchmarksSchema, modelStats } from '@kilocode/db/schema'; +import { and, eq, notLike } from 'drizzle-orm'; + +const TerminalBenchSchema = ModelStatsBenchmarksSchema.unwrap() + .pick({ kiloBench: true }) + .optional(); + +export const AUTO_DECIDER_MIN_COST_USD = 15; +export const AUTO_DECIDER_MAX_COST_USD = 25; + +export type AutoRoutingDeciderCandidate = { + id: string; + avgAttemptCostUsd: number; +}; + +type Row = { + openrouterId: string; + isActive: boolean | null; + benchmarks: unknown; +}; + +function isInAutoCostBand(avgAttemptCostUsd: number): boolean { + const floored = Math.floor(avgAttemptCostUsd); + return floored >= AUTO_DECIDER_MIN_COST_USD && floored <= AUTO_DECIDER_MAX_COST_USD; +} + +export function summarizeAutoRoutingDeciderCandidates( + rows: readonly Row[] +): AutoRoutingDeciderCandidate[] { + const candidates: AutoRoutingDeciderCandidate[] = []; + + for (const row of rows) { + if (!row.isActive || row.openrouterId.startsWith(CUSTOM_LLM_PREFIX)) continue; + const result = TerminalBenchSchema.safeParse(row.benchmarks); + if (!result.success) continue; + const bench = result.data?.kiloBench?.evals['terminal-bench']; + if ( + !bench || + (bench.nAttempts ?? 0) < 5 || + bench.avgAttemptCostUsd === null || + bench.avgAttemptCostUsd === undefined || + !isInAutoCostBand(bench.avgAttemptCostUsd) + ) { + continue; + } + candidates.push({ id: row.openrouterId, avgAttemptCostUsd: bench.avgAttemptCostUsd }); + } + + return candidates.sort((left, right) => { + const costDelta = right.avgAttemptCostUsd - left.avgAttemptCostUsd; + return costDelta === 0 ? left.id.localeCompare(right.id) : costDelta; + }); +} + +export async function listAutoRoutingDeciderCandidates(): Promise { + const rows = await readDb + .select({ + openrouterId: modelStats.openrouterId, + isActive: modelStats.isActive, + benchmarks: modelStats.benchmarks, + }) + .from(modelStats) + .where( + and(eq(modelStats.isActive, true), notLike(modelStats.openrouterId, `${CUSTOM_LLM_PREFIX}%`)) + ); + return summarizeAutoRoutingDeciderCandidates(rows); +} diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts index 57fb7b11e6..76e12fc663 100644 --- a/packages/auto-routing-contracts/src/benchmark.ts +++ b/packages/auto-routing-contracts/src/benchmark.ts @@ -18,6 +18,11 @@ export const BenchmarkDeciderModelSchema = z.object({ }); export type BenchmarkDeciderModel = z.infer; +export const AutoBenchmarkDeciderModelSchema = BenchmarkDeciderModelSchema.extend({ + avgAttemptCostUsd: z.number().nonnegative(), +}); +export type AutoBenchmarkDeciderModel = z.infer; + // Flags each list entry whose (trimmed) id already appeared earlier in the // array. Model ids are the D1 primary keys for config_classifier_models / // config_decider_models, so duplicates would otherwise reach the DB as an @@ -40,6 +45,14 @@ export const BenchmarkConfigSchema = z .object({ classifierModels: z.array(z.string().trim().min(1)).min(1), deciderModels: z.array(BenchmarkDeciderModelSchema).min(1), + // Manual additions are operator-pinned decider candidates. When omitted by + // older clients, the worker treats deciderModels as the manual list. + manualDeciderModels: z.array(BenchmarkDeciderModelSchema).optional(), + // Auto additions are refreshed from Kilo Bench cost data by the benchmark + // worker's scheduled sync. The effective deciderModels list is manual + + // non-excluded auto models. + autoDeciderModels: z.array(AutoBenchmarkDeciderModelSchema).optional(), + excludedAutoDeciderModels: z.array(z.string().trim().min(1)).optional(), // Accuracy threshold for "gets the job done" (per taxonomy route). minAccuracy: z.number().min(0).max(1), // Benchmark-wide parallelism budget. Decider runs use it as a live @@ -77,9 +90,39 @@ export const BenchmarkConfigSchema = z 'deciderModels', ctx ); + addDuplicateModelIssues( + (config.manualDeciderModels ?? []).map(m => m.id), + 'manualDeciderModels', + ctx + ); + addDuplicateModelIssues( + (config.autoDeciderModels ?? []).map(m => m.id), + 'autoDeciderModels', + ctx + ); + addDuplicateModelIssues( + config.excludedAutoDeciderModels ?? [], + 'excludedAutoDeciderModels', + ctx + ); }); export type BenchmarkConfig = z.infer; +export const AutoBenchmarkDeciderCandidatesResponseSchema = z.object({ + candidates: z.array( + z.object({ + id: z.string().trim().min(1), + avgAttemptCostUsd: z.number().nonnegative(), + }) + ), + minCostUsd: z.number().nonnegative().optional(), + maxCostUsd: z.number().nonnegative().optional(), + generatedAt: z.string().optional(), +}); +export type AutoBenchmarkDeciderCandidatesResponse = z.infer< + typeof AutoBenchmarkDeciderCandidatesResponseSchema +>; + export const BenchmarkRunStatusSchema = z.enum(['running', 'completed', 'failed']); export type BenchmarkRunStatus = z.infer; diff --git a/packages/auto-routing-contracts/src/contracts.test.ts b/packages/auto-routing-contracts/src/contracts.test.ts index 42e0379fc6..292fbdadad 100644 --- a/packages/auto-routing-contracts/src/contracts.test.ts +++ b/packages/auto-routing-contracts/src/contracts.test.ts @@ -163,6 +163,29 @@ describe('BenchmarkConfigSchema defaults', () => { }); expect(result.success).toBe(true); }); + + it('accepts explicit manual and excluded auto decider model lists', () => { + const result = BenchmarkConfigSchema.parse({ + classifierModels: ['model/a'], + deciderModels: [{ id: 'model/b' }], + manualDeciderModels: [{ id: 'model/c', reasoningEffort: 'high' }], + autoDeciderModels: [{ id: 'model/b', reasoningEffort: null, avgAttemptCostUsd: 21.1 }], + excludedAutoDeciderModels: ['model/d'], + minAccuracy: 0.7, + switchCostFactor: 3, + maxConcurrency: 10, + benchmarkUserId: null, + benchmarkOrgId: null, + updatedAt: null, + updatedBy: null, + }); + + expect(result.manualDeciderModels).toEqual([{ id: 'model/c', reasoningEffort: 'high' }]); + expect(result.autoDeciderModels).toEqual([ + { id: 'model/b', reasoningEffort: null, avgAttemptCostUsd: 21.1 }, + ]); + expect(result.excludedAutoDeciderModels).toEqual(['model/d']); + }); }); describe('BenchmarkConfigSchema duplicate model ids', () => { diff --git a/services/auto-routing-benchmark/migrations/0002_magical_wendell_rand.sql b/services/auto-routing-benchmark/migrations/0002_magical_wendell_rand.sql new file mode 100644 index 0000000000..31cc8b7a7c --- /dev/null +++ b/services/auto-routing-benchmark/migrations/0002_magical_wendell_rand.sql @@ -0,0 +1,10 @@ +CREATE TABLE `config_auto_decider_exclusions` ( + `model` text PRIMARY KEY NOT NULL +); +--> statement-breakpoint +CREATE TABLE `config_auto_decider_models` ( + `model` text PRIMARY KEY NOT NULL, + `reasoning_effort` text, + `avg_attempt_cost_usd` real NOT NULL, + `synced_at` text NOT NULL +); diff --git a/services/auto-routing-benchmark/migrations/meta/0002_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0002_snapshot.json new file mode 100644 index 0000000000..7156487709 --- /dev/null +++ b/services/auto-routing-benchmark/migrations/meta/0002_snapshot.json @@ -0,0 +1,734 @@ +{ + "version": "6", + "dialect": "sqlite", + "id": "3c258229-2360-4f73-bc7e-807239a3336d", + "prevId": "b717d9d9-78c9-43eb-99fa-b0a1db80b78e", + "tables": { + "benchmark_config": { + "name": "benchmark_config", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "min_accuracy": { + "name": "min_accuracy", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "switch_cost_factor": { + "name": "switch_cost_factor", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "max_concurrency": { + "name": "max_concurrency", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "benchmark_user_id": { + "name": "benchmark_user_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "benchmark_org_id": { + "name": "benchmark_org_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "classifier_repetitions": { + "name": "classifier_repetitions", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": 1 + }, + "decider_repetitions": { + "name": "decider_repetitions", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": 1 + }, + "classifier_max_p95_latency_ms": { + "name": "classifier_max_p95_latency_ms", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "updated_at": { + "name": "updated_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "updated_by": { + "name": "updated_by", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "benchmark_runs": { + "name": "benchmark_runs", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "kind": { + "name": "kind", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "started_at": { + "name": "started_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "completed_at": { + "name": "completed_at", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "min_accuracy": { + "name": "min_accuracy", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "switch_cost_factor": { + "name": "switch_cost_factor", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "max_concurrency": { + "name": "max_concurrency", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "benchmark_user_id": { + "name": "benchmark_user_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "benchmark_org_id": { + "name": "benchmark_org_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "repetitions": { + "name": "repetitions", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": 1 + }, + "classifier_max_p95_latency_ms": { + "name": "classifier_max_p95_latency_ms", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "engine_identity": { + "name": "engine_identity", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": "''" + } + }, + "indexes": { + "UQ_benchmark_runs_one_running_per_kind": { + "name": "UQ_benchmark_runs_one_running_per_kind", + "columns": [ + "kind" + ], + "isUnique": true, + "where": "\"benchmark_runs\".\"status\" = 'running'" + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "case_results": { + "name": "case_results", + "columns": { + "run_id": { + "name": "run_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "model": { + "name": "model", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "case_id": { + "name": "case_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "route_key": { + "name": "route_key", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "score": { + "name": "score", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "latency_ms": { + "name": "latency_ms", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "cost_usd": { + "name": "cost_usd", + "type": "real", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "fallback_reason": { + "name": "fallback_reason", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "retried": { + "name": "retried", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "exit_code": { + "name": "exit_code", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "output_prefix": { + "name": "output_prefix", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "event_count": { + "name": "event_count", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "last_event_types": { + "name": "last_event_types", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "rep": { + "name": "rep", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": 0 + }, + "timed_out": { + "name": "timed_out", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": 0 + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": { + "case_results_run_id_model_case_id_rep_pk": { + "columns": [ + "run_id", + "model", + "case_id", + "rep" + ], + "name": "case_results_run_id_model_case_id_rep_pk" + } + }, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "config_auto_decider_exclusions": { + "name": "config_auto_decider_exclusions", + "columns": { + "model": { + "name": "model", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "config_auto_decider_models": { + "name": "config_auto_decider_models", + "columns": { + "model": { + "name": "model", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "reasoning_effort": { + "name": "reasoning_effort", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "avg_attempt_cost_usd": { + "name": "avg_attempt_cost_usd", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "synced_at": { + "name": "synced_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "config_classifier_models": { + "name": "config_classifier_models", + "columns": { + "model": { + "name": "model", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "config_decider_models": { + "name": "config_decider_models", + "columns": { + "model": { + "name": "model", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "reasoning_effort": { + "name": "reasoning_effort", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "model_summaries": { + "name": "model_summaries", + "columns": { + "run_id": { + "name": "run_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "model": { + "name": "model", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "route_key": { + "name": "route_key", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "accuracy": { + "name": "accuracy", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "avg_cost_usd": { + "name": "avg_cost_usd", + "type": "real", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "avg_latency_ms": { + "name": "avg_latency_ms", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "p50_latency_ms": { + "name": "p50_latency_ms", + "type": "real", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "cases": { + "name": "cases", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "errors": { + "name": "errors", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "p95_latency_ms": { + "name": "p95_latency_ms", + "type": "real", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "timeouts": { + "name": "timeouts", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": 0 + }, + "carried": { + "name": "carried", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false, + "default": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": { + "model_summaries_run_id_model_route_key_pk": { + "columns": [ + "run_id", + "model", + "route_key" + ], + "name": "model_summaries_run_id_model_route_key_pk" + } + }, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "routing_table_candidates": { + "name": "routing_table_candidates", + "columns": { + "run_id": { + "name": "run_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "route_key": { + "name": "route_key", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "rank": { + "name": "rank", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "model": { + "name": "model", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "accuracy": { + "name": "accuracy", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "avg_cost_usd": { + "name": "avg_cost_usd", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "meets_threshold": { + "name": "meets_threshold", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "reasoning_effort": { + "name": "reasoning_effort", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": { + "routing_table_candidates_run_id_route_key_rank_pk": { + "columns": [ + "run_id", + "route_key", + "rank" + ], + "name": "routing_table_candidates_run_id_route_key_rank_pk" + } + }, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "routing_tables": { + "name": "routing_tables", + "columns": { + "run_id": { + "name": "run_id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "published_at": { + "name": "published_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "generated_at": { + "name": "generated_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "min_accuracy": { + "name": "min_accuracy", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "switch_cost_factor": { + "name": "switch_cost_factor", + "type": "real", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "source": { + "name": "source", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "run_models": { + "name": "run_models", + "columns": { + "run_id": { + "name": "run_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "model": { + "name": "model", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "enqueued": { + "name": "enqueued", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "reasoning_effort": { + "name": "reasoning_effort", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": { + "run_models_run_id_model_pk": { + "columns": [ + "run_id", + "model" + ], + "name": "run_models_run_id_model_pk" + } + }, + "uniqueConstraints": {}, + "checkConstraints": {} + } + }, + "views": {}, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + }, + "internal": { + "indexes": {} + } +} \ No newline at end of file diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json index da28b77f72..d6cb5b17a7 100644 --- a/services/auto-routing-benchmark/migrations/meta/_journal.json +++ b/services/auto-routing-benchmark/migrations/meta/_journal.json @@ -15,6 +15,13 @@ "when": 1781696079415, "tag": "0001_special_yellow_claw", "breakpoints": true + }, + { + "idx": 2, + "version": "6", + "when": 1781713850969, + "tag": "0002_magical_wendell_rand", + "breakpoints": true } ] } \ No newline at end of file diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts index 162d727b05..2c34b0bcec 100644 --- a/services/auto-routing-benchmark/src/admin.test.ts +++ b/services/auto-routing-benchmark/src/admin.test.ts @@ -62,6 +62,8 @@ const TEST_CONFIG_ROWS = { model: m.id, reasoning_effort: m.reasoningEffort ?? null, })), + autoDeciderModels: [], + excludedAutoDeciderModels: [], }; // --------------------------------------------------------------------------- @@ -153,6 +155,8 @@ beforeEach(() => { config: null, classifierModels: [], deciderModels: [], + autoDeciderModels: [], + excludedAutoDeciderModels: [], }); vi.mocked(replaceConfig).mockResolvedValue(undefined); vi.mocked(listRuns).mockResolvedValue([]); @@ -219,6 +223,8 @@ describe('GET /admin/config', () => { }, classifierModels, deciderModels, + autoDeciderModels: [], + excludedAutoDeciderModels: [], }); const res = await authedGet('/admin/config'); @@ -278,6 +284,13 @@ describe('PUT /admin/config', () => { const validConfig = { ...TEST_CONFIG, minAccuracy: 0.85, + deciderModels: [ + { id: 'manual/model', reasoningEffort: 'low' }, + { id: 'auto/model', reasoningEffort: null }, + ], + manualDeciderModels: [{ id: 'manual/model', reasoningEffort: 'low' }], + autoDeciderModels: [{ id: 'auto/model', reasoningEffort: null, avgAttemptCostUsd: 20 }], + excludedAutoDeciderModels: ['auto/excluded'], updatedAt: null, updatedBy: null, }; @@ -295,10 +308,13 @@ describe('PUT /admin/config', () => { expect(typeof body.config.updatedAt).toBe('string'); expect(replaceConfig).toHaveBeenCalledOnce(); - const [, configArg] = vi.mocked(replaceConfig).mock.calls[0]; + const [, configArg, , deciderModelRows, excludedAutoDeciderModels] = + vi.mocked(replaceConfig).mock.calls[0]; expect(configArg.min_accuracy).toBe(0.85); expect(typeof configArg.updated_at).toBe('string'); expect(configArg.updated_by).toBe('igor@kilocode.ai'); + expect(deciderModelRows).toEqual([{ model: 'manual/model', reasoning_effort: 'low' }]); + expect(excludedAutoDeciderModels).toEqual(['auto/excluded']); }); }); diff --git a/services/auto-routing-benchmark/src/auto-decider-sync.test.ts b/services/auto-routing-benchmark/src/auto-decider-sync.test.ts new file mode 100644 index 0000000000..cf7e64e33d --- /dev/null +++ b/services/auto-routing-benchmark/src/auto-decider-sync.test.ts @@ -0,0 +1,142 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import type * as DbModule from './db'; +import { syncAutoDeciderModels } from './auto-decider-sync'; + +vi.mock('./db', async importOriginal => { + const actual = await importOriginal(); + return { + ...actual, + getConfigRows: vi.fn(), + replaceAutoDeciderModels: vi.fn(), + getRunningRun: vi.fn(), + getLatestSummariesByModel: vi.fn(), + insertRun: vi.fn(), + markStaleRunsFailed: vi.fn(), + }; +}); + +import { + getConfigRows, + getLatestSummariesByModel, + getRunningRun, + insertRun, + markStaleRunsFailed, + replaceAutoDeciderModels, +} from './db'; + +const tokenGet = vi.fn<() => Promise>(); +const queueSendBatch = vi.fn(); +const fetchImpl = vi.fn(); + +const env = { + INTERNAL_API_SECRET_PROD: { get: tokenGet }, + BENCH_DB: {} as D1Database, + BENCH_QUEUE: { sendBatch: queueSendBatch }, + AUTO_ROUTING_CONFIG: { delete: vi.fn() }, + KILO_WEB_API_BASE_URL: 'https://app.test', + KILO_CLI_API_URL: 'https://api.test', +} as unknown as Env; + +const config = { + id: 1 as const, + min_accuracy: 0.7, + switch_cost_factor: 3, + max_concurrency: 100, + benchmark_user_id: 'user-123', + benchmark_org_id: null, + classifier_repetitions: 1, + decider_repetitions: 1, + classifier_max_p95_latency_ms: 1000, + updated_at: '2026-06-01T00:00:00.000Z', + updated_by: null, +}; + +describe('syncAutoDeciderModels', () => { + beforeEach(() => { + vi.clearAllMocks(); + tokenGet.mockResolvedValue('secret'); + fetchImpl.mockResolvedValue( + new Response( + JSON.stringify({ + candidates: [ + { id: 'auto/existing', avgAttemptCostUsd: 18 }, + { id: 'auto/new', avgAttemptCostUsd: 21.75 }, + ], + }), + { status: 200, headers: { 'content-type': 'application/json' } } + ) + ); + vi.mocked(getConfigRows).mockResolvedValue({ + config, + classifierModels: ['classifier/model'], + deciderModels: [{ model: 'manual/model', reasoning_effort: null }], + autoDeciderModels: [ + { + model: 'auto/existing', + reasoning_effort: 'high', + avg_attempt_cost_usd: 18, + synced_at: '2026-06-01T00:00:00.000Z', + }, + ], + excludedAutoDeciderModels: [], + }); + vi.mocked(replaceAutoDeciderModels).mockResolvedValue(undefined); + vi.mocked(markStaleRunsFailed).mockResolvedValue(undefined); + vi.mocked(getRunningRun).mockResolvedValue(undefined); + vi.mocked(getLatestSummariesByModel).mockResolvedValue(new Map()); + vi.mocked(insertRun).mockResolvedValue(undefined); + queueSendBatch.mockResolvedValue(undefined); + }); + + it('persists auto candidates, preserves existing reasoning effort, and starts a decider run for new effective models', async () => { + const result = await syncAutoDeciderModels(env, { fetchImpl }); + + expect(fetchImpl).toHaveBeenCalledWith( + 'https://app.test/api/internal/auto-routing-benchmark/decider-candidates', + expect.objectContaining({ + headers: expect.objectContaining({ authorization: 'Bearer secret' }), + }) + ); + expect(replaceAutoDeciderModels).toHaveBeenCalledWith(env.BENCH_DB, [ + expect.objectContaining({ model: 'auto/existing', reasoning_effort: 'high' }), + expect.objectContaining({ model: 'auto/new', reasoning_effort: null }), + ]); + expect(insertRun).toHaveBeenCalledOnce(); + expect(result).toMatchObject({ + addedModels: ['auto/new'], + removedModels: [], + startedRun: true, + }); + }); + + it('does not fail the sync when a decider run is already active', async () => { + vi.mocked(getRunningRun).mockResolvedValue({ + id: 'decider-active', + kind: 'decider', + status: 'running', + started_at: '2026-06-01T00:00:00.000Z', + completed_at: null, + error: null, + min_accuracy: 0.7, + switch_cost_factor: 3, + max_concurrency: 100, + benchmark_user_id: 'user-123', + benchmark_org_id: null, + repetitions: 1, + classifier_max_p95_latency_ms: null, + engine_identity: 'v1:test', + }); + + const result = await syncAutoDeciderModels(env, { fetchImpl }); + + expect(result).toMatchObject({ + addedModels: ['auto/new'], + removedModels: [], + startedRun: false, + runId: null, + skippedReason: 'active-run', + activeRunId: 'decider-active', + }); + expect(insertRun).not.toHaveBeenCalled(); + }); +}); diff --git a/services/auto-routing-benchmark/src/auto-decider-sync.ts b/services/auto-routing-benchmark/src/auto-decider-sync.ts new file mode 100644 index 0000000000..5880bc39c8 --- /dev/null +++ b/services/auto-routing-benchmark/src/auto-decider-sync.ts @@ -0,0 +1,130 @@ +import { + AutoBenchmarkDeciderCandidatesResponseSchema, + type BenchmarkDeciderModel, +} from '@kilocode/auto-routing-contracts'; +import { getBenchmarkConfig, mapConfigRows } from './config'; +import { getConfigRows, replaceAutoDeciderModels, type ConfigAutoDeciderModelRow } from './db'; +import { RunAlreadyActiveError, startRun } from './run'; + +type SyncOptions = { + fetchImpl?: typeof fetch; + now?: Date; +}; + +export type AutoDeciderSyncResult = { + addedModels: string[]; + removedModels: string[]; + startedRun: boolean; + runId: string | null; + skippedReason?: 'active-run'; + activeRunId?: string; +}; + +function modelKey(model: BenchmarkDeciderModel): string { + return `${model.id}\0${model.reasoningEffort ?? ''}`; +} + +function diffModels( + before: readonly BenchmarkDeciderModel[], + after: readonly BenchmarkDeciderModel[] +): { added: string[]; removed: string[] } { + const beforeKeys = new Set(before.map(modelKey)); + const afterKeys = new Set(after.map(modelKey)); + return { + added: after.filter(model => !beforeKeys.has(modelKey(model))).map(model => model.id), + removed: before.filter(model => !afterKeys.has(modelKey(model))).map(model => model.id), + }; +} + +async function fetchAutoDeciderCandidates( + env: Env, + fetchImpl: typeof fetch +): Promise<{ id: string; avgAttemptCostUsd: number }[]> { + const secret = await env.INTERNAL_API_SECRET_PROD.get(); + const response = await fetchImpl( + `${env.KILO_WEB_API_BASE_URL}/api/internal/auto-routing-benchmark/decider-candidates`, + { + headers: { + authorization: `Bearer ${secret}`, + }, + } + ); + if (!response.ok) { + const detail = (await response.text().catch(() => '')).slice(0, 200); + throw new Error(`auto decider candidate sync failed: HTTP ${response.status} ${detail}`); + } + const parsed = AutoBenchmarkDeciderCandidatesResponseSchema.safeParse(await response.json()); + if (!parsed.success) throw new Error('auto decider candidate sync returned unexpected response'); + return parsed.data.candidates; +} + +export async function syncAutoDeciderModels( + env: Env, + options: SyncOptions = {} +): Promise { + const fetchImpl = options.fetchImpl ?? fetch; + const syncedAt = (options.now ?? new Date()).toISOString(); + const beforeRows = await getConfigRows(env.BENCH_DB); + const beforeConfig = mapConfigRows( + beforeRows.config, + beforeRows.classifierModels, + beforeRows.deciderModels, + beforeRows.autoDeciderModels, + beforeRows.excludedAutoDeciderModels + ); + + const candidates = await fetchAutoDeciderCandidates(env, fetchImpl); + const previousReasoningEffort = new Map(); + for (const row of beforeRows.autoDeciderModels) { + previousReasoningEffort.set(row.model, row.reasoning_effort); + } + for (const row of beforeRows.deciderModels) { + previousReasoningEffort.set(row.model, row.reasoning_effort); + } + + const nextAutoRows: ConfigAutoDeciderModelRow[] = candidates.map(candidate => ({ + model: candidate.id, + reasoning_effort: previousReasoningEffort.get(candidate.id) ?? null, + avg_attempt_cost_usd: candidate.avgAttemptCostUsd, + synced_at: syncedAt, + })); + + await replaceAutoDeciderModels(env.BENCH_DB, nextAutoRows); + + const afterConfig = mapConfigRows( + beforeRows.config, + beforeRows.classifierModels, + beforeRows.deciderModels, + nextAutoRows, + beforeRows.excludedAutoDeciderModels + ); + const diff = diffModels(beforeConfig?.deciderModels ?? [], afterConfig?.deciderModels ?? []); + const changed = diff.added.length > 0 || diff.removed.length > 0; + + if (!changed || !(await getBenchmarkConfig(env.BENCH_DB))) { + return { addedModels: diff.added, removedModels: diff.removed, startedRun: false, runId: null }; + } + + let run: Awaited>; + try { + run = await startRun(env, 'decider'); + } catch (error) { + if (error instanceof RunAlreadyActiveError) { + return { + addedModels: diff.added, + removedModels: diff.removed, + startedRun: false, + runId: null, + skippedReason: 'active-run', + activeRunId: error.activeRunId, + }; + } + throw error; + } + return { + addedModels: diff.added, + removedModels: diff.removed, + startedRun: true, + runId: run.runId, + }; +} diff --git a/services/auto-routing-benchmark/src/config.test.ts b/services/auto-routing-benchmark/src/config.test.ts index 3e80a08259..55566dc7a0 100644 --- a/services/auto-routing-benchmark/src/config.test.ts +++ b/services/auto-routing-benchmark/src/config.test.ts @@ -23,23 +23,32 @@ const deciderRows: ConfigDeciderModelRow[] = [ }, ]; +const autoRows = [ + { + model: 'auto/model', + reasoning_effort: null, + avg_attempt_cost_usd: 19.75, + synced_at: '2026-06-01T01:00:00.000Z', + }, +]; + describe('mapConfigRows', () => { it('returns null when config row is null', () => { - expect(mapConfigRows(null, ['some/model'], deciderRows)).toBeNull(); + expect(mapConfigRows(null, ['some/model'], deciderRows, autoRows, [])).toBeNull(); }); it('returns null when classifierModels array is empty', () => { - expect(mapConfigRows(configRow, [], deciderRows)).toBeNull(); + expect(mapConfigRows(configRow, [], deciderRows, autoRows, [])).toBeNull(); }); it('returns null when deciderModels array is empty', () => { - expect(mapConfigRows(configRow, ['some/model'], [])).toBeNull(); + expect(mapConfigRows(configRow, ['some/model'], [], [], [])).toBeNull(); }); it('maps a full config row set to BenchmarkConfig', () => { const classifierModels = ['some/model-a', 'some/model-b']; - const result = mapConfigRows(configRow, classifierModels, deciderRows); + const result = mapConfigRows(configRow, classifierModels, deciderRows, autoRows, []); expect(result).not.toBeNull(); expect(result?.minAccuracy).toBe(0.85); @@ -50,11 +59,28 @@ describe('mapConfigRows', () => { expect(result?.updatedAt).toBe('2026-06-01T00:00:00.000Z'); expect(result?.updatedBy).toBe('admin@example.com'); expect(result?.classifierModels).toEqual(classifierModels); - expect(result?.deciderModels).toHaveLength(1); + expect(result?.deciderModels).toHaveLength(2); expect(result?.deciderModels[0].id).toBe('some/decider'); expect(result?.deciderModels[0].reasoningEffort).toBe('high'); + expect(result?.manualDeciderModels).toEqual([{ id: 'some/decider', reasoningEffort: 'high' }]); + expect(result?.autoDeciderModels).toEqual([ + { id: 'auto/model', reasoningEffort: null, avgAttemptCostUsd: 19.75 }, + ]); expect(result?.classifierRepetitions).toBe(1); expect(result?.deciderRepetitions).toBe(1); expect(result?.classifierMaxP95LatencyMs).toBeNull(); }); + + it('excludes only auto decider models, leaving a manual model with the same id included', () => { + const result = mapConfigRows( + configRow, + ['some/model'], + [{ model: 'auto/model', reasoning_effort: 'medium' }], + autoRows, + ['auto/model'] + ); + + expect(result?.deciderModels).toEqual([{ id: 'auto/model', reasoningEffort: 'medium' }]); + expect(result?.excludedAutoDeciderModels).toEqual(['auto/model']); + }); }); diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts index e4091ac0dd..7e99fcd47b 100644 --- a/services/auto-routing-benchmark/src/config.ts +++ b/services/auto-routing-benchmark/src/config.ts @@ -1,5 +1,10 @@ import type { BenchmarkConfig } from '@kilocode/auto-routing-contracts'; -import { getConfigRows, replaceConfig, type ConfigDeciderModelRow } from './db'; +import { + getConfigRows, + replaceConfig, + type ConfigAutoDeciderModelRow, + type ConfigDeciderModelRow, +} from './db'; // Maps the three normalized config tables to the BenchmarkConfig contract. // Null when no admin has saved a config yet — the worker never fabricates @@ -18,19 +23,39 @@ export function mapConfigRows( updated_by: string | null; } | null, classifierModels: string[], - deciderModelRows: ConfigDeciderModelRow[] + deciderModelRows: ConfigDeciderModelRow[], + autoDeciderModelRows: ConfigAutoDeciderModelRow[] = [], + excludedAutoDeciderModels: string[] = [] ): BenchmarkConfig | null { - if (configRow === null || classifierModels.length === 0 || deciderModelRows.length === 0) { + const excludedAuto = new Set(excludedAutoDeciderModels); + const manualDeciderModels = deciderModelRows.map(r => ({ + id: r.model, + reasoningEffort: + r.reasoning_effort as BenchmarkConfig['deciderModels'][number]['reasoningEffort'], + })); + const manualIds = new Set(manualDeciderModels.map(model => model.id)); + const autoDeciderModels = autoDeciderModelRows.map(r => ({ + id: r.model, + reasoningEffort: + r.reasoning_effort as BenchmarkConfig['deciderModels'][number]['reasoningEffort'], + avgAttemptCostUsd: r.avg_attempt_cost_usd, + })); + const effectiveAutoDeciderModels = autoDeciderModels + .filter(model => !excludedAuto.has(model.id)) + .filter(model => !manualIds.has(model.id)) + .map(model => ({ id: model.id, reasoningEffort: model.reasoningEffort })); + const deciderModels = [...manualDeciderModels, ...effectiveAutoDeciderModels]; + + if (configRow === null || classifierModels.length === 0 || deciderModels.length === 0) { return null; } return { classifierModels, - deciderModels: deciderModelRows.map(r => ({ - id: r.model, - reasoningEffort: - r.reasoning_effort as BenchmarkConfig['deciderModels'][number]['reasoningEffort'], - })), + deciderModels, + manualDeciderModels, + autoDeciderModels, + excludedAutoDeciderModels, minAccuracy: configRow.min_accuracy, switchCostFactor: configRow.switch_cost_factor, maxConcurrency: configRow.max_concurrency, @@ -45,8 +70,15 @@ export function mapConfigRows( } export async function getBenchmarkConfig(db: D1Database): Promise { - const { config, classifierModels, deciderModels } = await getConfigRows(db); - return mapConfigRows(config, classifierModels, deciderModels); + const { config, classifierModels, deciderModels, autoDeciderModels, excludedAutoDeciderModels } = + await getConfigRows(db); + return mapConfigRows( + config, + classifierModels, + deciderModels, + autoDeciderModels, + excludedAutoDeciderModels + ); } export async function saveBenchmarkConfig( @@ -57,7 +89,8 @@ export async function saveBenchmarkConfig( const updatedAt = new Date().toISOString(); const stamped: BenchmarkConfig = { ...config, updatedAt, updatedBy }; - const deciderModelRows: ConfigDeciderModelRow[] = config.deciderModels.map(m => ({ + const manualDeciderModels = config.manualDeciderModels ?? config.deciderModels; + const deciderModelRows: ConfigDeciderModelRow[] = manualDeciderModels.map(m => ({ model: m.id, reasoning_effort: m.reasoningEffort ?? null, })); @@ -77,7 +110,8 @@ export async function saveBenchmarkConfig( updated_by: updatedBy, }, config.classifierModels, - deciderModelRows + deciderModelRows, + config.excludedAutoDeciderModels ?? [] ); return stamped; diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts index 6b400cefa3..b191bff2e0 100644 --- a/services/auto-routing-benchmark/src/db-schema.ts +++ b/services/auto-routing-benchmark/src/db-schema.ts @@ -28,6 +28,17 @@ export const configDeciderModels = sqliteTable('config_decider_models', { reasoning_effort: text('reasoning_effort'), }); +export const configAutoDeciderModels = sqliteTable('config_auto_decider_models', { + model: text('model').primaryKey(), + reasoning_effort: text('reasoning_effort'), + avg_attempt_cost_usd: real('avg_attempt_cost_usd').notNull(), + synced_at: text('synced_at').notNull(), +}); + +export const configAutoDeciderExclusions = sqliteTable('config_auto_decider_exclusions', { + model: text('model').primaryKey(), +}); + export const benchmarkRuns = sqliteTable( 'benchmark_runs', { diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts index 7f21ed2526..f4bbac38a3 100644 --- a/services/auto-routing-benchmark/src/db.ts +++ b/services/auto-routing-benchmark/src/db.ts @@ -14,6 +14,8 @@ import { benchmarkConfig, benchmarkRuns, caseResults, + configAutoDeciderExclusions, + configAutoDeciderModels, configClassifierModels, configDeciderModels, modelSummaries, @@ -27,6 +29,7 @@ export type CaseResultRow = typeof caseResults.$inferSelect; export type RunRow = typeof benchmarkRuns.$inferSelect; export type RunModelRow = typeof runModels.$inferSelect; export type ConfigDeciderModelRow = typeof configDeciderModels.$inferSelect; +export type ConfigAutoDeciderModelRow = typeof configAutoDeciderModels.$inferSelect; type ModelSummaryRow = typeof modelSummaries.$inferSelect; // D1 rejects statements with too many bound variables. A model summary insert @@ -78,17 +81,24 @@ export async function getConfigRows(db: D1Database): Promise<{ config: typeof benchmarkConfig.$inferSelect | null; classifierModels: string[]; deciderModels: ConfigDeciderModelRow[]; + autoDeciderModels: ConfigAutoDeciderModelRow[]; + excludedAutoDeciderModels: string[]; }> { const orm = drizzle(db); - const [configRows, classifierRows, deciderRows] = await Promise.all([ - orm.select().from(benchmarkConfig).where(eq(benchmarkConfig.id, 1)).limit(1), - orm.select().from(configClassifierModels), - orm.select().from(configDeciderModels), - ]); + const [configRows, classifierRows, deciderRows, autoDeciderRows, exclusionRows] = + await Promise.all([ + orm.select().from(benchmarkConfig).where(eq(benchmarkConfig.id, 1)).limit(1), + orm.select().from(configClassifierModels), + orm.select().from(configDeciderModels), + orm.select().from(configAutoDeciderModels), + orm.select().from(configAutoDeciderExclusions), + ]); return { config: configRows[0] ?? null, classifierModels: classifierRows.map(r => r.model), deciderModels: deciderRows, + autoDeciderModels: autoDeciderRows, + excludedAutoDeciderModels: exclusionRows.map(r => r.model), }; } @@ -107,7 +117,8 @@ export async function replaceConfig( updated_by: string | null; }, classifierModels: string[], - deciderModels: ConfigDeciderModelRow[] + deciderModels: ConfigDeciderModelRow[], + excludedAutoDeciderModels: string[] = [] ): Promise { const orm = drizzle(db); const stmts: [BatchItem<'sqlite'>, ...BatchItem<'sqlite'>[]] = [ @@ -120,6 +131,7 @@ export async function replaceConfig( }), orm.delete(configClassifierModels), orm.delete(configDeciderModels), + orm.delete(configAutoDeciderExclusions), ]; if (classifierModels.length > 0) { stmts.push( @@ -129,6 +141,27 @@ export async function replaceConfig( if (deciderModels.length > 0) { stmts.push(orm.insert(configDeciderModels).values(deciderModels)); } + if (excludedAutoDeciderModels.length > 0) { + stmts.push( + orm + .insert(configAutoDeciderExclusions) + .values(excludedAutoDeciderModels.map(model => ({ model }))) + ); + } + await orm.batch(stmts); +} + +export async function replaceAutoDeciderModels( + db: D1Database, + autoDeciderModels: ConfigAutoDeciderModelRow[] +): Promise { + const orm = drizzle(db); + const stmts: [BatchItem<'sqlite'>, ...BatchItem<'sqlite'>[]] = [ + orm.delete(configAutoDeciderModels), + ]; + if (autoDeciderModels.length > 0) { + stmts.push(orm.insert(configAutoDeciderModels).values(autoDeciderModels)); + } await orm.batch(stmts); } diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts index 75cacb902c..7efbe7b296 100644 --- a/services/auto-routing-benchmark/src/index.ts +++ b/services/auto-routing-benchmark/src/index.ts @@ -2,6 +2,7 @@ import { Hono } from 'hono'; import { createErrorHandler, createNotFoundHandler } from '@kilocode/worker-utils'; import { registerAdminRoutes } from './admin'; import { authMiddleware } from './auth'; +import { syncAutoDeciderModels } from './auto-decider-sync'; import type { HonoEnv } from './hono-env'; import { processJob, type BenchmarkJobMessage } from './run'; @@ -19,6 +20,16 @@ app.onError(createErrorHandler()); export default { fetch: app.fetch, + async scheduled(controller: ScheduledController, env: Env): Promise { + const result = await syncAutoDeciderModels(env); + console.log( + JSON.stringify({ + event: 'auto_decider_model_sync_completed', + cron: controller.cron, + ...result, + }) + ); + }, async queue(batch: MessageBatch, env: Env): Promise { for (const message of batch.messages) { // Deliberately no try/catch: a throw from processJob (transient token, diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc index c0433b1073..fa08071eaa 100644 --- a/services/auto-routing-benchmark/wrangler.jsonc +++ b/services/auto-routing-benchmark/wrangler.jsonc @@ -15,6 +15,7 @@ "custom_domain": true, }, ], + "triggers": { "crons": ["0 9 * * *"] }, "dev": { "port": 8814, "local_protocol": "http", "ip": "0.0.0.0" }, "observability": { "enabled": true }, "vars": { From 4b5212f125b66137e238aabef1c1c1dfc6870fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20=C5=A0=C4=87eki=C4=87?= Date: Wed, 17 Jun 2026 19:35:10 +0200 Subject: [PATCH 2/5] feat(auto-routing): configure auto decider cost bounds --- .../benchmark-config/route.test.ts | 2 + .../auto-routing/BenchmarksSection.test.ts | 10 +- .../admin/auto-routing/BenchmarksSection.tsx | 48 ++ .../decider-candidates/route.test.ts | 21 + .../decider-candidates/route.ts | 24 +- ...uto-routing-benchmark-admin-client.test.ts | 2 + .../auto-routing-decider-candidates.test.ts | 31 +- .../auto-routing-decider-candidates.ts | 35 +- .../auto-routing-contracts/src/benchmark.ts | 14 + .../src/contracts.test.ts | 27 +- .../migrations/0003_chunky_ogun.sql | 2 + .../migrations/meta/0003_snapshot.json | 750 ++++++++++++++++++ .../migrations/meta/_journal.json | 7 + .../auto-routing-benchmark/src/admin.test.ts | 8 + .../src/auto-decider-sync.test.ts | 4 +- .../src/auto-decider-sync.ts | 28 +- .../auto-routing-benchmark/src/config.test.ts | 4 + services/auto-routing-benchmark/src/config.ts | 6 + .../src/db-replace-summaries.test.ts | 45 +- .../auto-routing-benchmark/src/db-schema.ts | 2 + services/auto-routing-benchmark/src/db.ts | 7 +- 21 files changed, 1042 insertions(+), 35 deletions(-) create mode 100644 services/auto-routing-benchmark/migrations/0003_chunky_ogun.sql create mode 100644 services/auto-routing-benchmark/migrations/meta/0003_snapshot.json diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts index e708bd01b9..ee6ca77392 100644 --- a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts +++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts @@ -78,6 +78,8 @@ const validConfig = { classifierRepetitions: 1, deciderRepetitions: 1, classifierMaxP95LatencyMs: 1000, + autoDeciderMinCostUsd: 15, + autoDeciderMaxCostUsd: 25, updatedAt: null, updatedBy: null, }; diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts index 059275664d..6f0e74bf6a 100644 --- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts @@ -121,6 +121,8 @@ describe('configToFormState', () => { expect(state.classifierRepetitions).toBe(1); expect(state.deciderRepetitions).toBe(1); expect(state.classifierMaxP95LatencyMs).toBe('1000'); + expect(state.autoDeciderMinCostUsd).toBe(15); + expect(state.autoDeciderMaxCostUsd).toBe(25); expect(state.classifierModels).toBe(''); expect(state.deciderModels).toEqual([]); expect(state.autoDeciderModels).toEqual([]); @@ -149,15 +151,19 @@ describe('formStateToConfig round-trip', () => { classifierRepetitions: 3, deciderRepetitions: 2, classifierMaxP95LatencyMs: 500, + autoDeciderMinCostUsd: 12, + autoDeciderMaxCostUsd: 24, updatedAt: null, updatedBy: null, }; - it('preserves classifierRepetitions, deciderRepetitions, and classifierMaxP95LatencyMs', () => { + it('preserves repetitions, classifierMaxP95LatencyMs, and auto decider cost bounds', () => { const state = configToFormState(baseConfig); expect(state.classifierRepetitions).toBe(3); expect(state.deciderRepetitions).toBe(2); expect(state.classifierMaxP95LatencyMs).toBe('500'); + expect(state.autoDeciderMinCostUsd).toBe(12); + expect(state.autoDeciderMaxCostUsd).toBe(24); expect(state.benchmarkOrgId).toBe('org-123'); expect(state.deciderModels).toEqual([{ id: 'manual-model', reasoningEffort: 'low' }]); expect(state.autoDeciderModels).toEqual(baseConfig.autoDeciderModels); @@ -167,6 +173,8 @@ describe('formStateToConfig round-trip', () => { expect(result.classifierRepetitions).toBe(3); expect(result.deciderRepetitions).toBe(2); expect(result.classifierMaxP95LatencyMs).toBe(500); + expect(result.autoDeciderMinCostUsd).toBe(12); + expect(result.autoDeciderMaxCostUsd).toBe(24); expect(result.benchmarkOrgId).toBe('org-123'); expect(result.manualDeciderModels).toEqual([{ id: 'manual-model', reasoningEffort: 'low' }]); expect(result.excludedAutoDeciderModels).toEqual(['excluded-auto-model']); diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx index bf73a21be8..2e79f3f6a6 100644 --- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx @@ -1,6 +1,8 @@ 'use client'; import { + AUTO_DECIDER_DEFAULT_MAX_COST_USD, + AUTO_DECIDER_DEFAULT_MIN_COST_USD, BenchmarkConfigResponseSchema, BenchmarkRoutingTableResponseSchema, BenchmarkRunsResponseSchema, @@ -138,6 +140,8 @@ export function configToFormState(config: BenchmarkConfig | null): { classifierRepetitions: number; deciderRepetitions: number; classifierMaxP95LatencyMs: string; + autoDeciderMinCostUsd: number; + autoDeciderMaxCostUsd: number; } { if (config === null) { // No config saved yet: the worker fabricates nothing, so the form starts @@ -155,6 +159,8 @@ export function configToFormState(config: BenchmarkConfig | null): { classifierRepetitions: 1, deciderRepetitions: 1, classifierMaxP95LatencyMs: '1000', + autoDeciderMinCostUsd: AUTO_DECIDER_DEFAULT_MIN_COST_USD, + autoDeciderMaxCostUsd: AUTO_DECIDER_DEFAULT_MAX_COST_USD, }; } return { @@ -174,6 +180,8 @@ export function configToFormState(config: BenchmarkConfig | null): { deciderRepetitions: config.deciderRepetitions, classifierMaxP95LatencyMs: config.classifierMaxP95LatencyMs !== null ? String(config.classifierMaxP95LatencyMs) : '', + autoDeciderMinCostUsd: config.autoDeciderMinCostUsd, + autoDeciderMaxCostUsd: config.autoDeciderMaxCostUsd, }; } @@ -245,6 +253,8 @@ export function formStateToConfig( classifierRepetitions: state.classifierRepetitions, deciderRepetitions: state.deciderRepetitions, classifierMaxP95LatencyMs, + autoDeciderMinCostUsd: state.autoDeciderMinCostUsd, + autoDeciderMaxCostUsd: state.autoDeciderMaxCostUsd, updatedAt: base?.updatedAt ?? null, updatedBy: base?.updatedBy ?? null, }; @@ -561,6 +571,44 @@ function BenchmarkConfigEditor({ className="h-8 w-40 tabular-nums" />
+
+ + + updateForm(prev => ({ + ...prev, + autoDeciderMinCostUsd: parseFloat(e.target.value) || 0, + })) + } + className="h-8 w-40 tabular-nums" + /> +
+
+ + + updateForm(prev => ({ + ...prev, + autoDeciderMaxCostUsd: parseFloat(e.target.value) || 0, + })) + } + className="h-8 w-40 tabular-nums" + /> +