diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts index e708bd01b9..ee6ca77392 100644 --- a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts +++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts @@ -78,6 +78,8 @@ const validConfig = { classifierRepetitions: 1, deciderRepetitions: 1, classifierMaxP95LatencyMs: 1000, + autoDeciderMinCostUsd: 15, + autoDeciderMaxCostUsd: 25, updatedAt: null, updatedBy: null, }; diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts index 8796256337..6f0e74bf6a 100644 --- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts @@ -1,9 +1,11 @@ import { describe, expect, it } from '@jest/globals'; +import type { BenchmarkConfig } from '@kilocode/auto-routing-contracts'; import React from 'react'; import { renderToStaticMarkup } from 'react-dom/server'; import { configToFormState, costPerAccuracy, + effectiveDeciderModels, formatCostPerAccuracy, formatAccuracy, formatUsd, @@ -119,8 +121,12 @@ describe('configToFormState', () => { expect(state.classifierRepetitions).toBe(1); expect(state.deciderRepetitions).toBe(1); expect(state.classifierMaxP95LatencyMs).toBe('1000'); + expect(state.autoDeciderMinCostUsd).toBe(15); + expect(state.autoDeciderMaxCostUsd).toBe(25); expect(state.classifierModels).toBe(''); expect(state.deciderModels).toEqual([]); + expect(state.autoDeciderModels).toEqual([]); + expect(state.excludedAutoDeciderModels).toBe(''); expect(state.maxConcurrency).toBe(100); expect(state.benchmarkUserId).toBe('ce12ef3d-ae95-4d77-b4f0-23735f0a0591'); expect(state.benchmarkOrgId).toBe('9d278969-5453-4ae3-a51f-a8d2274a7b56'); @@ -128,9 +134,15 @@ describe('configToFormState', () => { }); describe('formStateToConfig round-trip', () => { - const baseConfig = { + const baseConfig: BenchmarkConfig = { classifierModels: ['model-a', 'model-b'], deciderModels: [{ id: 'model-c', reasoningEffort: null }], + manualDeciderModels: [{ id: 'manual-model', reasoningEffort: 'low' }], + autoDeciderModels: [ + { id: 'auto-model', reasoningEffort: null, avgAttemptCostUsd: 21.25 }, + { id: 'excluded-auto-model', reasoningEffort: 'high', avgAttemptCostUsd: 18 }, + ], + excludedAutoDeciderModels: ['excluded-auto-model'], minAccuracy: 0.8, switchCostFactor: 3, maxConcurrency: 4, @@ -139,22 +151,37 @@ describe('formStateToConfig round-trip', () => { classifierRepetitions: 3, deciderRepetitions: 2, classifierMaxP95LatencyMs: 500, + autoDeciderMinCostUsd: 12, + autoDeciderMaxCostUsd: 24, updatedAt: null, updatedBy: null, }; - it('preserves classifierRepetitions, deciderRepetitions, and classifierMaxP95LatencyMs', () => { + it('preserves repetitions, classifierMaxP95LatencyMs, and auto decider cost bounds', () => { const state = configToFormState(baseConfig); expect(state.classifierRepetitions).toBe(3); expect(state.deciderRepetitions).toBe(2); expect(state.classifierMaxP95LatencyMs).toBe('500'); + expect(state.autoDeciderMinCostUsd).toBe(12); + expect(state.autoDeciderMaxCostUsd).toBe(24); expect(state.benchmarkOrgId).toBe('org-123'); + expect(state.deciderModels).toEqual([{ id: 'manual-model', reasoningEffort: 'low' }]); + expect(state.autoDeciderModels).toEqual(baseConfig.autoDeciderModels); + expect(state.excludedAutoDeciderModels).toBe('excluded-auto-model'); const result = formStateToConfig(state, baseConfig); expect(result.classifierRepetitions).toBe(3); expect(result.deciderRepetitions).toBe(2); expect(result.classifierMaxP95LatencyMs).toBe(500); + expect(result.autoDeciderMinCostUsd).toBe(12); + expect(result.autoDeciderMaxCostUsd).toBe(24); expect(result.benchmarkOrgId).toBe('org-123'); + expect(result.manualDeciderModels).toEqual([{ id: 'manual-model', reasoningEffort: 'low' }]); + expect(result.excludedAutoDeciderModels).toEqual(['excluded-auto-model']); + expect(result.deciderModels).toEqual([ + { id: 'manual-model', reasoningEffort: 'low' }, + { id: 'auto-model', reasoningEffort: null }, + ]); }); it('converts empty-string classifierMaxP95LatencyMs form value to null in config', () => { @@ -164,3 +191,26 @@ describe('formStateToConfig round-trip', () => { expect(result.classifierMaxP95LatencyMs).toBeNull(); }); }); + +describe('effectiveDeciderModels', () => { + it('combines manual models with non-excluded auto models and lets manual override an auto duplicate', () => { + expect( + effectiveDeciderModels({ + manualDeciderModels: [ + { id: 'manual/model', reasoningEffort: null }, + { id: 'auto/duplicate', reasoningEffort: 'high' }, + ], + autoDeciderModels: [ + { id: 'auto/duplicate', reasoningEffort: null, avgAttemptCostUsd: 20 }, + { id: 'auto/included', reasoningEffort: 'low', avgAttemptCostUsd: 22 }, + { id: 'auto/excluded', reasoningEffort: null, avgAttemptCostUsd: 23 }, + ], + excludedAutoDeciderModels: ['auto/excluded'], + }) + ).toEqual([ + { id: 'manual/model', reasoningEffort: null }, + { id: 'auto/duplicate', reasoningEffort: 'high' }, + { id: 'auto/included', reasoningEffort: 'low' }, + ]); + }); +}); diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx index 94096fbd1e..2e79f3f6a6 100644 --- a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx @@ -1,6 +1,8 @@ 'use client'; import { + AUTO_DECIDER_DEFAULT_MAX_COST_USD, + AUTO_DECIDER_DEFAULT_MIN_COST_USD, BenchmarkConfigResponseSchema, BenchmarkRoutingTableResponseSchema, BenchmarkRunsResponseSchema, @@ -12,6 +14,7 @@ import { type BenchmarkModelSummary, type RankedCandidate, type ReasoningEffort, + type AutoBenchmarkDeciderModel, } from '@kilocode/auto-routing-contracts'; import React, { useCallback, useEffect, useRef, useState } from 'react'; import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; @@ -119,12 +122,16 @@ type DeciderModelRow = { reasoningEffort: ReasoningEffort | null; }; +type AutoDeciderModelRow = AutoBenchmarkDeciderModel; + const DEFAULT_BENCHMARK_USER_ID = 'ce12ef3d-ae95-4d77-b4f0-23735f0a0591'; const DEFAULT_BENCHMARK_ORG_ID = '9d278969-5453-4ae3-a51f-a8d2274a7b56'; export function configToFormState(config: BenchmarkConfig | null): { classifierModels: string; deciderModels: DeciderModelRow[]; + autoDeciderModels: AutoDeciderModelRow[]; + excludedAutoDeciderModels: string; minAccuracy: number; switchCostFactor: number; maxConcurrency: number; @@ -133,6 +140,8 @@ export function configToFormState(config: BenchmarkConfig | null): { classifierRepetitions: number; deciderRepetitions: number; classifierMaxP95LatencyMs: string; + autoDeciderMinCostUsd: number; + autoDeciderMaxCostUsd: number; } { if (config === null) { // No config saved yet: the worker fabricates nothing, so the form starts @@ -140,6 +149,8 @@ export function configToFormState(config: BenchmarkConfig | null): { return { classifierModels: '', deciderModels: [], + autoDeciderModels: [], + excludedAutoDeciderModels: '', minAccuracy: 0.7, switchCostFactor: 3, maxConcurrency: 100, @@ -148,14 +159,18 @@ export function configToFormState(config: BenchmarkConfig | null): { classifierRepetitions: 1, deciderRepetitions: 1, classifierMaxP95LatencyMs: '1000', + autoDeciderMinCostUsd: AUTO_DECIDER_DEFAULT_MIN_COST_USD, + autoDeciderMaxCostUsd: AUTO_DECIDER_DEFAULT_MAX_COST_USD, }; } return { classifierModels: config.classifierModels.join('\n'), - deciderModels: config.deciderModels.map(m => ({ + deciderModels: (config.manualDeciderModels ?? config.deciderModels).map(m => ({ id: m.id, reasoningEffort: m.reasoningEffort ?? null, })), + autoDeciderModels: config.autoDeciderModels ?? [], + excludedAutoDeciderModels: (config.excludedAutoDeciderModels ?? []).join('\n'), minAccuracy: config.minAccuracy, switchCostFactor: config.switchCostFactor, maxConcurrency: config.maxConcurrency, @@ -165,23 +180,61 @@ export function configToFormState(config: BenchmarkConfig | null): { deciderRepetitions: config.deciderRepetitions, classifierMaxP95LatencyMs: config.classifierMaxP95LatencyMs !== null ? String(config.classifierMaxP95LatencyMs) : '', + autoDeciderMinCostUsd: config.autoDeciderMinCostUsd, + autoDeciderMaxCostUsd: config.autoDeciderMaxCostUsd, }; } -export function formStateToConfig( - state: ReturnType, - base: BenchmarkConfig | null -): BenchmarkConfig { - const classifierModels = state.classifierModels +function parseModelLines(value: string): string[] { + return value .split('\n') .map(s => s.trim()) .filter(s => s.length > 0); - const deciderModels = state.deciderModels +} + +export function effectiveDeciderModels({ + manualDeciderModels, + autoDeciderModels, + excludedAutoDeciderModels, +}: { + manualDeciderModels: DeciderModelRow[]; + autoDeciderModels: AutoDeciderModelRow[]; + excludedAutoDeciderModels: string[]; +}): DeciderModelRow[] { + const manual = manualDeciderModels .filter(row => row.id.trim().length > 0) .map(row => ({ id: row.id.trim(), reasoningEffort: row.reasoningEffort ?? null, })); + const manualIds = new Set(manual.map(model => model.id)); + const excludedAuto = new Set(excludedAutoDeciderModels); + return [ + ...manual, + ...autoDeciderModels + .filter(model => !excludedAuto.has(model.id)) + .filter(model => !manualIds.has(model.id)) + .map(model => ({ + id: model.id, + reasoningEffort: model.reasoningEffort ?? null, + })), + ]; +} + +export function formStateToConfig( + state: ReturnType, + base: BenchmarkConfig | null +): BenchmarkConfig { + const classifierModels = parseModelLines(state.classifierModels); + const excludedAutoDeciderModels = parseModelLines(state.excludedAutoDeciderModels); + const manualDeciderModels = state.deciderModels + .filter(row => row.id.trim().length > 0) + .map(row => ({ id: row.id.trim(), reasoningEffort: row.reasoningEffort ?? null })); + const deciderModels = effectiveDeciderModels({ + manualDeciderModels, + autoDeciderModels: state.autoDeciderModels, + excludedAutoDeciderModels, + }); const benchmarkUserId = state.benchmarkUserId.trim(); const benchmarkOrgId = state.benchmarkOrgId.trim(); const rawLatency = state.classifierMaxP95LatencyMs.trim(); @@ -189,6 +242,9 @@ export function formStateToConfig( return { classifierModels, deciderModels, + manualDeciderModels, + autoDeciderModels: state.autoDeciderModels, + excludedAutoDeciderModels, minAccuracy: state.minAccuracy, switchCostFactor: state.switchCostFactor, maxConcurrency: state.maxConcurrency, @@ -197,6 +253,8 @@ export function formStateToConfig( classifierRepetitions: state.classifierRepetitions, deciderRepetitions: state.deciderRepetitions, classifierMaxP95LatencyMs, + autoDeciderMinCostUsd: state.autoDeciderMinCostUsd, + autoDeciderMaxCostUsd: state.autoDeciderMaxCostUsd, updatedAt: base?.updatedAt ?? null, updatedBy: base?.updatedBy ?? null, }; @@ -287,6 +345,24 @@ function BenchmarkConfigEditor({ [updateForm] ); + const handleToggleAutoDeciderModel = useCallback( + (modelId: string, included: boolean) => { + updateForm(prev => { + const excluded = new Set(parseModelLines(prev.excludedAutoDeciderModels)); + if (included) { + excluded.delete(modelId); + } else { + excluded.add(modelId); + } + return { + ...prev, + excludedAutoDeciderModels: [...excluded].sort().join('\n'), + }; + }); + }, + [updateForm] + ); + const handleSave = useCallback(() => { saveMutation.mutate(formStateToConfig(form, config)); }, [form, config, saveMutation]); @@ -312,9 +388,9 @@ function BenchmarkConfigEditor({ /> - {/* Decider models table */} + {/* Manual decider models table */}
- +
@@ -389,6 +465,59 @@ function BenchmarkConfigEditor({ + {/* Auto decider models */} +
+
+ + {form.autoDeciderModels.length} synced +
+ {form.autoDeciderModels.length > 0 ? ( +
+
+ + + Model ID + Avg run + Reasoning effort + Included + + + + {form.autoDeciderModels.map(model => { + const excluded = parseModelLines(form.excludedAutoDeciderModels).includes( + model.id + ); + return ( + + {model.id} + + {formatUsd(model.avgAttemptCostUsd)} + + + {model.reasoningEffort ?? 'default'} + + + + handleToggleAutoDeciderModel(model.id, checked === true) + } + aria-label={`${excluded ? 'Include' : 'Exclude'} ${model.id}`} + /> + + + ); + })} + +
+
+ ) : ( +
+ No auto decider models synced yet. +
+ )} +
+ {/* Numeric inputs */}
@@ -442,6 +571,44 @@ function BenchmarkConfigEditor({ className="h-8 w-40 tabular-nums" />
+
+ + + updateForm(prev => ({ + ...prev, + autoDeciderMinCostUsd: parseFloat(e.target.value) || 0, + })) + } + className="h-8 w-40 tabular-nums" + /> +
+
+ + + updateForm(prev => ({ + ...prev, + autoDeciderMaxCostUsd: parseFloat(e.target.value) || 0, + })) + } + className="h-8 w-40 tabular-nums" + /> +