From c5f0348387daa92758b3013fbeb4449989356f62 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:04:36 -0500
Subject: [PATCH 01/14] feat: add AI-powered chart generation tab (#485)

---
 .../app/src/app/(dashboard)/ai-chart/page.tsx |  10 +
 .../components/ai-chart/AiChartDisplay.tsx    | 213 ++++++++++++++++++
 .../src/components/ai-chart/AiChartResult.tsx | 174 ++++++++++++++
 .../components/ai-chart/example-prompts.ts    |   8 +
 .../components/ai-chart/prompt-templates.ts   |  74 ++++++
 packages/app/src/components/ai-chart/types.ts |  23 ++
 packages/app/src/components/tab-nav.tsx       |   1 +
 packages/app/src/hooks/api/use-ai-chart.ts    | 207 +++++++++++++++++
 packages/app/src/lib/ai-providers.ts          | 141 ++++++++++++
 packages/app/src/lib/tab-meta.ts              |   6 +
 10 files changed, 857 insertions(+)
 create mode 100644 packages/app/src/app/(dashboard)/ai-chart/page.tsx
 create mode 100644 packages/app/src/components/ai-chart/AiChartDisplay.tsx
 create mode 100644 packages/app/src/components/ai-chart/AiChartResult.tsx
 create mode 100644 packages/app/src/components/ai-chart/example-prompts.ts
 create mode 100644 packages/app/src/components/ai-chart/prompt-templates.ts
 create mode 100644 packages/app/src/components/ai-chart/types.ts
 create mode 100644 packages/app/src/hooks/api/use-ai-chart.ts
 create mode 100644 packages/app/src/lib/ai-providers.ts
diff --git a/packages/app/src/app/(dashboard)/ai-chart/page.tsx b/packages/app/src/app/(dashboard)/ai-chart/page.tsx
new file mode 100644
index 0000000..b299e8a
--- /dev/null
+++ b/packages/app/src/app/(dashboard)/ai-chart/page.tsx
@@ -0,0 +1,10 @@
+import type { Metadata } from 'next';
+
+import AiChartDisplay from '@/components/ai-chart/AiChartDisplay';
+import { tabMetadata } from '@/lib/tab-meta';
+
+export const metadata: Metadata = tabMetadata('ai-chart');
+
+export default function AiChartPage() {
+  return <AiChartDisplay />;
+}
diff --git a/packages/app/src/components/ai-chart/AiChartDisplay.tsx b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
new file mode 100644
index 0000000..ff86d71
--- /dev/null
+++ b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
@@ -0,0 +1,213 @@
+'use client';
+
+import { useCallback, useEffect, useState } from 'react';
+import { AlertCircle, Eye, EyeOff, Sparkles } from 'lucide-react';
+
+import { track } from '@/lib/analytics';
+import { PROVIDER_OPTIONS, getProviderLabel } from '@/lib/ai-providers';
+import { useAiChart } from '@/hooks/api/use-ai-chart';
+import { Button } from '@/components/ui/button';
+import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card';
+import { Input } from '@/components/ui/input';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { Skeleton } from '@/components/ui/skeleton';
+import { Textarea } from '@/components/ui/textarea';
+
+import type { AiProvider } from './types';
+import { EXAMPLE_PROMPTS } from './example-prompts';
+import AiChartResult from './AiChartResult';
+
+const STORAGE_PREFIX = 'inferencex-ai-key-';
+
+function getStoredKey(provider: AiProvider): string {
+  if (typeof window === 'undefined') return '';
+  return sessionStorage.getItem(`${STORAGE_PREFIX}${provider}`) ?? '';
+}
+
+function storeKey(provider: AiProvider, key: string) {
+  if (typeof window === 'undefined') return;
+  if (key) {
+    sessionStorage.setItem(`${STORAGE_PREFIX}${provider}`, key);
+  } else {
+    sessionStorage.removeItem(`${STORAGE_PREFIX}${provider}`);
+  }
+}
+
+export default function AiChartDisplay() {
+  const [provider, setProvider] = useState<AiProvider>('openai');
+  const [apiKey, setApiKey] = useState('');
+  const [prompt, setPrompt] = useState('');
+  const [showKey, setShowKey] = useState(false);
+  const { result, isLoading, error, generate, reset } = useAiChart();
+
+  // Load stored key on provider change
+  useEffect(() => {
+    setApiKey(getStoredKey(provider));
+  }, [provider]);
+
+  const handleProviderChange = useCallback((value: string) => {
+    const newProvider = value as AiProvider;
+    setProvider(newProvider);
+    track('ai_chart_provider_changed', { provider: newProvider });
+  }, []);
+
+  const handleSubmit = useCallback(() => {
+    if (!apiKey.trim() || !prompt.trim()) return;
+    storeKey(provider, apiKey);
+    track('ai_chart_prompt_submitted', { provider, prompt_length: prompt.length });
+    generate(prompt, provider, apiKey);
+  }, [apiKey, prompt, provider, generate]);
+
+  const handleExampleClick = useCallback((example: string, index: number) => {
+    setPrompt(example);
+    track('ai_chart_example_clicked', { example_index: index });
+  }, []);
+
+  const handleKeyDown = useCallback(
+    (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
+      if (e.key === 'Enter' && (e.metaKey || e.ctrlKey)) {
+        e.preventDefault();
+        handleSubmit();
+      }
+    },
+    [handleSubmit],
+  );
+
+  return (
+    <div className="space-y-6">
+      {/* Provider & API Key */}
+      <Card>
+        <CardHeader className="pb-4">
+          <CardTitle className="flex items-center gap-2">
+            <Sparkles className="size-5" />
+            AI Chart Generation
+          </CardTitle>
+          <CardDescription>
+            Describe the chart you want in natural language. Your API key is stored in your browser
+            and only used by your selected provider. We never see it.
+          </CardDescription>
+        </CardHeader>
+        <CardContent className="space-y-4">
+          <div className="flex flex-col gap-4 sm:flex-row">
+            <div className="w-full sm:w-48">
+              <Select value={provider} onValueChange={handleProviderChange}>
+                <SelectTrigger>
+                  <SelectValue />
+                </SelectTrigger>
+                <SelectContent>
+                  {PROVIDER_OPTIONS.map((p) => (
+                    <SelectItem key={p} value={p}>
+                      {getProviderLabel(p)}
+                    </SelectItem>
+                  ))}
+                </SelectContent>
+              </Select>
+            </div>
+            <div className="flex flex-1 gap-2">
+              <Input
+                className="flex-1"
+                type={showKey ? 'text' : 'password'}
+                placeholder={`${getProviderLabel(provider)} API Key`}
+                value={apiKey}
+                onChange={(e) => setApiKey(e.target.value)}
+              />
+              <Button
+                variant="outline"
+                size="icon"
+                onClick={() => setShowKey((s) => !s)}
+                aria-label={showKey ? 'Hide API key' : 'Show API key'}
+              >
+                {showKey ? <Eye className="size-4" /> : <EyeOff className="size-4" />}
+              </Button>
+            </div>
+          </div>
+
+          <div className="space-y-2">
+            <Textarea
+              placeholder="Describe the chart you want to see..."
+              value={prompt}
+              onChange={(e) => setPrompt(e.target.value)}
+              onKeyDown={handleKeyDown}
+              rows={3}
+              className="resize-none"
+            />
+            <div className="flex items-center justify-between">
+              <span className="text-muted-foreground text-xs">
+                {navigator.userAgent.includes('Mac') ? '⌘' : 'Ctrl'}+Enter to generate
+              </span>
+              <Button
+                onClick={handleSubmit}
+                disabled={isLoading || !apiKey.trim() || !prompt.trim()}
+              >
+                {isLoading ? 'Generating...' : 'Generate Chart'}
+              </Button>
+            </div>
+          </div>
+        </CardContent>
+      </Card>
+
+      {/* Loading state */}
+      {isLoading && (
+        <Card>
+          <CardContent className="space-y-4 pt-6">
+            <Skeleton className="h-6 w-1/3" />
+            <Skeleton className="h-100 w-full" />
+            <Skeleton className="h-4 w-2/3" />
+          </CardContent>
+        </Card>
+      )}
+
+      {/* Error state */}
+      {error && (
+        <Card className="border-destructive">
+          <CardContent className="flex items-start gap-3 pt-6">
+            <AlertCircle className="text-destructive mt-0.5 size-5 shrink-0" />
+            <div>
+              <p className="text-destructive text-sm font-medium">Error</p>
+              <p className="text-muted-foreground text-sm">{error}</p>
+              <Button variant="outline" size="sm" className="mt-2" onClick={reset}>
+                Try Again
+              </Button>
+            </div>
+          </CardContent>
+        </Card>
+      )}
+
+      {/* Result */}
+      {result && (
+        <AiChartResult
+          spec={result.spec}
+          barData={result.barData}
+          scatterData={result.scatterData}
+          colorMap={result.colorMap}
+          summary={result.summary}
+        />
+      )}
+
+      {/* Example prompts (shown when no result) */}
+      {!result && !isLoading && !error && (
+        <div className="space-y-3">
+          <h3 className="text-muted-foreground text-sm font-medium">Example prompts</h3>
+          <div className="grid gap-2 sm:grid-cols-2">
+            {EXAMPLE_PROMPTS.map((example, i) => (
+              <button
+                key={i}
+                type="button"
+                className="text-muted-foreground hover:bg-accent hover:text-foreground rounded-lg border p-3 text-left text-sm transition-colors"
+                onClick={() => handleExampleClick(example, i)}
+              >
+                {example}
+              </button>
+            ))}
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/ai-chart/AiChartResult.tsx b/packages/app/src/components/ai-chart/AiChartResult.tsx
new file mode 100644
index 0000000..85f36fd
--- /dev/null
+++ b/packages/app/src/components/ai-chart/AiChartResult.tsx
@@ -0,0 +1,174 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card';
+import type { InferenceData } from '@/components/inference/types';
+import { D3Chart } from '@/lib/d3-chart/D3Chart';
+import type {
+  BarLayerConfig,
+  ScatterLayerConfig,
+  ScaleConfig,
+  AxisConfig,
+} from '@/lib/d3-chart/D3Chart';
+
+import type { AiChartBarPoint, AiChartSpec } from './types';
+
+interface AiChartResultProps {
+  spec: AiChartSpec;
+  barData: AiChartBarPoint[];
+  scatterData: InferenceData[];
+  colorMap: Record<string, string>;
+  summary: string | null;
+}
+
+function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }) {
+  const xScale = useMemo<ScaleConfig>(
+    () => ({
+      type: 'band',
+      domain: data.map((d) => d.label),
+      padding: 0.3,
+    }),
+    [data],
+  );
+
+  const yMax = useMemo(() => Math.max(...data.map((d) => d.value), 1), [data]);
+
+  const yScale = useMemo<ScaleConfig>(
+    () => ({
+      type: 'linear',
+      domain: [0, yMax * 1.15],
+      nice: true,
+    }),
+    [yMax],
+  );
+
+  const xAxis = useMemo<AxisConfig>(() => ({ label: '' }), []);
+  const yAxis = useMemo<AxisConfig>(() => ({ label: spec.yAxisLabel }), [spec.yAxisLabel]);
+
+  const layers = useMemo(() => {
+    const barLayer: BarLayerConfig<AiChartBarPoint> = {
+      type: 'bar',
+      data,
+      config: {
+        getX: (d) => d.label,
+        getY: (d) => d.value,
+        getColor: (d) => d.color,
+        getForeground: () => 'var(--foreground)',
+        rx: 4,
+      },
+    };
+    return [barLayer];
+  }, [data]);
+
+  return (
+    <D3Chart
+      chartId="ai-chart-bar"
+      data={data}
+      height={450}
+      xScale={xScale}
+      yScale={yScale}
+      xAxis={xAxis}
+      yAxis={yAxis}
+      layers={layers}
+      watermark="logo"
+    />
+  );
+}
+
+function ScatterChart({
+  data,
+  spec,
+  colorMap,
+}: {
+  data: InferenceData[];
+  spec: AiChartSpec;
+  colorMap: Record<string, string>;
+}) {
+  const xExtent = useMemo(() => {
+    const xs = data.map((d) => d.x);
+    return [Math.min(...xs) * 0.9, Math.max(...xs) * 1.1] as [number, number];
+  }, [data]);
+
+  const yExtent = useMemo(() => {
+    const ys = data.map((d) => d.y);
+    return [Math.min(...ys) * 0.9, Math.max(...ys) * 1.1] as [number, number];
+  }, [data]);
+
+  const xScale = useMemo<ScaleConfig>(
+    () => ({ type: 'linear', domain: xExtent, nice: true }),
+    [xExtent],
+  );
+
+  const yScale = useMemo<ScaleConfig>(
+    () => ({ type: 'linear', domain: yExtent, nice: true }),
+    [yExtent],
+  );
+
+  const xAxis = useMemo<AxisConfig>(() => ({ label: 'Interactivity (tok/s/user)' }), []);
+  const yAxis = useMemo<AxisConfig>(() => ({ label: spec.yAxisLabel }), [spec.yAxisLabel]);
+
+  const layers = useMemo(() => {
+    const scatterLayer: ScatterLayerConfig<InferenceData> = {
+      type: 'scatter',
+      data,
+      config: {
+        getColor: (d) => colorMap[d.hwKey ?? ''] ?? '#888',
+      },
+    };
+    return [scatterLayer];
+  }, [data, colorMap]);
+
+  return (
+    <D3Chart
+      chartId="ai-chart-scatter"
+      data={data}
+      height={500}
+      xScale={xScale}
+      yScale={yScale}
+      xAxis={xAxis}
+      yAxis={yAxis}
+      layers={layers}
+      watermark="logo"
+      zoom={{ enabled: true, axes: 'both' }}
+    />
+  );
+}
+
+export default function AiChartResult({
+  spec,
+  barData,
+  scatterData,
+  colorMap,
+  summary,
+}: AiChartResultProps) {
+  return (
+    <div className="space-y-4">
+      <Card>
+        <CardHeader>
+          <CardTitle>{spec.title}</CardTitle>
+          <CardDescription>{spec.description}</CardDescription>
+        </CardHeader>
+        <CardContent>
+          {spec.chartType === 'bar' && barData.length > 0 && (
+            <BarChart data={barData} spec={spec} />
+          )}
+          {spec.chartType === 'scatter' && scatterData.length > 0 && (
+            <ScatterChart data={scatterData} spec={spec} colorMap={colorMap} />
+          )}
+        </CardContent>
+      </Card>
+
+      {summary && (
+        <Card>
+          <CardHeader>
+            <CardTitle className="text-sm">AI Summary</CardTitle>
+          </CardHeader>
+          <CardContent>
+            <p className="text-muted-foreground text-sm">{summary}</p>
+          </CardContent>
+        </Card>
+      )}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/ai-chart/example-prompts.ts b/packages/app/src/components/ai-chart/example-prompts.ts
new file mode 100644
index 0000000..19a85ba
--- /dev/null
+++ b/packages/app/src/components/ai-chart/example-prompts.ts
@@ -0,0 +1,8 @@
+export const EXAMPLE_PROMPTS = [
+  'Compare throughput per GPU across all GPUs for DeepSeek R1 at 8k/1k',
+  'Bar chart: H100 vs B200 vs GB200 cost per million tokens (hyperscaler) for DeepSeek R1',
+  'Show a scatter plot of all GPU configs for DeepSeek R1 at 8k/1k with throughput per GPU',
+  'Compare MI355X vs H200 throughput at 50 tok/s/user interactivity for DeepSeek R1 fp8',
+  'Bar chart of energy per output token across all GPUs for gpt-oss at 8k/1k',
+  'Scatter plot: cost per million tokens vs interactivity for all GPUs, DeepSeek R1 fp8 8k/1k',
+];
diff --git a/packages/app/src/components/ai-chart/prompt-templates.ts b/packages/app/src/components/ai-chart/prompt-templates.ts
new file mode 100644
index 0000000..5f8e41c
--- /dev/null
+++ b/packages/app/src/components/ai-chart/prompt-templates.ts
@@ -0,0 +1,74 @@
+/**
+ * System prompt for the LLM that parses user natural language into an AiChartSpec.
+ * Kept compact to minimize token cost.
+ */
+export function buildParsePrompt(): string {
+  return `You are an expert at parsing natural language requests about ML inference benchmarks into structured JSON.
+
+## Available values
+
+Models: DeepSeek-R1-0528, Llama-3.3-70B-Instruct-FP8, gpt-oss-120b, Qwen-3.5-397B-A17B, Kimi-K2.5, MiniMax-M2.5, GLM-5
+Sequences (input/output token lengths): 1k/1k, 1k/8k, 8k/1k
+Precisions: fp4, fp8, bf16, int4
+GPU base names: gb300, gb200, b300, b200, mi355x, h200, mi325x, h100, mi300x
+Frameworks: sglang, vllm, trt, dynamo-sglang, dynamo-trt, mori-sglang, atom, sglang-disagg
+
+Y-axis metrics (key → label):
+- y_tpPerGpu → Token Throughput per GPU (tok/s/gpu) [DEFAULT]
+- y_outputTputPerGpu → Output Token Throughput per GPU (tok/s/gpu)
+- y_inputTputPerGpu → Input Token Throughput per GPU (tok/s/gpu)
+- y_tpPerMw → Token Throughput per MW (tok/s/MW)
+- y_costh → Cost per M Total Tokens, Hyperscaler ($)
+- y_costn → Cost per M Total Tokens, Neocloud ($)
+- y_costr → Cost per M Total Tokens, 3yr Rental ($)
+- y_jTotal → All-in J per Total Token (J/tok)
+- y_jOutput → All-in J per Output Token (J/tok)
+- y_jInput → All-in J per Input Token (J/tok)
+
+## Rules
+
+1. Map user intent to the closest available values. Be flexible with naming (e.g., "H100" → "h100", "deepseek r1" → "DeepSeek-R1-0528", "sglang" → "sglang").
+2. hardwareKeys: list of GPU base names the user wants to compare. Empty array [] means "all GPUs".
+3. precisions: list of precisions. Empty array [] means "all precisions".
+4. chartType: "bar" for comparing specific values across GPUs/configs, "scatter" for plotting all data points (interactivity vs metric).
+5. targetInteractivity: for bar charts, the interactivity level (tok/s/user) to read from. Default 40.
+6. If the user doesn't specify a model, default to "DeepSeek-R1-0528".
+7. If the user doesn't specify a sequence, default to "8k/1k".
+8. title: a short chart title describing the comparison.
+9. description: a one-sentence description of what the chart shows.
+
+## Output format
+
+Return ONLY valid JSON matching this schema (no markdown, no preamble):
+{
+  "chartType": "bar" | "scatter",
+  "model": "string",
+  "sequence": "string",
+  "precisions": ["string"],
+  "hardwareKeys": ["string"],
+  "yAxisMetric": "string",
+  "yAxisLabel": "string",
+  "targetInteractivity": number,
+  "title": "string",
+  "description": "string"
+}`;
+}
+
+export function buildSummaryPrompt(
+  spec: { title: string; yAxisLabel: string; model: string; sequence: string },
+  dataDescription: string,
+): string {
+  return `You are an expert performance analyst. Based on the following benchmark data, provide a concise 2-3 sentence summary highlighting the key takeaway.
+
+Chart: ${spec.title}
+Metric: ${spec.yAxisLabel}
+Model: ${spec.model}, Sequence: ${spec.sequence}
+
+Data:
+${dataDescription}
+
+Rules:
+- Be technical and precise. Mention specific values and percentage differences.
+- Focus on the most interesting comparison or finding.
+- No markdown formatting, just plain text.`;
+}
diff --git a/packages/app/src/components/ai-chart/types.ts b/packages/app/src/components/ai-chart/types.ts
new file mode 100644
index 0000000..fa5c41d
--- /dev/null
+++ b/packages/app/src/components/ai-chart/types.ts
@@ -0,0 +1,23 @@
+export type AiProvider = 'openai' | 'anthropic' | 'xai' | 'google';
+
+export type AiChartType = 'bar' | 'scatter';
+
+export interface AiChartSpec {
+  chartType: AiChartType;
+  model: string;
+  sequence: string;
+  precisions: string[];
+  hardwareKeys: string[];
+  yAxisMetric: string;
+  yAxisLabel: string;
+  targetInteractivity?: number;
+  title: string;
+  description: string;
+}
+
+export interface AiChartBarPoint {
+  hwKey: string;
+  label: string;
+  value: number;
+  color: string;
+}
diff --git a/packages/app/src/components/tab-nav.tsx b/packages/app/src/components/tab-nav.tsx
index 0aa22b2..b219838 100644
--- a/packages/app/src/components/tab-nav.tsx
+++ b/packages/app/src/components/tab-nav.tsx
@@ -71,6 +71,7 @@ const TAB_LINKS = [
   { href: '/calculator', label: 'TCO Calculator', testId: 'tab-trigger-calculator' },
   { href: '/reliability', label: 'GPU Reliability', testId: 'tab-trigger-reliability' },
   { href: '/gpu-specs', label: 'GPU Specs', testId: 'tab-trigger-gpu-specs' },
+  { href: '/ai-chart', label: 'AI Chart', testId: 'tab-trigger-ai-chart' },
   { href: '/gpu-metrics', label: 'PowerX', testId: 'tab-trigger-gpu-metrics', gated: true },
 ] as const;
 
diff --git a/packages/app/src/hooks/api/use-ai-chart.ts b/packages/app/src/hooks/api/use-ai-chart.ts
new file mode 100644
index 0000000..4224c98
--- /dev/null
+++ b/packages/app/src/hooks/api/use-ai-chart.ts
@@ -0,0 +1,207 @@
+'use client';
+
+import { useCallback, useState } from 'react';
+
+import type { AiChartBarPoint, AiChartSpec, AiProvider } from '@/components/ai-chart/types';
+import { buildParsePrompt, buildSummaryPrompt } from '@/components/ai-chart/prompt-templates';
+import type { InferenceData } from '@/components/inference/types';
+import { callLlm } from '@/lib/ai-providers';
+import { fetchBenchmarks } from '@/lib/api';
+import { transformBenchmarkRows } from '@/lib/benchmark-transform';
+import { getNestedYValue } from '@/lib/chart-utils';
+import { generateHighContrastColors } from '@/lib/chart-utils';
+import { getHardwareConfig, getModelSortIndex } from '@/lib/constants';
+
+import chartDefinitions from '@/components/inference/inference-chart-config.json';
+
+interface AiChartResult {
+  spec: AiChartSpec;
+  /** For bar charts: aggregated bar data. */
+  barData: AiChartBarPoint[];
+  /** For scatter charts: raw InferenceData points. */
+  scatterData: InferenceData[];
+  /** Color map: hwKey → color. */
+  colorMap: Record<string, string>;
+  summary: string | null;
+}
+
+interface UseAiChartReturn {
+  result: AiChartResult | null;
+  isLoading: boolean;
+  error: string | null;
+  generate: (prompt: string, provider: AiProvider, apiKey: string) => Promise<void>;
+  reset: () => void;
+}
+
+function parseSpecFromLlm(raw: string): AiChartSpec {
+  const cleaned = raw
+    .replace(/```json\s*/g, '')
+    .replace(/```/g, '')
+    .trim();
+  return JSON.parse(cleaned);
+}
+
+/**
+ * For bar charts: group scatter data by hwKey, find the point closest to
+ * targetInteractivity, and extract the requested metric value.
+ */
+function buildBarData(
+  data: InferenceData[],
+  spec: AiChartSpec,
+  colorMap: Record<string, string>,
+): AiChartBarPoint[] {
+  const target = spec.targetInteractivity ?? 40;
+
+  // Find the y-field path from the chart config (interactivity chart = index 0)
+  const chartDef = (chartDefinitions as any[])[0];
+  const yFieldPath: string = chartDef[spec.yAxisMetric] ?? 'tpPerGpu.y';
+
+  // Group by hwKey
+  const groups = new Map<string, InferenceData[]>();
+  for (const point of data) {
+    const key = point.hwKey ?? '';
+    if (!groups.has(key)) groups.set(key, []);
+    groups.get(key)!.push(point);
+  }
+
+  const bars: AiChartBarPoint[] = [];
+  for (const [hwKey, points] of groups) {
+    // Find closest to target interactivity (x = median_intvty)
+    let closest = points[0];
+    let closestDist = Math.abs(closest.x - target);
+    for (let i = 1; i < points.length; i++) {
+      const dist = Math.abs(points[i].x - target);
+      if (dist < closestDist) {
+        closest = points[i];
+        closestDist = dist;
+      }
+    }
+
+    const value = getNestedYValue(closest, yFieldPath);
+    if (value <= 0) continue;
+
+    const config = getHardwareConfig(hwKey);
+    bars.push({
+      hwKey,
+      label: config ? `${config.label}${config.suffix ? ' ' + config.suffix : ''}` : hwKey,
+      value,
+      color: colorMap[hwKey] ?? '#888',
+    });
+  }
+
+  // Sort by MODEL_ORDER (GPU hierarchy)
+  bars.sort((a, b) => getModelSortIndex(a.hwKey) - getModelSortIndex(b.hwKey));
+  return bars;
+}
+
+function sequenceToIslOsl(seq: string): { isl: number; osl: number } {
+  const parts = seq.split('/');
+  const parse = (s: string) => (s.includes('8k') ? 8192 : 1024);
+  return { isl: parse(parts[0] ?? '1k'), osl: parse(parts[1] ?? '1k') };
+}
+
+export function useAiChart(): UseAiChartReturn {
+  const [result, setResult] = useState<AiChartResult | null>(null);
+  const [isLoading, setIsLoading] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+
+  const generate = useCallback(async (prompt: string, provider: AiProvider, apiKey: string) => {
+    setIsLoading(true);
+    setError(null);
+    setResult(null);
+
+    try {
+      // Step 1: Parse prompt into spec
+      const rawSpec = await callLlm(provider, apiKey, buildParsePrompt(), prompt);
+      const spec = parseSpecFromLlm(rawSpec);
+
+      // Step 2: Fetch benchmark data from our API
+      const rows = await fetchBenchmarks(spec.model);
+
+      // Step 3: Transform to InferenceData
+      const { chartData } = transformBenchmarkRows(rows);
+      // Use interactivity chart (index 0)
+      let points = chartData[0] ?? [];
+
+      // Step 4: Filter by spec
+      if (spec.hardwareKeys.length > 0) {
+        const allowedGpus = new Set(spec.hardwareKeys);
+        points = points.filter((p) => {
+          const hwKey = p.hwKey ?? '';
+          return allowedGpus.has(hwKey) || [...allowedGpus].some((g) => hwKey.startsWith(g));
+        });
+      }
+      if (spec.precisions.length > 0) {
+        const allowedPrec = new Set(spec.precisions.map((p) => p.toLowerCase()));
+        points = points.filter((p) => p.precision && allowedPrec.has(p.precision.toLowerCase()));
+      }
+
+      // Filter by sequence
+      const { isl, osl } = sequenceToIslOsl(spec.sequence);
+      points = points.filter((p) => {
+        const entry = p as any;
+        // BenchmarkRows are keyed by model which includes sequence in the API,
+        // but transformed points don't always have isl/osl. Check conc-level data
+        // if we can, otherwise keep all (single-sequence model).
+        if (entry.isl != null && entry.osl != null) {
+          return entry.isl === isl && entry.osl === osl;
+        }
+        return true;
+      });
+
+      if (points.length === 0) {
+        setError(
+          `No data found for ${spec.model} (${spec.sequence}). Try a different model or configuration.`,
+        );
+        setIsLoading(false);
+        return;
+      }
+
+      // Step 5: Build color map
+      const hwKeys = [...new Set(points.map((p) => p.hwKey ?? '').filter(Boolean))];
+      const colorMap = generateHighContrastColors(hwKeys, 'dark');
+
+      // Step 6: Build chart-specific data
+      const barData = spec.chartType === 'bar' ? buildBarData(points, spec, colorMap) : [];
+      const scatterData = spec.chartType === 'scatter' ? points : [];
+
+      // Step 7: Generate summary (best-effort, don't block on failure)
+      let summary: string | null = null;
+      try {
+        const dataDesc =
+          spec.chartType === 'bar'
+            ? barData.map((b) => `${b.label}: ${b.value.toFixed(1)}`).join('\n')
+            : `${points.length} data points across ${hwKeys.length} hardware configs`;
+
+        const summaryRaw = await callLlm(
+          provider,
+          apiKey,
+          buildSummaryPrompt(spec, dataDesc),
+          'Provide the summary.',
+        );
+        summary = summaryRaw.trim();
+      } catch {
+        // Summary generation is non-critical
+      }
+
+      setResult({
+        spec,
+        barData,
+        scatterData,
+        colorMap,
+        summary,
+      });
+    } catch (err) {
+      setError(err instanceof Error ? err.message : 'An unexpected error occurred.');
+    } finally {
+      setIsLoading(false);
+    }
+  }, []);
+
+  const reset = useCallback(() => {
+    setResult(null);
+    setError(null);
+  }, []);
+
+  return { result, isLoading, error, generate, reset };
+}
diff --git a/packages/app/src/lib/ai-providers.ts b/packages/app/src/lib/ai-providers.ts
new file mode 100644
index 0000000..1c0de60
--- /dev/null
+++ b/packages/app/src/lib/ai-providers.ts
@@ -0,0 +1,141 @@
+import type { AiProvider } from '@/components/ai-chart/types';
+
+interface ProviderDef {
+  label: string;
+  buildRequest(
+    systemPrompt: string,
+    userPrompt: string,
+    apiKey: string,
+  ): { url: string; init: RequestInit };
+  parseResponse(json: any): string;
+}
+
+const PROVIDERS: Record<AiProvider, ProviderDef> = {
+  openai: {
+    label: 'OpenAI',
+    buildRequest(system, user, apiKey) {
+      return {
+        url: 'https://api.openai.com/v1/chat/completions',
+        init: {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+            Authorization: `Bearer ${apiKey}`,
+          },
+          body: JSON.stringify({
+            model: 'gpt-4o',
+            temperature: 0,
+            response_format: { type: 'json_object' },
+            messages: [
+              { role: 'system', content: system },
+              { role: 'user', content: user },
+            ],
+          }),
+        },
+      };
+    },
+    parseResponse(json) {
+      return json.choices?.[0]?.message?.content ?? '';
+    },
+  },
+
+  anthropic: {
+    label: 'Anthropic',
+    buildRequest(system, user, apiKey) {
+      return {
+        url: 'https://api.anthropic.com/v1/messages',
+        init: {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+            'x-api-key': apiKey,
+            'anthropic-version': '2023-06-01',
+            'anthropic-dangerous-direct-browser-access': 'true',
+          },
+          body: JSON.stringify({
+            model: 'claude-sonnet-4-20250514',
+            max_tokens: 1024,
+            system,
+            messages: [{ role: 'user', content: user }],
+          }),
+        },
+      };
+    },
+    parseResponse(json) {
+      return json.content?.[0]?.text ?? '';
+    },
+  },
+
+  xai: {
+    label: 'xAI',
+    buildRequest(system, user, apiKey) {
+      return {
+        url: 'https://api.x.ai/v1/chat/completions',
+        init: {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+            Authorization: `Bearer ${apiKey}`,
+          },
+          body: JSON.stringify({
+            model: 'grok-3',
+            temperature: 0,
+            messages: [
+              { role: 'system', content: system },
+              { role: 'user', content: user },
+            ],
+          }),
+        },
+      };
+    },
+    parseResponse(json) {
+      return json.choices?.[0]?.message?.content ?? '';
+    },
+  },
+
+  google: {
+    label: 'Google',
+    buildRequest(system, user, apiKey) {
+      return {
+        url: `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${apiKey}`,
+        init: {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({
+            contents: [{ role: 'user', parts: [{ text: `${system}\n\nUser request: ${user}` }] }],
+            generationConfig: { temperature: 0, responseMimeType: 'application/json' },
+          }),
+        },
+      };
+    },
+    parseResponse(json) {
+      return json.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
+    },
+  },
+};
+
+export function getProviderLabel(provider: AiProvider): string {
+  return PROVIDERS[provider].label;
+}
+
+export const PROVIDER_OPTIONS: AiProvider[] = ['openai', 'anthropic', 'xai', 'google'];
+
+export async function callLlm(
+  provider: AiProvider,
+  apiKey: string,
+  systemPrompt: string,
+  userPrompt: string,
+): Promise<string> {
+  const def = PROVIDERS[provider];
+  const { url, init } = def.buildRequest(systemPrompt, userPrompt, apiKey);
+  const res = await fetch(url, init);
+  const json = await res.json();
+
+  if (!res.ok) {
+    const msg =
+      json?.error?.message ?? json?.error?.type ?? `${provider} request failed (${res.status})`;
+    throw new Error(msg);
+  }
+
+  return def.parseResponse(json);
+}
diff --git a/packages/app/src/lib/tab-meta.ts b/packages/app/src/lib/tab-meta.ts
index c3d8935..8cc7dc2 100644
--- a/packages/app/src/lib/tab-meta.ts
+++ b/packages/app/src/lib/tab-meta.ts
@@ -15,6 +15,7 @@ export const VALID_TABS = [
   'calculator',
   'reliability',
   'gpu-specs',
+  'ai-chart',
   'gpu-metrics',
 ] as const;
 
@@ -51,6 +52,11 @@ export const TAB_META: Record<TabKey, { title: string; description: string }> =
     description:
       'Detailed GPU specifications for AI inference. Compare NVIDIA, AMD, and Intel GPUs — memory bandwidth, FLOPS, interconnects, and topology.',
   },
+  'ai-chart': {
+    title: 'AI-Powered Chart Generation',
+    description:
+      'Generate custom inference benchmark charts using natural language prompts. Compare GPUs, costs, and performance with AI assistance.',
+  },
   'gpu-metrics': {
     title: 'GPU Power & Efficiency Metrics',
     description:

From f7ee42fec688effbfb76ae09e59c0830fec1ed84 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:07:51 -0500
Subject: [PATCH 02/14] feat: expose evaluations, reliability, and history data
 sources to AI chart

---
 .../components/ai-chart/example-prompts.ts    |   4 +-
 .../components/ai-chart/prompt-templates.ts   |  37 ++-
 packages/app/src/components/ai-chart/types.ts |   3 +
 packages/app/src/hooks/api/use-ai-chart.ts    | 290 ++++++++++++++----
 4 files changed, 259 insertions(+), 75 deletions(-)

diff --git a/packages/app/src/components/ai-chart/example-prompts.ts b/packages/app/src/components/ai-chart/example-prompts.ts
index 19a85ba..5fc88cf 100644
--- a/packages/app/src/components/ai-chart/example-prompts.ts
+++ b/packages/app/src/components/ai-chart/example-prompts.ts
@@ -2,7 +2,7 @@ export const EXAMPLE_PROMPTS = [
   'Compare throughput per GPU across all GPUs for DeepSeek R1 at 8k/1k',
   'Bar chart: H100 vs B200 vs GB200 cost per million tokens (hyperscaler) for DeepSeek R1',
   'Show a scatter plot of all GPU configs for DeepSeek R1 at 8k/1k with throughput per GPU',
-  'Compare MI355X vs H200 throughput at 50 tok/s/user interactivity for DeepSeek R1 fp8',
+  'Which GPU has the best GSM8K accuracy score for DeepSeek R1?',
+  'Compare reliability/success rate across all GPUs',
   'Bar chart of energy per output token across all GPUs for gpt-oss at 8k/1k',
-  'Scatter plot: cost per million tokens vs interactivity for all GPUs, DeepSeek R1 fp8 8k/1k',
 ];
diff --git a/packages/app/src/components/ai-chart/prompt-templates.ts b/packages/app/src/components/ai-chart/prompt-templates.ts
index 5f8e41c..b8ae3b7 100644
--- a/packages/app/src/components/ai-chart/prompt-templates.ts
+++ b/packages/app/src/components/ai-chart/prompt-templates.ts
@@ -5,6 +5,13 @@
 export function buildParsePrompt(): string {
   return `You are an expert at parsing natural language requests about ML inference benchmarks into structured JSON.
 
+## Available data sources
+
+1. **benchmarks** — Inference performance: throughput, latency, cost, energy per GPU config. Use for comparing GPU performance, cost-efficiency, or energy usage.
+2. **evaluations** — Accuracy/quality scores (GSM8K) per hardware/model/precision. Use for accuracy comparisons.
+3. **reliability** — GPU success rates (n_success / total) per hardware per date. Use for reliability/uptime comparisons.
+4. **history** — Historical benchmark data over time for a specific model/sequence. Use for trend analysis ("how has X improved over time").
+
 ## Available values
 
 Models: DeepSeek-R1-0528, Llama-3.3-70B-Instruct-FP8, gpt-oss-120b, Qwen-3.5-397B-A17B, Kimi-K2.5, MiniMax-M2.5, GLM-5
@@ -12,8 +19,9 @@ Sequences (input/output token lengths): 1k/1k, 1k/8k, 8k/1k
 Precisions: fp4, fp8, bf16, int4
 GPU base names: gb300, gb200, b300, b200, mi355x, h200, mi325x, h100, mi300x
 Frameworks: sglang, vllm, trt, dynamo-sglang, dynamo-trt, mori-sglang, atom, sglang-disagg
+Eval tasks: gsm8k
 
-Y-axis metrics (key → label):
+Y-axis metrics for benchmarks (key → label):
 - y_tpPerGpu → Token Throughput per GPU (tok/s/gpu) [DEFAULT]
 - y_outputTputPerGpu → Output Token Throughput per GPU (tok/s/gpu)
 - y_inputTputPerGpu → Input Token Throughput per GPU (tok/s/gpu)
@@ -25,23 +33,32 @@ Y-axis metrics (key → label):
 - y_jOutput → All-in J per Output Token (J/tok)
 - y_jInput → All-in J per Input Token (J/tok)
 
+Y-axis metrics for evaluations:
+- eval_score → Evaluation Score (e.g., GSM8K accuracy)
+
+Y-axis metrics for reliability:
+- reliability_rate → Success Rate (%)
+
 ## Rules
 
-1. Map user intent to the closest available values. Be flexible with naming (e.g., "H100" → "h100", "deepseek r1" → "DeepSeek-R1-0528", "sglang" → "sglang").
-2. hardwareKeys: list of GPU base names the user wants to compare. Empty array [] means "all GPUs".
-3. precisions: list of precisions. Empty array [] means "all precisions".
-4. chartType: "bar" for comparing specific values across GPUs/configs, "scatter" for plotting all data points (interactivity vs metric).
-5. targetInteractivity: for bar charts, the interactivity level (tok/s/user) to read from. Default 40.
-6. If the user doesn't specify a model, default to "DeepSeek-R1-0528".
-7. If the user doesn't specify a sequence, default to "8k/1k".
-8. title: a short chart title describing the comparison.
-9. description: a one-sentence description of what the chart shows.
+1. Map user intent to the closest available values. Be flexible with naming (e.g., "H100" → "h100", "deepseek r1" → "DeepSeek-R1-0528").
+2. Pick the correct dataSource based on what the user is asking about (performance → benchmarks, accuracy → evaluations, uptime/success → reliability, trends over time → history).
+3. hardwareKeys: list of GPU base names to compare. Empty array [] means "all GPUs".
+4. precisions: list of precisions. Empty array [] means "all precisions".
+5. chartType: "bar" for comparing specific values across GPUs/configs, "scatter" for plotting all data points.
+6. targetInteractivity: for benchmark bar charts, the interactivity level (tok/s/user) to read from. Default 40.
+7. If the user doesn't specify a model, default to "DeepSeek-R1-0528".
+8. If the user doesn't specify a sequence, default to "8k/1k".
+9. title: a short chart title describing the comparison.
+10. description: a one-sentence description of what the chart shows.
+11. For evaluations: yAxisMetric should be "eval_score". For reliability: yAxisMetric should be "reliability_rate".
 
 ## Output format
 
 Return ONLY valid JSON matching this schema (no markdown, no preamble):
 {
   "chartType": "bar" | "scatter",
+  "dataSource": "benchmarks" | "evaluations" | "reliability" | "history",
   "model": "string",
   "sequence": "string",
   "precisions": ["string"],
diff --git a/packages/app/src/components/ai-chart/types.ts b/packages/app/src/components/ai-chart/types.ts
index fa5c41d..65c2d7c 100644
--- a/packages/app/src/components/ai-chart/types.ts
+++ b/packages/app/src/components/ai-chart/types.ts
@@ -2,8 +2,11 @@ export type AiProvider = 'openai' | 'anthropic' | 'xai' | 'google';
 
 export type AiChartType = 'bar' | 'scatter';
 
+export type AiDataSource = 'benchmarks' | 'evaluations' | 'reliability' | 'history';
+
 export interface AiChartSpec {
   chartType: AiChartType;
+  dataSource: AiDataSource;
   model: string;
   sequence: string;
   precisions: string[];
diff --git a/packages/app/src/hooks/api/use-ai-chart.ts b/packages/app/src/hooks/api/use-ai-chart.ts
index 4224c98..3049889 100644
--- a/packages/app/src/hooks/api/use-ai-chart.ts
+++ b/packages/app/src/hooks/api/use-ai-chart.ts
@@ -6,9 +6,15 @@ import type { AiChartBarPoint, AiChartSpec, AiProvider } from '@/components/ai-c
 import { buildParsePrompt, buildSummaryPrompt } from '@/components/ai-chart/prompt-templates';
 import type { InferenceData } from '@/components/inference/types';
 import { callLlm } from '@/lib/ai-providers';
-import { fetchBenchmarks } from '@/lib/api';
+import {
+  fetchBenchmarks,
+  fetchBenchmarkHistory,
+  fetchEvaluations,
+  fetchReliability,
+} from '@/lib/api';
+import type { EvalRow, ReliabilityRow } from '@/lib/api';
 import { transformBenchmarkRows } from '@/lib/benchmark-transform';
-import { getNestedYValue } from '@/lib/chart-utils';
+import { getNestedYValue, normalizeEvalHardwareKey } from '@/lib/chart-utils';
 import { generateHighContrastColors } from '@/lib/chart-utils';
 import { getHardwareConfig, getModelSortIndex } from '@/lib/constants';
 
@@ -16,11 +22,8 @@ import chartDefinitions from '@/components/inference/inference-chart-config.json
 
 interface AiChartResult {
   spec: AiChartSpec;
-  /** For bar charts: aggregated bar data. */
   barData: AiChartBarPoint[];
-  /** For scatter charts: raw InferenceData points. */
   scatterData: InferenceData[];
-  /** Color map: hwKey → color. */
   colorMap: Record<string, string>;
   summary: string | null;
 }
@@ -41,22 +44,19 @@ function parseSpecFromLlm(raw: string): AiChartSpec {
   return JSON.parse(cleaned);
 }
 
-/**
- * For bar charts: group scatter data by hwKey, find the point closest to
- * targetInteractivity, and extract the requested metric value.
- */
-function buildBarData(
+// ---------------------------------------------------------------------------
+// Benchmark helpers
+// ---------------------------------------------------------------------------
+
+function buildBenchmarkBarData(
   data: InferenceData[],
   spec: AiChartSpec,
   colorMap: Record<string, string>,
 ): AiChartBarPoint[] {
   const target = spec.targetInteractivity ?? 40;
-
-  // Find the y-field path from the chart config (interactivity chart = index 0)
   const chartDef = (chartDefinitions as any[])[0];
   const yFieldPath: string = chartDef[spec.yAxisMetric] ?? 'tpPerGpu.y';
 
-  // Group by hwKey
   const groups = new Map<string, InferenceData[]>();
   for (const point of data) {
     const key = point.hwKey ?? '';
@@ -66,7 +66,6 @@ function buildBarData(
 
   const bars: AiChartBarPoint[] = [];
   for (const [hwKey, points] of groups) {
-    // Find closest to target interactivity (x = median_intvty)
     let closest = points[0];
     let closestDist = Math.abs(closest.x - target);
     for (let i = 1; i < points.length; i++) {
@@ -89,7 +88,6 @@ function buildBarData(
     });
   }
 
-  // Sort by MODEL_ORDER (GPU hierarchy)
   bars.sort((a, b) => getModelSortIndex(a.hwKey) - getModelSortIndex(b.hwKey));
   return bars;
 }
@@ -100,6 +98,108 @@ function sequenceToIslOsl(seq: string): { isl: number; osl: number } {
   return { isl: parse(parts[0] ?? '1k'), osl: parse(parts[1] ?? '1k') };
 }
 
+// ---------------------------------------------------------------------------
+// Evaluation helpers
+// ---------------------------------------------------------------------------
+
+function buildEvalBarData(
+  rows: EvalRow[],
+  spec: AiChartSpec,
+  colorMap: Record<string, string>,
+): AiChartBarPoint[] {
+  // Filter by model, hardware, precision
+  let filtered = rows.filter((r) => r.model === spec.model || spec.model === '');
+  if (spec.hardwareKeys.length > 0) {
+    const allowed = new Set(spec.hardwareKeys);
+    filtered = filtered.filter((r) => {
+      const hw = r.hardware.toLowerCase();
+      return allowed.has(hw) || [...allowed].some((g) => hw.startsWith(g));
+    });
+  }
+  if (spec.precisions.length > 0) {
+    const allowed = new Set(spec.precisions.map((p) => p.toLowerCase()));
+    filtered = filtered.filter((r) => allowed.has(r.precision.toLowerCase()));
+  }
+
+  // Group by hardware key, take latest date per group, extract score
+  const groups = new Map<string, EvalRow>();
+  for (const row of filtered) {
+    const hwKey = normalizeEvalHardwareKey(row.hardware, row.framework, row.spec_method);
+    const existing = groups.get(hwKey);
+    if (!existing || row.date > existing.date) {
+      groups.set(hwKey, row);
+    }
+  }
+
+  const bars: AiChartBarPoint[] = [];
+  for (const [hwKey, row] of groups) {
+    // GSM8K score is typically in metrics as "gsm8k" or first metric value
+    const score = row.metrics.gsm8k ?? row.metrics.accuracy ?? Object.values(row.metrics)[0] ?? 0;
+    if (score <= 0) continue;
+
+    const config = getHardwareConfig(hwKey);
+    bars.push({
+      hwKey,
+      label: config ? `${config.label}${config.suffix ? ' ' + config.suffix : ''}` : hwKey,
+      value: score,
+      color: colorMap[hwKey] ?? '#888',
+    });
+  }
+
+  bars.sort((a, b) => getModelSortIndex(a.hwKey) - getModelSortIndex(b.hwKey));
+  return bars;
+}
+
+// ---------------------------------------------------------------------------
+// Reliability helpers
+// ---------------------------------------------------------------------------
+
+function buildReliabilityBarData(
+  rows: ReliabilityRow[],
+  spec: AiChartSpec,
+  colorMap: Record<string, string>,
+): AiChartBarPoint[] {
+  // Filter by hardware
+  let filtered = rows;
+  if (spec.hardwareKeys.length > 0) {
+    const allowed = new Set(spec.hardwareKeys);
+    filtered = filtered.filter((r) => {
+      const hw = r.hardware.toLowerCase();
+      return allowed.has(hw) || [...allowed].some((g) => hw.startsWith(g));
+    });
+  }
+
+  // Aggregate across dates: total successes / total attempts per hardware
+  const agg = new Map<string, { success: number; total: number }>();
+  for (const row of filtered) {
+    const hw = row.hardware;
+    const existing = agg.get(hw) ?? { success: 0, total: 0 };
+    existing.success += row.n_success;
+    existing.total += row.total;
+    agg.set(hw, existing);
+  }
+
+  const bars: AiChartBarPoint[] = [];
+  for (const [hw, { success, total }] of agg) {
+    if (total === 0) continue;
+    const rate = (success / total) * 100;
+    const config = getHardwareConfig(hw);
+    bars.push({
+      hwKey: hw,
+      label: config ? `${config.label}${config.suffix ? ' ' + config.suffix : ''}` : hw,
+      value: Math.round(rate * 100) / 100,
+      color: colorMap[hw] ?? '#888',
+    });
+  }
+
+  bars.sort((a, b) => getModelSortIndex(a.hwKey) - getModelSortIndex(b.hwKey));
+  return bars;
+}
+
+// ---------------------------------------------------------------------------
+// Main hook
+// ---------------------------------------------------------------------------
+
 export function useAiChart(): UseAiChartReturn {
   const [result, setResult] = useState<AiChartResult | null>(null);
   const [isLoading, setIsLoading] = useState(false);
@@ -114,16 +214,70 @@ export function useAiChart(): UseAiChartReturn {
       // Step 1: Parse prompt into spec
       const rawSpec = await callLlm(provider, apiKey, buildParsePrompt(), prompt);
       const spec = parseSpecFromLlm(rawSpec);
+      // Default dataSource for backwards compat
+      if (!spec.dataSource) spec.dataSource = 'benchmarks';
+
+      let barData: AiChartBarPoint[] = [];
+      let scatterData: InferenceData[] = [];
+      let hwKeys: string[] = [];
+
+      if (spec.dataSource === 'evaluations') {
+        // ---- Evaluations ----
+        const rows = await fetchEvaluations();
+        hwKeys = [
+          ...new Set(
+            rows.map((r) => normalizeEvalHardwareKey(r.hardware, r.framework, r.spec_method)),
+          ),
+        ];
+        const colorMap = generateHighContrastColors(hwKeys, 'dark');
+        barData = buildEvalBarData(rows, spec, colorMap);
+
+        if (barData.length === 0) {
+          setError('No evaluation data found for the requested configuration.');
+          setIsLoading(false);
+          return;
+        }
+
+        hwKeys = barData.map((b) => b.hwKey);
+        const finalColorMap = generateHighContrastColors(hwKeys, 'dark');
+        barData = barData.map((b) => ({ ...b, color: finalColorMap[b.hwKey] ?? b.color }));
+
+        await generateSummary(provider, apiKey, spec, barData, finalColorMap, setResult);
+        return;
+      }
+
+      if (spec.dataSource === 'reliability') {
+        // ---- Reliability ----
+        const rows = await fetchReliability();
+        hwKeys = [...new Set(rows.map((r) => r.hardware))];
+        const colorMap = generateHighContrastColors(hwKeys, 'dark');
+        barData = buildReliabilityBarData(rows, spec, colorMap);
+
+        if (barData.length === 0) {
+          setError('No reliability data found for the requested configuration.');
+          setIsLoading(false);
+          return;
+        }
 
-      // Step 2: Fetch benchmark data from our API
-      const rows = await fetchBenchmarks(spec.model);
+        hwKeys = barData.map((b) => b.hwKey);
+        const finalColorMap = generateHighContrastColors(hwKeys, 'dark');
+        barData = barData.map((b) => ({ ...b, color: finalColorMap[b.hwKey] ?? b.color }));
+
+        await generateSummary(provider, apiKey, spec, barData, finalColorMap, setResult);
+        return;
+      }
+
+      // ---- Benchmarks (default) & History ----
+      const { isl, osl } = sequenceToIslOsl(spec.sequence);
+      const rows =
+        spec.dataSource === 'history'
+          ? await fetchBenchmarkHistory(spec.model, isl, osl)
+          : await fetchBenchmarks(spec.model);
 
-      // Step 3: Transform to InferenceData
       const { chartData } = transformBenchmarkRows(rows);
-      // Use interactivity chart (index 0)
       let points = chartData[0] ?? [];
 
-      // Step 4: Filter by spec
+      // Filter by spec
       if (spec.hardwareKeys.length > 0) {
         const allowedGpus = new Set(spec.hardwareKeys);
         points = points.filter((p) => {
@@ -136,18 +290,16 @@ export function useAiChart(): UseAiChartReturn {
         points = points.filter((p) => p.precision && allowedPrec.has(p.precision.toLowerCase()));
       }
 
-      // Filter by sequence
-      const { isl, osl } = sequenceToIslOsl(spec.sequence);
-      points = points.filter((p) => {
-        const entry = p as any;
-        // BenchmarkRows are keyed by model which includes sequence in the API,
-        // but transformed points don't always have isl/osl. Check conc-level data
-        // if we can, otherwise keep all (single-sequence model).
-        if (entry.isl != null && entry.osl != null) {
-          return entry.isl === isl && entry.osl === osl;
-        }
-        return true;
-      });
+      // Filter by sequence (for non-history, where all sequences may be returned)
+      if (spec.dataSource !== 'history') {
+        points = points.filter((p) => {
+          const entry = p as any;
+          if (entry.isl != null && entry.osl != null) {
+            return entry.isl === isl && entry.osl === osl;
+          }
+          return true;
+        });
+      }
 
       if (points.length === 0) {
         setError(
@@ -157,40 +309,13 @@ export function useAiChart(): UseAiChartReturn {
         return;
       }
 
-      // Step 5: Build color map
-      const hwKeys = [...new Set(points.map((p) => p.hwKey ?? '').filter(Boolean))];
+      hwKeys = [...new Set(points.map((p) => p.hwKey ?? '').filter(Boolean))];
       const colorMap = generateHighContrastColors(hwKeys, 'dark');
 
-      // Step 6: Build chart-specific data
-      const barData = spec.chartType === 'bar' ? buildBarData(points, spec, colorMap) : [];
-      const scatterData = spec.chartType === 'scatter' ? points : [];
-
-      // Step 7: Generate summary (best-effort, don't block on failure)
-      let summary: string | null = null;
-      try {
-        const dataDesc =
-          spec.chartType === 'bar'
-            ? barData.map((b) => `${b.label}: ${b.value.toFixed(1)}`).join('\n')
-            : `${points.length} data points across ${hwKeys.length} hardware configs`;
-
-        const summaryRaw = await callLlm(
-          provider,
-          apiKey,
-          buildSummaryPrompt(spec, dataDesc),
-          'Provide the summary.',
-        );
-        summary = summaryRaw.trim();
-      } catch {
-        // Summary generation is non-critical
-      }
+      barData = spec.chartType === 'bar' ? buildBenchmarkBarData(points, spec, colorMap) : [];
+      scatterData = spec.chartType === 'scatter' ? points : [];
 
-      setResult({
-        spec,
-        barData,
-        scatterData,
-        colorMap,
-        summary,
-      });
+      await generateSummary(provider, apiKey, spec, barData, colorMap, setResult, scatterData);
     } catch (err) {
       setError(err instanceof Error ? err.message : 'An unexpected error occurred.');
     } finally {
@@ -205,3 +330,42 @@ export function useAiChart(): UseAiChartReturn {
 
   return { result, isLoading, error, generate, reset };
 }
+
+async function generateSummary(
+  provider: AiProvider,
+  apiKey: string,
+  spec: AiChartSpec,
+  barData: AiChartBarPoint[],
+  colorMap: Record<string, string>,
+  setResult: (r: AiChartResult) => void,
+  scatterData: InferenceData[] = [],
+) {
+  let summary: string | null = null;
+  try {
+    const hwKeys = [
+      ...new Set([...barData.map((b) => b.hwKey), ...scatterData.map((p) => p.hwKey ?? '')]),
+    ].filter(Boolean);
+    const dataDesc =
+      barData.length > 0
+        ? barData.map((b) => `${b.label}: ${b.value.toFixed(2)}`).join('\n')
+        : `${scatterData.length} data points across ${hwKeys.length} hardware configs`;
+
+    const summaryRaw = await callLlm(
+      provider,
+      apiKey,
+      buildSummaryPrompt(spec, dataDesc),
+      'Provide the summary.',
+    );
+    summary = summaryRaw.trim();
+  } catch {
+    // Summary generation is non-critical
+  }
+
+  setResult({
+    spec,
+    barData,
+    scatterData,
+    colorMap,
+    summary,
+  });
+}

From ea69095819120681b0646a46b34644a31b5f9adf Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:09:52 -0500
Subject: [PATCH 03/14] fix: keep API key in memory only, not in sessionStorage

---
 .../components/ai-chart/AiChartDisplay.tsx    | 33 +++++--------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/packages/app/src/components/ai-chart/AiChartDisplay.tsx b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
index ff86d71..946c538 100644
--- a/packages/app/src/components/ai-chart/AiChartDisplay.tsx
+++ b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
@@ -1,6 +1,6 @@
 'use client';
 
-import { useCallback, useEffect, useState } from 'react';
+import { useCallback, useState } from 'react';
 import { AlertCircle, Eye, EyeOff, Sparkles } from 'lucide-react';
 
 import { track } from '@/lib/analytics';
@@ -23,33 +23,19 @@ import type { AiProvider } from './types';
 import { EXAMPLE_PROMPTS } from './example-prompts';
 import AiChartResult from './AiChartResult';
 
-const STORAGE_PREFIX = 'inferencex-ai-key-';
-
-function getStoredKey(provider: AiProvider): string {
-  if (typeof window === 'undefined') return '';
-  return sessionStorage.getItem(`${STORAGE_PREFIX}${provider}`) ?? '';
-}
-
-function storeKey(provider: AiProvider, key: string) {
-  if (typeof window === 'undefined') return;
-  if (key) {
-    sessionStorage.setItem(`${STORAGE_PREFIX}${provider}`, key);
-  } else {
-    sessionStorage.removeItem(`${STORAGE_PREFIX}${provider}`);
-  }
-}
-
 export default function AiChartDisplay() {
   const [provider, setProvider] = useState<AiProvider>('openai');
-  const [apiKey, setApiKey] = useState('');
+  const [apiKeys, setApiKeys] = useState<Record<AiProvider, string>>({
+    openai: '',
+    anthropic: '',
+    xai: '',
+    google: '',
+  });
   const [prompt, setPrompt] = useState('');
   const [showKey, setShowKey] = useState(false);
   const { result, isLoading, error, generate, reset } = useAiChart();
 
-  // Load stored key on provider change
-  useEffect(() => {
-    setApiKey(getStoredKey(provider));
-  }, [provider]);
+  const apiKey = apiKeys[provider];
 
   const handleProviderChange = useCallback((value: string) => {
     const newProvider = value as AiProvider;
@@ -59,7 +45,6 @@ export default function AiChartDisplay() {
 
   const handleSubmit = useCallback(() => {
     if (!apiKey.trim() || !prompt.trim()) return;
-    storeKey(provider, apiKey);
     track('ai_chart_prompt_submitted', { provider, prompt_length: prompt.length });
     generate(prompt, provider, apiKey);
   }, [apiKey, prompt, provider, generate]);
@@ -115,7 +100,7 @@ export default function AiChartDisplay() {
                 type={showKey ? 'text' : 'password'}
                 placeholder={`${getProviderLabel(provider)} API Key`}
                 value={apiKey}
-                onChange={(e) => setApiKey(e.target.value)}
+                onChange={(e) => setApiKeys((prev) => ({ ...prev, [provider]: e.target.value }))}
               />
               <Button
                 variant="outline"

From 86e2d18bf2118345706e6eced75c93dd3aecbaed Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:10:47 -0500
Subject: [PATCH 04/14] style: move eye toggle inside API key input field

---
 .../app/src/components/ai-chart/AiChartDisplay.tsx   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/packages/app/src/components/ai-chart/AiChartDisplay.tsx b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
index 946c538..3047962 100644
--- a/packages/app/src/components/ai-chart/AiChartDisplay.tsx
+++ b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
@@ -94,22 +94,22 @@ export default function AiChartDisplay() {
                 </SelectContent>
               </Select>
             </div>
-            <div className="flex flex-1 gap-2">
+            <div className="relative flex-1">
               <Input
-                className="flex-1"
+                className="pr-9"
                 type={showKey ? 'text' : 'password'}
                 placeholder={`${getProviderLabel(provider)} API Key`}
                 value={apiKey}
                 onChange={(e) => setApiKeys((prev) => ({ ...prev, [provider]: e.target.value }))}
               />
-              <Button
-                variant="outline"
-                size="icon"
+              <button
+                type="button"
+                className="text-muted-foreground hover:text-foreground absolute right-2.5 top-1/2 -translate-y-1/2 transition-colors"
                 onClick={() => setShowKey((s) => !s)}
                 aria-label={showKey ? 'Hide API key' : 'Show API key'}
               >
                 {showKey ? <Eye className="size-4" /> : <EyeOff className="size-4" />}
-              </Button>
+              </button>
             </div>
           </div>
 

From b45530fba6f05883229e33efe67b91d56b3b7701 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:11:42 -0500
Subject: [PATCH 05/14] fix: invert eye icon on API key toggle

---
 packages/app/src/components/ai-chart/AiChartDisplay.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/app/src/components/ai-chart/AiChartDisplay.tsx b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
index 3047962..7369184 100644
--- a/packages/app/src/components/ai-chart/AiChartDisplay.tsx
+++ b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
@@ -108,7 +108,7 @@ export default function AiChartDisplay() {
                 onClick={() => setShowKey((s) => !s)}
                 aria-label={showKey ? 'Hide API key' : 'Show API key'}
               >
-                {showKey ? <Eye className="size-4" /> : <EyeOff className="size-4" />}
+                {showKey ? <EyeOff className="size-4" /> : <Eye className="size-4" />}
               </button>
             </div>
           </div>

From a4ba19573a35dfeb26c9030a43bcf61947a9e728 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:12:55 -0500
Subject: [PATCH 06/14] style: split provider config and prompt input into
 separate cards

---
 packages/app/src/components/ai-chart/AiChartDisplay.tsx | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/ai-chart/AiChartDisplay.tsx b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
index 7369184..ce7be4d 100644
--- a/packages/app/src/components/ai-chart/AiChartDisplay.tsx
+++ b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
@@ -66,7 +66,7 @@ export default function AiChartDisplay() {
 
   return (
     <div className="space-y-6">
-      {/* Provider & API Key */}
+      {/* Title, description & API Key */}
       <Card>
         <CardHeader className="pb-4">
           <CardTitle className="flex items-center gap-2">
@@ -78,7 +78,7 @@ export default function AiChartDisplay() {
             and only used by your selected provider. We never see it.
           </CardDescription>
         </CardHeader>
-        <CardContent className="space-y-4">
+        <CardContent>
           <div className="flex flex-col gap-4 sm:flex-row">
             <div className="w-full sm:w-48">
               <Select value={provider} onValueChange={handleProviderChange}>
@@ -112,7 +112,12 @@ export default function AiChartDisplay() {
               </button>
             </div>
           </div>
+        </CardContent>
+      </Card>
 
+      {/* Prompt input */}
+      <Card>
+        <CardContent className="pt-6">
           <div className="space-y-2">
             <Textarea
               placeholder="Describe the chart you want to see..."

From f004b786d8258431ef20219740c7c19efaad3146 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:41:57 -0500
Subject: [PATCH 07/14] style: rotate bar chart x-axis labels 32deg with
 dynamic bottom margin

---
 .../src/components/ai-chart/AiChartResult.tsx  | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/ai-chart/AiChartResult.tsx b/packages/app/src/components/ai-chart/AiChartResult.tsx
index 85f36fd..b2bb971 100644
--- a/packages/app/src/components/ai-chart/AiChartResult.tsx
+++ b/packages/app/src/components/ai-chart/AiChartResult.tsx
@@ -43,7 +43,22 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
     [yMax],
   );
 
-  const xAxis = useMemo<AxisConfig>(() => ({ label: '' }), []);
+  const maxLabelLen = useMemo(() => Math.max(...data.map((d) => d.label.length), 0), [data]);
+  const bottomMargin = Math.max(50, Math.min(maxLabelLen * 4.5, 120));
+
+  const xAxis = useMemo<AxisConfig>(
+    () => ({
+      label: '',
+      customize: (g) => {
+        g.selectAll('text')
+          .attr('transform', 'rotate(-32)')
+          .attr('text-anchor', 'end')
+          .attr('dx', '-0.5em')
+          .attr('dy', '0.25em');
+      },
+    }),
+    [],
+  );
   const yAxis = useMemo<AxisConfig>(() => ({ label: spec.yAxisLabel }), [spec.yAxisLabel]);
 
   const layers = useMemo(() => {
@@ -66,6 +81,7 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
       chartId="ai-chart-bar"
       data={data}
       height={450}
+      margin={{ top: 24, right: 10, bottom: bottomMargin, left: 60 }}
       xScale={xScale}
       yScale={yScale}
       xAxis={xAxis}

From 0c1501e64f4dd7c986df2ad0c85b2ad7ab7e6e45 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:42:53 -0500
Subject: [PATCH 08/14] style: recombine prompt input and title into single
 card

---
 packages/app/src/components/ai-chart/AiChartDisplay.tsx | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/packages/app/src/components/ai-chart/AiChartDisplay.tsx b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
index ce7be4d..5dea861 100644
--- a/packages/app/src/components/ai-chart/AiChartDisplay.tsx
+++ b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
@@ -112,13 +112,7 @@ export default function AiChartDisplay() {
               </button>
             </div>
           </div>
-        </CardContent>
-      </Card>
-
-      {/* Prompt input */}
-      <Card>
-        <CardContent className="pt-6">
-          <div className="space-y-2">
+          <div className="mt-4 space-y-2">
             <Textarea
               placeholder="Describe the chart you want to see..."
               value={prompt}

From 52affe325a9baeb28e175e2f72b5b46702c9d24a Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:44:11 -0500
Subject: [PATCH 09/14] style: use gap-4 for card spacing to match other
 dashboard pages

---
 packages/app/src/components/ai-chart/AiChartDisplay.tsx | 2 +-
 packages/app/src/components/ai-chart/AiChartResult.tsx  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/ai-chart/AiChartDisplay.tsx b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
index 5dea861..36bb90f 100644
--- a/packages/app/src/components/ai-chart/AiChartDisplay.tsx
+++ b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
@@ -65,7 +65,7 @@ export default function AiChartDisplay() {
   );
 
   return (
-    <div className="space-y-6">
+    <div className="flex flex-col gap-4">
       {/* Title, description & API Key */}
       <Card>
         <CardHeader className="pb-4">
diff --git a/packages/app/src/components/ai-chart/AiChartResult.tsx b/packages/app/src/components/ai-chart/AiChartResult.tsx
index b2bb971..1775ca5 100644
--- a/packages/app/src/components/ai-chart/AiChartResult.tsx
+++ b/packages/app/src/components/ai-chart/AiChartResult.tsx
@@ -159,7 +159,7 @@ export default function AiChartResult({
   summary,
 }: AiChartResultProps) {
   return (
-    <div className="space-y-4">
+    <div className="flex flex-col gap-4">
       <Card>
         <CardHeader>
           <CardTitle>{spec.title}</CardTitle>

From 87dfa6a647ec7624e800859115e1376259b24f3d Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:45:03 -0500
Subject: [PATCH 10/14] fix: mask API key input from PostHog

---
 packages/app/src/components/ai-chart/AiChartDisplay.tsx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/packages/app/src/components/ai-chart/AiChartDisplay.tsx b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
index 36bb90f..33af57c 100644
--- a/packages/app/src/components/ai-chart/AiChartDisplay.tsx
+++ b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
@@ -101,6 +101,8 @@ export default function AiChartDisplay() {
                 placeholder={`${getProviderLabel(provider)} API Key`}
                 value={apiKey}
                 onChange={(e) => setApiKeys((prev) => ({ ...prev, [provider]: e.target.value }))}
+                data-ph-no-capture
+                autoComplete="off"
               />
               <button
                 type="button"

From 08d881b6fdf0bef40a0ecdd35431d1e7b68ca2ba Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:47:05 -0500
Subject: [PATCH 11/14] fix: dynamically calculate left margin for rotated
 x-axis labels

---
 packages/app/src/components/ai-chart/AiChartResult.tsx | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/ai-chart/AiChartResult.tsx b/packages/app/src/components/ai-chart/AiChartResult.tsx
index 1775ca5..97a3a7d 100644
--- a/packages/app/src/components/ai-chart/AiChartResult.tsx
+++ b/packages/app/src/components/ai-chart/AiChartResult.tsx
@@ -44,7 +44,11 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
   );
 
   const maxLabelLen = useMemo(() => Math.max(...data.map((d) => d.label.length), 0), [data]);
-  const bottomMargin = Math.max(50, Math.min(maxLabelLen * 4.5, 120));
+  const firstLabelLen = data[0]?.label.length ?? 0;
+  const sin32 = Math.sin((32 * Math.PI) / 180);
+  const charWidth = 6.5;
+  const bottomMargin = Math.max(50, Math.min(maxLabelLen * charWidth * sin32, 120));
+  const leftMargin = Math.max(60, Math.min(firstLabelLen * charWidth * sin32 + 20, 140));
 
   const xAxis = useMemo<AxisConfig>(
     () => ({
@@ -81,7 +85,7 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
       chartId="ai-chart-bar"
       data={data}
       height={450}
-      margin={{ top: 24, right: 10, bottom: bottomMargin, left: 60 }}
+      margin={{ top: 24, right: 10, bottom: bottomMargin, left: leftMargin }}
       xScale={xScale}
       yScale={yScale}
       xAxis={xAxis}

From dea9e4d072f26c2a2131aebbe1eba2372572142c Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:49:47 -0500
Subject: [PATCH 12/14] feat: add tooltips to bar and scatter charts showing
 series name and value

---
 .../src/components/ai-chart/AiChartResult.tsx | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/packages/app/src/components/ai-chart/AiChartResult.tsx b/packages/app/src/components/ai-chart/AiChartResult.tsx
index 97a3a7d..6671cb9 100644
--- a/packages/app/src/components/ai-chart/AiChartResult.tsx
+++ b/packages/app/src/components/ai-chart/AiChartResult.tsx
@@ -8,6 +8,7 @@ import { D3Chart } from '@/lib/d3-chart/D3Chart';
 import type {
   BarLayerConfig,
   ScatterLayerConfig,
+  TooltipConfig,
   ScaleConfig,
   AxisConfig,
 } from '@/lib/d3-chart/D3Chart';
@@ -80,6 +81,23 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
     return [barLayer];
   }, [data]);
 
+  const tooltip = useMemo<TooltipConfig<AiChartBarPoint>>(
+    () => ({
+      rulerType: 'none',
+      content: (d) =>
+        `<div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 10px 14px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);">
+          <div style="display: flex; align-items: center; gap: 6px; margin-bottom: 6px;">
+            <span style="width: 10px; height: 10px; border-radius: 2px; background: ${d.color};"></span>
+            <span style="color: var(--foreground); font-size: 12px; font-weight: 600;">${d.label}</span>
+          </div>
+          <div style="color: var(--muted-foreground); font-size: 11px;">
+            <strong>${spec.yAxisLabel}:</strong> ${d.value.toLocaleString(undefined, { maximumFractionDigits: 2 })}
+          </div>
+        </div>`,
+    }),
+    [spec.yAxisLabel],
+  );
+
   return (
     <D3Chart
       chartId="ai-chart-bar"
@@ -91,6 +109,7 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
       xAxis={xAxis}
       yAxis={yAxis}
       layers={layers}
+      tooltip={tooltip}
       watermark="logo"
     />
   );
@@ -139,6 +158,29 @@ function ScatterChart({
     return [scatterLayer];
   }, [data, colorMap]);
 
+  const tooltip = useMemo<TooltipConfig<InferenceData>>(
+    () => ({
+      rulerType: 'crosshair',
+      content: (d) => {
+        const hwKey = d.hwKey ?? '';
+        const color = colorMap[hwKey] ?? '#888';
+        return `<div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 10px 14px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);">
+          <div style="display: flex; align-items: center; gap: 6px; margin-bottom: 6px;">
+            <span style="width: 10px; height: 10px; border-radius: 2px; background: ${color};"></span>
+            <span style="color: var(--foreground); font-size: 12px; font-weight: 600;">${hwKey}</span>
+          </div>
+          <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 2px;">
+            <strong>Interactivity:</strong> ${d.x.toFixed(1)} tok/s/user
+          </div>
+          <div style="color: var(--muted-foreground); font-size: 11px;">
+            <strong>${spec.yAxisLabel}:</strong> ${d.y.toLocaleString(undefined, { maximumFractionDigits: 2 })}
+          </div>
+        </div>`;
+      },
+    }),
+    [colorMap, spec.yAxisLabel],
+  );
+
   return (
     <D3Chart
       chartId="ai-chart-scatter"
@@ -149,6 +191,7 @@ function ScatterChart({
       xAxis={xAxis}
       yAxis={yAxis}
       layers={layers}
+      tooltip={tooltip}
       watermark="logo"
       zoom={{ enabled: true, axes: 'both' }}
     />

From 00170bf3df00eeda0e7c4c99cf84672879a6f9a4 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:52:45 -0500
Subject: [PATCH 13/14] security: sanitize all LLM output with DOMPurify before
 innerHTML

---
 packages/app/package.json                         |  1 +
 .../app/src/components/ai-chart/AiChartResult.tsx | 15 +++++++++++----
 pnpm-lock.yaml                                    |  3 +++
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/packages/app/package.json b/packages/app/package.json
index 722ab67..ec6ea7a 100644
--- a/packages/app/package.json
+++ b/packages/app/package.json
@@ -46,6 +46,7 @@
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
     "d3": "^7.9.0",
+    "dompurify": "^3.3.3",
     "gray-matter": "^4.0.3",
     "iwanthue": "^2.0.0",
     "lodash-es": "^4.17.23",
diff --git a/packages/app/src/components/ai-chart/AiChartResult.tsx b/packages/app/src/components/ai-chart/AiChartResult.tsx
index 6671cb9..e31e812 100644
--- a/packages/app/src/components/ai-chart/AiChartResult.tsx
+++ b/packages/app/src/components/ai-chart/AiChartResult.tsx
@@ -13,8 +13,15 @@ import type {
   AxisConfig,
 } from '@/lib/d3-chart/D3Chart';
 
+import DOMPurify from 'dompurify';
+
 import type { AiChartBarPoint, AiChartSpec } from './types';
 
+/** Sanitize tooltip HTML that may contain LLM-generated strings. */
+function sanitize(html: string): string {
+  return DOMPurify.sanitize(html);
+}
+
 interface AiChartResultProps {
   spec: AiChartSpec;
   barData: AiChartBarPoint[];
@@ -85,7 +92,7 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
     () => ({
       rulerType: 'none',
       content: (d) =>
-        `<div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 10px 14px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);">
+        sanitize(`<div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 10px 14px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);">
           <div style="display: flex; align-items: center; gap: 6px; margin-bottom: 6px;">
             <span style="width: 10px; height: 10px; border-radius: 2px; background: ${d.color};"></span>
             <span style="color: var(--foreground); font-size: 12px; font-weight: 600;">${d.label}</span>
@@ -93,7 +100,7 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
           <div style="color: var(--muted-foreground); font-size: 11px;">
             <strong>${spec.yAxisLabel}:</strong> ${d.value.toLocaleString(undefined, { maximumFractionDigits: 2 })}
           </div>
-        </div>`,
+        </div>`),
     }),
     [spec.yAxisLabel],
   );
@@ -164,7 +171,7 @@ function ScatterChart({
       content: (d) => {
         const hwKey = d.hwKey ?? '';
         const color = colorMap[hwKey] ?? '#888';
-        return `<div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 10px 14px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);">
+        return sanitize(`<div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 10px 14px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);">
           <div style="display: flex; align-items: center; gap: 6px; margin-bottom: 6px;">
             <span style="width: 10px; height: 10px; border-radius: 2px; background: ${color};"></span>
             <span style="color: var(--foreground); font-size: 12px; font-weight: 600;">${hwKey}</span>
@@ -175,7 +182,7 @@ function ScatterChart({
           <div style="color: var(--muted-foreground); font-size: 11px;">
             <strong>${spec.yAxisLabel}:</strong> ${d.y.toLocaleString(undefined, { maximumFractionDigits: 2 })}
           </div>
-        </div>`;
+        </div>`);
       },
     }),
     [colorMap, spec.yAxisLabel],
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index e971ee0..4c15a14 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -102,6 +102,9 @@ importers:
       d3:
         specifier: ^7.9.0
         version: 7.9.0
+      dompurify:
+        specifier: ^3.3.3
+        version: 3.3.3
       gray-matter:
         specifier: ^4.0.3
         version: 4.0.3

From bab5d4108a8c26ec31ca7a2b55125830e99e9b94 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 30 Mar 2026 22:01:45 -0500
Subject: [PATCH 14/14] feat: validate LLM specs, multi-chart support, chart
 quality improvements

- Validate all LLM output fields against enum whitelists before use
- Sanitize error messages to prevent API key leaks
- Support up to 2 charts for cross-model comparison queries
- Smarter chart type selection guidance in system prompt
- Stricter DOMPurify config (whitelist tags/attrs)
- Dynamic bar chart height based on data count
- Scatter chart: zoom scaleExtent, grabCursor, instructions
- Bar chart: instructions overlay
---
 .../components/ai-chart/AiChartDisplay.tsx    |  10 +-
 .../src/components/ai-chart/AiChartResult.tsx |  58 ++--
 .../components/ai-chart/example-prompts.ts    |   4 +-
 .../components/ai-chart/prompt-templates.ts   |  52 +++-
 packages/app/src/components/ai-chart/types.ts |  71 +++++
 packages/app/src/hooks/api/use-ai-chart.ts    | 280 +++++++++---------
 packages/app/src/lib/ai-providers.ts          |   7 +-
 7 files changed, 285 insertions(+), 197 deletions(-)

diff --git a/packages/app/src/components/ai-chart/AiChartDisplay.tsx b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
index 33af57c..c1d30c1 100644
--- a/packages/app/src/components/ai-chart/AiChartDisplay.tsx
+++ b/packages/app/src/components/ai-chart/AiChartDisplay.tsx
@@ -166,15 +166,7 @@ export default function AiChartDisplay() {
       )}
 
       {/* Result */}
-      {result && (
-        <AiChartResult
-          spec={result.spec}
-          barData={result.barData}
-          scatterData={result.scatterData}
-          colorMap={result.colorMap}
-          summary={result.summary}
-        />
-      )}
+      {result && <AiChartResult charts={result.charts} summary={result.summary} />}
 
       {/* Example prompts (shown when no result) */}
       {!result && !isLoading && !error && (
diff --git a/packages/app/src/components/ai-chart/AiChartResult.tsx b/packages/app/src/components/ai-chart/AiChartResult.tsx
index e31e812..87e2639 100644
--- a/packages/app/src/components/ai-chart/AiChartResult.tsx
+++ b/packages/app/src/components/ai-chart/AiChartResult.tsx
@@ -16,17 +16,20 @@ import type {
 import DOMPurify from 'dompurify';
 
 import type { AiChartBarPoint, AiChartSpec } from './types';
+import type { AiSingleChartResult } from '@/hooks/api/use-ai-chart';
 
 /** Sanitize tooltip HTML that may contain LLM-generated strings. */
 function sanitize(html: string): string {
-  return DOMPurify.sanitize(html);
+  return DOMPurify.sanitize(html, {
+    ALLOWED_TAGS: ['div', 'span', 'strong', 'br'],
+    ALLOWED_ATTR: ['style'],
+  });
 }
 
+const CHART_INSTRUCTIONS = 'Hover for details';
+
 interface AiChartResultProps {
-  spec: AiChartSpec;
-  barData: AiChartBarPoint[];
-  scatterData: InferenceData[];
-  colorMap: Record<string, string>;
+  charts: AiSingleChartResult[];
   summary: string | null;
 }
 
@@ -109,7 +112,7 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
     <D3Chart
       chartId="ai-chart-bar"
       data={data}
-      height={450}
+      height={Math.max(300, data.length * 40 + bottomMargin + 24)}
       margin={{ top: 24, right: 10, bottom: bottomMargin, left: leftMargin }}
       xScale={xScale}
       yScale={yScale}
@@ -118,6 +121,7 @@ function BarChart({ data, spec }: { data: AiChartBarPoint[]; spec: AiChartSpec }
       layers={layers}
       tooltip={tooltip}
       watermark="logo"
+      instructions={CHART_INSTRUCTIONS}
     />
   );
 }
@@ -200,34 +204,32 @@ function ScatterChart({
       layers={layers}
       tooltip={tooltip}
       watermark="logo"
-      zoom={{ enabled: true, axes: 'both' }}
+      grabCursor
+      instructions={`${CHART_INSTRUCTIONS} • Scroll to zoom • Drag to pan`}
+      zoom={{ enabled: true, axes: 'both', scaleExtent: [0.7, 20] }}
     />
   );
 }
 
-export default function AiChartResult({
-  spec,
-  barData,
-  scatterData,
-  colorMap,
-  summary,
-}: AiChartResultProps) {
+export default function AiChartResult({ charts, summary }: AiChartResultProps) {
   return (
     <div className="flex flex-col gap-4">
-      <Card>
-        <CardHeader>
-          <CardTitle>{spec.title}</CardTitle>
-          <CardDescription>{spec.description}</CardDescription>
-        </CardHeader>
-        <CardContent>
-          {spec.chartType === 'bar' && barData.length > 0 && (
-            <BarChart data={barData} spec={spec} />
-          )}
-          {spec.chartType === 'scatter' && scatterData.length > 0 && (
-            <ScatterChart data={scatterData} spec={spec} colorMap={colorMap} />
-          )}
-        </CardContent>
-      </Card>
+      {charts.map((chart, i) => (
+        <Card key={i}>
+          <CardHeader>
+            <CardTitle>{chart.spec.title}</CardTitle>
+            <CardDescription>{chart.spec.description}</CardDescription>
+          </CardHeader>
+          <CardContent>
+            {chart.spec.chartType === 'bar' && chart.barData.length > 0 && (
+              <BarChart data={chart.barData} spec={chart.spec} />
+            )}
+            {chart.spec.chartType === 'scatter' && chart.scatterData.length > 0 && (
+              <ScatterChart data={chart.scatterData} spec={chart.spec} colorMap={chart.colorMap} />
+            )}
+          </CardContent>
+        </Card>
+      ))}
 
       {summary && (
         <Card>
diff --git a/packages/app/src/components/ai-chart/example-prompts.ts b/packages/app/src/components/ai-chart/example-prompts.ts
index 5fc88cf..ece8c93 100644
--- a/packages/app/src/components/ai-chart/example-prompts.ts
+++ b/packages/app/src/components/ai-chart/example-prompts.ts
@@ -1,8 +1,8 @@
 export const EXAMPLE_PROMPTS = [
   'Compare throughput per GPU across all GPUs for DeepSeek R1 at 8k/1k',
   'Bar chart: H100 vs B200 vs GB200 cost per million tokens (hyperscaler) for DeepSeek R1',
-  'Show a scatter plot of all GPU configs for DeepSeek R1 at 8k/1k with throughput per GPU',
+  'Compare Kimi K2.5 vs DeepSeek R1 throughput per GPU at 8k/1k',
   'Which GPU has the best GSM8K accuracy score for DeepSeek R1?',
   'Compare reliability/success rate across all GPUs',
-  'Bar chart of energy per output token across all GPUs for gpt-oss at 8k/1k',
+  'Show a scatter plot of all GPU configs for DeepSeek R1 at 8k/1k with throughput per GPU',
 ];
diff --git a/packages/app/src/components/ai-chart/prompt-templates.ts b/packages/app/src/components/ai-chart/prompt-templates.ts
index b8ae3b7..23e7c19 100644
--- a/packages/app/src/components/ai-chart/prompt-templates.ts
+++ b/packages/app/src/components/ai-chart/prompt-templates.ts
@@ -1,5 +1,5 @@
 /**
- * System prompt for the LLM that parses user natural language into an AiChartSpec.
+ * System prompt for the LLM that parses user natural language into AiChartSpec(s).
  * Kept compact to minimize token cost.
  */
 export function buildParsePrompt(): string {
@@ -39,23 +39,38 @@ Y-axis metrics for evaluations:
 Y-axis metrics for reliability:
 - reliability_rate → Success Rate (%)
 
-## Rules
+## Chart type selection rules
+
+Choose the chart type based on the user's intent:
+- **"bar"**: Use for comparing a single metric across GPUs/configs at a fixed operating point. Best for "compare X vs Y", "which GPU is best for...", "rank by...", direct comparisons. This is the DEFAULT for most queries.
+- **"scatter"**: Use ONLY when the user explicitly wants to see the full performance curve (all data points), trade-off relationships, or Pareto frontiers. Keywords: "scatter", "plot all points", "performance curve", "trade-off", "pareto".
+
+When in doubt, prefer "bar" — it produces cleaner, more readable charts.
+
+## Multi-chart comparisons
+
+If the user asks to compare two DIFFERENT models or two fundamentally different configurations side-by-side (e.g., "compare Kimi K2.5 vs DeepSeek R1" or "compare 1k/1k vs 8k/1k"), return an ARRAY of 2 chart specs — one for each. Each spec should have its own title clearly identifying what it shows.
+
+If the user is comparing GPUs/hardware within a single model (e.g., "H100 vs B200 for DeepSeek R1"), that's a single chart with multiple hardware keys — do NOT split into two charts.
+
+## General rules
 
 1. Map user intent to the closest available values. Be flexible with naming (e.g., "H100" → "h100", "deepseek r1" → "DeepSeek-R1-0528").
 2. Pick the correct dataSource based on what the user is asking about (performance → benchmarks, accuracy → evaluations, uptime/success → reliability, trends over time → history).
 3. hardwareKeys: list of GPU base names to compare. Empty array [] means "all GPUs".
 4. precisions: list of precisions. Empty array [] means "all precisions".
-5. chartType: "bar" for comparing specific values across GPUs/configs, "scatter" for plotting all data points.
-6. targetInteractivity: for benchmark bar charts, the interactivity level (tok/s/user) to read from. Default 40.
-7. If the user doesn't specify a model, default to "DeepSeek-R1-0528".
-8. If the user doesn't specify a sequence, default to "8k/1k".
-9. title: a short chart title describing the comparison.
-10. description: a one-sentence description of what the chart shows.
-11. For evaluations: yAxisMetric should be "eval_score". For reliability: yAxisMetric should be "reliability_rate".
+5. targetInteractivity: for benchmark bar charts, the interactivity level (tok/s/user) to read from. Default 40.
+6. If the user doesn't specify a model, default to "DeepSeek-R1-0528".
+7. If the user doesn't specify a sequence, default to "8k/1k".
+8. title: a short chart title describing the comparison.
+9. description: a one-sentence description of what the chart shows.
+10. For evaluations: yAxisMetric should be "eval_score". For reliability: yAxisMetric should be "reliability_rate".
 
 ## Output format
 
-Return ONLY valid JSON matching this schema (no markdown, no preamble):
+Return ONLY valid JSON (no markdown, no preamble).
+
+For a single chart, return one object:
 {
   "chartType": "bar" | "scatter",
   "dataSource": "benchmarks" | "evaluations" | "reliability" | "history",
@@ -68,18 +83,25 @@ Return ONLY valid JSON matching this schema (no markdown, no preamble):
   "targetInteractivity": number,
   "title": "string",
   "description": "string"
-}`;
+}
+
+For a comparison of two different models/configs, return an array of 2 objects:
+[{ ... }, { ... }]`;
 }
 
 export function buildSummaryPrompt(
-  spec: { title: string; yAxisLabel: string; model: string; sequence: string },
+  specs: { title: string; yAxisLabel: string; model: string; sequence: string }[],
   dataDescription: string,
 ): string {
+  const specSummary = specs
+    .map(
+      (s) => `Chart: ${s.title} | Metric: ${s.yAxisLabel} | Model: ${s.model}, Seq: ${s.sequence}`,
+    )
+    .join('\n');
+
   return `You are an expert performance analyst. Based on the following benchmark data, provide a concise 2-3 sentence summary highlighting the key takeaway.
 
-Chart: ${spec.title}
-Metric: ${spec.yAxisLabel}
-Model: ${spec.model}, Sequence: ${spec.sequence}
+${specSummary}
 
 Data:
 ${dataDescription}
diff --git a/packages/app/src/components/ai-chart/types.ts b/packages/app/src/components/ai-chart/types.ts
index 65c2d7c..7684346 100644
--- a/packages/app/src/components/ai-chart/types.ts
+++ b/packages/app/src/components/ai-chart/types.ts
@@ -1,3 +1,7 @@
+import { Model, Sequence, Precision } from '@/lib/data-mappings';
+import { Y_AXIS_METRICS } from '@/lib/chart-utils';
+import { MODEL_ORDER } from '@/lib/constants';
+
 export type AiProvider = 'openai' | 'anthropic' | 'xai' | 'google';
 
 export type AiChartType = 'bar' | 'scatter';
@@ -18,9 +22,76 @@ export interface AiChartSpec {
   description: string;
 }
 
+/** The LLM may return an array of up to 2 specs for comparison queries. */
+export type AiLlmResponse = AiChartSpec | AiChartSpec[];
+
 export interface AiChartBarPoint {
   hwKey: string;
   label: string;
   value: number;
   color: string;
 }
+
+// ---------------------------------------------------------------------------
+// Validation whitelists
+// ---------------------------------------------------------------------------
+
+const VALID_CHART_TYPES = new Set<string>(['bar', 'scatter']);
+const VALID_DATA_SOURCES = new Set<string>(['benchmarks', 'evaluations', 'reliability', 'history']);
+const VALID_MODELS = new Set<string>(Object.values(Model));
+const VALID_SEQUENCES = new Set<string>(Object.values(Sequence));
+const VALID_PRECISIONS = new Set<string>(Object.values(Precision));
+const VALID_GPU_BASES = new Set<string>(MODEL_ORDER);
+const VALID_Y_METRICS = new Set<string>([...Y_AXIS_METRICS, 'eval_score', 'reliability_rate']);
+
+/** Validate and clamp an LLM-generated spec to known values. Throws on unrecoverable input. */
+export function validateSpec(raw: Record<string, unknown>): AiChartSpec {
+  const chartType = VALID_CHART_TYPES.has(raw.chartType as string)
+    ? (raw.chartType as AiChartType)
+    : 'bar';
+
+  const dataSource = VALID_DATA_SOURCES.has(raw.dataSource as string)
+    ? (raw.dataSource as AiDataSource)
+    : 'benchmarks';
+
+  const model = VALID_MODELS.has(raw.model as string) ? (raw.model as string) : Model.DeepSeek_R1;
+
+  const sequence = VALID_SEQUENCES.has(raw.sequence as string)
+    ? (raw.sequence as string)
+    : Sequence.EightK_OneK;
+
+  const rawPrecisions = Array.isArray(raw.precisions) ? (raw.precisions as string[]) : [];
+  const precisions = rawPrecisions
+    .filter((p) => VALID_PRECISIONS.has(p.toLowerCase()))
+    .map((p) => p.toLowerCase());
+
+  const rawHwKeys = Array.isArray(raw.hardwareKeys) ? (raw.hardwareKeys as string[]) : [];
+  const hardwareKeys = rawHwKeys
+    .filter((k) => VALID_GPU_BASES.has(k.toLowerCase()))
+    .map((k) => k.toLowerCase());
+
+  const yAxisMetric = VALID_Y_METRICS.has(raw.yAxisMetric as string)
+    ? (raw.yAxisMetric as string)
+    : 'y_tpPerGpu';
+
+  const targetInteractivity =
+    typeof raw.targetInteractivity === 'number' &&
+    raw.targetInteractivity > 0 &&
+    raw.targetInteractivity < 1000
+      ? raw.targetInteractivity
+      : 40;
+
+  return {
+    chartType,
+    dataSource,
+    model,
+    sequence,
+    precisions,
+    hardwareKeys,
+    yAxisMetric,
+    yAxisLabel: typeof raw.yAxisLabel === 'string' ? raw.yAxisLabel.slice(0, 100) : yAxisMetric,
+    targetInteractivity,
+    title: typeof raw.title === 'string' ? raw.title.slice(0, 200) : 'AI Generated Chart',
+    description: typeof raw.description === 'string' ? raw.description.slice(0, 500) : '',
+  };
+}
diff --git a/packages/app/src/hooks/api/use-ai-chart.ts b/packages/app/src/hooks/api/use-ai-chart.ts
index 3049889..3e66854 100644
--- a/packages/app/src/hooks/api/use-ai-chart.ts
+++ b/packages/app/src/hooks/api/use-ai-chart.ts
@@ -3,6 +3,7 @@
 import { useCallback, useState } from 'react';
 
 import type { AiChartBarPoint, AiChartSpec, AiProvider } from '@/components/ai-chart/types';
+import { validateSpec } from '@/components/ai-chart/types';
 import { buildParsePrompt, buildSummaryPrompt } from '@/components/ai-chart/prompt-templates';
 import type { InferenceData } from '@/components/inference/types';
 import { callLlm } from '@/lib/ai-providers';
@@ -20,11 +21,19 @@ import { getHardwareConfig, getModelSortIndex } from '@/lib/constants';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 
-interface AiChartResult {
+// ---------------------------------------------------------------------------
+// Result types
+// ---------------------------------------------------------------------------
+
+export interface AiSingleChartResult {
   spec: AiChartSpec;
   barData: AiChartBarPoint[];
   scatterData: InferenceData[];
   colorMap: Record<string, string>;
+}
+
+export interface AiChartResult {
+  charts: AiSingleChartResult[];
   summary: string | null;
 }
 
@@ -36,12 +45,19 @@ interface UseAiChartReturn {
   reset: () => void;
 }
 
-function parseSpecFromLlm(raw: string): AiChartSpec {
+// ---------------------------------------------------------------------------
+// LLM response parsing
+// ---------------------------------------------------------------------------
+
+function parseSpecsFromLlm(raw: string): AiChartSpec[] {
   const cleaned = raw
     .replace(/```json\s*/g, '')
     .replace(/```/g, '')
     .trim();
-  return JSON.parse(cleaned);
+  const parsed = JSON.parse(cleaned);
+  const arr = Array.isArray(parsed) ? parsed : [parsed];
+  // Validate each spec and limit to 2
+  return arr.slice(0, 2).map((s: unknown) => validateSpec(s as Record<string, unknown>));
 }
 
 // ---------------------------------------------------------------------------
@@ -107,7 +123,6 @@ function buildEvalBarData(
   spec: AiChartSpec,
   colorMap: Record<string, string>,
 ): AiChartBarPoint[] {
-  // Filter by model, hardware, precision
   let filtered = rows.filter((r) => r.model === spec.model || spec.model === '');
   if (spec.hardwareKeys.length > 0) {
     const allowed = new Set(spec.hardwareKeys);
@@ -121,7 +136,6 @@ function buildEvalBarData(
     filtered = filtered.filter((r) => allowed.has(r.precision.toLowerCase()));
   }
 
-  // Group by hardware key, take latest date per group, extract score
   const groups = new Map<string, EvalRow>();
   for (const row of filtered) {
     const hwKey = normalizeEvalHardwareKey(row.hardware, row.framework, row.spec_method);
@@ -133,7 +147,6 @@ function buildEvalBarData(
 
   const bars: AiChartBarPoint[] = [];
   for (const [hwKey, row] of groups) {
-    // GSM8K score is typically in metrics as "gsm8k" or first metric value
     const score = row.metrics.gsm8k ?? row.metrics.accuracy ?? Object.values(row.metrics)[0] ?? 0;
     if (score <= 0) continue;
 
@@ -159,7 +172,6 @@ function buildReliabilityBarData(
   spec: AiChartSpec,
   colorMap: Record<string, string>,
 ): AiChartBarPoint[] {
-  // Filter by hardware
   let filtered = rows;
   if (spec.hardwareKeys.length > 0) {
     const allowed = new Set(spec.hardwareKeys);
@@ -169,7 +181,6 @@ function buildReliabilityBarData(
     });
   }
 
-  // Aggregate across dates: total successes / total attempts per hardware
   const agg = new Map<string, { success: number; total: number }>();
   for (const row of filtered) {
     const hw = row.hardware;
@@ -196,6 +207,87 @@ function buildReliabilityBarData(
   return bars;
 }
 
+// ---------------------------------------------------------------------------
+// Resolve a single spec into chart data
+// ---------------------------------------------------------------------------
+
+async function resolveSpec(spec: AiChartSpec): Promise<AiSingleChartResult> {
+  if (spec.dataSource === 'evaluations') {
+    const rows = await fetchEvaluations();
+    const hwKeys = [
+      ...new Set(rows.map((r) => normalizeEvalHardwareKey(r.hardware, r.framework, r.spec_method))),
+    ];
+    const colorMap = generateHighContrastColors(hwKeys, 'dark');
+    const barData = buildEvalBarData(rows, spec, colorMap);
+    // Re-color with final keys
+    const finalKeys = barData.map((b) => b.hwKey);
+    const finalColors = generateHighContrastColors(finalKeys, 'dark');
+    return {
+      spec,
+      barData: barData.map((b) => ({ ...b, color: finalColors[b.hwKey] ?? b.color })),
+      scatterData: [],
+      colorMap: finalColors,
+    };
+  }
+
+  if (spec.dataSource === 'reliability') {
+    const rows = await fetchReliability();
+    const hwKeys = [...new Set(rows.map((r) => r.hardware))];
+    const colorMap = generateHighContrastColors(hwKeys, 'dark');
+    const barData = buildReliabilityBarData(rows, spec, colorMap);
+    const finalKeys = barData.map((b) => b.hwKey);
+    const finalColors = generateHighContrastColors(finalKeys, 'dark');
+    return {
+      spec,
+      barData: barData.map((b) => ({ ...b, color: finalColors[b.hwKey] ?? b.color })),
+      scatterData: [],
+      colorMap: finalColors,
+    };
+  }
+
+  // Benchmarks or History
+  const { isl, osl } = sequenceToIslOsl(spec.sequence);
+  const rows =
+    spec.dataSource === 'history'
+      ? await fetchBenchmarkHistory(spec.model, isl, osl)
+      : await fetchBenchmarks(spec.model);
+
+  const { chartData } = transformBenchmarkRows(rows);
+  let points = chartData[0] ?? [];
+
+  if (spec.hardwareKeys.length > 0) {
+    const allowedGpus = new Set(spec.hardwareKeys);
+    points = points.filter((p) => {
+      const hwKey = p.hwKey ?? '';
+      return allowedGpus.has(hwKey) || [...allowedGpus].some((g) => hwKey.startsWith(g));
+    });
+  }
+  if (spec.precisions.length > 0) {
+    const allowedPrec = new Set(spec.precisions.map((p) => p.toLowerCase()));
+    points = points.filter((p) => p.precision && allowedPrec.has(p.precision.toLowerCase()));
+  }
+
+  if (spec.dataSource !== 'history') {
+    points = points.filter((p) => {
+      const entry = p as any;
+      if (entry.isl != null && entry.osl != null) {
+        return entry.isl === isl && entry.osl === osl;
+      }
+      return true;
+    });
+  }
+
+  const hwKeys = [...new Set(points.map((p) => p.hwKey ?? '').filter(Boolean))];
+  const colorMap = generateHighContrastColors(hwKeys, 'dark');
+
+  return {
+    spec,
+    barData: spec.chartType === 'bar' ? buildBenchmarkBarData(points, spec, colorMap) : [],
+    scatterData: spec.chartType === 'scatter' ? points : [],
+    colorMap,
+  };
+}
+
 // ---------------------------------------------------------------------------
 // Main hook
 // ---------------------------------------------------------------------------
@@ -211,111 +303,54 @@ export function useAiChart(): UseAiChartReturn {
     setResult(null);
 
     try {
-      // Step 1: Parse prompt into spec
-      const rawSpec = await callLlm(provider, apiKey, buildParsePrompt(), prompt);
-      const spec = parseSpecFromLlm(rawSpec);
-      // Default dataSource for backwards compat
-      if (!spec.dataSource) spec.dataSource = 'benchmarks';
-
-      let barData: AiChartBarPoint[] = [];
-      let scatterData: InferenceData[] = [];
-      let hwKeys: string[] = [];
-
-      if (spec.dataSource === 'evaluations') {
-        // ---- Evaluations ----
-        const rows = await fetchEvaluations();
-        hwKeys = [
-          ...new Set(
-            rows.map((r) => normalizeEvalHardwareKey(r.hardware, r.framework, r.spec_method)),
-          ),
-        ];
-        const colorMap = generateHighContrastColors(hwKeys, 'dark');
-        barData = buildEvalBarData(rows, spec, colorMap);
-
-        if (barData.length === 0) {
-          setError('No evaluation data found for the requested configuration.');
-          setIsLoading(false);
-          return;
-        }
-
-        hwKeys = barData.map((b) => b.hwKey);
-        const finalColorMap = generateHighContrastColors(hwKeys, 'dark');
-        barData = barData.map((b) => ({ ...b, color: finalColorMap[b.hwKey] ?? b.color }));
-
-        await generateSummary(provider, apiKey, spec, barData, finalColorMap, setResult);
-        return;
-      }
+      // Step 1: Parse prompt into validated spec(s)
+      const rawResponse = await callLlm(provider, apiKey, buildParsePrompt(), prompt);
+      const specs = parseSpecsFromLlm(rawResponse);
 
-      if (spec.dataSource === 'reliability') {
-        // ---- Reliability ----
-        const rows = await fetchReliability();
-        hwKeys = [...new Set(rows.map((r) => r.hardware))];
-        const colorMap = generateHighContrastColors(hwKeys, 'dark');
-        barData = buildReliabilityBarData(rows, spec, colorMap);
-
-        if (barData.length === 0) {
-          setError('No reliability data found for the requested configuration.');
-          setIsLoading(false);
-          return;
-        }
-
-        hwKeys = barData.map((b) => b.hwKey);
-        const finalColorMap = generateHighContrastColors(hwKeys, 'dark');
-        barData = barData.map((b) => ({ ...b, color: finalColorMap[b.hwKey] ?? b.color }));
-
-        await generateSummary(provider, apiKey, spec, barData, finalColorMap, setResult);
+      if (specs.length === 0) {
+        setError('Could not parse your request. Try rephrasing.');
+        setIsLoading(false);
         return;
       }
 
-      // ---- Benchmarks (default) & History ----
-      const { isl, osl } = sequenceToIslOsl(spec.sequence);
-      const rows =
-        spec.dataSource === 'history'
-          ? await fetchBenchmarkHistory(spec.model, isl, osl)
-          : await fetchBenchmarks(spec.model);
-
-      const { chartData } = transformBenchmarkRows(rows);
-      let points = chartData[0] ?? [];
-
-      // Filter by spec
-      if (spec.hardwareKeys.length > 0) {
-        const allowedGpus = new Set(spec.hardwareKeys);
-        points = points.filter((p) => {
-          const hwKey = p.hwKey ?? '';
-          return allowedGpus.has(hwKey) || [...allowedGpus].some((g) => hwKey.startsWith(g));
-        });
-      }
-      if (spec.precisions.length > 0) {
-        const allowedPrec = new Set(spec.precisions.map((p) => p.toLowerCase()));
-        points = points.filter((p) => p.precision && allowedPrec.has(p.precision.toLowerCase()));
-      }
-
-      // Filter by sequence (for non-history, where all sequences may be returned)
-      if (spec.dataSource !== 'history') {
-        points = points.filter((p) => {
-          const entry = p as any;
-          if (entry.isl != null && entry.osl != null) {
-            return entry.isl === isl && entry.osl === osl;
-          }
-          return true;
-        });
-      }
+      // Step 2: Resolve each spec into chart data (parallel for multi-chart)
+      const charts = await Promise.all(specs.map(resolveSpec));
 
-      if (points.length === 0) {
-        setError(
-          `No data found for ${spec.model} (${spec.sequence}). Try a different model or configuration.`,
-        );
+      // Check if any chart has data
+      const hasData = charts.some((c) => c.barData.length > 0 || c.scatterData.length > 0);
+      if (!hasData) {
+        const models = [...new Set(specs.map((s) => s.model))].join(', ');
+        setError(`No data found for ${models}. Try a different model or configuration.`);
         setIsLoading(false);
         return;
       }
 
-      hwKeys = [...new Set(points.map((p) => p.hwKey ?? '').filter(Boolean))];
-      const colorMap = generateHighContrastColors(hwKeys, 'dark');
-
-      barData = spec.chartType === 'bar' ? buildBenchmarkBarData(points, spec, colorMap) : [];
-      scatterData = spec.chartType === 'scatter' ? points : [];
+      // Step 3: Generate summary (best-effort)
+      let summary: string | null = null;
+      try {
+        const allBars = charts.flatMap((c) => c.barData);
+        const allScatter = charts.flatMap((c) => c.scatterData);
+        const hwKeys = [
+          ...new Set([...allBars.map((b) => b.hwKey), ...allScatter.map((p) => p.hwKey ?? '')]),
+        ].filter(Boolean);
+
+        const dataDesc =
+          allBars.length > 0
+            ? allBars.map((b) => `${b.label}: ${b.value.toFixed(2)}`).join('\n')
+            : `${allScatter.length} data points across ${hwKeys.length} hardware configs`;
+
+        const summaryRaw = await callLlm(
+          provider,
+          apiKey,
+          buildSummaryPrompt(specs, dataDesc),
+          'Provide the summary.',
+        );
+        summary = summaryRaw.trim();
+      } catch {
+        // Summary generation is non-critical
+      }
 
-      await generateSummary(provider, apiKey, spec, barData, colorMap, setResult, scatterData);
+      setResult({ charts, summary });
     } catch (err) {
       setError(err instanceof Error ? err.message : 'An unexpected error occurred.');
     } finally {
@@ -330,42 +365,3 @@ export function useAiChart(): UseAiChartReturn {
 
   return { result, isLoading, error, generate, reset };
 }
-
-async function generateSummary(
-  provider: AiProvider,
-  apiKey: string,
-  spec: AiChartSpec,
-  barData: AiChartBarPoint[],
-  colorMap: Record<string, string>,
-  setResult: (r: AiChartResult) => void,
-  scatterData: InferenceData[] = [],
-) {
-  let summary: string | null = null;
-  try {
-    const hwKeys = [
-      ...new Set([...barData.map((b) => b.hwKey), ...scatterData.map((p) => p.hwKey ?? '')]),
-    ].filter(Boolean);
-    const dataDesc =
-      barData.length > 0
-        ? barData.map((b) => `${b.label}: ${b.value.toFixed(2)}`).join('\n')
-        : `${scatterData.length} data points across ${hwKeys.length} hardware configs`;
-
-    const summaryRaw = await callLlm(
-      provider,
-      apiKey,
-      buildSummaryPrompt(spec, dataDesc),
-      'Provide the summary.',
-    );
-    summary = summaryRaw.trim();
-  } catch {
-    // Summary generation is non-critical
-  }
-
-  setResult({
-    spec,
-    barData,
-    scatterData,
-    colorMap,
-    summary,
-  });
-}
diff --git a/packages/app/src/lib/ai-providers.ts b/packages/app/src/lib/ai-providers.ts
index 1c0de60..335b895 100644
--- a/packages/app/src/lib/ai-providers.ts
+++ b/packages/app/src/lib/ai-providers.ts
@@ -132,8 +132,13 @@ export async function callLlm(
   const json = await res.json();
 
   if (!res.ok) {
-    const msg =
+    const raw =
       json?.error?.message ?? json?.error?.type ?? `${provider} request failed (${res.status})`;
+    // Strip anything that looks like an API key to prevent accidental leaks in UI
+    const msg = String(raw)
+      .replace(/sk-[a-zA-Z0-9_-]{10,}/g, '[REDACTED]')
+      .replace(/key-[a-zA-Z0-9_-]{10,}/g, '[REDACTED]')
+      .replace(/Bearer\s+\S+/gi, 'Bearer [REDACTED]');
     throw new Error(msg);
   }