From fbc1be151c9db0ff253cea5e9a63901420187a23 Mon Sep 17 00:00:00 2001 From: Julian Benegas Date: Sat, 7 Feb 2026 22:12:08 +0000 Subject: [PATCH 1/2] feat: implement prompt caching for Anthropic and OpenAI models Add prompt caching to reduce token costs and improve latency for LLM conversations, following patterns from vercel/ai SDK cookbook and anomalyco/opencode. ## Anthropic (explicit cache control) - Adds cacheControl breakpoints on system messages and the last 2 conversation messages (max 4 breakpoints per Anthropic's limit) - Cached tokens cost 10% of input tokens (with 25% write premium) - Applied via providerOptions on messages, automatically translated by the AI SDK to block-level cache_control ## OpenAI (routing optimization) - Sets promptCacheKey per post to improve cache hit routing - OpenAI automatically caches prompts >= 1024 tokens; the key helps route requests to the same server for better cache hits - No cost premium for cache writes ## Implementation - New utility: agent/prompt-cache.ts with provider detection and caching strategies - Updated response-agent.ts streamTextStep to apply caching - Comprehensive test suite (18 tests) covering all edge cases - Non-Anthropic/OpenAI providers pass through unchanged (safe for provider-agnostic code) --- apps/web/agent/__tests__/prompt-cache.test.ts | 313 ++++++++++++++++++ apps/web/agent/prompt-cache.ts | 156 +++++++++ apps/web/agent/response-agent.ts | 11 +- 3 files changed, 479 insertions(+), 1 deletion(-) create mode 100644 apps/web/agent/__tests__/prompt-cache.test.ts create mode 100644 apps/web/agent/prompt-cache.ts diff --git a/apps/web/agent/__tests__/prompt-cache.test.ts b/apps/web/agent/__tests__/prompt-cache.test.ts new file mode 100644 index 0000000..8e54f3e --- /dev/null +++ b/apps/web/agent/__tests__/prompt-cache.test.ts @@ -0,0 +1,313 @@ +import { describe, expect, test } from "bun:test" +import type { ModelMessage } from "ai" +import { + addCacheControlToMessages, + getCacheProviderOptions, +} from "../prompt-cache" + +// ─── Helpers ───────────────────────────────────────────────────────── + +function systemMsg(text: string): ModelMessage { + return { role: "system", content: text } +} + +function userMsg(text: string): ModelMessage { + return { + role: "user", + content: [{ type: "text", text }], + } +} + +function assistantMsg(text: string): ModelMessage { + return { + role: "assistant", + content: [{ type: "text", text }], + } +} + +function toolMsg(): ModelMessage { + return { + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: "call_1", + toolName: "Read", + output: { type: "text", value: "file contents" }, + }, + ], + } +} + +const EPHEMERAL = { anthropic: { cacheControl: { type: "ephemeral" } } } + +// ─── addCacheControlToMessages ─────────────────────────────────────── + +describe("addCacheControlToMessages", () => { + describe("Anthropic models", () => { + const anthropicModels = [ + "anthropic/claude-sonnet-4.5", + "anthropic/claude-haiku-4.5", + "anthropic/claude-opus-4", + ] + + for (const model of anthropicModels) { + test(`adds cache control for ${model}`, () => { + const messages: ModelMessage[] = [ + systemMsg("You are a helpful assistant."), + userMsg("Hello"), + assistantMsg("Hi there!"), + userMsg("Tell me about caching"), + ] + + const result = addCacheControlToMessages({ messages, model }) + + // System message (index 0) should have cache control + expect(result[0].providerOptions).toEqual(EPHEMERAL) + + // First user message (index 1) should NOT have cache control + // (only last 2 non-system messages get it) + expect(result[1].providerOptions).toBeUndefined() + + // Last 2 non-system messages should have cache control + expect(result[2].providerOptions).toEqual(EPHEMERAL) + expect(result[3].providerOptions).toEqual(EPHEMERAL) + }) + } + + test("handles empty messages array", () => { + const result = addCacheControlToMessages({ + messages: [], + model: "anthropic/claude-sonnet-4.5", + }) + expect(result).toEqual([]) + }) + + test("handles single system message", () => { + const messages: ModelMessage[] = [ + systemMsg("System prompt"), + ] + + const result = addCacheControlToMessages({ + messages, + model: "anthropic/claude-sonnet-4.5", + }) + + expect(result[0].providerOptions).toEqual(EPHEMERAL) + }) + + test("handles 2 system messages", () => { + const messages: ModelMessage[] = [ + systemMsg("System part 1"), + systemMsg("System part 2"), + userMsg("Hello"), + ] + + const result = addCacheControlToMessages({ + messages, + model: "anthropic/claude-sonnet-4.5", + }) + + // Both system messages should be cached + expect(result[0].providerOptions).toEqual(EPHEMERAL) + expect(result[1].providerOptions).toEqual(EPHEMERAL) + // User message is the only non-system message, so it's one of last 2 + expect(result[2].providerOptions).toEqual(EPHEMERAL) + }) + + test("respects max 4 breakpoints with long conversations", () => { + const messages: ModelMessage[] = [ + systemMsg("System prompt"), + systemMsg("Additional system context"), + userMsg("First question"), + assistantMsg("First answer"), + userMsg("Second question"), + assistantMsg("Second answer"), + userMsg("Third question"), + assistantMsg("Third answer"), + userMsg("Fourth question"), + ] + + const result = addCacheControlToMessages({ + messages, + model: "anthropic/claude-sonnet-4.5", + }) + + // Count cache breakpoints + let breakpoints = 0 + for (const msg of result) { + if (msg.providerOptions?.anthropic) breakpoints++ + } + + // Should be exactly 4: 2 system + 2 last non-system + expect(breakpoints).toBe(4) + + // Verify correct placement: system messages + expect(result[0].providerOptions).toEqual(EPHEMERAL) + expect(result[1].providerOptions).toEqual(EPHEMERAL) + + // Middle messages should NOT have cache control + expect(result[2].providerOptions).toBeUndefined() + expect(result[3].providerOptions).toBeUndefined() + expect(result[4].providerOptions).toBeUndefined() + expect(result[5].providerOptions).toBeUndefined() + expect(result[6].providerOptions).toBeUndefined() + + // Last 2 non-system messages + expect(result[7].providerOptions).toEqual(EPHEMERAL) + expect(result[8].providerOptions).toEqual(EPHEMERAL) + }) + + test("preserves existing providerOptions on messages", () => { + const messages: ModelMessage[] = [ + { + role: "system", + content: "System prompt", + providerOptions: { someOther: { key: "value" } }, + }, + userMsg("Hello"), + ] + + const result = addCacheControlToMessages({ + messages, + model: "anthropic/claude-sonnet-4.5", + }) + + // Should merge, not replace + expect(result[0].providerOptions).toEqual({ + someOther: { key: "value" }, + anthropic: { cacheControl: { type: "ephemeral" } }, + }) + }) + + test("handles tool messages in conversation", () => { + const messages: ModelMessage[] = [ + systemMsg("System prompt"), + userMsg("Read a file"), + assistantMsg("I'll read that file."), + toolMsg(), + userMsg("Thanks, now explain it"), + ] + + const result = addCacheControlToMessages({ + messages, + model: "anthropic/claude-sonnet-4.5", + }) + + // System: cached + expect(result[0].providerOptions).toEqual(EPHEMERAL) + // Last 2 non-system: tool result and final user message + expect(result[3].providerOptions).toEqual(EPHEMERAL) + expect(result[4].providerOptions).toEqual(EPHEMERAL) + }) + + test("detects Anthropic from LanguageModel-like objects", () => { + const model = { + provider: "anthropic", + modelId: "claude-sonnet-4.5", + specificationVersion: "v3" as const, + defaultObjectGenerationMode: "json" as const, + doGenerate: async () => ({} as any), + doStream: async () => ({} as any), + } + + const messages: ModelMessage[] = [userMsg("Hello")] + const result = addCacheControlToMessages({ messages, model: model as any }) + expect(result[0].providerOptions).toEqual(EPHEMERAL) + }) + }) + + describe("OpenAI models", () => { + test("passes messages through unchanged", () => { + const messages: ModelMessage[] = [ + systemMsg("System prompt"), + userMsg("Hello"), + assistantMsg("Hi"), + userMsg("Follow up"), + ] + + const result = addCacheControlToMessages({ + messages, + model: "openai/gpt-4o", + }) + + // Messages should be identical (no providerOptions added) + for (let i = 0; i < result.length; i++) { + expect(result[i].providerOptions).toBeUndefined() + } + }) + }) + + describe("Unknown providers", () => { + test("passes messages through unchanged", () => { + const messages: ModelMessage[] = [ + systemMsg("System prompt"), + userMsg("Hello"), + ] + + const result = addCacheControlToMessages({ + messages, + model: "google/gemini-2.0-flash", + }) + + for (const msg of result) { + expect(msg.providerOptions).toBeUndefined() + } + }) + }) +}) + +// ─── getCacheProviderOptions ───────────────────────────────────────── + +describe("getCacheProviderOptions", () => { + test("returns promptCacheKey for OpenAI models", () => { + const result = getCacheProviderOptions({ + model: "openai/gpt-4o", + postId: "abc123", + }) + + expect(result).toEqual({ + openai: { + promptCacheKey: "forums-abc123", + }, + }) + }) + + test("returns undefined for OpenAI without postId", () => { + const result = getCacheProviderOptions({ + model: "openai/gpt-4o", + }) + expect(result).toBeUndefined() + }) + + test("returns undefined for Anthropic models", () => { + const result = getCacheProviderOptions({ + model: "anthropic/claude-sonnet-4.5", + postId: "abc123", + }) + expect(result).toBeUndefined() + }) + + test("returns undefined for unknown providers", () => { + const result = getCacheProviderOptions({ + model: "google/gemini-2.0-flash", + postId: "abc123", + }) + expect(result).toBeUndefined() + }) + + test("detects GPT model strings", () => { + for (const model of ["openai/gpt-4o", "openai/gpt-4o-mini"]) { + const result = getCacheProviderOptions({ model, postId: "test" }) + expect(result).toBeDefined() + expect(result?.openai?.promptCacheKey).toBe("forums-test") + } + }) + + test("detects o-series model strings", () => { + for (const model of ["openai/o1-preview", "openai/o3-mini", "openai/o4-mini"]) { + const result = getCacheProviderOptions({ model, postId: "test" }) + expect(result).toBeDefined() + } + }) +}) diff --git a/apps/web/agent/prompt-cache.ts b/apps/web/agent/prompt-cache.ts new file mode 100644 index 0000000..685e0a8 --- /dev/null +++ b/apps/web/agent/prompt-cache.ts @@ -0,0 +1,156 @@ +import type { JSONValue, LanguageModel, ModelMessage } from "ai" + +/** + * Prompt caching utilities for reducing token costs and latency. + * + * ## Anthropic + * Marks system messages and the last conversation message with + * `cacheControl: { type: "ephemeral" }` so Anthropic caches the + * prefix incrementally. Cached tokens cost 10% of input tokens + * (cache writes cost 25% more). Minimum cacheable length varies + * by model (1024–4096 tokens). + * + * ## OpenAI + * OpenAI automatically caches prompts ≥ 1024 tokens. We set a + * `promptCacheKey` per post to improve cache routing (requests + * sharing the same key + prefix hash are routed to the same server). + * + * Inspired by: + * - vercel/ai SDK cookbook: https://ai-sdk.dev/cookbook/node/dynamic-prompt-caching + * - anomalyco/opencode ProviderTransform.applyCaching + */ + +// ─── Provider detection ────────────────────────────────────────────── + +function isAnthropicModel(model: string | LanguageModel): boolean { + if (typeof model === "string") { + return model.includes("anthropic") || model.includes("claude") + } + return ( + model.provider === "anthropic" || + model.provider.includes("anthropic") || + model.modelId.includes("anthropic") || + model.modelId.includes("claude") + ) +} + +function isOpenAIModel(model: string | LanguageModel): boolean { + if (typeof model === "string") { + return ( + model.includes("openai") || + model.includes("gpt-") || + model.includes("o1-") || + model.includes("o3-") || + model.includes("o4-") + ) + } + return ( + model.provider === "openai" || + model.provider.includes("openai") || + model.modelId.includes("gpt-") || + model.modelId.includes("o1-") || + model.modelId.includes("o3-") || + model.modelId.includes("o4-") + ) +} + +// ─── Anthropic caching ─────────────────────────────────────────────── + +const ANTHROPIC_CACHE_CONTROL = { + anthropic: { cacheControl: { type: "ephemeral" } }, +} satisfies Record> + +/** + * Apply Anthropic prompt caching breakpoints to messages. + * + * Strategy (mirrors opencode's applyCaching): + * 1. Mark up to the first 2 system messages (static instructions) + * 2. Mark the last 2 non-system messages (conversation frontier) + * + * Anthropic allows a max of 4 cache breakpoints per request. + * The AI SDK translates message-level providerOptions to block-level + * cache_control automatically. + */ +function applyAnthropicCaching(messages: ModelMessage[]): ModelMessage[] { + // Identify system messages and non-system messages + const systemIndices: number[] = [] + const nonSystemIndices: number[] = [] + + for (let i = 0; i < messages.length; i++) { + if (messages[i].role === "system") { + systemIndices.push(i) + } else { + nonSystemIndices.push(i) + } + } + + // Pick indices to cache: first 2 system + last 2 non-system = max 4 breakpoints + const cacheIndices = new Set([ + ...systemIndices.slice(0, 2), + ...nonSystemIndices.slice(-2), + ]) + + return messages.map((message, index) => { + if (!cacheIndices.has(index)) return message + + return { + ...message, + providerOptions: { + ...message.providerOptions, + ...ANTHROPIC_CACHE_CONTROL, + }, + } + }) +} + +// ─── Public API ────────────────────────────────────────────────────── + +/** + * Apply prompt caching to messages based on the model provider. + * + * - **Anthropic**: adds `cacheControl` breakpoints on system messages and + * the last conversation messages (up to 4 breakpoints). + * - **OpenAI**: returns messages unchanged (caching is automatic; use + * `getProviderOptions` to set `promptCacheKey`). + * - **Other providers**: messages pass through unchanged. + */ +export function addCacheControlToMessages({ + messages, + model, +}: { + messages: ModelMessage[] + model: string | LanguageModel +}): ModelMessage[] { + if (messages.length === 0) return messages + + if (isAnthropicModel(model)) { + return applyAnthropicCaching(messages) + } + + // Other providers: return unchanged + return messages +} + +/** + * Build provider-level options for prompt caching. + * + * - **OpenAI**: sets `promptCacheKey` to improve cache hit routing. + * - **Anthropic / others**: returns undefined (caching is message-level). + */ +export function getCacheProviderOptions({ + model, + postId, +}: { + model: string | LanguageModel + postId?: string +}): Record> | undefined { + if (isOpenAIModel(model) && postId) { + return { + openai: { + promptCacheKey: `forums-${postId}`, + }, + } + } + + return undefined +} diff --git a/apps/web/agent/response-agent.ts b/apps/web/agent/response-agent.ts index ac55b51..2df9727 100644 --- a/apps/web/agent/response-agent.ts +++ b/apps/web/agent/response-agent.ts @@ -16,6 +16,7 @@ import { autumn, type BillingCategory, CREDIT_COSTS } from "@/lib/autumn" import { db } from "@/lib/db/client" import { comments, posts } from "@/lib/db/schema" import { ERROR_CODES } from "@/lib/errors" +import { addCacheControlToMessages, getCacheProviderOptions } from "./prompt-cache" import { getAllTools, getTools } from "./tools" import type { AgentMode, AgentUIMessage } from "./types" import { startWorkspace } from "./workspace" @@ -366,11 +367,19 @@ async function streamTextStep({ ? BUILD_SYSTEM_PROMPT(owner, repo) : ASK_SYSTEM_PROMPT(owner, repo) - const result = streamText({ + // Apply prompt caching to reduce token costs and latency + const modelMessages = addCacheControlToMessages({ messages: await convertToModelMessages(allMessages), + model, + }) + const cacheProviderOptions = getCacheProviderOptions({ model, postId }) + + const result = streamText({ + messages: modelMessages, tools, system: systemPrompt, model, + ...(cacheProviderOptions && { providerOptions: cacheProviderOptions }), }) const stepNewMessages: AgentUIMessage[] = [] From 6fc6b54fc9c47e90348b5c7972219adfc2d55e52 Mon Sep 17 00:00:00 2001 From: Julian Benegas Date: Sat, 7 Feb 2026 22:17:15 +0000 Subject: [PATCH 2/2] fix: always pass Anthropic cache control on system prompt to prevent cache purge Wrap the system prompt as a SystemModelMessage with cacheControl providerOptions for Anthropic models. Without this, the system prompt (passed as a plain string to streamText) has no cache_control marker, meaning Anthropic won't establish the cache prefix and the cache gets purged between requests. - New wrapSystemPrompt() function: returns SystemModelMessage with cacheControl for Anthropic, plain string for other providers - Updated response-agent.ts to use wrapSystemPrompt() - Added 4 new tests for wrapSystemPrompt (22 total, all passing) --- apps/web/agent/__tests__/prompt-cache.test.ts | 51 +++++++++++++++++++ apps/web/agent/prompt-cache.ts | 38 +++++++++++++- apps/web/agent/response-agent.ts | 15 ++++-- 3 files changed, 100 insertions(+), 4 deletions(-) diff --git a/apps/web/agent/__tests__/prompt-cache.test.ts b/apps/web/agent/__tests__/prompt-cache.test.ts index 8e54f3e..5240204 100644 --- a/apps/web/agent/__tests__/prompt-cache.test.ts +++ b/apps/web/agent/__tests__/prompt-cache.test.ts @@ -3,6 +3,7 @@ import type { ModelMessage } from "ai" import { addCacheControlToMessages, getCacheProviderOptions, + wrapSystemPrompt, } from "../prompt-cache" // ─── Helpers ───────────────────────────────────────────────────────── @@ -311,3 +312,53 @@ describe("getCacheProviderOptions", () => { } }) }) + +// ─── wrapSystemPrompt ──────────────────────────────────────────────── + +describe("wrapSystemPrompt", () => { + test("wraps system prompt as SystemModelMessage for Anthropic", () => { + const result = wrapSystemPrompt({ + system: "You are a helpful assistant.", + model: "anthropic/claude-sonnet-4.5", + }) + + expect(typeof result).toBe("object") + expect(result).toEqual({ + role: "system", + content: "You are a helpful assistant.", + providerOptions: EPHEMERAL, + }) + }) + + test("wraps for all Anthropic model variants", () => { + for (const model of [ + "anthropic/claude-sonnet-4.5", + "anthropic/claude-haiku-4.5", + "anthropic/claude-opus-4", + ]) { + const result = wrapSystemPrompt({ system: "test", model }) + expect(typeof result).toBe("object") + expect((result as any).providerOptions).toEqual(EPHEMERAL) + } + }) + + test("returns plain string for OpenAI models", () => { + const result = wrapSystemPrompt({ + system: "You are a helpful assistant.", + model: "openai/gpt-4o", + }) + + expect(typeof result).toBe("string") + expect(result).toBe("You are a helpful assistant.") + }) + + test("returns plain string for unknown providers", () => { + const result = wrapSystemPrompt({ + system: "You are a helpful assistant.", + model: "google/gemini-2.0-flash", + }) + + expect(typeof result).toBe("string") + expect(result).toBe("You are a helpful assistant.") + }) +}) diff --git a/apps/web/agent/prompt-cache.ts b/apps/web/agent/prompt-cache.ts index 685e0a8..524fee7 100644 --- a/apps/web/agent/prompt-cache.ts +++ b/apps/web/agent/prompt-cache.ts @@ -1,4 +1,9 @@ -import type { JSONValue, LanguageModel, ModelMessage } from "ai" +import type { + JSONValue, + LanguageModel, + ModelMessage, + SystemModelMessage, +} from "ai" /** * Prompt caching utilities for reducing token costs and latency. @@ -10,6 +15,10 @@ import type { JSONValue, LanguageModel, ModelMessage } from "ai" * (cache writes cost 25% more). Minimum cacheable length varies * by model (1024–4096 tokens). * + * IMPORTANT: Every request to Anthropic must include the cache + * control markers, otherwise the cache is not maintained. This + * applies to both the system prompt and conversation messages. + * * ## OpenAI * OpenAI automatically caches prompts ≥ 1024 tokens. We set a * `promptCacheKey` per post to improve cache routing (requests @@ -131,6 +140,33 @@ export function addCacheControlToMessages({ return messages } +/** + * Wrap the system prompt with cache control for Anthropic models. + * + * For Anthropic, the system prompt must be sent as a SystemModelMessage + * with `providerOptions` containing `cacheControl`, otherwise the cache + * prefix is not established and the cache will be purged. + * + * For other providers, returns the system string as-is. + */ +export function wrapSystemPrompt({ + system, + model, +}: { + system: string + model: string | LanguageModel +}): string | SystemModelMessage { + if (isAnthropicModel(model)) { + return { + role: "system" as const, + content: system, + providerOptions: ANTHROPIC_CACHE_CONTROL, + } + } + + return system +} + /** * Build provider-level options for prompt caching. * diff --git a/apps/web/agent/response-agent.ts b/apps/web/agent/response-agent.ts index 2df9727..e620ac4 100644 --- a/apps/web/agent/response-agent.ts +++ b/apps/web/agent/response-agent.ts @@ -16,7 +16,11 @@ import { autumn, type BillingCategory, CREDIT_COSTS } from "@/lib/autumn" import { db } from "@/lib/db/client" import { comments, posts } from "@/lib/db/schema" import { ERROR_CODES } from "@/lib/errors" -import { addCacheControlToMessages, getCacheProviderOptions } from "./prompt-cache" +import { + addCacheControlToMessages, + getCacheProviderOptions, + wrapSystemPrompt, +} from "./prompt-cache" import { getAllTools, getTools } from "./tools" import type { AgentMode, AgentUIMessage } from "./types" import { startWorkspace } from "./workspace" @@ -362,16 +366,21 @@ async function streamTextStep({ ? getAllTools({ workspace: lazyWorkspace, userAccessToken }) : getTools({ workspace: lazyWorkspace }) - const systemPrompt = + const systemPromptText = mode === "build" ? BUILD_SYSTEM_PROMPT(owner, repo) : ASK_SYSTEM_PROMPT(owner, repo) - // Apply prompt caching to reduce token costs and latency + // Apply prompt caching to reduce token costs and latency. + // For Anthropic: marks system prompt and conversation messages with + // cacheControl breakpoints. These must be present on every request + // or the cache is purged. + // For OpenAI: sets promptCacheKey for better cache routing. const modelMessages = addCacheControlToMessages({ messages: await convertToModelMessages(allMessages), model, }) + const systemPrompt = wrapSystemPrompt({ system: systemPromptText, model }) const cacheProviderOptions = getCacheProviderOptions({ model, postId }) const result = streamText({