From fbc1be151c9db0ff253cea5e9a63901420187a23 Mon Sep 17 00:00:00 2001
From: Julian Benegas <julianbenegas99@gmail.com>
Date: Sat, 7 Feb 2026 22:12:08 +0000
Subject: [PATCH 1/2] feat: implement prompt caching for Anthropic and OpenAI
 models

Add prompt caching to reduce token costs and improve latency for
LLM conversations, following patterns from vercel/ai SDK cookbook
and anomalyco/opencode.

## Anthropic (explicit cache control)
- Adds cacheControl breakpoints on system messages and the last 2
  conversation messages (max 4 breakpoints per Anthropic's limit)
- Cached tokens cost 10% of input tokens (with 25% write premium)
- Applied via providerOptions on messages, automatically translated
  by the AI SDK to block-level cache_control

## OpenAI (routing optimization)
- Sets promptCacheKey per post to improve cache hit routing
- OpenAI automatically caches prompts >= 1024 tokens; the key
  helps route requests to the same server for better cache hits
- No cost premium for cache writes

## Implementation
- New utility: agent/prompt-cache.ts with provider detection and
  caching strategies
- Updated response-agent.ts streamTextStep to apply caching
- Comprehensive test suite (18 tests) covering all edge cases
- Non-Anthropic/OpenAI providers pass through unchanged (safe for
  provider-agnostic code)
---
 apps/web/agent/__tests__/prompt-cache.test.ts | 313 ++++++++++++++++++
 apps/web/agent/prompt-cache.ts                | 156 +++++++++
 apps/web/agent/response-agent.ts              |  11 +-
 3 files changed, 479 insertions(+), 1 deletion(-)
 create mode 100644 apps/web/agent/__tests__/prompt-cache.test.ts
 create mode 100644 apps/web/agent/prompt-cache.ts

diff --git a/apps/web/agent/__tests__/prompt-cache.test.ts b/apps/web/agent/__tests__/prompt-cache.test.ts
new file mode 100644
index 0000000..8e54f3e
--- /dev/null
+++ b/apps/web/agent/__tests__/prompt-cache.test.ts
@@ -0,0 +1,313 @@
+import { describe, expect, test } from "bun:test"
+import type { ModelMessage } from "ai"
+import {
+  addCacheControlToMessages,
+  getCacheProviderOptions,
+} from "../prompt-cache"
+
+// ─── Helpers ─────────────────────────────────────────────────────────
+
+function systemMsg(text: string): ModelMessage {
+  return { role: "system", content: text }
+}
+
+function userMsg(text: string): ModelMessage {
+  return {
+    role: "user",
+    content: [{ type: "text", text }],
+  }
+}
+
+function assistantMsg(text: string): ModelMessage {
+  return {
+    role: "assistant",
+    content: [{ type: "text", text }],
+  }
+}
+
+function toolMsg(): ModelMessage {
+  return {
+    role: "tool",
+    content: [
+      {
+        type: "tool-result",
+        toolCallId: "call_1",
+        toolName: "Read",
+        output: { type: "text", value: "file contents" },
+      },
+    ],
+  }
+}
+
+const EPHEMERAL = { anthropic: { cacheControl: { type: "ephemeral" } } }
+
+// ─── addCacheControlToMessages ───────────────────────────────────────
+
+describe("addCacheControlToMessages", () => {
+  describe("Anthropic models", () => {
+    const anthropicModels = [
+      "anthropic/claude-sonnet-4.5",
+      "anthropic/claude-haiku-4.5",
+      "anthropic/claude-opus-4",
+    ]
+
+    for (const model of anthropicModels) {
+      test(`adds cache control for ${model}`, () => {
+        const messages: ModelMessage[] = [
+          systemMsg("You are a helpful assistant."),
+          userMsg("Hello"),
+          assistantMsg("Hi there!"),
+          userMsg("Tell me about caching"),
+        ]
+
+        const result = addCacheControlToMessages({ messages, model })
+
+        // System message (index 0) should have cache control
+        expect(result[0].providerOptions).toEqual(EPHEMERAL)
+
+        // First user message (index 1) should NOT have cache control
+        // (only last 2 non-system messages get it)
+        expect(result[1].providerOptions).toBeUndefined()
+
+        // Last 2 non-system messages should have cache control
+        expect(result[2].providerOptions).toEqual(EPHEMERAL)
+        expect(result[3].providerOptions).toEqual(EPHEMERAL)
+      })
+    }
+
+    test("handles empty messages array", () => {
+      const result = addCacheControlToMessages({
+        messages: [],
+        model: "anthropic/claude-sonnet-4.5",
+      })
+      expect(result).toEqual([])
+    })
+
+    test("handles single system message", () => {
+      const messages: ModelMessage[] = [
+        systemMsg("System prompt"),
+      ]
+
+      const result = addCacheControlToMessages({
+        messages,
+        model: "anthropic/claude-sonnet-4.5",
+      })
+
+      expect(result[0].providerOptions).toEqual(EPHEMERAL)
+    })
+
+    test("handles 2 system messages", () => {
+      const messages: ModelMessage[] = [
+        systemMsg("System part 1"),
+        systemMsg("System part 2"),
+        userMsg("Hello"),
+      ]
+
+      const result = addCacheControlToMessages({
+        messages,
+        model: "anthropic/claude-sonnet-4.5",
+      })
+
+      // Both system messages should be cached
+      expect(result[0].providerOptions).toEqual(EPHEMERAL)
+      expect(result[1].providerOptions).toEqual(EPHEMERAL)
+      // User message is the only non-system message, so it's one of last 2
+      expect(result[2].providerOptions).toEqual(EPHEMERAL)
+    })
+
+    test("respects max 4 breakpoints with long conversations", () => {
+      const messages: ModelMessage[] = [
+        systemMsg("System prompt"),
+        systemMsg("Additional system context"),
+        userMsg("First question"),
+        assistantMsg("First answer"),
+        userMsg("Second question"),
+        assistantMsg("Second answer"),
+        userMsg("Third question"),
+        assistantMsg("Third answer"),
+        userMsg("Fourth question"),
+      ]
+
+      const result = addCacheControlToMessages({
+        messages,
+        model: "anthropic/claude-sonnet-4.5",
+      })
+
+      // Count cache breakpoints
+      let breakpoints = 0
+      for (const msg of result) {
+        if (msg.providerOptions?.anthropic) breakpoints++
+      }
+
+      // Should be exactly 4: 2 system + 2 last non-system
+      expect(breakpoints).toBe(4)
+
+      // Verify correct placement: system messages
+      expect(result[0].providerOptions).toEqual(EPHEMERAL)
+      expect(result[1].providerOptions).toEqual(EPHEMERAL)
+
+      // Middle messages should NOT have cache control
+      expect(result[2].providerOptions).toBeUndefined()
+      expect(result[3].providerOptions).toBeUndefined()
+      expect(result[4].providerOptions).toBeUndefined()
+      expect(result[5].providerOptions).toBeUndefined()
+      expect(result[6].providerOptions).toBeUndefined()
+
+      // Last 2 non-system messages
+      expect(result[7].providerOptions).toEqual(EPHEMERAL)
+      expect(result[8].providerOptions).toEqual(EPHEMERAL)
+    })
+
+    test("preserves existing providerOptions on messages", () => {
+      const messages: ModelMessage[] = [
+        {
+          role: "system",
+          content: "System prompt",
+          providerOptions: { someOther: { key: "value" } },
+        },
+        userMsg("Hello"),
+      ]
+
+      const result = addCacheControlToMessages({
+        messages,
+        model: "anthropic/claude-sonnet-4.5",
+      })
+
+      // Should merge, not replace
+      expect(result[0].providerOptions).toEqual({
+        someOther: { key: "value" },
+        anthropic: { cacheControl: { type: "ephemeral" } },
+      })
+    })
+
+    test("handles tool messages in conversation", () => {
+      const messages: ModelMessage[] = [
+        systemMsg("System prompt"),
+        userMsg("Read a file"),
+        assistantMsg("I'll read that file."),
+        toolMsg(),
+        userMsg("Thanks, now explain it"),
+      ]
+
+      const result = addCacheControlToMessages({
+        messages,
+        model: "anthropic/claude-sonnet-4.5",
+      })
+
+      // System: cached
+      expect(result[0].providerOptions).toEqual(EPHEMERAL)
+      // Last 2 non-system: tool result and final user message
+      expect(result[3].providerOptions).toEqual(EPHEMERAL)
+      expect(result[4].providerOptions).toEqual(EPHEMERAL)
+    })
+
+    test("detects Anthropic from LanguageModel-like objects", () => {
+      const model = {
+        provider: "anthropic",
+        modelId: "claude-sonnet-4.5",
+        specificationVersion: "v3" as const,
+        defaultObjectGenerationMode: "json" as const,
+        doGenerate: async () => ({} as any),
+        doStream: async () => ({} as any),
+      }
+
+      const messages: ModelMessage[] = [userMsg("Hello")]
+      const result = addCacheControlToMessages({ messages, model: model as any })
+      expect(result[0].providerOptions).toEqual(EPHEMERAL)
+    })
+  })
+
+  describe("OpenAI models", () => {
+    test("passes messages through unchanged", () => {
+      const messages: ModelMessage[] = [
+        systemMsg("System prompt"),
+        userMsg("Hello"),
+        assistantMsg("Hi"),
+        userMsg("Follow up"),
+      ]
+
+      const result = addCacheControlToMessages({
+        messages,
+        model: "openai/gpt-4o",
+      })
+
+      // Messages should be identical (no providerOptions added)
+      for (let i = 0; i < result.length; i++) {
+        expect(result[i].providerOptions).toBeUndefined()
+      }
+    })
+  })
+
+  describe("Unknown providers", () => {
+    test("passes messages through unchanged", () => {
+      const messages: ModelMessage[] = [
+        systemMsg("System prompt"),
+        userMsg("Hello"),
+      ]
+
+      const result = addCacheControlToMessages({
+        messages,
+        model: "google/gemini-2.0-flash",
+      })
+
+      for (const msg of result) {
+        expect(msg.providerOptions).toBeUndefined()
+      }
+    })
+  })
+})
+
+// ─── getCacheProviderOptions ─────────────────────────────────────────
+
+describe("getCacheProviderOptions", () => {
+  test("returns promptCacheKey for OpenAI models", () => {
+    const result = getCacheProviderOptions({
+      model: "openai/gpt-4o",
+      postId: "abc123",
+    })
+
+    expect(result).toEqual({
+      openai: {
+        promptCacheKey: "forums-abc123",
+      },
+    })
+  })
+
+  test("returns undefined for OpenAI without postId", () => {
+    const result = getCacheProviderOptions({
+      model: "openai/gpt-4o",
+    })
+    expect(result).toBeUndefined()
+  })
+
+  test("returns undefined for Anthropic models", () => {
+    const result = getCacheProviderOptions({
+      model: "anthropic/claude-sonnet-4.5",
+      postId: "abc123",
+    })
+    expect(result).toBeUndefined()
+  })
+
+  test("returns undefined for unknown providers", () => {
+    const result = getCacheProviderOptions({
+      model: "google/gemini-2.0-flash",
+      postId: "abc123",
+    })
+    expect(result).toBeUndefined()
+  })
+
+  test("detects GPT model strings", () => {
+    for (const model of ["openai/gpt-4o", "openai/gpt-4o-mini"]) {
+      const result = getCacheProviderOptions({ model, postId: "test" })
+      expect(result).toBeDefined()
+      expect(result?.openai?.promptCacheKey).toBe("forums-test")
+    }
+  })
+
+  test("detects o-series model strings", () => {
+    for (const model of ["openai/o1-preview", "openai/o3-mini", "openai/o4-mini"]) {
+      const result = getCacheProviderOptions({ model, postId: "test" })
+      expect(result).toBeDefined()
+    }
+  })
+})
diff --git a/apps/web/agent/prompt-cache.ts b/apps/web/agent/prompt-cache.ts
new file mode 100644
index 0000000..685e0a8
--- /dev/null
+++ b/apps/web/agent/prompt-cache.ts
@@ -0,0 +1,156 @@
+import type { JSONValue, LanguageModel, ModelMessage } from "ai"
+
+/**
+ * Prompt caching utilities for reducing token costs and latency.
+ *
+ * ## Anthropic
+ * Marks system messages and the last conversation message with
+ * `cacheControl: { type: "ephemeral" }` so Anthropic caches the
+ * prefix incrementally. Cached tokens cost 10% of input tokens
+ * (cache writes cost 25% more). Minimum cacheable length varies
+ * by model (1024–4096 tokens).
+ *
+ * ## OpenAI
+ * OpenAI automatically caches prompts ≥ 1024 tokens. We set a
+ * `promptCacheKey` per post to improve cache routing (requests
+ * sharing the same key + prefix hash are routed to the same server).
+ *
+ * Inspired by:
+ * - vercel/ai SDK cookbook: https://ai-sdk.dev/cookbook/node/dynamic-prompt-caching
+ * - anomalyco/opencode ProviderTransform.applyCaching
+ */
+
+// ─── Provider detection ──────────────────────────────────────────────
+
+function isAnthropicModel(model: string | LanguageModel): boolean {
+  if (typeof model === "string") {
+    return model.includes("anthropic") || model.includes("claude")
+  }
+  return (
+    model.provider === "anthropic" ||
+    model.provider.includes("anthropic") ||
+    model.modelId.includes("anthropic") ||
+    model.modelId.includes("claude")
+  )
+}
+
+function isOpenAIModel(model: string | LanguageModel): boolean {
+  if (typeof model === "string") {
+    return (
+      model.includes("openai") ||
+      model.includes("gpt-") ||
+      model.includes("o1-") ||
+      model.includes("o3-") ||
+      model.includes("o4-")
+    )
+  }
+  return (
+    model.provider === "openai" ||
+    model.provider.includes("openai") ||
+    model.modelId.includes("gpt-") ||
+    model.modelId.includes("o1-") ||
+    model.modelId.includes("o3-") ||
+    model.modelId.includes("o4-")
+  )
+}
+
+// ─── Anthropic caching ───────────────────────────────────────────────
+
+const ANTHROPIC_CACHE_CONTROL = {
+  anthropic: { cacheControl: { type: "ephemeral" } },
+} satisfies Record<string, Record<string, JSONValue>>
+
+/**
+ * Apply Anthropic prompt caching breakpoints to messages.
+ *
+ * Strategy (mirrors opencode's applyCaching):
+ * 1. Mark up to the first 2 system messages (static instructions)
+ * 2. Mark the last 2 non-system messages (conversation frontier)
+ *
+ * Anthropic allows a max of 4 cache breakpoints per request.
+ * The AI SDK translates message-level providerOptions to block-level
+ * cache_control automatically.
+ */
+function applyAnthropicCaching(messages: ModelMessage[]): ModelMessage[] {
+  // Identify system messages and non-system messages
+  const systemIndices: number[] = []
+  const nonSystemIndices: number[] = []
+
+  for (let i = 0; i < messages.length; i++) {
+    if (messages[i].role === "system") {
+      systemIndices.push(i)
+    } else {
+      nonSystemIndices.push(i)
+    }
+  }
+
+  // Pick indices to cache: first 2 system + last 2 non-system = max 4 breakpoints
+  const cacheIndices = new Set<number>([
+    ...systemIndices.slice(0, 2),
+    ...nonSystemIndices.slice(-2),
+  ])
+
+  return messages.map((message, index) => {
+    if (!cacheIndices.has(index)) return message
+
+    return {
+      ...message,
+      providerOptions: {
+        ...message.providerOptions,
+        ...ANTHROPIC_CACHE_CONTROL,
+      },
+    }
+  })
+}
+
+// ─── Public API ──────────────────────────────────────────────────────
+
+/**
+ * Apply prompt caching to messages based on the model provider.
+ *
+ * - **Anthropic**: adds `cacheControl` breakpoints on system messages and
+ *   the last conversation messages (up to 4 breakpoints).
+ * - **OpenAI**: returns messages unchanged (caching is automatic; use
+ *   `getProviderOptions` to set `promptCacheKey`).
+ * - **Other providers**: messages pass through unchanged.
+ */
+export function addCacheControlToMessages({
+  messages,
+  model,
+}: {
+  messages: ModelMessage[]
+  model: string | LanguageModel
+}): ModelMessage[] {
+  if (messages.length === 0) return messages
+
+  if (isAnthropicModel(model)) {
+    return applyAnthropicCaching(messages)
+  }
+
+  // Other providers: return unchanged
+  return messages
+}
+
+/**
+ * Build provider-level options for prompt caching.
+ *
+ * - **OpenAI**: sets `promptCacheKey` to improve cache hit routing.
+ * - **Anthropic / others**: returns undefined (caching is message-level).
+ */
+export function getCacheProviderOptions({
+  model,
+  postId,
+}: {
+  model: string | LanguageModel
+  postId?: string
+}): Record<string, Record<string, JSONValue>> | undefined {
+  if (isOpenAIModel(model) && postId) {
+    return {
+      openai: {
+        promptCacheKey: `forums-${postId}`,
+      },
+    }
+  }
+
+  return undefined
+}
diff --git a/apps/web/agent/response-agent.ts b/apps/web/agent/response-agent.ts
index ac55b51..2df9727 100644
--- a/apps/web/agent/response-agent.ts
+++ b/apps/web/agent/response-agent.ts
@@ -16,6 +16,7 @@ import { autumn, type BillingCategory, CREDIT_COSTS } from "@/lib/autumn"
 import { db } from "@/lib/db/client"
 import { comments, posts } from "@/lib/db/schema"
 import { ERROR_CODES } from "@/lib/errors"
+import { addCacheControlToMessages, getCacheProviderOptions } from "./prompt-cache"
 import { getAllTools, getTools } from "./tools"
 import type { AgentMode, AgentUIMessage } from "./types"
 import { startWorkspace } from "./workspace"
@@ -366,11 +367,19 @@ async function streamTextStep({
       ? BUILD_SYSTEM_PROMPT(owner, repo)
       : ASK_SYSTEM_PROMPT(owner, repo)
 
-  const result = streamText({
+  // Apply prompt caching to reduce token costs and latency
+  const modelMessages = addCacheControlToMessages({
     messages: await convertToModelMessages(allMessages),
+    model,
+  })
+  const cacheProviderOptions = getCacheProviderOptions({ model, postId })
+
+  const result = streamText({
+    messages: modelMessages,
     tools,
     system: systemPrompt,
     model,
+    ...(cacheProviderOptions && { providerOptions: cacheProviderOptions }),
   })
 
   const stepNewMessages: AgentUIMessage[] = []

From 6fc6b54fc9c47e90348b5c7972219adfc2d55e52 Mon Sep 17 00:00:00 2001
From: Julian Benegas <julianbenegas99@gmail.com>
Date: Sat, 7 Feb 2026 22:17:15 +0000
Subject: [PATCH 2/2] fix: always pass Anthropic cache control on system prompt
 to prevent cache purge

Wrap the system prompt as a SystemModelMessage with cacheControl
providerOptions for Anthropic models. Without this, the system
prompt (passed as a plain string to streamText) has no cache_control
marker, meaning Anthropic won't establish the cache prefix and the
cache gets purged between requests.

- New wrapSystemPrompt() function: returns SystemModelMessage with
  cacheControl for Anthropic, plain string for other providers
- Updated response-agent.ts to use wrapSystemPrompt()
- Added 4 new tests for wrapSystemPrompt (22 total, all passing)
---
 apps/web/agent/__tests__/prompt-cache.test.ts | 51 +++++++++++++++++++
 apps/web/agent/prompt-cache.ts                | 38 +++++++++++++-
 apps/web/agent/response-agent.ts              | 15 ++++--
 3 files changed, 100 insertions(+), 4 deletions(-)

diff --git a/apps/web/agent/__tests__/prompt-cache.test.ts b/apps/web/agent/__tests__/prompt-cache.test.ts
index 8e54f3e..5240204 100644
--- a/apps/web/agent/__tests__/prompt-cache.test.ts
+++ b/apps/web/agent/__tests__/prompt-cache.test.ts
@@ -3,6 +3,7 @@ import type { ModelMessage } from "ai"
 import {
   addCacheControlToMessages,
   getCacheProviderOptions,
+  wrapSystemPrompt,
 } from "../prompt-cache"
 
 // ─── Helpers ─────────────────────────────────────────────────────────
@@ -311,3 +312,53 @@ describe("getCacheProviderOptions", () => {
     }
   })
 })
+
+// ─── wrapSystemPrompt ────────────────────────────────────────────────
+
+describe("wrapSystemPrompt", () => {
+  test("wraps system prompt as SystemModelMessage for Anthropic", () => {
+    const result = wrapSystemPrompt({
+      system: "You are a helpful assistant.",
+      model: "anthropic/claude-sonnet-4.5",
+    })
+
+    expect(typeof result).toBe("object")
+    expect(result).toEqual({
+      role: "system",
+      content: "You are a helpful assistant.",
+      providerOptions: EPHEMERAL,
+    })
+  })
+
+  test("wraps for all Anthropic model variants", () => {
+    for (const model of [
+      "anthropic/claude-sonnet-4.5",
+      "anthropic/claude-haiku-4.5",
+      "anthropic/claude-opus-4",
+    ]) {
+      const result = wrapSystemPrompt({ system: "test", model })
+      expect(typeof result).toBe("object")
+      expect((result as any).providerOptions).toEqual(EPHEMERAL)
+    }
+  })
+
+  test("returns plain string for OpenAI models", () => {
+    const result = wrapSystemPrompt({
+      system: "You are a helpful assistant.",
+      model: "openai/gpt-4o",
+    })
+
+    expect(typeof result).toBe("string")
+    expect(result).toBe("You are a helpful assistant.")
+  })
+
+  test("returns plain string for unknown providers", () => {
+    const result = wrapSystemPrompt({
+      system: "You are a helpful assistant.",
+      model: "google/gemini-2.0-flash",
+    })
+
+    expect(typeof result).toBe("string")
+    expect(result).toBe("You are a helpful assistant.")
+  })
+})
diff --git a/apps/web/agent/prompt-cache.ts b/apps/web/agent/prompt-cache.ts
index 685e0a8..524fee7 100644
--- a/apps/web/agent/prompt-cache.ts
+++ b/apps/web/agent/prompt-cache.ts
@@ -1,4 +1,9 @@
-import type { JSONValue, LanguageModel, ModelMessage } from "ai"
+import type {
+  JSONValue,
+  LanguageModel,
+  ModelMessage,
+  SystemModelMessage,
+} from "ai"
 
 /**
  * Prompt caching utilities for reducing token costs and latency.
@@ -10,6 +15,10 @@ import type { JSONValue, LanguageModel, ModelMessage } from "ai"
  * (cache writes cost 25% more). Minimum cacheable length varies
  * by model (1024–4096 tokens).
  *
+ * IMPORTANT: Every request to Anthropic must include the cache
+ * control markers, otherwise the cache is not maintained. This
+ * applies to both the system prompt and conversation messages.
+ *
  * ## OpenAI
  * OpenAI automatically caches prompts ≥ 1024 tokens. We set a
  * `promptCacheKey` per post to improve cache routing (requests
@@ -131,6 +140,33 @@ export function addCacheControlToMessages({
   return messages
 }
 
+/**
+ * Wrap the system prompt with cache control for Anthropic models.
+ *
+ * For Anthropic, the system prompt must be sent as a SystemModelMessage
+ * with `providerOptions` containing `cacheControl`, otherwise the cache
+ * prefix is not established and the cache will be purged.
+ *
+ * For other providers, returns the system string as-is.
+ */
+export function wrapSystemPrompt({
+  system,
+  model,
+}: {
+  system: string
+  model: string | LanguageModel
+}): string | SystemModelMessage {
+  if (isAnthropicModel(model)) {
+    return {
+      role: "system" as const,
+      content: system,
+      providerOptions: ANTHROPIC_CACHE_CONTROL,
+    }
+  }
+
+  return system
+}
+
 /**
  * Build provider-level options for prompt caching.
  *
diff --git a/apps/web/agent/response-agent.ts b/apps/web/agent/response-agent.ts
index 2df9727..e620ac4 100644
--- a/apps/web/agent/response-agent.ts
+++ b/apps/web/agent/response-agent.ts
@@ -16,7 +16,11 @@ import { autumn, type BillingCategory, CREDIT_COSTS } from "@/lib/autumn"
 import { db } from "@/lib/db/client"
 import { comments, posts } from "@/lib/db/schema"
 import { ERROR_CODES } from "@/lib/errors"
-import { addCacheControlToMessages, getCacheProviderOptions } from "./prompt-cache"
+import {
+  addCacheControlToMessages,
+  getCacheProviderOptions,
+  wrapSystemPrompt,
+} from "./prompt-cache"
 import { getAllTools, getTools } from "./tools"
 import type { AgentMode, AgentUIMessage } from "./types"
 import { startWorkspace } from "./workspace"
@@ -362,16 +366,21 @@ async function streamTextStep({
       ? getAllTools({ workspace: lazyWorkspace, userAccessToken })
       : getTools({ workspace: lazyWorkspace })
 
-  const systemPrompt =
+  const systemPromptText =
     mode === "build"
       ? BUILD_SYSTEM_PROMPT(owner, repo)
       : ASK_SYSTEM_PROMPT(owner, repo)
 
-  // Apply prompt caching to reduce token costs and latency
+  // Apply prompt caching to reduce token costs and latency.
+  // For Anthropic: marks system prompt and conversation messages with
+  // cacheControl breakpoints. These must be present on every request
+  // or the cache is purged.
+  // For OpenAI: sets promptCacheKey for better cache routing.
   const modelMessages = addCacheControlToMessages({
     messages: await convertToModelMessages(allMessages),
     model,
   })
+  const systemPrompt = wrapSystemPrompt({ system: systemPromptText, model })
   const cacheProviderOptions = getCacheProviderOptions({ model, postId })
 
   const result = streamText({