diff --git a/backend/tests/integration/test_prompt_caching_integration.py b/backend/tests/integration/test_prompt_caching_integration.py
new file mode 100644
index 0000000000..ec247efff5
--- /dev/null
+++ b/backend/tests/integration/test_prompt_caching_integration.py
@@ -0,0 +1,418 @@
+"""
+Integration tests for OpenAI prompt caching with live API calls.
+
+Tests three caching scenarios on gpt-5.1 (the fix from PR #4670):
+1. Same user, same conversation — identical calls should fully cache
+2. Same user, cross conversation — same language, different transcripts should cache instruction prefix
+3. Cross user — different languages should still cache instruction prefix (language_code in context, not instructions)
+
+Requires:
+    - OPENAI_API_KEY environment variable
+    - Network access to OpenAI API
+
+Run:
+    cd backend
+    PYTHONPATH=. python3 -m pytest tests/integration/test_prompt_caching_integration.py -v -s
+
+Note: These tests make real API calls and cost real money (~$0.10-0.20 per run).
+Cache behavior depends on OpenAI infrastructure; cache hits are not guaranteed
+on every run but should appear consistently when the prefix is stable and >1024 tokens.
+"""
+
+import os
+import time
+
+import pytest
+from openai import OpenAI
+
+# Skip entire module if no API key
+pytestmark = pytest.mark.skipif(
+    not os.environ.get("OPENAI_API_KEY"),
+    reason="OPENAI_API_KEY not set",
+)
+
+MODEL = "gpt-5.1"
+
+# ---------------------------------------------------------------------------
+#  Production-length instruction text (must be >1024 tokens for caching)
+#  This mirrors the extract_action_items instructions from conversation_processing.py
+# ---------------------------------------------------------------------------
+
+ACTION_ITEMS_INSTRUCTIONS = """You are an expert action item extractor. Your sole purpose is to identify and extract actionable tasks from the provided content.
+
+EXPLICIT TASK/REMINDER REQUESTS (HIGHEST PRIORITY)
+
+When the primary user OR someone speaking to them uses these patterns, ALWAYS extract the task:
+- "Remind me to X" / "Remember to X" → EXTRACT "X"
+- "Don't forget to X" / "Don't let me forget X" → EXTRACT "X"
+- "Add task X" / "Create task X" / "Make a task for X" → EXTRACT "X"
+- "Note to self: X" / "Mental note: X" → EXTRACT "X"
+- "Task: X" / "Todo: X" / "To do: X" → EXTRACT "X"
+- "I need to remember to X" → EXTRACT "X"
+- "Put X on my list" / "Add X to my tasks" → EXTRACT "X"
+- "Set a reminder for X" / "Can you remind me X" → EXTRACT "X"
+- "You need to X" / "You should X" / "Make sure you X" (said TO the user) → EXTRACT "X"
+
+These explicit requests bypass importance/timing filters. If someone explicitly asks for a reminder or task, extract it.
+
+Examples:
+- User says "Remind me to buy milk" → Extract "Buy milk"
+- Someone tells user "Don't forget to call your mom" → Extract "Call mom"
+- User says "Add task pick up dry cleaning" → Extract "Pick up dry cleaning"
+- User says "Note to self, check tire pressure" → Extract "Check tire pressure"
+
+CRITICAL: If CALENDAR MEETING CONTEXT is provided with participant names, you MUST use those names:
+- The conversation DEFINITELY happened between the named participants
+- NEVER use "Speaker 0", "Speaker 1", "Speaker 2", etc. when participant names are available
+- Match transcript speakers to participant names by analyzing the conversation context
+- Use participant names in ALL action items (e.g., "Follow up with Sarah" NOT "Follow up with Speaker 0")
+- Reference the meeting title/context when relevant to the action item
+- Consider the scheduled meeting time and duration when extracting due dates
+- If you cannot confidently match a speaker to a name, use the action description without speaker references
+
+CRITICAL DEDUPLICATION RULES (Check BEFORE extracting):
+- DO NOT extract action items that are >95% similar to existing ones in the content
+- Check both the description AND the due date/timeframe
+- Consider semantic similarity, not just exact word matches
+- Examples of what counts as DUPLICATES (DO NOT extract):
+  - "Call John" vs "Phone John" → DUPLICATE
+  - "Finish report by Friday" (existing) vs "Complete report by end of week" → DUPLICATE
+  - "Buy milk" (existing) vs "Get milk from store" → DUPLICATE
+  - "Email Sarah about meeting" (existing) vs "Send email to Sarah regarding the meeting" → DUPLICATE
+- Examples of what is NOT duplicate (OK to extract):
+  - "Buy groceries" (existing) vs "Buy milk" → NOT duplicate (different scope)
+  - "Call dentist" (existing) vs "Call plumber" → NOT duplicate (different person/service)
+  - "Submit report by March 1st" (existing) vs "Submit report by March 15th" → NOT duplicate (different deadlines)
+- If you're unsure whether something is a duplicate, err on the side of treating it as a duplicate (DON'T extract)
+
+WORKFLOW:
+1. FIRST: Read the ENTIRE conversation carefully to understand the full context
+2. SECOND: Check for EXPLICIT task requests (remind me, add task, don't forget, etc.) - ALWAYS extract these
+3. THIRD: For IMPLICIT tasks - be extremely aggressive with filtering:
+   - Is the user ALREADY doing this? SKIP IT
+   - Is this truly important enough to remind a busy person? If ANY doubt, SKIP IT
+   - Would missing this have real consequences? If not obvious, SKIP IT
+   - Better to extract 0 implicit tasks than flood the user with noise
+4. FOURTH: Extract timing information separately and put it in the due_at field
+5. FIFTH: Clean the description - remove ALL time references and vague words
+6. SIXTH: Final check - description should be timeless and specific
+
+CRITICAL CONTEXT:
+- These action items are primarily for the PRIMARY USER who is having/recording this conversation
+- The user is the person wearing the device or initiating the conversation
+- Focus on tasks the primary user needs to track and act upon
+- Include tasks for OTHER people ONLY if:
+  - The primary user is dependent on that task being completed
+  - It's super crucial for the primary user to track it
+  - The primary user needs to follow up on it
+
+BALANCE QUALITY AND USER INTENT:
+- For EXPLICIT requests (remind me, add task, don't forget, etc.) - ALWAYS extract
+- For IMPLICIT tasks inferred from conversation - be very selective, better to extract 0 than flood the user
+- Think: "Did the user ask for this reminder, or am I guessing they need it?"
+- If the user explicitly asked for a task/reminder, respect their request even if it seems trivial
+
+STRICT FILTERING RULES - Include ONLY tasks that meet ALL these criteria:
+
+1. **Clear Ownership & Relevance to Primary User**:
+   - Identify which speaker is the primary user based on conversational context
+   - Look for cues: who is asking questions, who is receiving advice/tasks, who initiates topics
+   - For tasks assigned to the primary user: phrase them directly (start with verb)
+   - For tasks assigned to others: include them ONLY if primary user is dependent on them or needs to track them
+
+2. **Concrete Action**: The task describes a specific, actionable next step (not vague intentions)
+
+3. **Timing Signal** (NOT required for explicit task requests):
+   - Explicit dates or times
+   - Relative timing ("tomorrow", "next week", "by Friday", "this month")
+   - Urgency markers ("urgent", "ASAP", "high priority")
+
+4. **Real Importance** (NOT required for explicit task requests):
+   - Financial impact (bills, payments, purchases, invoices)
+   - Health/safety concerns (appointments, medications, safety checks)
+   - Hard deadlines (submissions, filings, registrations)
+   - Critical dependencies (primary user blocked without it)
+   - Commitments to other people (meetings, deliverables, promises)
+
+EXCLUDE these types of items (be aggressive about exclusion):
+- Things user is ALREADY doing or actively working on
+- Casual mentions or updates
+- Vague suggestions without commitment
+- General goals without specific next steps
+- Past actions being discussed
+- Hypothetical scenarios
+- Trivial tasks with no real consequences
+- Tasks assigned to others that don't impact the primary user
+
+FORMAT REQUIREMENTS:
+- Keep each action item SHORT and concise (maximum 15 words, strict limit)
+- Use clear, direct language
+- Start with a verb when possible
+- Include only essential details
+- Remove filler words and unnecessary context
+- Merge duplicates
+- Order by: due date, urgency, alphabetical
+
+Respond with JSON: {"action_items": [{"description": "..."}]}"""
+
+
+@pytest.fixture(scope="module")
+def client():
+    return OpenAI()
+
+
+def _call_and_get_cache_info(client: OpenAI, messages: list) -> dict:
+    """Make an API call and return cache-related usage info."""
+    response = client.chat.completions.create(
+        model=MODEL,
+        messages=messages,
+        max_completion_tokens=150,
+    )
+    usage = response.usage
+    result = {
+        "prompt_tokens": usage.prompt_tokens,
+        "completion_tokens": usage.completion_tokens,
+        "cached_tokens": 0,
+    }
+    if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
+        details = usage.prompt_tokens_details
+        if hasattr(details, "cached_tokens"):
+            result["cached_tokens"] = details.cached_tokens or 0
+    return result
+
+
+# ---------------------------------------------------------------------------
+#  Sample transcripts (different content, same function)
+# ---------------------------------------------------------------------------
+
+TRANSCRIPT_A = """Speaker 0: Good morning everyone. Let's start our weekly product sync.
+
+Speaker 1: Sure. I've been working on the new onboarding flow. We reduced the number of screens from 7 to 4, and early testing shows a 15% improvement in completion rate.
+
+Speaker 0: That's great progress. When do you think we can ship it?
+
+Speaker 1: I think we can have it ready by next Friday. I just need to finish the animations and get design sign-off from Sarah.
+
+Speaker 0: Perfect. Let me know if you need anything. Also, don't forget to update the analytics events before you ship.
+
+Speaker 2: I wanted to bring up the API latency issue. We're seeing p99 spikes above 2 seconds on the search endpoint. I've identified the root cause - it's the new full-text search query that's not using the index properly.
+
+Speaker 0: That sounds urgent. Can you fix it today?
+
+Speaker 2: I already have a fix ready. Just need to run the migration on staging first. Should be deployed by end of day.
+
+Speaker 0: Good. Remind me to check the latency dashboard tomorrow morning."""
+
+TRANSCRIPT_B = """Speaker 0: Hey, thanks for meeting with me about the budget.
+
+Speaker 1: Of course. Let me walk you through the Q2 projections. We're looking at a 12% increase in infrastructure costs due to the new region expansion.
+
+Speaker 0: That's higher than expected. Can we optimize anywhere?
+
+Speaker 1: Yes, I've identified three areas. First, we can switch to reserved instances for the database cluster - that saves about 30%. Second, we can implement better caching on the API layer. Third, we should audit our unused resources.
+
+Speaker 0: Let's do the reserved instances first since that's the biggest savings. Can you prepare a proposal by Wednesday?
+
+Speaker 1: Sure, I'll have it ready. I'll also include a comparison of different commitment terms.
+
+Speaker 0: Great. Also, remind me to schedule a meeting with the finance team next week to discuss the annual budget review."""
+
+TRANSCRIPT_C = """Speaker 0: I just got back from the dentist. They said I need to come back in two weeks for a follow-up.
+
+Speaker 1: Oh no, is everything okay?
+
+Speaker 0: Yeah, just a routine filling. But I need to remember to call the insurance company to check if the procedure is covered before I go back.
+
+Speaker 1: You should also ask about the pre-authorization process. Sometimes they need 48 hours notice.
+
+Speaker 0: Good point. I'll add that to my list. Also, remind me to pick up the prescription from the pharmacy on the way home tomorrow."""
+
+
+class TestSameUserSameConversation:
+    """Test intra-conversation caching: same user calls structure + action_items on the same transcript.
+
+    In production, each conversation triggers two sequential LLM calls (get_transcript_structure
+    then extract_action_items). Since both share the same static instruction prefix, the second
+    call should get a cache hit on that prefix even though the instructions differ after the prefix.
+
+    More importantly, calling the SAME function twice with identical messages should produce
+    a near-complete cache hit (all tokens cached).
+    """
+
+    def test_same_function_same_transcript_full_cache(self, client):
+        """Two identical calls — second should cache nearly all prompt tokens."""
+        msgs = [
+            {"role": "system", "content": ACTION_ITEMS_INSTRUCTIONS},
+            {
+                "role": "system",
+                "content": f"The content language is en. Use the same language en for your response.\n\nContent:\n{TRANSCRIPT_A}",
+            },
+        ]
+
+        print("\n  === Same user, same conversation (identical calls) ===")
+        r1 = _call_and_get_cache_info(client, msgs)
+        print(f"  Call 1 (prime): prompt={r1['prompt_tokens']}, cached={r1['cached_tokens']}")
+        time.sleep(1)
+
+        r2 = _call_and_get_cache_info(client, msgs)
+        print(f"  Call 2 (repeat): prompt={r2['prompt_tokens']}, cached={r2['cached_tokens']}")
+
+        if r2["cached_tokens"] > 0:
+            pct = r2["cached_tokens"] / r2["prompt_tokens"] * 100
+            print(
+                f"\n  ✅ SAME-CONVERSATION CACHE HIT: {r2['cached_tokens']}/{r2['prompt_tokens']} tokens ({pct:.1f}%)"
+            )
+        else:
+            print("\n  ⚠️  No cache hit on identical repeat (may need warm-up)")
+
+
+class TestSameUserCrossConversation:
+    """Test cross-conversation caching for the same user.
+
+    Same language/timezone, different transcripts. The static instruction prefix
+    should be cached after the first call. This is the primary cost-saving scenario:
+    a single user processes many conversations per day.
+    """
+
+    def test_same_language_different_transcripts(self, client):
+        """Same user (en) processes 3 different conversations — instruction prefix should cache."""
+        lang = "en"
+        transcripts = [TRANSCRIPT_A, TRANSCRIPT_B, TRANSCRIPT_C]
+        results = []
+
+        print("\n  === Same user (en), different conversations ===")
+        for i, transcript in enumerate(transcripts):
+            msgs = [
+                {"role": "system", "content": ACTION_ITEMS_INSTRUCTIONS},
+                {
+                    "role": "system",
+                    "content": f"The content language is {lang}. Use the same language {lang} for your response.\n\nContent:\n{transcript}",
+                },
+            ]
+            result = _call_and_get_cache_info(client, msgs)
+            results.append(result)
+            print(f"  Call {i+1}: prompt={result['prompt_tokens']}, cached={result['cached_tokens']}")
+            if i < len(transcripts) - 1:
+                time.sleep(1)
+
+        later_cached = results[1]["cached_tokens"] + results[2]["cached_tokens"]
+        later_prompt = results[1]["prompt_tokens"] + results[2]["prompt_tokens"]
+        if later_cached > 0:
+            pct = later_cached / later_prompt * 100
+            print(f"\n  ✅ CROSS-CONVERSATION CACHE HIT: {later_cached} cached tokens ({pct:.1f}% of later calls)")
+        else:
+            print("\n  ⚠️  No cache hits (may need warm-up)")
+
+
+class TestCrossUserCaching:
+    """Test cross-user caching with different languages.
+
+    Since {language_code} is now in the context message (not in the instruction prefix),
+    the static instruction prefix should be identical across ALL users regardless of language.
+    This is the key improvement from moving {language_code} out of instructions_text.
+    """
+
+    def test_different_languages_share_instruction_cache(self, client):
+        """Users with different languages should still get instruction prefix cache hits."""
+        # Simulate 3 users with different languages processing different conversations
+        user_calls = [
+            ("en", TRANSCRIPT_A, "English user"),
+            ("es", TRANSCRIPT_B, "Spanish user"),
+            ("ja", TRANSCRIPT_C, "Japanese user"),
+        ]
+        results = []
+
+        print("\n  === Cross-user: different languages, different conversations ===")
+        for i, (lang, transcript, label) in enumerate(user_calls):
+            msgs = [
+                {"role": "system", "content": ACTION_ITEMS_INSTRUCTIONS},
+                {
+                    "role": "system",
+                    "content": f"The content language is {lang}. Use the same language {lang} for your response.\n\nContent:\n{transcript}",
+                },
+            ]
+            result = _call_and_get_cache_info(client, msgs)
+            results.append(result)
+            print(f"  Call {i+1} ({label}): prompt={result['prompt_tokens']}, cached={result['cached_tokens']}")
+            if i < len(user_calls) - 1:
+                time.sleep(1)
+
+        later_cached = results[1]["cached_tokens"] + results[2]["cached_tokens"]
+        later_prompt = results[1]["prompt_tokens"] + results[2]["prompt_tokens"]
+        if later_cached > 0:
+            pct = later_cached / later_prompt * 100
+            print(f"\n  ✅ CROSS-USER CACHE HIT: {later_cached} cached tokens ({pct:.1f}% of later calls)")
+            print("  Instruction prefix is shared across languages!")
+        else:
+            print("\n  ⚠️  No cross-user cache hits (may need warm-up)")
+
+    def test_cross_user_vs_language_in_instructions(self, client):
+        """A/B: static instructions (language in context) vs dynamic instructions (language baked in).
+
+        This proves the value of moving {language_code} out of instructions_text.
+        """
+        languages = ["en", "es", "ja"]
+        transcripts = [TRANSCRIPT_A, TRANSCRIPT_B, TRANSCRIPT_C]
+
+        # --- New approach: language in context message (static prefix) ---
+        print("\n  === A: Language in context (static instruction prefix) ===")
+        static_results = []
+        for i, (lang, transcript) in enumerate(zip(languages, transcripts)):
+            msgs = [
+                {"role": "system", "content": ACTION_ITEMS_INSTRUCTIONS},
+                {
+                    "role": "system",
+                    "content": f"The content language is {lang}. Use the same language {lang} for your response.\n\nContent:\n{transcript}",
+                },
+            ]
+            result = _call_and_get_cache_info(client, msgs)
+            static_results.append(result)
+            print(f"    Call {i+1} (lang={lang}): prompt={result['prompt_tokens']}, cached={result['cached_tokens']}")
+            if i < len(languages) - 1:
+                time.sleep(1)
+
+        time.sleep(3)
+
+        # --- Old approach: language baked into instructions (dynamic prefix) ---
+        print("\n  === B: Language in instructions (dynamic prefix — old bug) ===")
+        dynamic_results = []
+        for i, (lang, transcript) in enumerate(zip(languages, transcripts)):
+            # Simulate old code: language_code in instructions
+            dynamic_instructions = (
+                f"You are an expert action item extractor.\n\nThe content language is {lang}. Use the same language {lang} for your response.\n\n"
+                + ACTION_ITEMS_INSTRUCTIONS[
+                    len(
+                        "You are an expert action item extractor. Your sole purpose is to identify and extract actionable tasks from the provided content.\n\n"
+                    ) :
+                ]
+            )
+            msgs = [
+                {"role": "system", "content": dynamic_instructions},
+                {"role": "system", "content": f"Content:\n{transcript}"},
+            ]
+            result = _call_and_get_cache_info(client, msgs)
+            dynamic_results.append(result)
+            print(f"    Call {i+1} (lang={lang}): prompt={result['prompt_tokens']}, cached={result['cached_tokens']}")
+            if i < len(languages) - 1:
+                time.sleep(1)
+
+        # --- Summary ---
+        static_cached = sum(r["cached_tokens"] for r in static_results)
+        static_prompt = sum(r["prompt_tokens"] for r in static_results)
+        dynamic_cached = sum(r["cached_tokens"] for r in dynamic_results)
+        dynamic_prompt = sum(r["prompt_tokens"] for r in dynamic_results)
+
+        print(f"\n  === RESULTS ===")
+        print(
+            f"  Static prefix (new):  {static_cached}/{static_prompt} cached ({static_cached/max(static_prompt,1)*100:.1f}%)"
+        )
+        print(
+            f"  Dynamic prefix (old): {dynamic_cached}/{dynamic_prompt} cached ({dynamic_cached/max(dynamic_prompt,1)*100:.1f}%)"
+        )
+
+        if static_cached > dynamic_cached:
+            print(f"  ✅ Static prefix produces {static_cached - dynamic_cached} MORE cached tokens across languages")
+        elif static_cached == dynamic_cached == 0:
+            print("  ⚠️  No cache hits on either (may need warm-up)")
+        else:
+            print("  ⚠️  Unexpected: dynamic prefix matched or beat static prefix")
diff --git a/backend/tests/unit/test_prompt_caching.py b/backend/tests/unit/test_prompt_caching.py
index 4eff88655d..b728c472c9 100644
--- a/backend/tests/unit/test_prompt_caching.py
+++ b/backend/tests/unit/test_prompt_caching.py
@@ -2,11 +2,13 @@
 
 Verifies that:
 1. _build_conversation_context() produces deterministic, identical output for the same inputs
-2. get_transcript_structure() and extract_action_items() use a shared context prefix
-   as the first system message to enable OpenAI prompt caching
+2. get_transcript_structure() and extract_action_items() use conversation context
+   as the second system message (after static instructions) to enable OpenAI prompt caching
 3. Calendar context is unified (includes meeting_link in both functions)
 """
 
+import inspect
+import re
 import sys
 from datetime import datetime, timezone
 from unittest.mock import MagicMock
@@ -17,7 +19,11 @@
 sys.modules.setdefault("utils.llm.clients", _mock_clients)
 
 from models.conversation import CalendarMeetingContext, MeetingParticipant, ConversationPhoto
-from utils.llm.conversation_processing import _build_conversation_context
+from utils.llm.conversation_processing import (
+    _build_conversation_context,
+    extract_action_items,
+    get_transcript_structure,
+)
 
 
 class TestBuildConversationContext:
@@ -170,3 +176,73 @@ def test_participant_without_name(self):
         )
         result = _build_conversation_context("test", None, calendar)
         assert "unknown@co.com" in result
+
+
+class TestPromptMessageOrdering:
+    """Tests that static instructions come before dynamic content in prompt templates.
+
+    OpenAI prompt caching requires static content as a prefix for cross-conversation
+    cache hits. These tests verify the message order is [instructions, context] not
+    [context, instructions].
+    """
+
+    def _get_from_messages_calls(self, func):
+        """Extract ChatPromptTemplate.from_messages call patterns from function source."""
+        source = inspect.getsource(func)
+        return re.findall(r'from_messages\(\[(.*?)\]\)', source, re.DOTALL)
+
+    def test_get_transcript_structure_instructions_first(self):
+        """Static instructions must be the first system message for cross-conversation caching."""
+        calls = self._get_from_messages_calls(get_transcript_structure)
+        assert len(calls) == 1, "Expected exactly one from_messages call"
+        args = calls[0].strip()
+        # instructions_text should come before context_message
+        instructions_pos = args.index('instructions_text')
+        context_pos = args.index('context_message')
+        assert instructions_pos < context_pos, "instructions_text must come before context_message"
+
+    def test_extract_action_items_instructions_first(self):
+        """Static instructions must be the first system message for cross-conversation caching."""
+        calls = self._get_from_messages_calls(extract_action_items)
+        assert len(calls) == 1, "Expected exactly one from_messages call"
+        args = calls[0].strip()
+        instructions_pos = args.index('instructions_text')
+        context_pos = args.index('context_message')
+        assert instructions_pos < context_pos, "instructions_text must come before context_message"
+
+    def test_both_functions_use_two_system_messages(self):
+        """Both functions must use exactly two system messages."""
+        for func in [get_transcript_structure, extract_action_items]:
+            calls = self._get_from_messages_calls(func)
+            assert len(calls) == 1, f"{func.__name__}: expected one from_messages call"
+            # Count 'system' occurrences in the call
+            system_count = calls[0].count("'system'")
+            assert system_count == 2, f"{func.__name__}: expected 2 system messages, got {system_count}"
+
+    def test_existing_items_context_not_in_instructions(self):
+        """existing_items_context must be in the context message, not the instructions."""
+        source = inspect.getsource(extract_action_items)
+        # Find the instructions_text definition
+        instructions_match = re.search(r"instructions_text\s*=\s*'''(.*?)'''", source, re.DOTALL)
+        assert instructions_match, "Could not find instructions_text definition"
+        instructions_content = instructions_match.group(1)
+        assert (
+            '{existing_items_context}' not in instructions_content
+        ), "existing_items_context should not be in instructions_text (breaks static prefix caching)"
+        # Verify it IS in the context_message
+        context_match = re.search(r"context_message\s*=\s*['\"](.+?)['\"]", source)
+        assert context_match, "Could not find context_message definition"
+        assert 'existing_items_context' in context_match.group(1), "existing_items_context should be in context_message"
+
+    def test_language_code_not_in_instructions(self):
+        """language_code must be in the context message, not the instructions prefix."""
+        source = inspect.getsource(extract_action_items)
+        instructions_match = re.search(r"instructions_text\s*=\s*'''(.*?)'''", source, re.DOTALL)
+        assert instructions_match, "Could not find instructions_text definition"
+        instructions_content = instructions_match.group(1)
+        assert (
+            '{language_code}' not in instructions_content
+        ), "language_code should not be in instructions_text (breaks static prefix caching for non-English)"
+        context_match = re.search(r"context_message\s*=\s*['\"](.+?)['\"]", source)
+        assert context_match, "Could not find context_message definition"
+        assert 'language_code' in context_match.group(1), "language_code should be in context_message"
diff --git a/backend/utils/llm/conversation_processing.py b/backend/utils/llm/conversation_processing.py
index f9e674c6a3..29b4fb6123 100644
--- a/backend/utils/llm/conversation_processing.py
+++ b/backend/utils/llm/conversation_processing.py
@@ -243,8 +243,8 @@ def _build_conversation_context(
     """Build the conversation context string shared across LLM prompts.
 
     Produces a deterministic string from transcript, photos, and calendar context.
-    When used as the first system message in a prompt, enables OpenAI prompt caching
-    across sequential calls (e.g. structure + action items) that share the same content.
+    Used as the second system message (after static instructions) so that the static
+    instruction prefix enables cross-conversation OpenAI prompt caching.
 
     Returns:
         Formatted context string, or empty string if no content provided.
@@ -322,14 +322,10 @@ def extract_action_items(
             items_list
         )
 
-    # First system message: shared conversation context (enables OpenAI prompt caching)
-    context_message = 'Content:\n{conversation_context}'
-
-    # Second system message: task-specific instructions
+    # First system message: task-specific instructions (static prefix enables cross-conversation caching)
+    # NOTE: {language_code} is in the context message, not here, to keep this prefix fully static across all languages.
     instructions_text = '''You are an expert action item extractor. Your sole purpose is to identify and extract actionable tasks from the provided content.
 
-    The content language is {language_code}. Use the same language {language_code} for your response.
-
     EXPLICIT TASK/REMINDER REQUESTS (HIGHEST PRIORITY)
 
     When the primary user OR someone speaking to them uses these patterns, ALWAYS extract the task:
@@ -358,10 +354,10 @@ def extract_action_items(
     - Use participant names in ALL action items (e.g., "Follow up with Sarah" NOT "Follow up with Speaker 0")
     - Reference the meeting title/context when relevant to the action item
     - Consider the scheduled meeting time and duration when extracting due dates
-    - If you cannot confidently match a speaker to a name, use the action description without speaker references{existing_items_context}
+    - If you cannot confidently match a speaker to a name, use the action description without speaker references
 
     CRITICAL DEDUPLICATION RULES (Check BEFORE extracting):
-    • DO NOT extract action items that are >95% similar to existing ones listed above
+    • DO NOT extract action items that are >95% similar to existing ones in the content
     • Check both the description AND the due date/timeframe
     • Consider semantic similarity, not just exact word matches
     • Examples of what counts as DUPLICATES (DO NOT extract):
@@ -549,7 +545,9 @@ def extract_action_items(
     ).strip()
 
     action_items_parser = PydanticOutputParser(pydantic_object=ActionItemsExtraction)
-    prompt = ChatPromptTemplate.from_messages([('system', context_message), ('system', instructions_text)])
+    # Second system message: conversation context + existing items (dynamic, per-conversation)
+    context_message = 'The content language is {language_code}. Use the same language {language_code} for your response.\n\nContent:\n{conversation_context}{existing_items_context}'
+    prompt = ChatPromptTemplate.from_messages([('system', instructions_text), ('system', context_message)])
     chain = prompt | llm_medium_experiment | action_items_parser
 
     try:
@@ -589,10 +587,7 @@ def get_transcript_structure(
     if not conversation_context:
         return Structured()  # Should be caught by discard logic, but as a safeguard.
 
-    # First system message: shared conversation context (enables OpenAI prompt caching)
-    context_message = 'Content:\n{conversation_context}'
-
-    # Second system message: task-specific instructions
+    # First system message: task-specific instructions (static prefix enables cross-conversation caching)
     instructions_text = '''You are an expert content analyzer. Your task is to analyze the provided content (which could be a transcript, a series of photo descriptions from a wearable camera, or both) and provide structure and clarity.
     The content language is {language_code}. Use the same language {language_code} for your response.
 
@@ -638,7 +633,9 @@ def get_transcript_structure(
         '    ', ''
     ).strip()
 
-    prompt = ChatPromptTemplate.from_messages([('system', context_message), ('system', instructions_text)])
+    # Second system message: conversation context (dynamic, per-conversation)
+    context_message = 'Content:\n{conversation_context}'
+    prompt = ChatPromptTemplate.from_messages([('system', instructions_text), ('system', context_message)])
     chain = prompt | llm_medium_experiment | parser
 
     response = chain.invoke(