diff --git a/backend/tests/integration/test_prompt_caching_integration.py b/backend/tests/integration/test_prompt_caching_integration.py new file mode 100644 index 0000000000..ec247efff5 --- /dev/null +++ b/backend/tests/integration/test_prompt_caching_integration.py @@ -0,0 +1,418 @@ +""" +Integration tests for OpenAI prompt caching with live API calls. + +Tests three caching scenarios on gpt-5.1 (the fix from PR #4670): +1. Same user, same conversation — identical calls should fully cache +2. Same user, cross conversation — same language, different transcripts should cache instruction prefix +3. Cross user — different languages should still cache instruction prefix (language_code in context, not instructions) + +Requires: + - OPENAI_API_KEY environment variable + - Network access to OpenAI API + +Run: + cd backend + PYTHONPATH=. python3 -m pytest tests/integration/test_prompt_caching_integration.py -v -s + +Note: These tests make real API calls and cost real money (~$0.10-0.20 per run). +Cache behavior depends on OpenAI infrastructure; cache hits are not guaranteed +on every run but should appear consistently when the prefix is stable and >1024 tokens. +""" + +import os +import time + +import pytest +from openai import OpenAI + +# Skip entire module if no API key +pytestmark = pytest.mark.skipif( + not os.environ.get("OPENAI_API_KEY"), + reason="OPENAI_API_KEY not set", +) + +MODEL = "gpt-5.1" + +# --------------------------------------------------------------------------- +# Production-length instruction text (must be >1024 tokens for caching) +# This mirrors the extract_action_items instructions from conversation_processing.py +# --------------------------------------------------------------------------- + +ACTION_ITEMS_INSTRUCTIONS = """You are an expert action item extractor. Your sole purpose is to identify and extract actionable tasks from the provided content. + +EXPLICIT TASK/REMINDER REQUESTS (HIGHEST PRIORITY) + +When the primary user OR someone speaking to them uses these patterns, ALWAYS extract the task: +- "Remind me to X" / "Remember to X" → EXTRACT "X" +- "Don't forget to X" / "Don't let me forget X" → EXTRACT "X" +- "Add task X" / "Create task X" / "Make a task for X" → EXTRACT "X" +- "Note to self: X" / "Mental note: X" → EXTRACT "X" +- "Task: X" / "Todo: X" / "To do: X" → EXTRACT "X" +- "I need to remember to X" → EXTRACT "X" +- "Put X on my list" / "Add X to my tasks" → EXTRACT "X" +- "Set a reminder for X" / "Can you remind me X" → EXTRACT "X" +- "You need to X" / "You should X" / "Make sure you X" (said TO the user) → EXTRACT "X" + +These explicit requests bypass importance/timing filters. If someone explicitly asks for a reminder or task, extract it. + +Examples: +- User says "Remind me to buy milk" → Extract "Buy milk" +- Someone tells user "Don't forget to call your mom" → Extract "Call mom" +- User says "Add task pick up dry cleaning" → Extract "Pick up dry cleaning" +- User says "Note to self, check tire pressure" → Extract "Check tire pressure" + +CRITICAL: If CALENDAR MEETING CONTEXT is provided with participant names, you MUST use those names: +- The conversation DEFINITELY happened between the named participants +- NEVER use "Speaker 0", "Speaker 1", "Speaker 2", etc. when participant names are available +- Match transcript speakers to participant names by analyzing the conversation context +- Use participant names in ALL action items (e.g., "Follow up with Sarah" NOT "Follow up with Speaker 0") +- Reference the meeting title/context when relevant to the action item +- Consider the scheduled meeting time and duration when extracting due dates +- If you cannot confidently match a speaker to a name, use the action description without speaker references + +CRITICAL DEDUPLICATION RULES (Check BEFORE extracting): +- DO NOT extract action items that are >95% similar to existing ones in the content +- Check both the description AND the due date/timeframe +- Consider semantic similarity, not just exact word matches +- Examples of what counts as DUPLICATES (DO NOT extract): + - "Call John" vs "Phone John" → DUPLICATE + - "Finish report by Friday" (existing) vs "Complete report by end of week" → DUPLICATE + - "Buy milk" (existing) vs "Get milk from store" → DUPLICATE + - "Email Sarah about meeting" (existing) vs "Send email to Sarah regarding the meeting" → DUPLICATE +- Examples of what is NOT duplicate (OK to extract): + - "Buy groceries" (existing) vs "Buy milk" → NOT duplicate (different scope) + - "Call dentist" (existing) vs "Call plumber" → NOT duplicate (different person/service) + - "Submit report by March 1st" (existing) vs "Submit report by March 15th" → NOT duplicate (different deadlines) +- If you're unsure whether something is a duplicate, err on the side of treating it as a duplicate (DON'T extract) + +WORKFLOW: +1. FIRST: Read the ENTIRE conversation carefully to understand the full context +2. SECOND: Check for EXPLICIT task requests (remind me, add task, don't forget, etc.) - ALWAYS extract these +3. THIRD: For IMPLICIT tasks - be extremely aggressive with filtering: + - Is the user ALREADY doing this? SKIP IT + - Is this truly important enough to remind a busy person? If ANY doubt, SKIP IT + - Would missing this have real consequences? If not obvious, SKIP IT + - Better to extract 0 implicit tasks than flood the user with noise +4. FOURTH: Extract timing information separately and put it in the due_at field +5. FIFTH: Clean the description - remove ALL time references and vague words +6. SIXTH: Final check - description should be timeless and specific + +CRITICAL CONTEXT: +- These action items are primarily for the PRIMARY USER who is having/recording this conversation +- The user is the person wearing the device or initiating the conversation +- Focus on tasks the primary user needs to track and act upon +- Include tasks for OTHER people ONLY if: + - The primary user is dependent on that task being completed + - It's super crucial for the primary user to track it + - The primary user needs to follow up on it + +BALANCE QUALITY AND USER INTENT: +- For EXPLICIT requests (remind me, add task, don't forget, etc.) - ALWAYS extract +- For IMPLICIT tasks inferred from conversation - be very selective, better to extract 0 than flood the user +- Think: "Did the user ask for this reminder, or am I guessing they need it?" +- If the user explicitly asked for a task/reminder, respect their request even if it seems trivial + +STRICT FILTERING RULES - Include ONLY tasks that meet ALL these criteria: + +1. **Clear Ownership & Relevance to Primary User**: + - Identify which speaker is the primary user based on conversational context + - Look for cues: who is asking questions, who is receiving advice/tasks, who initiates topics + - For tasks assigned to the primary user: phrase them directly (start with verb) + - For tasks assigned to others: include them ONLY if primary user is dependent on them or needs to track them + +2. **Concrete Action**: The task describes a specific, actionable next step (not vague intentions) + +3. **Timing Signal** (NOT required for explicit task requests): + - Explicit dates or times + - Relative timing ("tomorrow", "next week", "by Friday", "this month") + - Urgency markers ("urgent", "ASAP", "high priority") + +4. **Real Importance** (NOT required for explicit task requests): + - Financial impact (bills, payments, purchases, invoices) + - Health/safety concerns (appointments, medications, safety checks) + - Hard deadlines (submissions, filings, registrations) + - Critical dependencies (primary user blocked without it) + - Commitments to other people (meetings, deliverables, promises) + +EXCLUDE these types of items (be aggressive about exclusion): +- Things user is ALREADY doing or actively working on +- Casual mentions or updates +- Vague suggestions without commitment +- General goals without specific next steps +- Past actions being discussed +- Hypothetical scenarios +- Trivial tasks with no real consequences +- Tasks assigned to others that don't impact the primary user + +FORMAT REQUIREMENTS: +- Keep each action item SHORT and concise (maximum 15 words, strict limit) +- Use clear, direct language +- Start with a verb when possible +- Include only essential details +- Remove filler words and unnecessary context +- Merge duplicates +- Order by: due date, urgency, alphabetical + +Respond with JSON: {"action_items": [{"description": "..."}]}""" + + +@pytest.fixture(scope="module") +def client(): + return OpenAI() + + +def _call_and_get_cache_info(client: OpenAI, messages: list) -> dict: + """Make an API call and return cache-related usage info.""" + response = client.chat.completions.create( + model=MODEL, + messages=messages, + max_completion_tokens=150, + ) + usage = response.usage + result = { + "prompt_tokens": usage.prompt_tokens, + "completion_tokens": usage.completion_tokens, + "cached_tokens": 0, + } + if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details: + details = usage.prompt_tokens_details + if hasattr(details, "cached_tokens"): + result["cached_tokens"] = details.cached_tokens or 0 + return result + + +# --------------------------------------------------------------------------- +# Sample transcripts (different content, same function) +# --------------------------------------------------------------------------- + +TRANSCRIPT_A = """Speaker 0: Good morning everyone. Let's start our weekly product sync. + +Speaker 1: Sure. I've been working on the new onboarding flow. We reduced the number of screens from 7 to 4, and early testing shows a 15% improvement in completion rate. + +Speaker 0: That's great progress. When do you think we can ship it? + +Speaker 1: I think we can have it ready by next Friday. I just need to finish the animations and get design sign-off from Sarah. + +Speaker 0: Perfect. Let me know if you need anything. Also, don't forget to update the analytics events before you ship. + +Speaker 2: I wanted to bring up the API latency issue. We're seeing p99 spikes above 2 seconds on the search endpoint. I've identified the root cause - it's the new full-text search query that's not using the index properly. + +Speaker 0: That sounds urgent. Can you fix it today? + +Speaker 2: I already have a fix ready. Just need to run the migration on staging first. Should be deployed by end of day. + +Speaker 0: Good. Remind me to check the latency dashboard tomorrow morning.""" + +TRANSCRIPT_B = """Speaker 0: Hey, thanks for meeting with me about the budget. + +Speaker 1: Of course. Let me walk you through the Q2 projections. We're looking at a 12% increase in infrastructure costs due to the new region expansion. + +Speaker 0: That's higher than expected. Can we optimize anywhere? + +Speaker 1: Yes, I've identified three areas. First, we can switch to reserved instances for the database cluster - that saves about 30%. Second, we can implement better caching on the API layer. Third, we should audit our unused resources. + +Speaker 0: Let's do the reserved instances first since that's the biggest savings. Can you prepare a proposal by Wednesday? + +Speaker 1: Sure, I'll have it ready. I'll also include a comparison of different commitment terms. + +Speaker 0: Great. Also, remind me to schedule a meeting with the finance team next week to discuss the annual budget review.""" + +TRANSCRIPT_C = """Speaker 0: I just got back from the dentist. They said I need to come back in two weeks for a follow-up. + +Speaker 1: Oh no, is everything okay? + +Speaker 0: Yeah, just a routine filling. But I need to remember to call the insurance company to check if the procedure is covered before I go back. + +Speaker 1: You should also ask about the pre-authorization process. Sometimes they need 48 hours notice. + +Speaker 0: Good point. I'll add that to my list. Also, remind me to pick up the prescription from the pharmacy on the way home tomorrow.""" + + +class TestSameUserSameConversation: + """Test intra-conversation caching: same user calls structure + action_items on the same transcript. + + In production, each conversation triggers two sequential LLM calls (get_transcript_structure + then extract_action_items). Since both share the same static instruction prefix, the second + call should get a cache hit on that prefix even though the instructions differ after the prefix. + + More importantly, calling the SAME function twice with identical messages should produce + a near-complete cache hit (all tokens cached). + """ + + def test_same_function_same_transcript_full_cache(self, client): + """Two identical calls — second should cache nearly all prompt tokens.""" + msgs = [ + {"role": "system", "content": ACTION_ITEMS_INSTRUCTIONS}, + { + "role": "system", + "content": f"The content language is en. Use the same language en for your response.\n\nContent:\n{TRANSCRIPT_A}", + }, + ] + + print("\n === Same user, same conversation (identical calls) ===") + r1 = _call_and_get_cache_info(client, msgs) + print(f" Call 1 (prime): prompt={r1['prompt_tokens']}, cached={r1['cached_tokens']}") + time.sleep(1) + + r2 = _call_and_get_cache_info(client, msgs) + print(f" Call 2 (repeat): prompt={r2['prompt_tokens']}, cached={r2['cached_tokens']}") + + if r2["cached_tokens"] > 0: + pct = r2["cached_tokens"] / r2["prompt_tokens"] * 100 + print( + f"\n ✅ SAME-CONVERSATION CACHE HIT: {r2['cached_tokens']}/{r2['prompt_tokens']} tokens ({pct:.1f}%)" + ) + else: + print("\n ⚠️ No cache hit on identical repeat (may need warm-up)") + + +class TestSameUserCrossConversation: + """Test cross-conversation caching for the same user. + + Same language/timezone, different transcripts. The static instruction prefix + should be cached after the first call. This is the primary cost-saving scenario: + a single user processes many conversations per day. + """ + + def test_same_language_different_transcripts(self, client): + """Same user (en) processes 3 different conversations — instruction prefix should cache.""" + lang = "en" + transcripts = [TRANSCRIPT_A, TRANSCRIPT_B, TRANSCRIPT_C] + results = [] + + print("\n === Same user (en), different conversations ===") + for i, transcript in enumerate(transcripts): + msgs = [ + {"role": "system", "content": ACTION_ITEMS_INSTRUCTIONS}, + { + "role": "system", + "content": f"The content language is {lang}. Use the same language {lang} for your response.\n\nContent:\n{transcript}", + }, + ] + result = _call_and_get_cache_info(client, msgs) + results.append(result) + print(f" Call {i+1}: prompt={result['prompt_tokens']}, cached={result['cached_tokens']}") + if i < len(transcripts) - 1: + time.sleep(1) + + later_cached = results[1]["cached_tokens"] + results[2]["cached_tokens"] + later_prompt = results[1]["prompt_tokens"] + results[2]["prompt_tokens"] + if later_cached > 0: + pct = later_cached / later_prompt * 100 + print(f"\n ✅ CROSS-CONVERSATION CACHE HIT: {later_cached} cached tokens ({pct:.1f}% of later calls)") + else: + print("\n ⚠️ No cache hits (may need warm-up)") + + +class TestCrossUserCaching: + """Test cross-user caching with different languages. + + Since {language_code} is now in the context message (not in the instruction prefix), + the static instruction prefix should be identical across ALL users regardless of language. + This is the key improvement from moving {language_code} out of instructions_text. + """ + + def test_different_languages_share_instruction_cache(self, client): + """Users with different languages should still get instruction prefix cache hits.""" + # Simulate 3 users with different languages processing different conversations + user_calls = [ + ("en", TRANSCRIPT_A, "English user"), + ("es", TRANSCRIPT_B, "Spanish user"), + ("ja", TRANSCRIPT_C, "Japanese user"), + ] + results = [] + + print("\n === Cross-user: different languages, different conversations ===") + for i, (lang, transcript, label) in enumerate(user_calls): + msgs = [ + {"role": "system", "content": ACTION_ITEMS_INSTRUCTIONS}, + { + "role": "system", + "content": f"The content language is {lang}. Use the same language {lang} for your response.\n\nContent:\n{transcript}", + }, + ] + result = _call_and_get_cache_info(client, msgs) + results.append(result) + print(f" Call {i+1} ({label}): prompt={result['prompt_tokens']}, cached={result['cached_tokens']}") + if i < len(user_calls) - 1: + time.sleep(1) + + later_cached = results[1]["cached_tokens"] + results[2]["cached_tokens"] + later_prompt = results[1]["prompt_tokens"] + results[2]["prompt_tokens"] + if later_cached > 0: + pct = later_cached / later_prompt * 100 + print(f"\n ✅ CROSS-USER CACHE HIT: {later_cached} cached tokens ({pct:.1f}% of later calls)") + print(" Instruction prefix is shared across languages!") + else: + print("\n ⚠️ No cross-user cache hits (may need warm-up)") + + def test_cross_user_vs_language_in_instructions(self, client): + """A/B: static instructions (language in context) vs dynamic instructions (language baked in). + + This proves the value of moving {language_code} out of instructions_text. + """ + languages = ["en", "es", "ja"] + transcripts = [TRANSCRIPT_A, TRANSCRIPT_B, TRANSCRIPT_C] + + # --- New approach: language in context message (static prefix) --- + print("\n === A: Language in context (static instruction prefix) ===") + static_results = [] + for i, (lang, transcript) in enumerate(zip(languages, transcripts)): + msgs = [ + {"role": "system", "content": ACTION_ITEMS_INSTRUCTIONS}, + { + "role": "system", + "content": f"The content language is {lang}. Use the same language {lang} for your response.\n\nContent:\n{transcript}", + }, + ] + result = _call_and_get_cache_info(client, msgs) + static_results.append(result) + print(f" Call {i+1} (lang={lang}): prompt={result['prompt_tokens']}, cached={result['cached_tokens']}") + if i < len(languages) - 1: + time.sleep(1) + + time.sleep(3) + + # --- Old approach: language baked into instructions (dynamic prefix) --- + print("\n === B: Language in instructions (dynamic prefix — old bug) ===") + dynamic_results = [] + for i, (lang, transcript) in enumerate(zip(languages, transcripts)): + # Simulate old code: language_code in instructions + dynamic_instructions = ( + f"You are an expert action item extractor.\n\nThe content language is {lang}. Use the same language {lang} for your response.\n\n" + + ACTION_ITEMS_INSTRUCTIONS[ + len( + "You are an expert action item extractor. Your sole purpose is to identify and extract actionable tasks from the provided content.\n\n" + ) : + ] + ) + msgs = [ + {"role": "system", "content": dynamic_instructions}, + {"role": "system", "content": f"Content:\n{transcript}"}, + ] + result = _call_and_get_cache_info(client, msgs) + dynamic_results.append(result) + print(f" Call {i+1} (lang={lang}): prompt={result['prompt_tokens']}, cached={result['cached_tokens']}") + if i < len(languages) - 1: + time.sleep(1) + + # --- Summary --- + static_cached = sum(r["cached_tokens"] for r in static_results) + static_prompt = sum(r["prompt_tokens"] for r in static_results) + dynamic_cached = sum(r["cached_tokens"] for r in dynamic_results) + dynamic_prompt = sum(r["prompt_tokens"] for r in dynamic_results) + + print(f"\n === RESULTS ===") + print( + f" Static prefix (new): {static_cached}/{static_prompt} cached ({static_cached/max(static_prompt,1)*100:.1f}%)" + ) + print( + f" Dynamic prefix (old): {dynamic_cached}/{dynamic_prompt} cached ({dynamic_cached/max(dynamic_prompt,1)*100:.1f}%)" + ) + + if static_cached > dynamic_cached: + print(f" ✅ Static prefix produces {static_cached - dynamic_cached} MORE cached tokens across languages") + elif static_cached == dynamic_cached == 0: + print(" ⚠️ No cache hits on either (may need warm-up)") + else: + print(" ⚠️ Unexpected: dynamic prefix matched or beat static prefix") diff --git a/backend/tests/unit/test_prompt_caching.py b/backend/tests/unit/test_prompt_caching.py index 4eff88655d..b728c472c9 100644 --- a/backend/tests/unit/test_prompt_caching.py +++ b/backend/tests/unit/test_prompt_caching.py @@ -2,11 +2,13 @@ Verifies that: 1. _build_conversation_context() produces deterministic, identical output for the same inputs -2. get_transcript_structure() and extract_action_items() use a shared context prefix - as the first system message to enable OpenAI prompt caching +2. get_transcript_structure() and extract_action_items() use conversation context + as the second system message (after static instructions) to enable OpenAI prompt caching 3. Calendar context is unified (includes meeting_link in both functions) """ +import inspect +import re import sys from datetime import datetime, timezone from unittest.mock import MagicMock @@ -17,7 +19,11 @@ sys.modules.setdefault("utils.llm.clients", _mock_clients) from models.conversation import CalendarMeetingContext, MeetingParticipant, ConversationPhoto -from utils.llm.conversation_processing import _build_conversation_context +from utils.llm.conversation_processing import ( + _build_conversation_context, + extract_action_items, + get_transcript_structure, +) class TestBuildConversationContext: @@ -170,3 +176,73 @@ def test_participant_without_name(self): ) result = _build_conversation_context("test", None, calendar) assert "unknown@co.com" in result + + +class TestPromptMessageOrdering: + """Tests that static instructions come before dynamic content in prompt templates. + + OpenAI prompt caching requires static content as a prefix for cross-conversation + cache hits. These tests verify the message order is [instructions, context] not + [context, instructions]. + """ + + def _get_from_messages_calls(self, func): + """Extract ChatPromptTemplate.from_messages call patterns from function source.""" + source = inspect.getsource(func) + return re.findall(r'from_messages\(\[(.*?)\]\)', source, re.DOTALL) + + def test_get_transcript_structure_instructions_first(self): + """Static instructions must be the first system message for cross-conversation caching.""" + calls = self._get_from_messages_calls(get_transcript_structure) + assert len(calls) == 1, "Expected exactly one from_messages call" + args = calls[0].strip() + # instructions_text should come before context_message + instructions_pos = args.index('instructions_text') + context_pos = args.index('context_message') + assert instructions_pos < context_pos, "instructions_text must come before context_message" + + def test_extract_action_items_instructions_first(self): + """Static instructions must be the first system message for cross-conversation caching.""" + calls = self._get_from_messages_calls(extract_action_items) + assert len(calls) == 1, "Expected exactly one from_messages call" + args = calls[0].strip() + instructions_pos = args.index('instructions_text') + context_pos = args.index('context_message') + assert instructions_pos < context_pos, "instructions_text must come before context_message" + + def test_both_functions_use_two_system_messages(self): + """Both functions must use exactly two system messages.""" + for func in [get_transcript_structure, extract_action_items]: + calls = self._get_from_messages_calls(func) + assert len(calls) == 1, f"{func.__name__}: expected one from_messages call" + # Count 'system' occurrences in the call + system_count = calls[0].count("'system'") + assert system_count == 2, f"{func.__name__}: expected 2 system messages, got {system_count}" + + def test_existing_items_context_not_in_instructions(self): + """existing_items_context must be in the context message, not the instructions.""" + source = inspect.getsource(extract_action_items) + # Find the instructions_text definition + instructions_match = re.search(r"instructions_text\s*=\s*'''(.*?)'''", source, re.DOTALL) + assert instructions_match, "Could not find instructions_text definition" + instructions_content = instructions_match.group(1) + assert ( + '{existing_items_context}' not in instructions_content + ), "existing_items_context should not be in instructions_text (breaks static prefix caching)" + # Verify it IS in the context_message + context_match = re.search(r"context_message\s*=\s*['\"](.+?)['\"]", source) + assert context_match, "Could not find context_message definition" + assert 'existing_items_context' in context_match.group(1), "existing_items_context should be in context_message" + + def test_language_code_not_in_instructions(self): + """language_code must be in the context message, not the instructions prefix.""" + source = inspect.getsource(extract_action_items) + instructions_match = re.search(r"instructions_text\s*=\s*'''(.*?)'''", source, re.DOTALL) + assert instructions_match, "Could not find instructions_text definition" + instructions_content = instructions_match.group(1) + assert ( + '{language_code}' not in instructions_content + ), "language_code should not be in instructions_text (breaks static prefix caching for non-English)" + context_match = re.search(r"context_message\s*=\s*['\"](.+?)['\"]", source) + assert context_match, "Could not find context_message definition" + assert 'language_code' in context_match.group(1), "language_code should be in context_message" diff --git a/backend/utils/llm/conversation_processing.py b/backend/utils/llm/conversation_processing.py index f9e674c6a3..29b4fb6123 100644 --- a/backend/utils/llm/conversation_processing.py +++ b/backend/utils/llm/conversation_processing.py @@ -243,8 +243,8 @@ def _build_conversation_context( """Build the conversation context string shared across LLM prompts. Produces a deterministic string from transcript, photos, and calendar context. - When used as the first system message in a prompt, enables OpenAI prompt caching - across sequential calls (e.g. structure + action items) that share the same content. + Used as the second system message (after static instructions) so that the static + instruction prefix enables cross-conversation OpenAI prompt caching. Returns: Formatted context string, or empty string if no content provided. @@ -322,14 +322,10 @@ def extract_action_items( items_list ) - # First system message: shared conversation context (enables OpenAI prompt caching) - context_message = 'Content:\n{conversation_context}' - - # Second system message: task-specific instructions + # First system message: task-specific instructions (static prefix enables cross-conversation caching) + # NOTE: {language_code} is in the context message, not here, to keep this prefix fully static across all languages. instructions_text = '''You are an expert action item extractor. Your sole purpose is to identify and extract actionable tasks from the provided content. - The content language is {language_code}. Use the same language {language_code} for your response. - EXPLICIT TASK/REMINDER REQUESTS (HIGHEST PRIORITY) When the primary user OR someone speaking to them uses these patterns, ALWAYS extract the task: @@ -358,10 +354,10 @@ def extract_action_items( - Use participant names in ALL action items (e.g., "Follow up with Sarah" NOT "Follow up with Speaker 0") - Reference the meeting title/context when relevant to the action item - Consider the scheduled meeting time and duration when extracting due dates - - If you cannot confidently match a speaker to a name, use the action description without speaker references{existing_items_context} + - If you cannot confidently match a speaker to a name, use the action description without speaker references CRITICAL DEDUPLICATION RULES (Check BEFORE extracting): - • DO NOT extract action items that are >95% similar to existing ones listed above + • DO NOT extract action items that are >95% similar to existing ones in the content • Check both the description AND the due date/timeframe • Consider semantic similarity, not just exact word matches • Examples of what counts as DUPLICATES (DO NOT extract): @@ -549,7 +545,9 @@ def extract_action_items( ).strip() action_items_parser = PydanticOutputParser(pydantic_object=ActionItemsExtraction) - prompt = ChatPromptTemplate.from_messages([('system', context_message), ('system', instructions_text)]) + # Second system message: conversation context + existing items (dynamic, per-conversation) + context_message = 'The content language is {language_code}. Use the same language {language_code} for your response.\n\nContent:\n{conversation_context}{existing_items_context}' + prompt = ChatPromptTemplate.from_messages([('system', instructions_text), ('system', context_message)]) chain = prompt | llm_medium_experiment | action_items_parser try: @@ -589,10 +587,7 @@ def get_transcript_structure( if not conversation_context: return Structured() # Should be caught by discard logic, but as a safeguard. - # First system message: shared conversation context (enables OpenAI prompt caching) - context_message = 'Content:\n{conversation_context}' - - # Second system message: task-specific instructions + # First system message: task-specific instructions (static prefix enables cross-conversation caching) instructions_text = '''You are an expert content analyzer. Your task is to analyze the provided content (which could be a transcript, a series of photo descriptions from a wearable camera, or both) and provide structure and clarity. The content language is {language_code}. Use the same language {language_code} for your response. @@ -638,7 +633,9 @@ def get_transcript_structure( ' ', '' ).strip() - prompt = ChatPromptTemplate.from_messages([('system', context_message), ('system', instructions_text)]) + # Second system message: conversation context (dynamic, per-conversation) + context_message = 'Content:\n{conversation_context}' + prompt = ChatPromptTemplate.from_messages([('system', instructions_text), ('system', context_message)]) chain = prompt | llm_medium_experiment | parser response = chain.invoke(