From d4904100f7185ca3b27c74ee28babaf40bbee61d Mon Sep 17 00:00:00 2001 From: Vignesh Subbiah Date: Mon, 15 Jun 2026 13:51:29 -0700 Subject: [PATCH] hooks: ship each tool_use once per turn (fix Stop-event duplicate counts) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Claude Code (and Codex) fire multiple Stop events per user turn — one per agent turn. The Stop handler rebuilds the exchange window from the persisted audit log, resetting on each UserPromptSubmit and appending every event after. Commit 16d1ffe removed the post-send prune of already-shipped events (it had broken the get_recent_user_prompts_for_session lookback by wiping prior prompts). With the prune gone, every Stop re-shipped ALL PostToolUse events accumulated since the last UserPromptSubmit, so a single file edit was counted 1 + 2 + ... + N times across N Stops — the root cause of inflated policy-match counts (one edit seen 14-35x). Fix — restore the prune safely (mark, don't delete): - claude-code: skip PostToolUse audit entries already flagged SHIPPED_FLAG when building the window; after a successful send, stamp the shipped tool entries (and the turn's UserPromptSubmit anchor) and persist. Entries stay in the log so the user-prompt lookback keeps working. A tool-free conversation turn still ships exactly once via the prompt anchor. - codex: ships tool_uses from the transcript per turn; mark the turn's UserPromptSubmit anchor shipped after a successful send and skip resend. - cursor: groups by generation_id; mark the generation's stop entry shipped and skip if a stop fires twice for the same generation. _persist_*_flags re-read the log before stamping to avoid clobbering entries a concurrent hook appended. Tests: new test_stop_dedup.py in each hook dir reproduces the bug (multiple Stops in one turn) and asserts each tool_use ships exactly once, plus that the get_recent_user_prompts_for_session lookback still returns prior prompts (what 16d1ffe protected). Verified red without the fix, green with it. Co-Authored-By: Claude Opus 4.8 (1M context) --- claude-code/hooks/test_stop_dedup.py | 232 +++++++++++++++++++++++++++ claude-code/hooks/unbound.py | 92 ++++++++++- codex/hooks/test_stop_dedup.py | 115 +++++++++++++ codex/hooks/unbound.py | 53 +++++- cursor/test_stop_dedup.py | 103 ++++++++++++ cursor/unbound.py | 53 ++++-- 6 files changed, 628 insertions(+), 20 deletions(-) create mode 100644 claude-code/hooks/test_stop_dedup.py create mode 100644 codex/hooks/test_stop_dedup.py create mode 100644 cursor/test_stop_dedup.py diff --git a/claude-code/hooks/test_stop_dedup.py b/claude-code/hooks/test_stop_dedup.py new file mode 100644 index 00000000..f90f8f45 --- /dev/null +++ b/claude-code/hooks/test_stop_dedup.py @@ -0,0 +1,232 @@ +""" +Regression tests for the Stop-event duplicate-count bug in the claude-code hook. + +Background +---------- +Claude Code fires multiple `Stop` events per user turn (one per agent turn). +`process_stop_event` rebuilds the exchange window from the persisted audit log: +it resets the window on each `UserPromptSubmit` and appends every event after. +Commit 16d1ffe removed the post-send prune of already-shipped events (to fix a +`get_recent_user_prompts_for_session` lookback regression). With the prune gone, +every Stop re-shipped ALL PostToolUse events accumulated since the last +UserPromptSubmit, so one file edit got counted 1 + 2 + ... + N times across N +Stops. + +These tests pin both halves of the contract: + - each PostToolUse tool_use is shipped exactly once across multiple Stops + (idempotent), and + - the user-prompt lookback still returns prior prompts (what 16d1ffe protected). +""" + +import json +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +import unbound + + +def _entry(session_id, hook_event_name, timestamp, **event_fields): + event = {'session_id': session_id, 'hook_event_name': hook_event_name} + event.update(event_fields) + return {'timestamp': timestamp, 'session_id': session_id, 'event': event} + + +class StopDedupTestBase(unittest.TestCase): + def setUp(self): + self.tmp = Path(tempfile.mkdtemp()) + self.audit_log = self.tmp / "agent-audit.log" + self._audit_patcher = patch.object(unbound, "AUDIT_LOG", self.audit_log) + self._audit_patcher.start() + self.addCleanup(self._audit_patcher.stop) + + # Capture every exchange handed to send_to_api; pretend the send + # succeeded so the shipped-marker path runs. + self.sent_exchanges = [] + + def _fake_send(exchange, api_key): + self.sent_exchanges.append(exchange) + return True + + self._send_patcher = patch.object(unbound, "send_to_api", _fake_send) + self._send_patcher.start() + self.addCleanup(self._send_patcher.stop) + + # Identity probe shells out / reads config; stub it. + self._identity_patcher = patch.object( + unbound, "build_account_identity", lambda probe=False: {} + ) + self._identity_patcher.start() + self.addCleanup(self._identity_patcher.stop) + + def _write_log(self, entries): + unbound.save_logs(entries) + + def _all_shipped_tool_paths(self): + """Flatten every tool_use file_path shipped across all sends.""" + paths = [] + for exchange in self.sent_exchanges: + for msg in exchange.get('messages', []): + for tu in msg.get('tool_use', []) or []: + paths.append(tu.get('tool_input', {}).get('file_path')) + return paths + + +class TestStopEventIdempotent(StopDedupTestBase): + """Multiple Stops in one turn must ship each tool_use exactly once.""" + + def _turn_with_two_edits(self): + sid = "sess-1" + return [ + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:00:00Z', + prompt='please edit two files'), + _entry(sid, 'PostToolUse', '2026-06-15T00:00:01Z', + tool_name='Edit', tool_input={'file_path': '/a.py'}, + tool_response={'ok': True}), + _entry(sid, 'PostToolUse', '2026-06-15T00:00:02Z', + tool_name='Edit', tool_input={'file_path': '/b.py'}, + tool_response={'ok': True}), + ] + + def _stop_event(self, sid="sess-1", msg="done"): + return { + 'session_id': sid, + 'hook_event_name': 'Stop', + 'transcript_path': 'undefined', + 'last_assistant_message': msg, + } + + def test_single_stop_ships_each_tool_once(self): + self._write_log(self._turn_with_two_edits()) + + unbound.process_stop_event(self._stop_event(), api_key='k') + + paths = self._all_shipped_tool_paths() + self.assertEqual(sorted(paths), ['/a.py', '/b.py']) + + def test_repeated_stops_do_not_reship(self): + # The bug: 2nd/3rd Stop in the same turn re-ship the same tool_uses. + self._write_log(self._turn_with_two_edits()) + + for _ in range(3): + unbound.process_stop_event(self._stop_event(), api_key='k') + + paths = self._all_shipped_tool_paths() + # Exactly one tool_use per edited file across ALL Stops — no inflation. + self.assertEqual(sorted(paths), ['/a.py', '/b.py']) + + def test_new_tool_after_first_stop_is_shipped_once(self): + # First Stop ships /a.py. Then a new edit lands and a second Stop fires: + # /b.py must ship, /a.py must NOT ship again. + sid = "sess-1" + self._write_log([ + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:00:00Z', prompt='go'), + _entry(sid, 'PostToolUse', '2026-06-15T00:00:01Z', + tool_name='Edit', tool_input={'file_path': '/a.py'}, + tool_response={'ok': True}), + ]) + unbound.process_stop_event(self._stop_event(), api_key='k') + + # A new PostToolUse arrives (as main() would append it), then Stop again. + unbound.append_to_audit_log( + _entry(sid, 'PostToolUse', '2026-06-15T00:00:03Z', + tool_name='Edit', tool_input={'file_path': '/b.py'}, + tool_response={'ok': True}) + ) + unbound.process_stop_event(self._stop_event(), api_key='k') + + paths = self._all_shipped_tool_paths() + self.assertEqual(sorted(paths), ['/a.py', '/b.py']) + + def test_stop_with_nothing_new_does_not_send(self): + # A turn with only a prompt and an already-shipped edit: a redundant + # Stop must not produce any send at all. + self._write_log(self._turn_with_two_edits()) + unbound.process_stop_event(self._stop_event(), api_key='k') + sends_after_first = len(self.sent_exchanges) + + unbound.process_stop_event(self._stop_event(), api_key='k') + self.assertEqual(len(self.sent_exchanges), sends_after_first) + + def test_tool_free_turn_ships_exactly_once(self): + # A pure-conversation turn (no PostToolUse) must still ship once — and + # only once — across multiple Stops. Guards against over-narrowing the + # send to tool-bearing turns only. + sid = "sess-1" + self._write_log([ + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:00:00Z', + prompt='just answer, no tools'), + ]) + + for _ in range(3): + unbound.process_stop_event(self._stop_event(msg='here is the answer'), + api_key='k') + + self.assertEqual(len(self.sent_exchanges), 1) + + +class TestLookbackStillWorks(StopDedupTestBase): + """The thing 16d1ffe protected: prior user prompts remain in the log and are + returned by get_recent_user_prompts_for_session after Stops are processed.""" + + def test_prior_prompts_survive_stop_processing(self): + sid = "sess-1" + self._write_log([ + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:00:00Z', prompt='first prompt'), + _entry(sid, 'PostToolUse', '2026-06-15T00:00:01Z', + tool_name='Edit', tool_input={'file_path': '/a.py'}, + tool_response={'ok': True}), + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:01:00Z', prompt='second prompt'), + _entry(sid, 'PostToolUse', '2026-06-15T00:01:01Z', + tool_name='Edit', tool_input={'file_path': '/b.py'}, + tool_response={'ok': True}), + ]) + + stop = { + 'session_id': sid, + 'hook_event_name': 'Stop', + 'transcript_path': 'undefined', + 'last_assistant_message': 'done', + } + # Process a couple of Stops, which is when the old code wiped the log. + unbound.process_stop_event(stop, api_key='k') + unbound.process_stop_event(stop, api_key='k') + + prompts = unbound.get_recent_user_prompts_for_session(sid, n=5) + self.assertEqual(prompts, ['first prompt', 'second prompt']) + + def test_shipped_marker_does_not_drop_postooluse_entries(self): + # Marking entries shipped must keep them in the log (size unchanged), + # only annotate them — otherwise rotation/lookback assumptions break. + sid = "sess-1" + self._write_log([ + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:00:00Z', prompt='go'), + _entry(sid, 'PostToolUse', '2026-06-15T00:00:01Z', + tool_name='Edit', tool_input={'file_path': '/a.py'}, + tool_response={'ok': True}), + ]) + before = len(unbound.load_existing_logs()) + + stop = { + 'session_id': sid, + 'hook_event_name': 'Stop', + 'transcript_path': 'undefined', + 'last_assistant_message': 'done', + } + unbound.process_stop_event(stop, api_key='k') + + after_logs = unbound.load_existing_logs() + # Nothing is dropped — entries are annotated in place, not pruned. + self.assertEqual(len(after_logs), before) + # The PostToolUse entry is now flagged shipped. + flagged_tools = [ + l for l in after_logs + if l.get(unbound.SHIPPED_FLAG) + and l.get('event', {}).get('hook_event_name') == 'PostToolUse' + ] + self.assertEqual(len(flagged_tools), 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/claude-code/hooks/unbound.py b/claude-code/hooks/unbound.py index 331abbf3..8f5ae9ce 100644 --- a/claude-code/hooks/unbound.py +++ b/claude-code/hooks/unbound.py @@ -31,6 +31,13 @@ PRETOOL_USER_MESSAGES_LIMIT = 5 AUDIT_LOG_TOTAL_LIMIT = 100 +# Per-entry flag set on PostToolUse audit-log entries once their tool_use has +# been shipped to the proxy on a Stop event. Claude Code fires multiple Stop +# events per turn; this flag lets a later Stop skip already-shipped tool_uses +# instead of re-sending them, while the entries stay in the log so the +# user-prompt lookback (get_recent_user_prompts_for_session) keeps working. +SHIPPED_FLAG = 'unbound_shipped' + APPROVAL_TIMEOUT = 4 * 60 * 60 DISCOVERY_DEBOUNCE_SECONDS = 24 * 3600 @@ -1126,29 +1133,85 @@ def cleanup_old_logs(): save_logs(logs[-AUDIT_LOG_TOTAL_LIMIT:]) +def _event_name_of(log: Dict) -> Optional[str]: + if 'event' in log: + return log.get('event', {}).get('hook_event_name') + return log.get('hook_event_name') + + +def _log_identity(log: Dict) -> tuple: + """Stable identity for an audit-log entry, used to re-find the entries this + Stop shipped after a fresh re-read (a concurrent Stop may have appended new + lines since we loaded the log).""" + event = log.get('event', {}) if 'event' in log else log + return ( + log.get('timestamp'), + log.get('session_id') or event.get('session_id'), + event.get('hook_event_name'), + event.get('tool_name'), + ) + + +def _persist_shipped_flags(session_id: Optional[str], shipped_logs: List[Dict]) -> None: + """Re-read the audit log and stamp SHIPPED_FLAG on the entries that were + just shipped, then rewrite it. Re-reading (instead of saving our stale + in-memory copy) avoids clobbering entries a concurrent hook appended after + we loaded the log.""" + if not shipped_logs: + return + shipped_ids = {_log_identity(log) for log in shipped_logs} + fresh_logs = load_existing_logs() + changed = False + for log in fresh_logs: + if log.get(SHIPPED_FLAG): + continue + if _log_identity(log) in shipped_ids: + log[SHIPPED_FLAG] = True + changed = True + if changed: + save_logs(fresh_logs) + + def process_stop_event(event: Dict, api_key: str): session_id = event.get('session_id') transcript_path = event.get('transcript_path') last_assistant_message = event.get('last_assistant_message') logs = load_existing_logs() - + session_events = [] + # PostToolUse entries (the same dict objects held in `logs`) that this Stop + # is about to ship. Marked shipped only after send_to_api succeeds. + pending_tool_logs = [] + # The current turn's UserPromptSubmit entry. Doubles as the per-turn shipped + # anchor so a no-tool conversation turn ships exactly once, not once per Stop. + prompt_log = None current_conversation_started = False user_prompt_timestamp = None - + for log in logs: log_session_id = log.get('session_id') or log.get('event', {}).get('session_id') - + if log_session_id == session_id: - event_name = log.get('event', {}).get('hook_event_name') if 'event' in log else log.get('hook_event_name') - + event_name = _event_name_of(log) + if event_name == 'UserPromptSubmit': + # Reset the window on each new prompt. The anchor entries + # (UserPromptSubmit, SessionStart) stay in the log untouched so + # the user-prompt lookback keeps finding prior prompts. session_events = [log] + pending_tool_logs = [] + prompt_log = log current_conversation_started = True user_prompt_timestamp = log.get('timestamp') elif current_conversation_started: + # Skip PostToolUse entries already shipped by an earlier Stop in + # this same turn — re-shipping them is the duplicate-count bug. + if event_name == 'PostToolUse' and log.get(SHIPPED_FLAG): + continue session_events.append(log) + if event_name == 'PostToolUse': + pending_tool_logs.append(log) transcript_assistant_messages = [] transcript_usage = None @@ -1166,6 +1229,15 @@ def process_stop_event(event: Dict, api_key: str): # the cached session model is wrong). Fall back to the audit log otherwise. session_model = transcript_model or _extract_session_model(logs, session_id) or 'auto' + # Idempotency guard: Claude Code fires a Stop per agent turn, so a single + # user turn produces several Stop events. Re-sending already-shipped + # tool_uses is the root cause of inflated policy-match counts. Send only + # when this Stop has something new: an unshipped PostToolUse, or — for a + # tool-free conversation turn — a turn whose prompt anchor hasn't shipped yet. + turn_already_shipped = prompt_log is not None and prompt_log.get(SHIPPED_FLAG) + if not pending_tool_logs and turn_already_shipped: + return + exchange = build_llm_exchange( session_events, stop_assistant_message=last_assistant_message, @@ -1174,8 +1246,14 @@ def process_stop_event(event: Dict, api_key: str): usage=transcript_usage, ) - if exchange: - send_to_api(exchange, api_key) + if exchange and send_to_api(exchange, api_key): + # Persist the shipped markers so later Stops in this turn skip what was + # already sent. The entries stay in the log for the user-prompt lookback. + for log in pending_tool_logs: + log[SHIPPED_FLAG] = True + if prompt_log is not None: + prompt_log[SHIPPED_FLAG] = True + _persist_shipped_flags(session_id, pending_tool_logs + ([prompt_log] if prompt_log else [])) def get_api_key(): diff --git a/codex/hooks/test_stop_dedup.py b/codex/hooks/test_stop_dedup.py new file mode 100644 index 00000000..8ab1439d --- /dev/null +++ b/codex/hooks/test_stop_dedup.py @@ -0,0 +1,115 @@ +""" +Regression tests for the Stop-event duplicate-count bug in the codex hook. + +Like the claude-code hook, codex fires multiple `Stop` events per user turn and +`process_stop_event` rebuilds the exchange each time. Codex sources its +tool_uses from the transcript (scoped by the turn's user-prompt timestamp), so a +second Stop in the same turn re-ships the same tool_uses. Commit 16d1ffe removed +the post-send prune, so nothing stopped the resend. + +These tests pin the contract: + - a turn is shipped exactly once across multiple Stops (idempotent), and + - prior user prompts remain in the audit log for the lookback. +""" + +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +import unbound + + +def _entry(session_id, hook_event_name, timestamp, **event_fields): + event = {'session_id': session_id, 'hook_event_name': hook_event_name} + event.update(event_fields) + return {'timestamp': timestamp, 'session_id': session_id, 'event': event} + + +class CodexStopDedupBase(unittest.TestCase): + def setUp(self): + self.tmp = Path(tempfile.mkdtemp()) + self.audit_log = self.tmp / "agent-audit.log" + self._audit_patcher = patch.object(unbound, "AUDIT_LOG", self.audit_log) + self._audit_patcher.start() + self.addCleanup(self._audit_patcher.stop) + + self.sent_exchanges = [] + self._send_patcher = patch.object( + unbound, "send_to_api", + lambda exchange, api_key: self.sent_exchanges.append(exchange) or True, + ) + self._send_patcher.start() + self.addCleanup(self._send_patcher.stop) + + # Codex sources tool_uses from the transcript — stub it to a fixed set. + self._tools_patcher = patch.object( + unbound, "parse_codex_transcript_for_tools", + lambda path, ts=None: [ + {'type': 'PostToolUse', 'tool_name': 'apply_patch', + 'tool_input': {'file_path': '/a.py'}, 'tool_response': {}}, + ], + ) + self._tools_patcher.start() + self.addCleanup(self._tools_patcher.stop) + + self._usage_patcher = patch.object( + unbound, "parse_codex_transcript_for_usage", lambda path, ts=None: None + ) + self._usage_patcher.start() + self.addCleanup(self._usage_patcher.stop) + + def _stop(self, sid="sess-1"): + return { + 'session_id': sid, + 'hook_event_name': 'Stop', + 'transcript_path': '/transcript.jsonl', + 'last_assistant_message': 'done', + } + + +class TestCodexStopIdempotent(CodexStopDedupBase): + def test_repeated_stops_ship_turn_once(self): + sid = "sess-1" + unbound.save_logs([ + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:00:00Z', prompt='go'), + ]) + + for _ in range(3): + unbound.process_stop_event(self._stop(), api_key='k') + + # The turn is shipped exactly once across all three Stops. + self.assertEqual(len(self.sent_exchanges), 1) + + def test_new_turn_ships_again(self): + sid = "sess-1" + unbound.save_logs([ + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:00:00Z', prompt='first'), + ]) + unbound.process_stop_event(self._stop(), api_key='k') + + # A new prompt opens a new turn — it must ship. + unbound.append_to_audit_log( + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:01:00Z', prompt='second') + ) + unbound.process_stop_event(self._stop(), api_key='k') + + self.assertEqual(len(self.sent_exchanges), 2) + + +class TestCodexLookback(CodexStopDedupBase): + def test_prior_prompts_survive(self): + sid = "sess-1" + unbound.save_logs([ + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:00:00Z', prompt='first'), + _entry(sid, 'UserPromptSubmit', '2026-06-15T00:01:00Z', prompt='second'), + ]) + unbound.process_stop_event(self._stop(), api_key='k') + unbound.process_stop_event(self._stop(), api_key='k') + + prompts = unbound.get_recent_user_prompts_for_session(sid, n=5) + self.assertEqual(prompts, ['first', 'second']) + + +if __name__ == '__main__': + unittest.main() diff --git a/codex/hooks/unbound.py b/codex/hooks/unbound.py index a1c99b71..89be9ebb 100644 --- a/codex/hooks/unbound.py +++ b/codex/hooks/unbound.py @@ -32,6 +32,14 @@ PRETOOL_USER_MESSAGES_LIMIT = 5 AUDIT_LOG_TOTAL_LIMIT = 100 +# Per-entry flag set on the turn's UserPromptSubmit audit-log entry once that +# turn's exchange has been shipped to the proxy on a Stop event. Codex fires +# multiple Stop events per turn and rebuilds the tool_uses from the transcript +# each time; this flag lets a later Stop skip an already-shipped turn instead +# of re-sending the same tool_uses, while the entry stays in the log so the +# user-prompt lookback keeps working. +SHIPPED_FLAG = 'unbound_shipped' + APPROVAL_TIMEOUT = 4 * 60 * 60 DISCOVERY_DEBOUNCE_SECONDS = 24 * 3600 @@ -1067,6 +1075,34 @@ def parse_codex_transcript_for_usage(transcript_path: str, user_prompt_timestamp } +def _log_identity(log: Dict) -> tuple: + """Stable identity for an audit-log entry, used to re-find the entry this + Stop shipped after a fresh re-read (a concurrent Stop may have appended new + lines since we loaded the log).""" + event = log.get('event', {}) if 'event' in log else log + return ( + log.get('timestamp'), + log.get('session_id') or event.get('session_id'), + event.get('hook_event_name'), + ) + + +def _persist_shipped_flag(prompt_log: Dict) -> None: + """Re-read the audit log and stamp SHIPPED_FLAG on the turn's + UserPromptSubmit entry, then rewrite it. Re-reading (instead of saving a + stale in-memory copy) avoids clobbering entries a concurrent hook appended + after we loaded the log.""" + target_id = _log_identity(prompt_log) + fresh_logs = load_existing_logs() + changed = False + for log in fresh_logs: + if not log.get(SHIPPED_FLAG) and _log_identity(log) == target_id: + log[SHIPPED_FLAG] = True + changed = True + if changed: + save_logs(fresh_logs) + + def process_stop_event(event: Dict, api_key: str): session_id = event.get('session_id') transcript_path = event.get('transcript_path') @@ -1074,10 +1110,11 @@ def process_stop_event(event: Dict, api_key: str): logs = load_existing_logs() - # Find the UserPromptSubmit for this session + # Find the UserPromptSubmit for this session (latest wins) user_prompt = None user_prompt_timestamp = None permission_mode = None + prompt_log = None for log in logs: log_session_id = log.get('session_id') or log.get('event', {}).get('session_id') @@ -1090,10 +1127,18 @@ def process_stop_event(event: Dict, api_key: str): user_prompt = log_event.get('prompt') user_prompt_timestamp = log.get('timestamp') permission_mode = log_event.get('permission_mode', 'default') + prompt_log = log if not user_prompt: return + # Idempotency guard: Codex fires a Stop per agent turn, so a single user + # turn produces several Stop events. Once this turn's exchange has shipped, + # re-sending would re-count the same tool_uses (root cause of inflated + # policy-match counts), so skip it. + if prompt_log is not None and prompt_log.get(SHIPPED_FLAG): + return + messages = [{'role': 'user', 'content': user_prompt}] # Parse tool uses from Codex transcript (function_call/function_call_output pairs) @@ -1118,7 +1163,11 @@ def process_stop_event(event: Dict, api_key: str): if usage: exchange['usage'] = usage - send_to_api(exchange, api_key) + if send_to_api(exchange, api_key) and prompt_log is not None: + # Persist the shipped marker so later Stops in this turn skip the send. + # The entry stays in the log for the user-prompt lookback. + prompt_log[SHIPPED_FLAG] = True + _persist_shipped_flag(prompt_log) _GATEWAY_URL_RE = re.compile(r'^https?://[A-Za-z0-9._\-]+(:\d+)?(/[A-Za-z0-9._/\-]*)?$') diff --git a/cursor/test_stop_dedup.py b/cursor/test_stop_dedup.py new file mode 100644 index 00000000..2500f7b4 --- /dev/null +++ b/cursor/test_stop_dedup.py @@ -0,0 +1,103 @@ +""" +Regression tests for the Stop-event duplicate-count guard in the cursor hook. + +Cursor groups audit events by generation_id and process_stop_event ships only +the generation matching the incoming stop. That isolates turns better than the +claude-code hook, but if a generation_id ever receives more than one `stop` +event the exchange (and its tool_uses) would be re-shipped — the same +duplicate-count class fixed in 16d1ffe's wake. + +These tests pin the contract: + - a generation is shipped exactly once even if `stop` fires twice for it, and + - prior user prompts remain in the audit log for the lookback. +""" + +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +import unbound + + +def _entry(conversation_id, generation_id, hook_event_name, timestamp, **fields): + event = { + 'conversation_id': conversation_id, + 'generation_id': generation_id, + 'hook_event_name': hook_event_name, + } + event.update(fields) + return {'timestamp': timestamp, 'event': event} + + +class CursorStopDedupBase(unittest.TestCase): + def setUp(self): + self.tmp = Path(tempfile.mkdtemp()) + self.audit_log = self.tmp / "agent-audit.log" + self._audit_patcher = patch.object(unbound, "AUDIT_LOG", self.audit_log) + self._audit_patcher.start() + self.addCleanup(self._audit_patcher.stop) + + self.sent_exchanges = [] + self._send_patcher = patch.object( + unbound, "send_to_api", + lambda exchange, api_key=None: self.sent_exchanges.append(exchange) or True, + ) + self._send_patcher.start() + self.addCleanup(self._send_patcher.stop) + + self._serial_patcher = patch.object( + unbound, "_device_serial", lambda probe=False: None + ) + self._serial_patcher.start() + self.addCleanup(self._serial_patcher.stop) + + def _generation(self, conv="conv-1", gen="gen-1"): + return [ + _entry(conv, gen, 'beforeSubmitPrompt', '2026-06-15T00:00:00Z', prompt='go'), + _entry(conv, gen, 'afterFileEdit', '2026-06-15T00:00:01Z', + file_path='/a.py', edits=[{'old': 'x', 'new': 'y'}]), + _entry(conv, gen, 'afterAgentResponse', '2026-06-15T00:00:02Z', text='done'), + _entry(conv, gen, 'stop', '2026-06-15T00:00:03Z'), + ] + + +class TestCursorStopIdempotent(CursorStopDedupBase): + def test_repeated_stops_for_same_generation_ship_once(self): + unbound.save_logs(self._generation()) + + unbound.process_stop_event('gen-1', api_key='k') + unbound.process_stop_event('gen-1', api_key='k') + + self.assertEqual(len(self.sent_exchanges), 1) + + def test_new_generation_ships_again(self): + unbound.save_logs(self._generation(gen='gen-1')) + unbound.process_stop_event('gen-1', api_key='k') + + for entry in self._generation(gen='gen-2'): + unbound.append_to_audit_log(entry) + unbound.process_stop_event('gen-2', api_key='k') + + self.assertEqual(len(self.sent_exchanges), 2) + + +class TestCursorLookback(CursorStopDedupBase): + def test_prior_prompts_survive(self): + logs = self._generation(gen='gen-1') + logs += [ + _entry('conv-1', 'gen-2', 'beforeSubmitPrompt', '2026-06-15T00:01:00Z', + prompt='second'), + _entry('conv-1', 'gen-2', 'stop', '2026-06-15T00:01:01Z'), + ] + unbound.save_logs(logs) + + unbound.process_stop_event('gen-1', api_key='k') + unbound.process_stop_event('gen-1', api_key='k') + + prompts = unbound.get_recent_user_prompts_for_session('conv-1', 5) + self.assertEqual(prompts, ['go', 'second']) + + +if __name__ == '__main__': + unittest.main() diff --git a/cursor/unbound.py b/cursor/unbound.py index 6a455186..2e42bf04 100644 --- a/cursor/unbound.py +++ b/cursor/unbound.py @@ -79,6 +79,13 @@ PRETOOL_USER_MESSAGES_LIMIT = 5 AUDIT_LOG_TOTAL_LIMIT = 100 +# Per-entry flag set on a generation's stop audit-log entry once that +# generation's exchange has been shipped to the proxy. If a generation_id ever +# receives more than one stop event, this flag lets the later stop skip the +# resend instead of re-counting the same tool_uses, while the entries stay in +# the log so the user-prompt lookback keeps working. +SHIPPED_FLAG = 'unbound_shipped' + # Ensure log directory exists try: LOG_DIR.mkdir(parents=True, exist_ok=True) @@ -1154,28 +1161,52 @@ def cleanup_old_logs(): save_logs(logs[-AUDIT_LOG_TOTAL_LIMIT:]) +def _persist_generation_shipped(generation_id): + """Re-read the audit log and stamp SHIPPED_FLAG on this generation's stop + entries, then rewrite it. Re-reading (instead of saving a stale in-memory + copy) avoids clobbering entries a concurrent hook appended after we loaded + the log.""" + fresh_logs = load_existing_logs() + changed = False + for log in fresh_logs: + event = log.get('event', {}) + if (event.get('generation_id') == generation_id + and event.get('hook_event_name') == 'stop' + and not log.get(SHIPPED_FLAG)): + log[SHIPPED_FLAG] = True + changed = True + if changed: + save_logs(fresh_logs) + + def process_stop_event(generation_id, api_key=None): """Process stop event: convert to LLM format and send to API.""" logs = load_existing_logs() - + # Group events grouped = group_events_by_generation(logs) - + # Find and process the generation with stop event for conversation_id, generations in grouped.items(): if generation_id in generations: events = generations[generation_id] - + # Check if this generation has a stop event - has_stop = any( - log.get('event', {}).get('hook_event_name') == 'stop' - for log in events - ) - - if has_stop: + stop_logs = [ + log for log in events + if log.get('event', {}).get('hook_event_name') == 'stop' + ] + + if stop_logs: + # Idempotency guard: if this generation was already shipped by an + # earlier stop, re-sending would re-count the same tool_uses + # (root cause of inflated policy-match counts). Skip it. + if any(log.get(SHIPPED_FLAG) for log in stop_logs): + break + exchange = build_llm_exchange(events, api_key) - if exchange: - send_to_api(exchange, api_key) + if exchange and send_to_api(exchange, api_key): + _persist_generation_shipped(generation_id) break