From e933d30dd15bcf158e268c510f3da62cd66ca59f Mon Sep 17 00:00:00 2001 From: Ryan Orban Date: Thu, 5 Mar 2026 13:57:30 -0800 Subject: [PATCH 1/6] apply review findings: drop ID column, collapse dead staleness branch, add negative tests --- hooks/scripts/pre_compact.py | 38 +++++++----------------------------- hooks/scripts/test_hooks.py | 37 ++++++++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/hooks/scripts/pre_compact.py b/hooks/scripts/pre_compact.py index 1450757..30c4d62 100644 --- a/hooks/scripts/pre_compact.py +++ b/hooks/scripts/pre_compact.py @@ -151,11 +151,11 @@ def build_reload_snippet(state: dict, ws: Path | None = None) -> str: artifacts = state.get("artifacts", {}) working = state.get("working_set", []) if working: - lines.append("| ID | Kind | Path |") - lines.append("|----|------|------|") + lines.append("| Kind | Path |") + lines.append("|------|------|") for aid in working: art = artifacts.get(aid, {}) - lines.append(f"| {aid} | {art.get('kind', '?')} | {art.get('path', '?')} |") + lines.append(f"| {art.get('kind', '?')} | {art.get('path', '?')} |") else: lines.append("_Empty._") @@ -172,54 +172,30 @@ def build_reload_snippet(state: dict, ws: Path | None = None) -> str: for entry in recently_read: lines.append(f"- `{entry.get('path', '?')}`") - # Git context + # Git context (branch only — experiment showed dirty artifacts list unused) git_ctx = state.get("git") if git_ctx: lines += ["", "## Git Context", ""] lines.append(f"- **Branch:** `{git_ctx.get('branch', '?')}`") - dirty = git_ctx.get("dirty_artifacts", []) - if dirty: - lines.append(f"- **Uncommitted working-set files:** {', '.join(f'`{p}`' for p in dirty)}") - else: - lines.append("- All working-set files are committed.") - - # Reference file snapshots if workspace is available - if ws is not None and (ws / "derived" / "file_snapshots.md").exists(): - lines += [ - "", - "## File Snapshots", - "", - "See `.agent-workspace/derived/file_snapshots.md` for first lines of working set files.", - ] lines += [ "", "## After Resuming", "", "1. Read `.agent-workspace/TASKLOG.md` for what happened recently", - "2. Read `.agent-workspace/derived/file_snapshots.md` for code context (if it exists)", - "3. Run `/relay sync` to verify and update objective, next_actions, and open_questions", - "4. Read `.agent-workspace/STATE.json` only if you need full artifact details", + "2. Run `/relay sync` to verify and update objective, next_actions, and open_questions", ] # Add staleness info - current_turn = state.get("turn_count", 0) - semantic_turn = state.get("semantic_turn", 0) - turns_stale = max(0, current_turn - semantic_turn) + turns_stale = max(0, state.get("turn_count", 0) - state.get("semantic_turn", 0)) stale_at_compaction = state.get("stale_at_compaction", False) - if stale_at_compaction: + if stale_at_compaction or turns_stale > 5: lines += [ "", f"**STATE WAS STALE AT COMPACTION ({turns_stale} turns).** " "The fields above may NOT reflect reality.", "Run `/relay sync` immediately to update workspace state.", ] - elif turns_stale > 5: - lines += [ - "", - f"**WARNING: Workspace state was {turns_stale} turns stale at compaction time.**", - "The fields above likely do NOT reflect reality. Run `/relay sync` immediately.", - ] else: lines += [ "", diff --git a/hooks/scripts/test_hooks.py b/hooks/scripts/test_hooks.py index 8eedd3f..cb003a2 100644 --- a/hooks/scripts/test_hooks.py +++ b/hooks/scripts/test_hooks.py @@ -326,7 +326,8 @@ def test_stale_warning(self, state): state["turn_count"] = 15 state["semantic_turn"] = 2 snippet = build_reload_snippet(state) - assert "13 turns stale" in snippet + assert "STATE WAS STALE" in snippet + assert "13 turns" in snippet def test_fresh_state_no_stale_warning(self, state): state["turn_count"] = 3 @@ -348,7 +349,7 @@ def test_recently_read_absent_when_empty(self, state): snippet = build_reload_snippet(state) assert "## Recently Consulted Files" not in snippet - def test_git_context_in_snippet(self, state): + def test_git_context_branch_only(self, state): state["git"] = { "branch": "main", "dirty_artifacts": ["src/app.py", "lib/utils.py"], @@ -357,9 +358,10 @@ def test_git_context_in_snippet(self, state): snippet = build_reload_snippet(state) assert "## Git Context" in snippet assert "`main`" in snippet - assert "`src/app.py`" in snippet + assert "`src/app.py`" not in snippet + assert "Uncommitted" not in snippet - def test_git_context_clean_in_snippet(self, state): + def test_git_context_no_committed_msg(self, state): state["git"] = { "branch": "develop", "dirty_artifacts": [], @@ -367,13 +369,38 @@ def test_git_context_clean_in_snippet(self, state): } snippet = build_reload_snippet(state) assert "## Git Context" in snippet - assert "All working-set files are committed" in snippet + assert "`develop`" in snippet + assert "All working-set files are committed" not in snippet def test_git_context_absent_when_none(self, state): state["git"] = None snippet = build_reload_snippet(state) assert "## Git Context" not in snippet + def test_dropped_sections_absent(self, state, workspace): + """Sections removed per experiment results should not appear in RELOAD.md.""" + state["git"] = { + "branch": "main", + "dirty_artifacts": ["src/app.py"], + "captured_at": "2026-03-01T12:05:00", + } + # Write file_snapshots.md to disk — should NOT be referenced + (workspace / "derived" / "file_snapshots.md").write_text("# File Snapshots\n") + snippet = build_reload_snippet(state, ws=workspace) + assert "file_snapshots.md" not in snippet + assert "File Snapshots" not in snippet + assert "Uncommitted" not in snippet + assert "dirty" not in snippet.lower() + + def test_working_set_table_has_no_id_column(self, state): + state["artifacts"] = { + "a_abc123": {"kind": "code", "path": "src/app.py"}, + } + state["working_set"] = ["a_abc123"] + snippet = build_reload_snippet(state) + assert "| Kind | Path |" in snippet + assert "a_abc123" not in snippet + # ── build_summary ───────────────────────────────────────── From 79f861770289a8b74c6a5a8afe52f91c8a9fa4a4 Mon Sep 17 00:00:00 2001 From: Ryan Orban Date: Thu, 5 Mar 2026 13:59:14 -0800 Subject: [PATCH 2/6] soften n=1 conclusions in experiment results, note simulated vs real compaction gap --- docs/plans/experiment-results.md | 139 +++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 docs/plans/experiment-results.md diff --git a/docs/plans/experiment-results.md b/docs/plans/experiment-results.md new file mode 100644 index 0000000..282e5d6 --- /dev/null +++ b/docs/plans/experiment-results.md @@ -0,0 +1,139 @@ +--- +title: Artifact tracking A/B experiment results +date: 2026-03-04 +--- + +# Artifact tracking A/B experiment results + +## Experiment design + +Two-phase experiment testing whether artifact tracking in RELOAD.md helps agents recover after compaction. + +### Phase 1: Baseline (no compaction) + +Both conditions implement a bookmark feature from scratch. Measures whether artifacts affect normal task performance. + +### Phase 2: Simulated post-compaction recovery + +Both conditions start from identical state (bookmarks done, 317 tests passing) with a RELOAD.md injected. Must implement a second feature (likes). Measures re-orientation behavior. + +## Setup + +- **Codebase:** `/tmp/relay-stress-test` (288 tests, 3,400 lines across models/services/routes/tests) +- **Model:** Sonnet (`claude --print --model sonnet --dangerously-skip-permissions`) +- **Output:** `stream-json` for tool call analysis +- **Condition A:** Full RELOAD.md (Working Set table, Recently Consulted Files, File Snapshots ref, dirty artifacts, 4 After Resuming steps) +- **Condition B:** Stripped RELOAD.md (only objective, hypothesis, next actions, history, branch, 2 After Resuming steps) + +## Phase 1 results: Baseline (bookmark feature, no compaction) + +| Metric | A (full) | B (stripped) | Delta | +|--------|----------|--------------|-------| +| Total tool calls | 48 | 42 | A +6 | +| Read calls | 23 | 21 | A +2 | +| Glob/Grep | 1 | 1 | 0 | +| Write calls | 4 | 4 | 0 | +| Edit calls | 10 | 7 | A +3 | +| Bash calls | 1 | 1 | 0 | +| Tests passing | 320 (288+32) | 314 (288+26) | A +6 | +| Read order | app -> models -> services -> routes -> tests -> utils | same | identical | +| Compaction triggered | No | No | — | + +**Phase 1 finding:** No meaningful difference in a single run. Both conditions explored identically. Differences (A +6 tests, +3 edits) are within expected LLM variance for n=1, though it's also possible richer context nudged A toward more thorough output. + +## Phase 2 results: Simulated post-compaction recovery (likes feature) + +Starting state: Bookmarks fully implemented, 317 tests passing. RELOAD.md placed in `.agent-workspace/derived/`. Prompt says "you were compacted, read RELOAD.md, continue with likes." + +### Re-orientation behavior + +| Metric | A (full) | B (stripped) | Delta | +|--------|----------|--------------|-------| +| Total tool calls | 38 | 34 | A +4 | +| Read calls | 10 | 12 | **B +2** | +| Glob calls | 1 | 0 | A +1 | +| Grep calls | 0 | 0 | 0 | +| Write calls | 4 | 4 | 0 | +| Edit calls | 10 | 5 | A +5 | +| Bash calls (incl. grep/ls) | 4 | 5 | **B +1** | +| **Reads before first edit** | **10** | **12** | **B +2** | +| First edited file | like.py | like.py | same | +| All tests pass | 349 | 353 | both pass | + +### Tool call sequences (first 15) + +**Condition A (full RELOAD.md):** +``` +1. Read RELOAD.md +2. Read bookmark.py ← from Working Set +3. Read bookmark_service.py ← from Working Set +4. Read bookmarks.py ← from Working Set +5. Read test_bookmarks.py ← from Working Set +6. Read app.py ← from Working Set +7. Read post_routes.py ← from Recently Consulted +8. Read conftest.py ← from Recently Consulted +9. Read post_service.py ← from Recently Consulted +10. Bash(grep "like") ← search existing code +11. Glob tests/*.py +12-13. Bash(grep) +14-15. TodoWrite ← start implementing +``` + +**Condition B (stripped RELOAD.md):** +``` +1. Read RELOAD.md +2. Read bookmark.py ← knew from prompt context +3. Read bookmark_service.py +4. Read bookmarks.py +5. Read test_bookmarks.py +6. Read app.py +7. Read db.py ← EXTRA: exploring unknown file +8. Read post_routes.py +9. Bash(ls) ← EXTRA: directory listing to orient +10. Bash(ls utils/ && ls) ← EXTRA: more exploration +11. Bash(grep "bookmark") ← EXTRA: searching for patterns +12. Read post_routes.py ← RE-READ (already read at step 8) +13. Read conftest.py +14. Read post_service.py +15. Bash(grep "like") +``` + +### Key observations + +All observations below are from a single run per condition. They're directional signals, not conclusions. + +1. **Both conditions found the same files.** Neither got "lost." The prompt mentioned bookmarks were done, so both knew to look at bookmark files. This hint gave Condition B a significant leg up it wouldn't have in a real compaction scenario. + +2. **B did more exploratory work.** Two extra `ls` calls (steps 9-10), one `grep` for bookmark patterns (step 11), and a re-read of `post_routes.py` (step 12). This is consistent with the re-orientation cost of not having the Working Set and Recently Consulted lists, though a single run can't rule out normal variance. + +3. **B read a file A didn't.** B read `db.py` (step 7), presumably trying to understand the data layer since it had no artifact context telling it which files mattered. + +4. **A went straight from reads to implementation.** After reading the 9 files it knew about (from Working Set + Recently Consulted), A did targeted grep searches and started writing. B needed directory listings first. + +5. **A used more Edits (10 vs 5).** A was more confident editing existing files, while B created more from scratch. This might indicate artifact context gave A better knowledge of existing file structure, or it might be normal LLM variance. + +6. **Delta: 2 extra reads + 2 extra bash calls for B.** Falls below the experiment's threshold of "3+ more turns" or "5+ more search calls." The difference is real but small. With n=1, this delta could easily swing to 0 or 5 in another run. + +## Decision + +**Modify.** Based on this single-run experiment, artifact tracking appears to help slightly but not enough to justify the full ~900-byte overhead. These are informed bets, not proven conclusions. + +Recommendations (confidence noted): +- **Keep:** Working Set file paths — A's direct navigation vs B's exploratory `ls` calls is the strongest signal in the data (medium confidence) +- **Keep:** Recently Consulted Files — same reasoning, prevents redundant exploration (medium confidence) +- **Drop:** File Snapshots reference — neither condition used it; agents Read files directly (high confidence, consistent across both phases) +- **Drop:** Dirty artifacts list in Git Context — neither condition ran `git status` post-compaction (medium confidence, n=1) +- **Simplify:** After Resuming to 2 steps, TASKLOG + relay sync (high confidence, the other steps were never observed in use) + +Estimated savings: ~400 bytes of the ~900-byte overhead. Keep the useful path lists, drop the rarely-used snapshot references. + +These recommendations could be wrong. The experiment likely underestimates artifact tracking's value because simulated compaction is easier than real compaction (see caveats). If anything, err toward keeping more context rather than less. + +## Caveats + +These caveats all push in the same direction: the experiment likely underestimates the value of artifact tracking. Every limitation made it easier for Condition B to succeed without artifact context. + +1. **N=1 per condition.** LLM non-determinism means these are directional signals, not statistically significant results. The observed delta of 2 extra reads could easily be 0 or 5 in another run. +2. **Simulated, not real, compaction.** The agent was told "you were compacted" but never actually had in-context history to lose. Real compaction means the agent loses working memory it built up across dozens of turns — a harder recovery problem than starting fresh with a RELOAD.md. This is the biggest limitation: the experiment tests "does a file list help a fresh agent?" not "does a file list help an agent that just lost its memory?" +3. **Task prompt provided hints.** The prompt said "bookmarks are done, do likes" which told both conditions exactly which prior feature to reference as a pattern. Both immediately navigated to `bookmark.py`. Without that hint, Condition B would have needed to discover the codebase structure from scratch — exactly the scenario artifact tracking is designed for. +4. **Small codebase.** At 3,400 lines, the entire codebase fits easily in context. For larger codebases where you can't read everything, artifact context would matter more. From 741f1c070644f5f59e10a270d7e4eeed79a2f02a Mon Sep 17 00:00:00 2001 From: Ryan Orban Date: Thu, 5 Mar 2026 14:01:10 -0800 Subject: [PATCH 3/6] suppress compaction warning when workspace state is fresh --- hooks/scripts/pack.py | 12 +++++++++++- hooks/scripts/test_hooks.py | 16 ++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/hooks/scripts/pack.py b/hooks/scripts/pack.py index 482f1e4..bbc61c7 100644 --- a/hooks/scripts/pack.py +++ b/hooks/scripts/pack.py @@ -98,8 +98,18 @@ def predict_next_compaction(history: list[int]) -> int: return sorted_h[-1] + max(avg_interval, 10) # floor of 10 turns +COMPACTION_FRESH_THRESHOLD = 3 + + def compaction_warning(state: dict) -> str | None: - """Return a compaction warning if approaching predicted compaction, else None.""" + """Return a compaction warning if approaching predicted compaction, else None. + + Suppressed when state was recently synced (staleness <= COMPACTION_FRESH_THRESHOLD). + """ + turns_stale = max(0, state.get("turn_count", 0) - state.get("semantic_turn", 0)) + if turns_stale <= COMPACTION_FRESH_THRESHOLD: + return None + history = state.get("compaction_history", []) predicted = predict_next_compaction(history) current = state.get("turn_count", 0) diff --git a/hooks/scripts/test_hooks.py b/hooks/scripts/test_hooks.py index cb003a2..184cba7 100644 --- a/hooks/scripts/test_hooks.py +++ b/hooks/scripts/test_hooks.py @@ -895,6 +895,22 @@ def test_no_warning_with_no_history_early(self, state): state["turn_count"] = 10 assert compaction_warning(state) is None + def test_suppressed_when_state_fresh(self, state): + # Within warn buffer but recently synced — should not warn + state["compaction_history"] = [80] + state["turn_count"] = 146 + state["semantic_turn"] = 144 # 2 turns stale, below threshold + assert compaction_warning(state) is None + + def test_not_suppressed_when_stale(self, state): + # Within warn buffer and stale — should warn + state["compaction_history"] = [80] + state["turn_count"] = 146 + state["semantic_turn"] = 130 # 16 turns stale + result = compaction_warning(state) + assert result is not None + assert "approaching" in result + class TestCompactionHistoryRecording: def test_records_turn_count(self, workspace, state): From f490d9ce28cb0dd7acb54beab4fdcc131b240651 Mon Sep 17 00:00:00 2001 From: Ryan Orban Date: Thu, 5 Mar 2026 14:13:27 -0800 Subject: [PATCH 4/6] remove dead file_snapshots generation, add configurable thresholds via STATE.json config key --- hooks/scripts/init_workspace.py | 24 +++++++++++ hooks/scripts/nudge.py | 6 +-- hooks/scripts/pack.py | 31 +++++++------- hooks/scripts/post_tool_use.py | 12 +++--- hooks/scripts/pre_compact.py | 62 ---------------------------- hooks/scripts/test_hooks.py | 73 +++++++++++---------------------- 6 files changed, 71 insertions(+), 137 deletions(-) diff --git a/hooks/scripts/init_workspace.py b/hooks/scripts/init_workspace.py index f861ae2..c91268b 100644 --- a/hooks/scripts/init_workspace.py +++ b/hooks/scripts/init_workspace.py @@ -146,6 +146,30 @@ def read_hook_stdin() -> dict: return {} +CONFIG_DEFAULTS = { + "stale_session_threshold": 5, + "staleness_mild": 6, + "staleness_loud": 16, + "sync_reminder_interval": 8, + "compaction_warn_buffer": 15, + "compaction_urgent_buffer": 5, + "compaction_fresh_threshold": 3, + "extraction_interval": 5, + "max_recently_read": 20, +} + + +def get_config(state: dict) -> dict: + """Return config with user overrides merged over defaults. + + Users can set overrides in STATE.json under the "config" key, e.g.: + {"config": {"stale_session_threshold": 10}} + """ + overrides = state.get("config", {}) + merged = {**CONFIG_DEFAULTS, **overrides} + return merged + + MAX_ERROR_LOG_ENTRIES = 50 diff --git a/hooks/scripts/nudge.py b/hooks/scripts/nudge.py index 739b293..fc85a53 100644 --- a/hooks/scripts/nudge.py +++ b/hooks/scripts/nudge.py @@ -11,10 +11,9 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) -from init_workspace import load_state, read_hook_stdin +from init_workspace import load_state, read_hook_stdin, get_config WORKSPACE_DIR = ".agent-workspace" -STALE_SESSION_THRESHOLD = 5 def build_nudge_message(state: dict | None) -> str | None: @@ -22,6 +21,7 @@ def build_nudge_message(state: dict | None) -> str | None: if state is None: return None + cfg = get_config(state) wk = state.get("workspace", {}) has_objective = bool(wk.get("objective")) has_artifacts = len(state.get("artifacts", {})) > 0 @@ -30,7 +30,7 @@ def build_nudge_message(state: dict | None) -> str | None: turns_stale = max(0, turn_count - semantic_turn) # Check staleness (even if objective is set) - if has_objective and turns_stale > STALE_SESSION_THRESHOLD: + if has_objective and turns_stale > cfg["stale_session_threshold"]: return ( f"Relay: workspace state is {turns_stale} turns stale from a previous session. " "Run /relay sync to update objective, next_actions, and open_questions." diff --git a/hooks/scripts/pack.py b/hooks/scripts/pack.py index bbc61c7..3167d1c 100644 --- a/hooks/scripts/pack.py +++ b/hooks/scripts/pack.py @@ -16,7 +16,7 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) -from init_workspace import ensure_workspace, load_state, read_hook_stdin, hash_workspace +from init_workspace import ensure_workspace, load_state, read_hook_stdin, hash_workspace, get_config try: from entire_bridge import detect_entire, write_relay_context @@ -55,19 +55,20 @@ def rebuild_index(ws: Path, state: dict): (ws / "INDEX.md").write_text("\n".join(lines)) -STALENESS_MILD = 6 # turns before mild nudge -STALENESS_LOUD = 16 # turns before strong warning -SYNC_REMINDER_INTERVAL = 8 # emit hookSpecificOutput every N turns of staleness +STALENESS_MILD = 6 # kept for test imports +STALENESS_LOUD = 16 # kept for test imports def build_sync_reminder(state: dict) -> str | None: """Return a sync reminder string if staleness hits an interval boundary, else None.""" + cfg = get_config(state) + interval = cfg["sync_reminder_interval"] current_turn = state.get("turn_count", 0) semantic_turn = state.get("semantic_turn", 0) turns_stale = max(0, current_turn - semantic_turn) - if turns_stale < SYNC_REMINDER_INTERVAL: + if turns_stale < interval: return None - if turns_stale % SYNC_REMINDER_INTERVAL != 0: + if turns_stale % interval != 0: return None return ( f"Relay: workspace state is {turns_stale} turns stale. " @@ -76,8 +77,6 @@ def build_sync_reminder(state: dict) -> str | None: DEFAULT_COMPACTION_TURN = 80 -COMPACTION_WARN_BUFFER = 15 -COMPACTION_URGENT_BUFFER = 5 MAX_COMPACTION_HISTORY = 10 @@ -98,16 +97,14 @@ def predict_next_compaction(history: list[int]) -> int: return sorted_h[-1] + max(avg_interval, 10) # floor of 10 turns -COMPACTION_FRESH_THRESHOLD = 3 - - def compaction_warning(state: dict) -> str | None: """Return a compaction warning if approaching predicted compaction, else None. - Suppressed when state was recently synced (staleness <= COMPACTION_FRESH_THRESHOLD). + Suppressed when state was recently synced (staleness <= compaction_fresh_threshold). """ + cfg = get_config(state) turns_stale = max(0, state.get("turn_count", 0) - state.get("semantic_turn", 0)) - if turns_stale <= COMPACTION_FRESH_THRESHOLD: + if turns_stale <= cfg["compaction_fresh_threshold"]: return None history = state.get("compaction_history", []) @@ -115,12 +112,12 @@ def compaction_warning(state: dict) -> str | None: current = state.get("turn_count", 0) remaining = predicted - current - if remaining <= COMPACTION_URGENT_BUFFER: + if remaining <= cfg["compaction_urgent_buffer"]: return ( f"Relay: compaction imminent (~{max(0, remaining)} turns). " "Run /relay sync NOW to preserve workspace state." ) - if remaining <= COMPACTION_WARN_BUFFER: + if remaining <= cfg["compaction_warn_buffer"]: return ( f"Relay: compaction approaching (~{remaining} turns). " "Consider running /relay sync to update workspace state." @@ -452,7 +449,6 @@ def apply_markers(state: dict, markers: dict) -> bool: SUGGESTION_TTL_TURNS = 3 -EXTRACTION_INTERVAL = 5 def check_pending_suggestions(ws: Path, state: dict) -> str | None: @@ -524,7 +520,8 @@ def spawn_extraction(ws: Path, state: dict, hook_data: dict): current_turn = state.get("turn_count", 0) last_extraction = state.get("last_extraction_turn", 0) - if current_turn - last_extraction < EXTRACTION_INTERVAL: + cfg = get_config(state) + if current_turn - last_extraction < cfg["extraction_interval"]: return message = hook_data.get("last_assistant_message", "") diff --git a/hooks/scripts/post_tool_use.py b/hooks/scripts/post_tool_use.py index b86812d..c74c7d1 100644 --- a/hooks/scripts/post_tool_use.py +++ b/hooks/scripts/post_tool_use.py @@ -15,7 +15,7 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) -from init_workspace import ensure_workspace, load_state, save_state, read_hook_stdin +from init_workspace import ensure_workspace, load_state, save_state, read_hook_stdin, get_config # File extensions -> artifact kind mapping KIND_MAP = { @@ -50,19 +50,17 @@ } -MAX_RECENTLY_READ = 20 - - def track_read(state: dict, rel_path: str, timestamp: str): """Add or update a file in the recently_read list.""" + cfg = get_config(state) + max_recent = cfg["max_recently_read"] reading = state.setdefault("recently_read", []) # Remove existing entry for this path (dedup) reading[:] = [e for e in reading if e["path"] != rel_path] # Append at end with updated timestamp reading.append({"path": rel_path, "last_read": timestamp}) - # Cap at MAX_RECENTLY_READ - if len(reading) > MAX_RECENTLY_READ: - state["recently_read"] = reading[-MAX_RECENTLY_READ:] + if len(reading) > max_recent: + state["recently_read"] = reading[-max_recent:] def classify_file(path: str) -> str: diff --git a/hooks/scripts/pre_compact.py b/hooks/scripts/pre_compact.py index 30c4d62..3ebb79c 100644 --- a/hooks/scripts/pre_compact.py +++ b/hooks/scripts/pre_compact.py @@ -59,63 +59,6 @@ def mark_staleness_at_compaction(state: dict): state["stale_at_compaction"] = turns_stale > STALE_AT_COMPACTION_THRESHOLD -SNAPSHOT_MAX_LINES = 10 -SNAPSHOT_MAX_BYTES = 2048 -SNAPSHOT_LINE_CAP = 120 - - -def build_file_snapshots(cwd: str, state: dict, max_bytes: int = SNAPSHOT_MAX_BYTES) -> str: - """Build a markdown string with first N lines of each working set file.""" - artifacts = state.get("artifacts", {}) - working = state.get("working_set", []) - if not working: - return "" - - lines = ["# File Snapshots", "", "> First lines of working set files at compaction time.", ""] - total_bytes = 0 - - for aid in working: - art = artifacts.get(aid, {}) - path = art.get("path", "?") - full = Path(cwd) / path if not Path(path).is_absolute() else Path(path) - - header = f"## `{path}` ({art.get('kind', '?')})" - lines.append(header) - lines.append("") - - if not full.exists(): - lines.append("_File not found on disk._") - lines.append("") - continue - - try: - file_lines = full.read_text(errors="replace").split("\n")[:SNAPSHOT_MAX_LINES] - snippet = [] - for fl in file_lines: - if len(fl) > SNAPSHOT_LINE_CAP: - fl = fl[:SNAPSHOT_LINE_CAP] + "..." - snippet.append(fl) - block = "\n".join(snippet) - - # Check size cap - block_bytes = len(block.encode()) - if total_bytes + block_bytes > max_bytes: - lines.append("_Skipped: snapshot size limit reached._") - lines.append("") - break - - total_bytes += block_bytes - lines.append("```") - lines.append(block) - lines.append("```") - lines.append("") - except Exception: - lines.append("_Could not read file._") - lines.append("") - - return "\n".join(lines) - - def build_reload_snippet(state: dict, ws: Path | None = None) -> str: """Build a minimal markdown snippet the agent can read after compaction.""" wk = state.get("workspace", {}) @@ -247,11 +190,6 @@ def main(): # Bump compaction count state["compaction_count"] = state.get("compaction_count", 0) + 1 - # Write file snapshots - snapshots = build_file_snapshots(cwd, state) - if snapshots: - (ws / "derived" / "file_snapshots.md").write_text(snapshots) - # Mark staleness BEFORE building reload snippet so RELOAD.md # reflects the current compaction's staleness, not the previous one. mark_staleness_at_compaction(state) diff --git a/hooks/scripts/test_hooks.py b/hooks/scripts/test_hooks.py index 184cba7..8b70bd7 100644 --- a/hooks/scripts/test_hooks.py +++ b/hooks/scripts/test_hooks.py @@ -23,11 +23,10 @@ log_error, save_state, ) +from init_workspace import get_config from pack import ( STALENESS_LOUD, STALENESS_MILD, - SUGGESTION_TTL_TURNS, - EXTRACTION_INTERVAL, append_tasklog, apply_markers, build_sync_reminder, @@ -45,7 +44,7 @@ staleness_warning, ) from nudge import build_nudge_message -from pre_compact import build_reload_snippet, build_summary, extract_recent_tasklog, mark_staleness_at_compaction, build_file_snapshots +from pre_compact import build_reload_snippet, build_summary, extract_recent_tasklog, mark_staleness_at_compaction # ── Fixtures ────────────────────────────────────────────── @@ -703,51 +702,6 @@ def test_negative_staleness_clamped(self, state): assert build_nudge_message(state) is None -class TestBuildFileSnapshots: - def test_empty_working_set(self, tmp_path, state): - result = build_file_snapshots(str(tmp_path), state) - assert result == "" - - def test_captures_first_lines(self, tmp_path, state): - (tmp_path / "main.py").write_text("line1\nline2\nline3\nline4\nline5\n") - state["artifacts"] = {"a_1": {"path": "main.py", "kind": "code"}} - state["working_set"] = ["a_1"] - - result = build_file_snapshots(str(tmp_path), state) - assert "main.py" in result - assert "line1" in result - assert "line5" in result - - def test_skips_missing_files(self, tmp_path, state): - state["artifacts"] = {"a_1": {"path": "gone.py", "kind": "code"}} - state["working_set"] = ["a_1"] - - result = build_file_snapshots(str(tmp_path), state) - assert "gone.py" in result - assert "not found" in result.lower() or result.count("```") == 0 - - def test_caps_total_size(self, tmp_path, state): - big_content = "x" * 500 + "\n" - for i in range(10): - (tmp_path / f"file{i}.py").write_text(big_content * 20) - state["artifacts"][f"a_{i}"] = {"path": f"file{i}.py", "kind": "code"} - state["working_set"].append(f"a_{i}") - - result = build_file_snapshots(str(tmp_path), state, max_bytes=2048) - assert len(result.encode()) <= 2048 + 200 # allow some header overhead - - def test_truncates_long_lines(self, tmp_path, state): - long_line = "x" * 300 + "\n" - (tmp_path / "wide.py").write_text(long_line * 5) - state["artifacts"] = {"a_1": {"path": "wide.py", "kind": "code"}} - state["working_set"] = ["a_1"] - - result = build_file_snapshots(str(tmp_path), state) - # No single line should be longer than ~120 chars + truncation marker - for line in result.split("\n"): - assert len(line) < 200 - - class TestReloadInstructions: def test_instructions_are_numbered(self, state): snippet = build_reload_snippet(state) @@ -912,6 +866,29 @@ def test_not_suppressed_when_stale(self, state): assert "approaching" in result +class TestConfigOverrides: + def test_defaults_used_when_no_config(self, state): + cfg = get_config(state) + assert cfg["stale_session_threshold"] == 5 + assert cfg["compaction_warn_buffer"] == 15 + + def test_overrides_applied(self, state): + state["config"] = {"stale_session_threshold": 10, "compaction_warn_buffer": 20} + cfg = get_config(state) + assert cfg["stale_session_threshold"] == 10 + assert cfg["compaction_warn_buffer"] == 20 + # Non-overridden keys still have defaults + assert cfg["extraction_interval"] == 5 + + def test_compaction_warning_respects_config(self, state): + # Normally turn 146 with history [80] warns (remaining=14, within default 15) + # But with compaction_warn_buffer=10, remaining=14 is outside buffer + state["compaction_history"] = [80] + state["turn_count"] = 146 + state["config"] = {"compaction_warn_buffer": 10} + assert compaction_warning(state) is None + + class TestCompactionHistoryRecording: def test_records_turn_count(self, workspace, state): state["turn_count"] = 85 From 229a6ecfe08b4fb2631757b501403abe020275de Mon Sep 17 00:00:00 2001 From: Ryan Orban Date: Thu, 5 Mar 2026 14:51:20 -0800 Subject: [PATCH 5/6] add config override tests for nudge threshold and track_read cap --- hooks/scripts/test_hooks.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/hooks/scripts/test_hooks.py b/hooks/scripts/test_hooks.py index 8b70bd7..c9d241e 100644 --- a/hooks/scripts/test_hooks.py +++ b/hooks/scripts/test_hooks.py @@ -701,6 +701,16 @@ def test_negative_staleness_clamped(self, state): state["semantic_turn"] = 10 assert build_nudge_message(state) is None + def test_nudge_respects_config_threshold(self, state): + state["workspace"]["objective"] = "goal" + state["turn_count"] = 6 + state["semantic_turn"] = 0 + # Default threshold is 5, so 6 turns stale triggers nudge + assert build_nudge_message(state) is not None + # Raise threshold to 10 — same staleness should no longer trigger + state["config"] = {"stale_session_threshold": 10} + assert build_nudge_message(state) is None + class TestReloadInstructions: def test_instructions_are_numbered(self, state): @@ -789,6 +799,15 @@ def test_order_preserved_for_new_entries(self, state): paths = [e["path"] for e in state["recently_read"]] assert paths == ["file0.py", "file1.py", "file2.py"] + def test_track_read_respects_config_cap(self, state): + state["config"] = {"max_recently_read": 3} + for i in range(5): + track_read(state, f"file{i}.py", f"2026-03-01T0{i}:00:00") + assert len(state["recently_read"]) == 3 + # Should keep the 3 most recent + paths = [e["path"] for e in state["recently_read"]] + assert paths == ["file2.py", "file3.py", "file4.py"] + class TestPredictNextCompaction: def test_default_with_no_history(self): From f8b65dcbbb1d0ca86bde3ead944998adb7ab146f Mon Sep 17 00:00:00 2001 From: Ryan Orban Date: Thu, 2 Apr 2026 22:43:53 -0700 Subject: [PATCH 6/6] feat(relay): add component cost attribution estimator Nightshift-Task: cost-attribution Nightshift-Ref: https://github.com/marcus/nightshift --- README.md | 26 +++++++ commands/relay.md | 18 ++++- hooks/scripts/init_workspace.py | 89 ++++++++++++++++++++++- hooks/scripts/nudge.py | 4 +- hooks/scripts/pack.py | 122 +++++++++++++++++++++++++++++++- hooks/scripts/post_tool_use.py | 11 ++- hooks/scripts/pre_compact.py | 3 +- hooks/scripts/reload.py | 16 ++--- hooks/scripts/test_hooks.py | 83 +++++++++++++++++++++- 9 files changed, 355 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index ca207bb..c0e715d 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ PreCompact (pre_compact.py) SessionStart (reload.py, nudge.py) - Turn count, compaction count, compaction history - Git context — branch, status, recent commits - Read tracking — last 5 unique file reads +- Hook telemetry — per-component invocation counters and extraction attempt/run counts **Semi-automatically** (via markers in assistant output): - Objective, hypothesis, next actions, open questions — parsed from `` HTML comments at the end of responses @@ -84,10 +85,35 @@ All commands are accessed via `/relay`: | `/relay` or `/relay status` | Show current workspace state | | `/relay sync` | Update all semantic fields (objective, status, hypothesis, next_actions, open_questions) | | `/relay pack` | Manually trigger a pack cycle | +| `/relay costs` | Show per-component estimated cost attribution from invocation telemetry | | `/relay objective ` | Set or show the workspace objective | | `/relay forget ` | Remove an artifact from the registry and working set | | `/relay reset` | Delete `.agent-workspace/` and start fresh | +## Cost attribution estimator + +Relay records invocation counters for `post_tool_use`, `pack`, `pre_compact`, `reload`, `nudge`, and `extract_state` under `STATE.json.metrics.component_invocations`. Each pack cycle computes `metrics.last_cost_report` using per-invocation rates from `config.cost_model`. + +Example interpretation: +- `subtotal = invocations * unit_cost` per component +- `total_estimated_cost = sum(subtotals)` +- extraction attempts/runs are tracked separately to show how often extraction was considered vs actually spawned + +You can tune rates in `STATE.json`: + +```json +{ + "config": { + "cost_model": { + "pack": 0.005, + "extract_state": 0.02 + } + } +} +``` + +These values are modeled internal estimates for relative attribution, not measured cloud billing truth. + ## Hooks Defined in `hooks/hooks.json`: diff --git a/commands/relay.md b/commands/relay.md index b9a2c19..159d0ab 100644 --- a/commands/relay.md +++ b/commands/relay.md @@ -1,7 +1,7 @@ --- name: relay description: View and manage the relay workspace state (.agent-workspace/) -argument-hint: "[status|sync|reset|pack|forget|objective]" +argument-hint: "[status|sync|reset|pack|costs|forget|objective]" --- # /relay — Workspace state manager @@ -58,6 +58,22 @@ Do this by running: `python3 ${CLAUDE_PLUGIN_ROOT}/hooks/scripts/pack.py` with a After packing, show: "Packed. Turn [N], [X] artifacts, [Y] in working set." +## /relay costs + +Show estimated operational cost attribution by relay component. + +1. Read `.agent-workspace/STATE.json` +2. If `metrics.last_cost_report` exists, use it; otherwise compute a report by applying `config.cost_model` rates to `metrics.component_invocations`. +3. Present a compact table with columns: + - Component + - Invocations + - Unit cost + - Subtotal +4. Show total estimated cost and extraction attempts/runs (`metrics.extract_state_attempts` / `metrics.extract_state_runs`). +5. Include this assumption line: "Modeled per-invocation estimates; not measured billing data." + +If workspace doesn't exist, say: "No relay workspace in this project. One will be created automatically when you start working." + ## /relay forget Remove an artifact from the working set (and optionally from the registry). diff --git a/hooks/scripts/init_workspace.py b/hooks/scripts/init_workspace.py index c91268b..4a1cab8 100644 --- a/hooks/scripts/init_workspace.py +++ b/hooks/scripts/init_workspace.py @@ -36,6 +36,29 @@ "turn_count": 0, "last_marker_turn": 0, "last_extraction_turn": 0, + "metrics": { + "component_invocations": { + "post_tool_use": 0, + "pack": 0, + "pre_compact": 0, + "reload": 0, + "nudge": 0, + "extract_state": 0, + }, + "extract_state_attempts": 0, + "extract_state_runs": 0, + "last_cost_report": None, + }, + "config": { + "cost_model": { + "post_tool_use": 0.001, + "pack": 0.003, + "pre_compact": 0.002, + "reload": 0.001, + "nudge": 0.001, + "extract_state": 0.01, + } + }, } EXEC_PACKET_TEMPLATE = """\ @@ -122,8 +145,8 @@ def ensure_workspace(cwd: str) -> Path: def load_state(ws: Path) -> dict: state_file = ws / "STATE.json" if state_file.exists(): - return json.loads(state_file.read_text()) - return copy.deepcopy(INITIAL_STATE) + return ensure_state_schema(json.loads(state_file.read_text())) + return ensure_state_schema(copy.deepcopy(INITIAL_STATE)) def save_state(ws: Path, state: dict): @@ -156,9 +179,54 @@ def read_hook_stdin() -> dict: "compaction_fresh_threshold": 3, "extraction_interval": 5, "max_recently_read": 20, + "cost_model": { + "post_tool_use": 0.001, + "pack": 0.003, + "pre_compact": 0.002, + "reload": 0.001, + "nudge": 0.001, + "extract_state": 0.01, + }, } +def ensure_state_schema(state: dict) -> dict: + """Backfill required keys for older STATE.json shapes.""" + if not isinstance(state.get("workspace"), dict): + state["workspace"] = copy.deepcopy(INITIAL_STATE["workspace"]) + + metrics = state.get("metrics") + if not isinstance(metrics, dict): + metrics = {} + state["metrics"] = metrics + + invocations = metrics.get("component_invocations") + if not isinstance(invocations, dict): + invocations = {} + metrics["component_invocations"] = invocations + + for component, default_count in INITIAL_STATE["metrics"]["component_invocations"].items(): + invocations.setdefault(component, default_count) + + metrics.setdefault("extract_state_attempts", 0) + metrics.setdefault("extract_state_runs", 0) + metrics.setdefault("last_cost_report", None) + + config = state.get("config") + if not isinstance(config, dict): + config = {} + state["config"] = config + + cost_model = config.get("cost_model") + if not isinstance(cost_model, dict): + cost_model = {} + config["cost_model"] = cost_model + for component, default_rate in CONFIG_DEFAULTS["cost_model"].items(): + cost_model.setdefault(component, default_rate) + + return state + + def get_config(state: dict) -> dict: """Return config with user overrides merged over defaults. @@ -167,9 +235,26 @@ def get_config(state: dict) -> dict: """ overrides = state.get("config", {}) merged = {**CONFIG_DEFAULTS, **overrides} + merged["cost_model"] = { + **CONFIG_DEFAULTS["cost_model"], + **(overrides.get("cost_model", {}) if isinstance(overrides, dict) else {}), + } return merged +def record_component_invocation(state: dict, component: str) -> int: + """Increment a component invocation counter and return the updated count.""" + ensure_state_schema(state) + metrics = state["metrics"] + invocations = metrics["component_invocations"] + try: + current = int(invocations.get(component, 0)) + except (TypeError, ValueError): + current = 0 + invocations[component] = current + 1 + return invocations[component] + + MAX_ERROR_LOG_ENTRIES = 50 diff --git a/hooks/scripts/nudge.py b/hooks/scripts/nudge.py index fc85a53..5956f73 100644 --- a/hooks/scripts/nudge.py +++ b/hooks/scripts/nudge.py @@ -11,7 +11,7 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) -from init_workspace import load_state, read_hook_stdin, get_config +from init_workspace import load_state, save_state, read_hook_stdin, get_config, record_component_invocation WORKSPACE_DIR = ".agent-workspace" @@ -74,6 +74,7 @@ def main(): try: state = load_state(ws) + record_component_invocation(state, "nudge") msg = build_nudge_message(state) marker_tip = ( @@ -98,6 +99,7 @@ def main(): "additionalContext": context, } } + save_state(ws, state) print(json.dumps(output)) except Exception as exc: try: diff --git a/hooks/scripts/pack.py b/hooks/scripts/pack.py index 3167d1c..eae4f3d 100644 --- a/hooks/scripts/pack.py +++ b/hooks/scripts/pack.py @@ -16,7 +16,15 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) -from init_workspace import ensure_workspace, load_state, read_hook_stdin, hash_workspace, get_config +from init_workspace import ( + ensure_workspace, + load_state, + read_hook_stdin, + hash_workspace, + get_config, + ensure_state_schema, + record_component_invocation, +) try: from entire_bridge import detect_entire, write_relay_context @@ -80,6 +88,108 @@ def build_sync_reminder(state: dict) -> str | None: MAX_COMPACTION_HISTORY = 10 +def _to_int(value: object, default: int = 0) -> int: + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _to_float(value: object, default: float = 0.0) -> float: + try: + return float(value) + except (TypeError, ValueError): + return default + + +def normalize_component_invocations(state: dict) -> dict[str, int]: + """Return sanitized invocation counts for all configured cost components.""" + ensure_state_schema(state) + cfg = get_config(state) + known_components = set(cfg.get("cost_model", {}).keys()) + raw = state.get("metrics", {}).get("component_invocations", {}) + if isinstance(raw, dict): + known_components.update(raw.keys()) + + normalized = {} + for component in sorted(known_components): + count = _to_int(raw.get(component, 0) if isinstance(raw, dict) else 0, 0) + normalized[component] = max(0, count) + return normalized + + +def resolve_cost_model(state: dict) -> dict[str, float]: + """Resolve cost rates with defaults and sanitize malformed values.""" + cfg = get_config(state) + model = cfg.get("cost_model", {}) + if not isinstance(model, dict): + model = {} + + resolved = {} + for component in sorted(model.keys()): + resolved[component] = max(0.0, _to_float(model.get(component), 0.0)) + return resolved + + +def estimate_costs(state: dict, generated_at: str | None = None) -> dict: + """Compute deterministic per-component cost attribution report.""" + invocations = normalize_component_invocations(state) + cost_model = resolve_cost_model(state) + + components = [] + total = 0.0 + for component in sorted(cost_model.keys()): + count = invocations.get(component, 0) + unit_cost = cost_model.get(component, 0.0) + subtotal = count * unit_cost + total += subtotal + components.append( + { + "component": component, + "invocations": count, + "unit_cost": round(unit_cost, 6), + "subtotal": round(subtotal, 6), + } + ) + + metrics = state.get("metrics", {}) + return { + "generated_at": generated_at or datetime.now().isoformat(), + "assumptions": "Modeled per-invocation estimates; not measured billing data.", + "currency": "estimated_units", + "components": components, + "total_estimated_cost": round(total, 6), + "counters": { + "extract_state_attempts": max(0, _to_int(metrics.get("extract_state_attempts"), 0)), + "extract_state_runs": max(0, _to_int(metrics.get("extract_state_runs"), 0)), + }, + } + + +def render_cost_report(report: dict) -> str: + """Render a compact markdown table for `/relay costs` output.""" + lines = [ + "### Relay Cost Attribution (Estimated)", + "", + "| Component | Invocations | Unit Cost | Subtotal |", + "|---|---:|---:|---:|", + ] + for row in report.get("components", []): + lines.append( + f"| `{row.get('component', '?')}` | {row.get('invocations', 0)} " + f"| {row.get('unit_cost', 0.0):.6f} | {row.get('subtotal', 0.0):.6f} |" + ) + lines.extend( + [ + "", + f"**Total estimated cost:** {report.get('total_estimated_cost', 0.0):.6f} {report.get('currency', 'estimated_units')}", + f"Extraction attempts/runs: {report.get('counters', {}).get('extract_state_attempts', 0)}/{report.get('counters', {}).get('extract_state_runs', 0)}", + f"Assumption: {report.get('assumptions', '')}", + ] + ) + return "\n".join(lines) + + def predict_next_compaction(history: list[int]) -> int: """Predict the next compaction turn from history of absolute turn numbers. @@ -517,6 +627,7 @@ def check_pending_suggestions(ws: Path, state: dict) -> str | None: def spawn_extraction(ws: Path, state: dict, hook_data: dict): """Spawn background extraction process if conditions are met.""" + ensure_state_schema(state) current_turn = state.get("turn_count", 0) last_extraction = state.get("last_extraction_turn", 0) @@ -535,6 +646,8 @@ def spawn_extraction(ws: Path, state: dict, hook_data: dict): } context_file = ws / "derived" / "extraction_context.json" output_file = ws / "derived" / "pending_suggestions.json" + metrics = state.setdefault("metrics", {}) + metrics["extract_state_attempts"] = _to_int(metrics.get("extract_state_attempts"), 0) + 1 try: context_file.write_text(json.dumps(context, indent=2)) @@ -545,6 +658,8 @@ def spawn_extraction(ws: Path, state: dict, hook_data: dict): stderr=subprocess.DEVNULL, ) state["last_extraction_turn"] = current_turn + metrics["extract_state_runs"] = _to_int(metrics.get("extract_state_runs"), 0) + 1 + record_component_invocation(state, "extract_state") except (OSError, FileNotFoundError): pass @@ -591,6 +706,9 @@ def main(): if state["last_updated"] <= state["last_pack_turn"]: sys.exit(0) + # Increment only after skip checks to avoid unpersisted counts on early exit. + record_component_invocation(state, "pack") + # Update semantic tracking if workspace_changed: state["last_semantic_update"] = datetime.now().isoformat() @@ -667,6 +785,8 @@ def main(): if is_main_stop and not markers_applied: spawn_extraction(ws, state, hook_data) + state.setdefault("metrics", {})["last_cost_report"] = estimate_costs(state) + # Set last_updated and last_pack_turn to the SAME timestamp so the # skip guard works: next turn, if post_tool_use doesn't fire (bumping # last_updated ahead), last_updated == last_pack_turn → skip. diff --git a/hooks/scripts/post_tool_use.py b/hooks/scripts/post_tool_use.py index c74c7d1..adf833e 100644 --- a/hooks/scripts/post_tool_use.py +++ b/hooks/scripts/post_tool_use.py @@ -15,7 +15,14 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) -from init_workspace import ensure_workspace, load_state, save_state, read_hook_stdin, get_config +from init_workspace import ( + ensure_workspace, + load_state, + save_state, + read_hook_stdin, + get_config, + record_component_invocation, +) # File extensions -> artifact kind mapping KIND_MAP = { @@ -102,6 +109,7 @@ def main(): try: ws = ensure_workspace(cwd) state = load_state(ws) + record_component_invocation(state, "post_tool_use") now = datetime.now().isoformat() try: rel_path = str(Path(file_path).relative_to(cwd)) @@ -120,6 +128,7 @@ def main(): try: ws = ensure_workspace(cwd) state = load_state(ws) + record_component_invocation(state, "post_tool_use") now = datetime.now().isoformat() aid = make_artifact_id(file_path, now) diff --git a/hooks/scripts/pre_compact.py b/hooks/scripts/pre_compact.py index 3ebb79c..d187d11 100644 --- a/hooks/scripts/pre_compact.py +++ b/hooks/scripts/pre_compact.py @@ -14,7 +14,7 @@ # Add parent to path so we can import the shared module sys.path.insert(0, str(Path(__file__).parent)) -from init_workspace import ensure_workspace, load_state, save_state, read_hook_stdin +from init_workspace import ensure_workspace, load_state, save_state, read_hook_stdin, record_component_invocation try: from entire_bridge import detect_entire, create_checkpoint, write_relay_context @@ -180,6 +180,7 @@ def main(): try: ws = ensure_workspace(cwd) state = load_state(ws) + record_component_invocation(state, "pre_compact") # Record this compaction's turn count for prediction history = state.setdefault("compaction_history", []) diff --git a/hooks/scripts/reload.py b/hooks/scripts/reload.py index e17dbd9..077793c 100644 --- a/hooks/scripts/reload.py +++ b/hooks/scripts/reload.py @@ -10,7 +10,7 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) -from init_workspace import read_hook_stdin +from init_workspace import ensure_workspace, load_state, save_state, read_hook_stdin, record_component_invocation WORKSPACE_DIR = ".agent-workspace" @@ -32,17 +32,14 @@ def main(): sys.exit(0) try: + ws = ensure_workspace(cwd) + state = load_state(ws) + record_component_invocation(state, "reload") + content = reload_file.read_text().strip() if content: # Check if state was stale at compaction - state_file = Path(cwd) / WORKSPACE_DIR / "STATE.json" - stale_flag = False - if state_file.exists(): - try: - st = json.loads(state_file.read_text()) - stale_flag = st.get("stale_at_compaction", False) - except Exception: - pass + stale_flag = state.get("stale_at_compaction", False) urgency = "" if stale_flag: @@ -66,6 +63,7 @@ def main(): } } print(json.dumps(output)) + save_state(ws, state) except Exception as exc: try: from init_workspace import log_error diff --git a/hooks/scripts/test_hooks.py b/hooks/scripts/test_hooks.py index c9d241e..67f61a9 100644 --- a/hooks/scripts/test_hooks.py +++ b/hooks/scripts/test_hooks.py @@ -21,6 +21,7 @@ hash_workspace, load_state, log_error, + record_component_invocation, save_state, ) from init_workspace import get_config @@ -34,10 +35,13 @@ check_pending_suggestions, cleanup_stale_artifacts, compaction_warning, + estimate_costs, infer_objective, + normalize_component_invocations, parse_markers, predict_next_compaction, prune_tasklog, + render_cost_report, rebuild_exec_packet, rebuild_index, spawn_extraction, @@ -442,6 +446,9 @@ def test_load_state_returns_initial(self, workspace): assert state["recently_read"] == [] assert state["git"] is None assert state["compaction_history"] == [] + assert state["metrics"]["component_invocations"]["pack"] == 0 + assert state["metrics"]["last_cost_report"] is None + assert state["config"]["cost_model"]["pack"] > 0 def test_load_state_fallback_is_deep_copy(self, tmp_path): # Simulate missing STATE.json @@ -467,6 +474,20 @@ def test_save_roundtrip(self, workspace, state): assert reloaded["workspace"]["objective"] == "test roundtrip" assert reloaded["turn_count"] == 42 + def test_load_backfills_missing_cost_fields(self, workspace): + legacy = { + "version": "1.0.0", + "workspace": {"objective": None, "hypothesis": None, "open_questions": [], "next_actions": [], "status": "idle"}, + "artifacts": {}, + "working_set": [], + "turn_count": 0, + } + (workspace / "STATE.json").write_text(json.dumps(legacy, indent=2) + "\n") + reloaded = load_state(workspace) + assert reloaded["metrics"]["component_invocations"]["pack"] == 0 + assert reloaded["metrics"]["extract_state_attempts"] == 0 + assert reloaded["config"]["cost_model"]["extract_state"] > 0 + # ── build_sync_reminder ────────────────────────────────── @@ -908,6 +929,61 @@ def test_compaction_warning_respects_config(self, state): assert compaction_warning(state) is None +class TestCostAttribution: + def test_record_component_invocation_increments(self, state): + assert state["metrics"]["component_invocations"]["pack"] == 0 + record_component_invocation(state, "pack") + record_component_invocation(state, "pack") + assert state["metrics"]["component_invocations"]["pack"] == 2 + + def test_estimate_costs_deterministic_math(self, state): + state["metrics"]["component_invocations"]["pack"] = 4 + state["metrics"]["component_invocations"]["post_tool_use"] = 5 + state["config"] = { + "cost_model": { + "pack": 0.25, + "post_tool_use": 0.1, + } + } + report = estimate_costs(state, generated_at="2026-04-03T00:00:00") + assert report["generated_at"] == "2026-04-03T00:00:00" + assert report["total_estimated_cost"] == 1.5 + by_component = {r["component"]: r for r in report["components"]} + assert by_component["pack"]["subtotal"] == 1.0 + assert by_component["post_tool_use"]["subtotal"] == 0.5 + + def test_estimate_costs_handles_malformed_counters(self, state): + state["metrics"]["component_invocations"]["extract_state"] = "bogus" + state["metrics"]["extract_state_attempts"] = "oops" + state["metrics"]["extract_state_runs"] = None + report = estimate_costs(state) + by_component = {r["component"]: r for r in report["components"]} + assert by_component["extract_state"]["invocations"] == 0 + assert report["counters"]["extract_state_attempts"] == 0 + assert report["counters"]["extract_state_runs"] == 0 + + def test_estimate_costs_fallback_when_config_missing(self): + bare_state = {"metrics": {"component_invocations": {"pack": 3}}} + report = estimate_costs(bare_state) + by_component = {r["component"]: r for r in report["components"]} + assert by_component["pack"]["invocations"] == 3 + assert report["total_estimated_cost"] > 0 + + def test_normalize_component_invocations_with_invalid_shape(self): + weird_state = {"metrics": {"component_invocations": ["bad"]}} + normalized = normalize_component_invocations(weird_state) + assert normalized["pack"] == 0 + + def test_render_cost_report_format(self, state): + state["metrics"]["component_invocations"]["pack"] = 2 + report = estimate_costs(state, generated_at="2026-04-03T00:00:00") + text = render_cost_report(report) + assert "Relay Cost Attribution (Estimated)" in text + assert "| Component | Invocations | Unit Cost | Subtotal |" in text + assert "**Total estimated cost:**" in text + assert "Modeled per-invocation estimates; not measured billing data." in text + + class TestCompactionHistoryRecording: def test_records_turn_count(self, workspace, state): state["turn_count"] = 85 @@ -1316,6 +1392,7 @@ def test_skips_when_interval_not_reached(self, workspace, state): with patch("pack.subprocess.Popen") as mock_popen: spawn_extraction(workspace, state, hook_data) mock_popen.assert_not_called() + assert state["metrics"]["extract_state_attempts"] == 0 def test_spawns_at_interval(self, workspace, state): state["turn_count"] = 5 @@ -1325,6 +1402,9 @@ def test_spawns_at_interval(self, workspace, state): spawn_extraction(workspace, state, hook_data) mock_popen.assert_called_once() assert state["last_extraction_turn"] == 5 + assert state["metrics"]["extract_state_attempts"] == 1 + assert state["metrics"]["extract_state_runs"] == 1 + assert state["metrics"]["component_invocations"]["extract_state"] == 1 def test_skips_empty_message(self, workspace, state): state["turn_count"] = 10 @@ -1353,7 +1433,8 @@ def test_handles_popen_failure(self, workspace, state): hook_data = {"last_assistant_message": "test"} with patch("pack.subprocess.Popen", side_effect=FileNotFoundError): spawn_extraction(workspace, state, hook_data) - # Should not raise + assert state["metrics"]["extract_state_attempts"] == 1 + assert state["metrics"]["extract_state_runs"] == 0 # ── nudge marker tip ─────────────────────────────────────