diff --git a/CLAUDE.md b/CLAUDE.md index 342da1b..125a87f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -119,8 +119,10 @@ multi-hunk unified-diff apply, atomic per call, approval-gated), toolset, which is otherwise on by default), `JARVIS_NO_PROJECT_CONTEXT` (any value disables auto-loading `AGENTS.md` / `CLAUDE.md` / `AGENT.md` from the workspace into the -system prompt; defaults to loading them, capped at 32 KiB), -`JARVIS_PROJECT_CONTEXT_BYTES` (override the default 32 KiB cap), +system prompt; defaults to loading them, capped at 8 KiB), +`JARVIS_PROJECT_CONTEXT_BYTES` (override the default 8 KiB cap; +truncation logs a startup WARN naming this env var so operators with +larger instruction files can opt back to a higher cap explicitly), `JARVIS_SHELL_TIMEOUT_MS` (default `30000`, per-call default for `shell.exec`), `JARVIS_MCP_SERVERS` (comma-separated `prefix=command args...` list of external MCP servers to spawn and adapt into Tools), @@ -968,9 +970,13 @@ binary appends the workspace's `AGENTS.md` / `CLAUDE.md` / `harness_tools::workspace::load_instructions(root, max_bytes)`. Each file is wrapped in a `=== project context: ===` header so the model can tell injected guidance from its own template. -Combined output is capped at 32 KiB (override: +Combined output is capped at 8 KiB (override: `JARVIS_PROJECT_CONTEXT_BYTES`); overflow is truncated with a -`[... project context truncated at N bytes ...]` marker. Disable +`[... project context truncated at N bytes ...]` marker, and +`apps/jarvis::serve` logs a startup WARN whenever truncation fires so +operators with larger instruction files notice and can raise the cap +deliberately. The lower default (was 32 KiB) keeps the system prompt +focused so prior tool results stay in the model's attention window. Disable entirely via `JARVIS_NO_PROJECT_CONTEXT=1` or `[agent].include_project_context = false`. `jarvis-cli` honours the same env vars plus a `--no-project-context` CLI flag. Deliberately diff --git a/apps/jarvis/src/serve.rs b/apps/jarvis/src/serve.rs index 510ca36..5a16be2 100644 --- a/apps/jarvis/src/serve.rs +++ b/apps/jarvis/src/serve.rs @@ -376,8 +376,22 @@ pub async fn run( if let Some(extra) = load_instructions_bounded(workspace_root.clone(), project_ctx_cap) { info!( bytes = extra.len(), + cap = project_ctx_cap, "loaded project instructions (AGENTS.md / JARVIS.md / CLAUDE.md / .jarvis)" ); + // The loader truncates at the cap and tags the suffix. When + // it fired, surface a WARN so operators with larger + // instruction files notice and can opt back to a higher cap + // via `JARVIS_PROJECT_CONTEXT_BYTES` rather than silently + // shipping truncated guidance to the model. + if extra.contains("project context truncated at") { + warn!( + cap = project_ctx_cap, + "project instructions exceeded cap — output truncated. \ + Set JARVIS_PROJECT_CONTEXT_BYTES= or [agent].project_context_max_bytes \ + to raise (default lowered to 8 KiB to keep system prompt focused)." + ); + } system_prompt.push_str("\n\n"); system_prompt.push_str(&extra); project_context_loaded = true; @@ -2018,7 +2032,18 @@ use ask.text instead of guessing."; /// checks, and end with a change report. const CODING_SYSTEM_PROMPT: &str = "You are Jarvis, a coding agent working in the user's repository. \ -Before editing, call workspace.context to orient yourself, then inspect git status. \ +\ +Reuse what you already gathered. Before calling any tool, scan earlier turns of THIS \ +conversation: if a recent tool result already answers the user's current question \ +(workspace.context, todo.list, requirement.list, triage.scan_candidates, fs.read of \ +the same path, code.grep of the same pattern, etc.), refer back to it instead of \ +re-running the tool. Re-running orientation tools on every user turn wastes the \ +user's tokens and dilutes your own attention. Only re-run when you have a specific \ +reason to believe the underlying state changed (a write happened, the user said the \ +file changed, enough turns have passed that staleness matters). \ +\ +If you don't already have the workspace layout in this conversation, call workspace.context \ +once to orient yourself, and inspect git status before editing. \ Do not overwrite user changes you did not make. \ Prefer code.grep, fs.read, fs.list, git.status, and git.diff before reaching for shell.exec. \ When you need a human decision, missing information, or a choice among acceptable options, \ @@ -2027,7 +2052,7 @@ Use fs.edit (uniqueness-checked single replace) or fs.patch (unified-diff multi- reviewable edits; reach for fs.write only to create new files. \ When you run checks (tests, lints, builds), keep them focused on the change rather than the \ whole repo. \ -At the start of a fresh session, call todo.list to see persistent project follow-ups; \ +Once per conversation (not per turn), call todo.list to see persistent project follow-ups; \ record new follow-ups via todo.add (not plan.update — that's for the current turn only) \ and mark them completed/blocked as you go. \ If your previous turn asked a yes/no question about a specific entity (\"标记为 Done?\", \ @@ -2049,7 +2074,8 @@ the final status report. \ Spec → Project workflow: when the user describes new work — either inline (\"add a user-avatar \ upload\") or by pointing at a doc (\"read docs/feature-x.md and lay out the work\") — drive the \ following sequence: \ -(1) call workspace.context, plus fs.read on any doc the user named; \ +(1) make sure you have the workspace context (call workspace.context only if you don't already have \ +it from earlier in this conversation), and fs.read any doc the user named; \ (2) call plan.update with the proposed breakdown (titles only) and let the user confirm or edit; \ (3) once confirmed, resolve the project (project.list / project.get; project.create only if it \ genuinely does not exist), then call requirement.create per item — populate `verification_plan.commands` \ @@ -2077,14 +2103,20 @@ fn include_project_context(cfg: &Config) -> bool { } /// Cap on the total bytes of project context appended to the system -/// prompt. Defaults to 32 KiB — enough for realistic agent -/// instruction files, far short of blowing a small-context model. +/// prompt. Defaults to 8 KiB — large enough for realistic AGENTS.md / +/// CLAUDE.md / .jarvis/rules files, but tight enough that the system +/// prompt doesn't drown out mid-conversation tool results in the +/// model's attention window. Override with `JARVIS_PROJECT_CONTEXT_BYTES` +/// or `[agent].project_context_max_bytes` when you have larger +/// instruction files you genuinely want to inline. The previous default +/// (32 KiB) regularly produced 30-KiB system blocks that pushed prior +/// tool output too deep for the model to reuse. fn project_context_max_bytes(cfg: &Config) -> usize { std::env::var("JARVIS_PROJECT_CONTEXT_BYTES") .ok() .and_then(|s| s.parse().ok()) .or(cfg.agent.project_context_max_bytes) - .unwrap_or(32 * 1024) + .unwrap_or(8 * 1024) } /// Resolve the file-based project memory switch. `Some(true)` means diff --git a/crates/harness-core/src/agent.rs b/crates/harness-core/src/agent.rs index e64dc0c..fd749e3 100644 --- a/crates/harness-core/src/agent.rs +++ b/crates/harness-core/src/agent.rs @@ -92,6 +92,15 @@ pub struct AgentConfig { /// dispatch is harmless because the LLM only emits one call at /// a time. pub parallel_tool_calls: bool, + /// When `true` (default), `ensure_system_prompt` will *replace* the + /// first `System` message of a loaded conversation when its content + /// no longer matches the configured `system_prompt`. The historical + /// behaviour was insert-if-missing only, which silently locked old + /// conversations into whatever prompt was active at creation — + /// updates to the binary's prompt template never reached resumed + /// sessions. Set to `false` for workflows that deliberately persist + /// per-conversation custom prompts. + pub refresh_system_prompt_on_resume: bool, } impl AgentConfig { @@ -108,6 +117,7 @@ impl AgentConfig { tool_filter: None, session_workspace: None, parallel_tool_calls: false, + refresh_system_prompt_on_resume: true, } } @@ -170,6 +180,13 @@ impl AgentConfig { self.parallel_tool_calls = enabled; self } + + /// Toggle resume-time refresh of the system prompt. See + /// [`AgentConfig::refresh_system_prompt_on_resume`]. + pub fn with_refresh_system_prompt_on_resume(mut self, enabled: bool) -> Self { + self.refresh_system_prompt_on_resume = enabled; + self + } } /// What ended a `run` invocation. @@ -375,7 +392,11 @@ impl Agent { } async fn run_inner(&self, conversation: &mut Conversation) -> Result<(RunOutcome, Usage)> { - Self::ensure_system_prompt(conversation, self.config.system_prompt.as_deref()); + Self::ensure_system_prompt( + conversation, + self.config.system_prompt.as_deref(), + self.config.refresh_system_prompt_on_resume, + ); let mut total_usage = Usage::default(); let run_span = Span::current(); @@ -519,7 +540,11 @@ impl Agent { jarvis.transport = "stream", ); let inner = stream! { - Self::ensure_system_prompt(&mut conversation, agent.config.system_prompt.as_deref()); + Self::ensure_system_prompt( + &mut conversation, + agent.config.system_prompt.as_deref(), + agent.config.refresh_system_prompt_on_resume, + ); for iter in 1..=agent.config.max_iterations { let req = match agent.build_request(&conversation).await { @@ -1145,15 +1170,28 @@ impl Agent { }) } - fn ensure_system_prompt(conv: &mut Conversation, prompt: Option<&str>) { + fn ensure_system_prompt(conv: &mut Conversation, prompt: Option<&str>, refresh: bool) { let Some(prompt) = prompt else { return }; - let has_system = conv - .messages - .first() - .map(|m| matches!(m, Message::System { .. })) - .unwrap_or(false); - if !has_system { - conv.messages.insert(0, Message::system(prompt)); + match conv.messages.first() { + Some(Message::System { content, .. }) if content == prompt => { + // Already in sync — nothing to do. + } + Some(Message::System { .. }) if refresh => { + // First message is a stale System (different content from + // the configured prompt) and the refresh policy is on: + // replace it. This is what unblocks resumed conversations + // from staying stuck on whatever prompt was active when + // they were originally created. + conv.messages[0] = Message::system(prompt); + } + Some(Message::System { .. }) => { + // Stale but refresh is off — historical behaviour, leave + // the persisted custom prompt alone. + } + _ => { + // No leading System at all — insert one. + conv.messages.insert(0, Message::system(prompt)); + } } } @@ -1883,4 +1921,53 @@ mod tests { let req = futures::executor::block_on(agent.build_request(&conv)).unwrap(); assert_eq!(req.parallel_tool_calls, None); } + + #[test] + fn ensure_system_prompt_inserts_when_missing() { + let mut conv = Conversation::new(); + conv.push(Message::user("hi")); + Agent::ensure_system_prompt(&mut conv, Some("PROMPT"), true); + assert!(matches!(conv.messages[0], Message::System { ref content, .. } if content == "PROMPT")); + assert_eq!(conv.messages.len(), 2); + } + + #[test] + fn ensure_system_prompt_skips_when_already_in_sync() { + let mut conv = Conversation::new(); + conv.push(Message::system("PROMPT")); + conv.push(Message::user("hi")); + let before_len = conv.messages.len(); + Agent::ensure_system_prompt(&mut conv, Some("PROMPT"), true); + assert_eq!(conv.messages.len(), before_len); + assert!(matches!(conv.messages[0], Message::System { ref content, .. } if content == "PROMPT")); + } + + #[test] + fn ensure_system_prompt_replaces_stale_when_refresh_on() { + let mut conv = Conversation::new(); + conv.push(Message::system("STALE")); + conv.push(Message::user("hi")); + Agent::ensure_system_prompt(&mut conv, Some("FRESH"), true); + assert_eq!(conv.messages.len(), 2); + assert!(matches!(conv.messages[0], Message::System { ref content, .. } if content == "FRESH")); + } + + #[test] + fn ensure_system_prompt_keeps_stale_when_refresh_off() { + let mut conv = Conversation::new(); + conv.push(Message::system("STALE")); + conv.push(Message::user("hi")); + Agent::ensure_system_prompt(&mut conv, Some("FRESH"), false); + assert_eq!(conv.messages.len(), 2); + assert!(matches!(conv.messages[0], Message::System { ref content, .. } if content == "STALE")); + } + + #[test] + fn ensure_system_prompt_no_op_when_no_prompt_configured() { + let mut conv = Conversation::new(); + conv.push(Message::system("KEEP")); + conv.push(Message::user("hi")); + Agent::ensure_system_prompt(&mut conv, None, true); + assert!(matches!(conv.messages[0], Message::System { ref content, .. } if content == "KEEP")); + } } diff --git a/crates/harness-core/src/lib.rs b/crates/harness-core/src/lib.rs index 6106379..7b51154 100644 --- a/crates/harness-core/src/lib.rs +++ b/crates/harness-core/src/lib.rs @@ -66,7 +66,7 @@ pub use label::{ pub use llm::{ChatRequest, ChatResponse, FinishReason, LlmChunk, LlmProvider, LlmStream, Usage}; pub use memory::{ cache_breakpoint_indices, default_estimator, estimate_tokens, estimate_total_tokens, - CharRatioEstimator, Memory, TokenEstimator, + CharRatioEstimator, JsonAwareEstimator, Memory, TokenEstimator, }; pub use message::{CacheHint, Message, ToolCall}; pub use observability::{ diff --git a/crates/harness-core/src/memory.rs b/crates/harness-core/src/memory.rs index 844a716..7fe5cb5 100644 --- a/crates/harness-core/src/memory.rs +++ b/crates/harness-core/src/memory.rs @@ -76,6 +76,53 @@ impl TokenEstimator for CharRatioEstimator { } } +/// Variant estimator that keeps `CharRatioEstimator`'s numbers for +/// natural-language messages but uses a tighter ratio (`chars * 2 / 7`, +/// i.e. roughly `chars / 3.5`) plus a larger per-message overhead for +/// `Message::Tool` payloads, which are typically dense JSON output from +/// agent tools (escapes, brackets, quoted keys) and which the +/// `chars / 4 + 4` heuristic systematically underestimates by ~15-25 %. +/// +/// Use this when a memory backend is wired without a provider-supplied +/// tokeniser (e.g. tests, jarvis-cli pipe-mode runs) and the +/// conversation involves substantial tool output. Production paths +/// already plug in `TiktokenEstimator` via `LlmProvider::estimator`, +/// which is more accurate still — `JsonAwareEstimator` only matters +/// where the provider doesn't override. +#[derive(Debug, Default, Clone, Copy)] +pub struct JsonAwareEstimator; + +impl JsonAwareEstimator { + pub const fn new() -> Self { + Self + } +} + +impl TokenEstimator for JsonAwareEstimator { + fn estimate_message(&self, message: &Message) -> usize { + match message { + Message::Tool { + tool_call_id, + content, + .. + } => { + let chars = tool_call_id.chars().count() + content.chars().count(); + // chars / 3.5 == chars * 2 / 7. Rounded up to be safe. + (chars * 2).div_ceil(7) + 8 + } + other => estimate_tokens(other), + } + } + + fn estimate_text(&self, text: &str) -> usize { + // Plain text doesn't carry the JSON-overhead penalty — fall back + // to the cheap ratio. This stays consistent with how + // `SummarizingMemory` measures the synthetic summary string, + // which is prose, not JSON. + text.chars().count().div_ceil(4) + } +} + /// Cheap, provider-agnostic token estimate. Roughly `chars / 4` plus a /// fixed per-message overhead for role/separator tokens. This is *not* a /// precise tiktoken count — it's a heuristic that lets memory backends @@ -221,4 +268,46 @@ mod tests { let msgs = vec![Message::system("sys"), Message::user("hi")]; assert!(cache_breakpoint_indices(&msgs).is_empty()); } + + #[test] + fn json_aware_estimates_tool_higher_than_char_ratio() { + let json_payload = r#"{"branch":"main","head":"abc1234","dirty":false,"instructions":[],"manifests":[{"path":"Cargo.toml","kind":"rust"}]}"#; + let tool_msg = Message::Tool { + tool_call_id: "call_workspace_ctx".into(), + content: json_payload.to_string(), + cache: None, + }; + let char_ratio = CharRatioEstimator; + let json_aware = JsonAwareEstimator; + let cr = char_ratio.estimate_message(&tool_msg); + let ja = json_aware.estimate_message(&tool_msg); + assert!( + ja > cr, + "json-aware estimate ({ja}) should exceed char-ratio ({cr}) for dense JSON" + ); + // Sanity: not absurdly higher (ballpark 15-30 % over). + assert!(ja < cr * 2, "json-aware ({ja}) blew past 2x char-ratio ({cr})"); + } + + #[test] + fn json_aware_matches_char_ratio_for_non_tool_messages() { + let json_aware = JsonAwareEstimator; + let char_ratio = CharRatioEstimator; + for m in [ + Message::system("you are jarvis"), + Message::user("hello"), + Message::assistant_text("hi there"), + ] { + assert_eq!(json_aware.estimate_message(&m), char_ratio.estimate_message(&m)); + } + } + + #[test] + fn json_aware_text_matches_char_ratio() { + let json_aware = JsonAwareEstimator; + let char_ratio = CharRatioEstimator; + for s in ["", "hi", "the quick brown fox", "0123456789abcdef"] { + assert_eq!(json_aware.estimate_text(s), char_ratio.estimate_text(s)); + } + } } diff --git a/crates/harness-memory/src/summarizing.rs b/crates/harness-memory/src/summarizing.rs index ac1e743..63a90f1 100644 --- a/crates/harness-memory/src/summarizing.rs +++ b/crates/harness-memory/src/summarizing.rs @@ -62,7 +62,17 @@ pub const DEFAULT_SUMMARY_PROMPT: &str = "\ You are a conversation summariser. Compress the supplied excerpt into a \ short paragraph. Preserve concrete facts, decisions, file paths, names, \ numbers, and any in-flight tool results that later turns may rely on. \ -Do not invent details, do not editorialise, and do not add a preamble."; +Do not invent details, do not editorialise, and do not add a preamble.\n\n\ +BAD (do not do this):\n\ +\"The user wants me to summarise the conversation. I need to compress it into a short \ +paragraph. Key facts: ...\"\n\n\ +GOOD:\n\ +\"User asked for a kanban review of the jarvis-roadmap project. Agent ran \ +requirement.list (30 items, 12 in_progress) and triage.scan_candidates (4 todo \ +markers). Decided to focus on Web UI experience cluster; user approved.\"\n\n\ +Output ONLY the summary content. No \"The user wants me to...\", no \"I'll summarise...\", \ +no \"Here is a summary...\", no \"This excerpt...\" — start directly with the substantive \ +fact, decision, or state."; /// Reserved budget (in estimated tokens) carved out for the summary /// itself when planning what to keep recent. Keeps us from packing the @@ -354,7 +364,7 @@ impl SummarizingMemory { // ended up persisting empty summaries and the agent saw no // prior context, hallucinating "对话历史被压缩了" on every // follow-up. - let text = match resp.message { + let raw = match resp.message { Message::Assistant { content, reasoning_content, @@ -376,6 +386,14 @@ impl SummarizingMemory { _ => return Err("summariser returned no assistant text".into()), }; + // Belt-and-braces: even with the BAD/GOOD-example prompt, smaller + // models often leak "The user wants me to summarise..." preambles + // that eat 100-300 chars of the cache slot before the actual + // facts start. We strip a conservative set of openers here so + // downstream callers (and operators reading the persisted cache) + // get the substantive content directly. + let text = strip_summary_preamble(&raw); + // Populate both cache tiers. self.cache_set(&fp, &text); if let Some(store) = &self.persistence { @@ -418,6 +436,58 @@ fn persist_key(fingerprint: &str) -> String { format!("{PERSIST_KEY_PREFIX}{fingerprint}") } +/// Conservative preamble stripper for summariser output. +/// +/// Smaller models often disregard the prompt's "no preamble" rule and +/// emit something like *"The user wants me to summarise the conversation +/// excerpt. Key facts: ..."*. The first sentence (or paragraph) is +/// dead weight that eats the cache slot before the substantive content +/// starts. This function looks for a known opener and, if found, skips +/// past the first paragraph break (`\n\n`). When no clear preamble is +/// present, or no paragraph break exists within the budget, it returns +/// the input unchanged — better to keep a borderline-OK first sentence +/// than to risk amputating real content. +/// +/// Returns `String` (not `&str`) because the borrowed slice is rooted +/// in the input; callers wanted ownership anyway. +fn strip_summary_preamble(text: &str) -> String { + // Cheap guard: if it doesn't start with a known opener, return as-is. + // We compare on a leading-window prefix (lower-cased once) rather than + // case-folding the whole string. + const OPENERS: &[&str] = &[ + "the user wants me to", + "the user is asking", + "the user has asked", + "the user asked me to", + "i'll summari", + "i will summari", + "let me summari", + "here is a summary", + "here's a summary", + "this excerpt", + "this conversation excerpt", + "the conversation excerpt", + ]; + // Window large enough for the opener but cheap to lowercase. + let window: String = text.chars().take(40).collect::().to_lowercase(); + if !OPENERS.iter().any(|opener| window.starts_with(opener)) { + return text.to_string(); + } + // Find the first paragraph break within a generous budget. If none, + // bail out to avoid eating the whole answer. + const SCAN_BUDGET: usize = 800; + let scan_end = text.char_indices().nth(SCAN_BUDGET).map(|(i, _)| i).unwrap_or(text.len()); + let head = &text[..scan_end]; + if let Some(rel) = head.find("\n\n") { + let after = &text[rel + 2..]; + let trimmed = after.trim_start(); + if !trimmed.is_empty() { + return trimmed.to_string(); + } + } + text.to_string() +} + fn extract_summary(conv: &Conversation) -> Option { conv.messages.iter().find_map(|m| match m { // Reject blank cached summaries — early kimi-coding runs wrote @@ -946,4 +1016,43 @@ mod tests { .iter() .any(|m| matches!(m, Message::System { content, .. } if content.contains("SUMMARY")))); } + + #[test] + fn strip_preamble_removes_common_meta_opener() { + let raw = "The user wants me to summarise the conversation excerpt. \ +I need to compress it into a short paragraph.\n\n\ +Key facts: agent ran requirement.list (30 items, 12 in_progress)."; + let out = strip_summary_preamble(raw); + assert!(out.starts_with("Key facts:"), "got: {out:?}"); + assert!(!out.contains("The user wants me to")); + } + + #[test] + fn strip_preamble_handles_summarize_us_spelling() { + let raw = "The user wants me to summarize the conversation. The excerpt is in Chinese.\n\n\ +Decision: focus on Web UI experience."; + let out = strip_summary_preamble(raw); + assert!(out.starts_with("Decision:")); + } + + #[test] + fn strip_preamble_leaves_clean_summary_alone() { + let raw = "User asked for kanban review. Agent ran requirement.list."; + assert_eq!(strip_summary_preamble(raw), raw); + } + + #[test] + fn strip_preamble_bails_when_no_paragraph_break() { + // Preamble pattern present but no \n\n boundary — leave as-is rather + // than risk amputating the only content. + let raw = "The user wants me to summarise something. Key facts inline."; + assert_eq!(strip_summary_preamble(raw), raw); + } + + #[test] + fn strip_preamble_handles_lets_summarise() { + let raw = "Let me summarise what happened.\n\nUser approved the kanban triage."; + let out = strip_summary_preamble(raw); + assert!(out.starts_with("User approved")); + } }