diff --git a/README.md b/README.md index 74157aa7..11581f79 100644 --- a/README.md +++ b/README.md @@ -85,9 +85,22 @@ bluebox-agent --model claude-opus-4-5 - Falls back to an AI browser agent for tasks without predefined routines - Post-processes outputs using Python (CSV, JSON, etc.) - Saves generated files to a local workspace +- Generates reusable **context files** to replay successful sessions instantly Ask it anything: *"Run a price analysis on Rolex Sea Dweller 16600"* — the agent automatically selects the right routine, runs it, and delivers structured results. +### Context (session replay) + +After a successful session, run `/generate_context` to save a snapshot of what worked — the goal, routines called (with exact parameters), any Python post-processing code, and output descriptions. Context files are saved to the workspace `context/` directory in both JSON and Markdown formats. + +When the agent starts a new session, it automatically loads the most recent context file and injects it into the system prompt. This lets the agent **skip trial and error** and directly replay the known-good path, adjusting parameters as needed for the new request. + +You can also load a specific context file explicitly: + +```bash +bluebox-agent --context-file path/to/agent_context.json +``` + ## Create your own routines To learn about the core technology powering BlueBox, see [routine_discovery.md](routine_discovery.md). diff --git a/bluebox/agents/bluebox_agent.py b/bluebox/agents/bluebox_agent.py index 82fce628..1fb3ab71 100644 --- a/bluebox/agents/bluebox_agent.py +++ b/bluebox/agents/bluebox_agent.py @@ -13,6 +13,7 @@ import json from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime +from pathlib import Path from textwrap import dedent from typing import Any, Callable @@ -21,6 +22,7 @@ from bluebox.agents.abstract_agent import AbstractAgent, AgentCard, agent_tool from bluebox.agents.workspace import AgentWorkspace, LocalWorkspace from bluebox.config import Config +from bluebox.data_models.agents.context import BlueBoxAgentContext, UsedRoutine from bluebox.data_models.browser_agent import ( BrowserAgentDoneEvent, BrowserAgentErrorEvent, @@ -50,6 +52,9 @@ logger = get_logger(name=__name__) + + + class BlueBoxAgent(AbstractAgent): """ BlueBoxAgent that searches and executes web automation routines. @@ -79,6 +84,7 @@ class BlueBoxAgent(AbstractAgent): Your workspace has the following structure: - `raw/` — routine result JSON files, saved automatically when routines execute - `outputs/` — write all your generated output files here (CSV, JSON, JSONL, etc.) + - `context/` — context files (JSON + Markdown) saved by `generate_context`, used for session replay **Pre-loaded variables in `run_python_code`:** - `routine_results` — list of dicts, one per JSON file in raw/ @@ -128,6 +134,7 @@ class BlueBoxAgent(AbstractAgent): - When using `execute_browser_task`, write a specific, step-by-step task description so the browser agent knows exactly what to do. - If your first search returns no results, try rephrasing the task description before giving up. - Be concise in responses. + - Be thorough and persistent — keep iterating until the output is correct. """).strip() ## Magic methods @@ -144,6 +151,7 @@ def __init__( workspace: AgentWorkspace | None = None, auth_headers_provider: Callable[[], dict[str, str]] | None = None, on_llm_response: Callable[[LLMChatResponse], None] | None = None, + context_file: str | None = None, ) -> None: """ Initialize the BlueBox Agent. @@ -160,6 +168,9 @@ def __init__( auth_headers_provider: Optional callback that returns auth headers for downstream API calls. If not provided, falls back to Config.VECTORLY_SERVICE_TOKEN. on_llm_response: Optional callback invoked after each LLM call with the response (for token tracking). + context_file: Optional path to a context file (.json or .md) from a previous + session. If not provided, auto-discovers the most recent context file from + the workspace's context/ directory. """ # Validate required config self._auth_headers_provider = auth_headers_provider @@ -169,6 +180,9 @@ def __init__( self._workspace = workspace or LocalWorkspace() self._routine_cache: dict[str, RoutineInfo] = {} + # Load context from explicit path or auto-discover from workspace + self._agent_context: BlueBoxAgentContext | None = self._load_context(context_file) + super().__init__( emit_message_callable=emit_message_callable, persist_chat_callable=persist_chat_callable, @@ -186,12 +200,20 @@ def __init__( self._is_blocklist_mode = self._sandbox_mode == "blocklist" logger.debug( - "BlueBoxAgent initialized with model: %s, chat_thread_id: %s, sandbox_mode: %s", + "BlueBoxAgent initialized with model: %s, chat_thread_id: %s, sandbox_mode: %s, has_context: %s", llm_model, self._thread.id, self._sandbox_mode, + self._agent_context is not None, ) + ## Properties + + @property + def loaded_context(self) -> BlueBoxAgentContext | None: + """The context loaded on init, if any.""" + return self._agent_context + ## Auth def _get_auth_headers(self) -> dict[str, str]: @@ -210,6 +232,8 @@ def _get_system_prompt(self) -> str: prompt = self.SYSTEM_PROMPT + time_info if self._is_blocklist_mode: prompt += self._get_blocklist_sandbox_prompt_section() + if self._agent_context: + prompt += self._get_context_prompt_section() return prompt def _get_blocklist_sandbox_prompt_section(self) -> str: @@ -281,6 +305,106 @@ def _validate_routine_params(self, routine_id: str, params: dict[str, Any]) -> s ) return None + ## Context loading + + _CONTEXT_PROMPT_MAX_CHARS: int = 20_000 + + def _load_context(self, context_file: str | None) -> BlueBoxAgentContext | None: + """Load context from an explicit path or auto-discover from workspace context/ dir. + + Resolution order for context_file: + 1. Absolute path + 2. Relative to workspace root + + If context_file is None, auto-discovers the most recent .json file in context/. + """ + if context_file: + return self._load_context_from_path(context_file) + return self._auto_discover_context() + + def _load_context_from_path(self, context_file: str) -> BlueBoxAgentContext | None: + """Load a context file from an explicit path (absolute or workspace-relative).""" + path = Path(context_file) + if not path.is_file() and not path.is_absolute(): + path = self._workspace.root_path / context_file + if not path.is_file(): + logger.warning("Context file not found: %s", path) + return None + try: + raw = path.read_text(encoding="utf-8") + if path.suffix == ".md": + ctx = BlueBoxAgentContext.from_markdown(raw) + else: + ctx = BlueBoxAgentContext.model_validate_json(raw) + logger.info("Loaded agent context from %s", path) + return ctx + except Exception as e: + logger.warning("Failed to load context file %s: %s", path, e) + return None + + def _auto_discover_context(self) -> BlueBoxAgentContext | None: + """Find and load the most recent context file from workspace context/ dir. + + Prefers .json files over .md when both exist. Falls back to .md if no + JSON context files are present. + """ + context_dir = self._workspace.root_path / "context" + if not context_dir.is_dir(): + return None + # Prefer JSON, fall back to Markdown + for ext in ("*.json", "*.md"): + files = sorted(context_dir.glob(ext), key=lambda p: p.stat().st_mtime, reverse=True) + if files: + return self._load_context_from_path(str(files[0])) + return None + + def _get_context_prompt_section(self) -> str: + """Build a system prompt section from a loaded BlueBoxAgentContext.""" + ctx = self._agent_context + if not ctx: + return "" + + section = ( + "\n\n## Prior Context\n" + "A previous session already solved a similar task. Use this as a starting point.\n" + "Replicate this path if the user's goal matches. " + "Adjust parameters for the new request. Skip trial and error.\n\n" + + ctx.to_markdown() + ) + + if len(section) > self._CONTEXT_PROMPT_MAX_CHARS: + section = section[:self._CONTEXT_PROMPT_MAX_CHARS] + ( + "\n\n... (context truncated — use `read_workspace_file` to read " + "the full context files in `context/` for more detail)" + ) + + return section + + def _extract_routines_from_raw(self) -> list[UsedRoutine]: + """Extract routine info from raw/ execution result files. + + Each raw JSON file contains routine_id, routine_name, parameters, + and status from a previous execution. Returns deduplicated list + of successfully executed routines. + """ + raw_results = self._workspace.load_raw_json() + seen: set[str] = set() + routines: list[UsedRoutine] = [] + for rr in raw_results: + rid = rr.get("routine_id") + if not rid or rid in seen: + continue + # Only include completed executions + if rr.get("status") != "completed": + continue + seen.add(rid) + routines.append(UsedRoutine.from_dict_params( + routine_id=rid, + routine_name=rr.get("routine_name", rid), + parameters=rr.get("parameters", {}), + )) + return routines + ## Tool handlers @agent_tool() @@ -340,8 +464,9 @@ def _execute_routines_in_parallel( def save_result(result: dict[str, Any]) -> dict[str, Any]: """Save a single routine result to a JSON file in raw/.""" try: + ts = datetime.now().strftime("%y-%m-%d-%H%M%S") save_info = self._workspace.save_file( - "raw", "routine_result", + "raw", f"{ts}-routine_result.json", json.dumps(result, indent=2, default=str), ) result.update(save_info) @@ -481,8 +606,9 @@ def _execute_browser_task( final_result = result.get("final_result") if final_result: try: + ts = datetime.now().strftime("%y-%m-%d-%H%M%S") save_info = self._workspace.save_file( - "outputs", "browser_agent", final_result, extension=".md", + "outputs", f"{ts}-browser_agent.md", final_result, ) result.update(save_info) except Exception as e: @@ -661,3 +787,73 @@ def _read_workspace_file( end_line: Optional 1-based end line number (inclusive). Omit to read to the end. """ return self._workspace.read_file(path, start_line=start_line, end_line=end_line) + + ## Context generation (structured output, called by TUI slash command) + + def generate_context(self, focus: str | None = None) -> BlueBoxAgentContext: + """Generate a context file from the current session using structured output. + + Makes a direct LLM call with response_model=BlueBoxAgentContext to get + a validated Pydantic model back. Saves both JSON and Markdown files to + the workspace context/ directory. + + Args: + focus: Optional user-provided focus prompt to guide context generation. + + Returns: + The validated BlueBoxAgentContext. + + Raises: + ValueError: If the LLM fails to produce a valid context. + """ + raw_routines = self._extract_routines_from_raw() + + system_prompt = ( + "You are analyzing a BlueBox Agent conversation to extract a reusable context file. " + "Fill in every field of the BlueBoxAgentContext schema based on the conversation.\n\n" + "CRITICAL: routines_used must include every routine that was executed with exact " + "routine_id, routine_name, and parameter values.\n" + "Include the final working python_code snippet if post-processing was done.\n" + "Include output_files with relative paths of files written to outputs/.\n" + ) + if raw_routines: + system_prompt += "\nRoutines found in execution results:\n" + for r in raw_routines: + system_prompt += f"- {r.routine_name} ({r.routine_id}): {json.dumps(r.parameters_as_dict(), default=str)}\n" + if focus: + system_prompt += f"\nUser focus: {focus}\n" + + # One-off structured output call that sees the full conversation via + # OpenAI's response chaining (previous_response_id reconstructs the + # thread server-side). We don't update self._previous_response_id + # afterward so this call doesn't affect the agent loop. + response = self.llm_client.call_sync( + input="Generate a reusable context file from this conversation.", + system_prompt=system_prompt, + response_model=BlueBoxAgentContext, + previous_response_id=self._previous_response_id, + ) + context = response.parsed + if context is None: + raise ValueError("LLM failed to produce a valid BlueBoxAgentContext") + + # Safety net: merge raw routines if LLM left routines_used empty + if not context.routines_used and raw_routines: + context.routines_used = raw_routines + logger.info( + "Auto-populated %d routine(s) from raw/ execution results", + len(raw_routines), + ) + + # Save canonical JSON + json_save = self._workspace.save_file( + "context", "agent_context.json", context.model_dump_json(indent=2), + ) + + # Save companion Markdown + md_save = self._workspace.save_file( + "context", "agent_context.md", context.to_markdown(), + ) + + logger.info("Context files saved: %s, %s", json_save["output_file"], md_save["output_file"]) + return context diff --git a/bluebox/agents/workspace.py b/bluebox/agents/workspace.py index 999c0db8..869d3bdb 100644 --- a/bluebox/agents/workspace.py +++ b/bluebox/agents/workspace.py @@ -11,9 +11,7 @@ from __future__ import annotations import json -import threading from abc import ABC, abstractmethod -from datetime import datetime from pathlib import Path from typing import Any @@ -27,9 +25,10 @@ class AgentWorkspace(ABC): """ Abstract workspace that agents use for file I/O. - A workspace has two logical subdirectories: + A workspace has three logical subdirectories: - raw/ : input data (e.g., routine results saved automatically) - outputs/: agent-generated output files (e.g., CSVs, processed JSON) + - context/: reusable context files from successful sessions """ @property @@ -41,17 +40,15 @@ def root_path(self) -> Path: def save_file( self, subdirectory: str, - filename_prefix: str, + filename: str, content: str, - extension: str = ".json", ) -> dict[str, str]: - """Save content with a unique timestamped filename. + """Save content to a file in the workspace. Args: - subdirectory: Logical subdirectory ("raw" or "outputs"). - filename_prefix: Prefix for the generated filename. + subdirectory: Logical subdirectory ("raw", "outputs", or "context"). + filename: The filename to use (e.g. "routine_result_1.json"). content: File content to write. - extension: File extension including the dot. Returns: Dict with at least "output_file" key (the saved path). @@ -113,7 +110,7 @@ def diff_outputs(self, before: dict[str, float]) -> list[str]: @abstractmethod def ensure_dirs(self) -> None: - """Ensure the workspace directory structure exists (raw/, outputs/).""" + """Ensure the workspace directory structure exists (raw/, outputs/, context/).""" class LocalWorkspace(AgentWorkspace): @@ -123,8 +120,8 @@ def __init__(self, workspace_dir: str = "./bluebox_workspace") -> None: self._workspace_dir = Path(workspace_dir) self._raw_dir = self._workspace_dir / "raw" self._outputs_dir = self._workspace_dir / "outputs" - self._execution_counter: int = 0 - self._counter_lock = threading.Lock() + self._context_dir = self._workspace_dir / "context" + self.ensure_dirs() @property def root_path(self) -> Path: @@ -133,17 +130,11 @@ def root_path(self) -> Path: def save_file( self, subdirectory: str, - filename_prefix: str, + filename: str, content: str, - extension: str = ".json", ) -> dict[str, str]: directory = self._workspace_dir / subdirectory directory.mkdir(parents=True, exist_ok=True) - with self._counter_lock: - self._execution_counter += 1 - idx = self._execution_counter - timestamp = datetime.now().strftime("%y-%m-%d-%H%M%S") - filename = f"{timestamp}-{filename_prefix}_{idx}{extension}" output_path = directory / filename output_path.write_text(content) logger.info("Result saved to %s", output_path) @@ -230,3 +221,4 @@ def diff_outputs(self, before: dict[str, float]) -> list[str]: def ensure_dirs(self) -> None: self._raw_dir.mkdir(parents=True, exist_ok=True) self._outputs_dir.mkdir(parents=True, exist_ok=True) + self._context_dir.mkdir(parents=True, exist_ok=True) diff --git a/bluebox/data_models/agents/__init__.py b/bluebox/data_models/agents/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bluebox/data_models/agents/context.py b/bluebox/data_models/agents/context.py new file mode 100644 index 00000000..f5f2f329 --- /dev/null +++ b/bluebox/data_models/agents/context.py @@ -0,0 +1,268 @@ +""" +bluebox/data_models/agents/context.py + +Data model for BlueBoxAgent context files. + +A context file captures the successful path through a BlueBoxAgent +conversation so a new agent instance can replay it without trial and error. + +Supports dual format: canonical JSON (Pydantic) and human-readable Markdown, +with round-trip parsing between both. +""" + +from __future__ import annotations + +import json +import re +from datetime import datetime, timezone +from typing import Any + +from pydantic import BaseModel, Field + + +class UsedRoutineParameter(BaseModel): + """A single parameter key-value pair used in a routine execution.""" + + key: str = Field(..., description="Parameter name") + value: str | bool | int | float = Field(..., description="Parameter value") + + +class UsedRoutine(BaseModel): + """One routine that was successfully executed during the session.""" + + routine_id: str = Field(..., description="Routine ID from search_routines results") + routine_name: str = Field(..., description="Human-readable routine name") + parameters: list[UsedRoutineParameter] = Field( + default_factory=list, + description="Parameter key-value pairs that produced correct results", + ) + + def parameters_as_dict(self) -> dict[str, str | bool | int | float]: + """Convert parameters list to a dict for convenience.""" + return {p.key: p.value for p in self.parameters} + + @classmethod + def from_dict_params( + cls, routine_id: str, routine_name: str, parameters: dict[str, Any], + ) -> UsedRoutine: + """Convenience constructor that accepts a dict of parameters.""" + return cls( + routine_id=routine_id, + routine_name=routine_name, + parameters=[UsedRoutineParameter(key=k, value=v) for k, v in parameters.items()], + ) + + +class BlueBoxAgentContext(BaseModel): + """ + Structured snapshot of a successful BlueBoxAgent session. + + Serialized to JSON and saved to context/. Consumed by a new + BlueBoxAgent instance via system prompt injection. + """ + + version: int = Field(default=1, description="Schema version for forward compatibility") + goal: str = Field(..., description="The user's original request, in their own words") + routines_used: list[UsedRoutine] = Field( + default_factory=list, + description="Routines that produced useful results, in execution order", + ) + python_code: str | None = Field( + default=None, + description="The final working Python post-processing snippet", + ) + output_files: list[str] = Field( + default_factory=list, + description="Relative paths of output files written to outputs/", + ) + output_description: str = Field( + ..., + description="Prose description of the output: format, key fields, row count if known", + ) + summary: str = Field( + ..., + description="1-2 sentence human-readable summary of what was accomplished", + ) + generated_at: datetime = Field( + default_factory=lambda: datetime.now(tz=timezone.utc), + description="When this context was generated", + ) + + # ── Markdown serialization ─────────────────────────────────────────── + + def to_markdown(self) -> str: + """Render as structured Markdown with fenced sections for round-tripping.""" + lines: list[str] = [] + lines.append("# BlueBox Agent Context") + lines.append("") + lines.append(f"**Version:** {self.version}") + lines.append(f"**Generated:** {self.generated_at.isoformat()}") + lines.append("") + + lines.append("## Goal") + lines.append("") + lines.append(self.goal) + lines.append("") + + lines.append("## Summary") + lines.append("") + lines.append(self.summary) + lines.append("") + + if self.routines_used: + lines.append("## Routines Used") + lines.append("") + for r in self.routines_used: + lines.append(f"### {r.routine_name} (`{r.routine_id}`)") + lines.append("") + if r.parameters: + lines.append("**Parameters:**") + lines.append("```json") + lines.append(json.dumps(r.parameters_as_dict(), indent=2, default=str)) + lines.append("```") + else: + lines.append("No parameters.") + lines.append("") + + if self.python_code: + lines.append("## Python Code") + lines.append("") + lines.append("```python") + lines.append(self.python_code) + lines.append("```") + lines.append("") + + if self.output_files: + lines.append("## Output Files") + lines.append("") + for f in self.output_files: + lines.append(f"- `{f}`") + lines.append("") + + lines.append("## Output Description") + lines.append("") + lines.append(self.output_description) + lines.append("") + + return "\n".join(lines) + + @classmethod + def from_markdown(cls, text: str) -> BlueBoxAgentContext: + """Parse structured Markdown back into BlueBoxAgentContext.""" + sections = _split_markdown_sections(text) + + # Version and generated_at from header + version = 1 + generated_at = datetime.now(tz=timezone.utc) + header = sections.get("BlueBox Agent Context", "") + version_match = re.search(r"\*\*Version:\*\*\s*(\d+)", header) + if version_match: + version = int(version_match.group(1)) + generated_match = re.search(r"\*\*Generated:\*\*\s*(.+)", header) + if generated_match: + try: + generated_at = datetime.fromisoformat(generated_match.group(1).strip()) + except ValueError: + pass + + goal = sections.get("Goal", "").strip() + summary = sections.get("Summary", "").strip() + output_description = sections.get("Output Description", "").strip() + + # Parse routines from subsections + routines_used = _parse_routines_section(sections.get("Routines Used", "")) + + # Parse python code from fenced block + python_code = _extract_fenced_block(sections.get("Python Code", ""), "python") + + # Parse output files + output_files: list[str] = [] + for line in sections.get("Output Files", "").splitlines(): + match = re.match(r"^-\s*`(.+)`", line.strip()) + if match: + output_files.append(match.group(1)) + + return cls( + version=version, + goal=goal, + summary=summary, + output_description=output_description, + routines_used=routines_used, + python_code=python_code, + output_files=output_files, + generated_at=generated_at, + ) + + +# ── Markdown parsing helpers ───────────────────────────────────────────── + + +def _split_markdown_sections(text: str) -> dict[str, str]: + """Split Markdown into {heading: body} pairs. Handles H1 and H2 levels.""" + sections: dict[str, str] = {} + current_heading: str | None = None + current_lines: list[str] = [] + + for line in text.splitlines(): + heading_match = re.match(r"^#{1,2}\s+(.+)$", line) + if heading_match: + if current_heading is not None: + sections[current_heading] = "\n".join(current_lines) + current_heading = heading_match.group(1).strip() + current_lines = [] + else: + current_lines.append(line) + + if current_heading is not None: + sections[current_heading] = "\n".join(current_lines) + + return sections + + +def _extract_fenced_block(text: str, language: str | None = None) -> str | None: + """Extract the first fenced code block from text, optionally matching language.""" + if language: + pattern = rf"```{re.escape(language)}\n(.*?)```" + else: + pattern = r"```\w*\n(.*?)```" + match = re.search(pattern, text, re.DOTALL) + if match: + return match.group(1).rstrip("\n") + return None + + +def _parse_routines_section(text: str) -> list[UsedRoutine]: + """Parse the Routines Used section into UsedRoutine objects.""" + routines: list[UsedRoutine] = [] + if not text.strip(): + return routines + + # Split on H3 headers: ### RoutineName (`routine_id`) + parts = re.split(r"^###\s+", text, flags=re.MULTILINE) + for part in parts: + if not part.strip(): + continue + # Parse header: "RoutineName (`routine_id`)" + header_match = re.match(r"^(.+?)\s*\(`([^`]+)`\)", part) + if not header_match: + continue + routine_name = header_match.group(1).strip() + routine_id = header_match.group(2).strip() + + # Parse parameters from JSON code block + param_list: list[UsedRoutineParameter] = [] + params_json = _extract_fenced_block(part, "json") + if params_json: + try: + params_dict = json.loads(params_json) + param_list = [UsedRoutineParameter(key=k, value=v) for k, v in params_dict.items()] + except (json.JSONDecodeError, TypeError): + pass + + routines.append(UsedRoutine( + routine_id=routine_id, + routine_name=routine_name, + parameters=param_list, + )) + + return routines diff --git a/bluebox/scripts/run_bluebox_agent.py b/bluebox/scripts/run_bluebox_agent.py index 015b9a1e..093a4667 100644 --- a/bluebox/scripts/run_bluebox_agent.py +++ b/bluebox/scripts/run_bluebox_agent.py @@ -22,20 +22,22 @@ import argparse import shutil +import sys from datetime import datetime from pathlib import Path from typing import TYPE_CHECKING, Any -import sys from bluebox.utils.code_execution_sandbox import is_docker_available from bluebox.utils.terminal_utils import ask_yes_no, print_colored, YELLOW from rich.console import Console from rich.text import Text +from textual import work from textual.widgets import RichLog from bluebox.agents.bluebox_agent import BlueBoxAgent from bluebox.agents.workspace import LocalWorkspace +from bluebox.data_models.agents.context import BlueBoxAgentContext from bluebox.config import Config from bluebox.data_models.llms.vendors import LLMModel from bluebox.utils.cli_utils import add_model_argument, resolve_model @@ -50,17 +52,25 @@ class BlueBoxAgentTUI(AbstractAgentTUI): """Multi-pane TUI for the BlueBox Agent.""" TITLE = "BlueBox Agent" - SLASH_COMMANDS = BASE_SLASH_COMMANDS - HELP_TEXT = BASE_HELP_TEXT + SLASH_COMMANDS = { + **BASE_SLASH_COMMANDS, + "/generate_context": "Save a reusable context file (optionally with a focus prompt)", + } + HELP_TEXT = BASE_HELP_TEXT + ( + "\n [cyan]/generate_context[/cyan] Save a reusable context file from this session" + "\n Optionally add a focus: [cyan]/generate_context focus on the flight search part[/cyan]\n" + ) SHOW_SAVED_FILES_PANE = True def __init__( self, llm_model: LLMModel, workspace_dir: str = "./bluebox_workspace", + context_file: str | None = None, ) -> None: super().__init__(llm_model, working_dir=workspace_dir) self._workspace_dir = workspace_dir + self._context_file = context_file # ── Abstract implementations ───────────────────────────────────────── @@ -70,6 +80,7 @@ def _create_agent(self) -> AbstractAgent: stream_chunk_callable=self._handle_stream_chunk, llm_model=self._llm_model, workspace=LocalWorkspace(self._workspace_dir), + context_file=self._context_file, ) def _print_welcome(self) -> None: @@ -83,6 +94,10 @@ def _print_welcome(self) -> None: lines = [ f"[dim]Model:[/dim] {self._llm_model.value}", ] + if isinstance(self._agent, BlueBoxAgent) and self._agent.loaded_context: + ctx = self._agent.loaded_context + lines.append(f"[dim]Context:[/dim] [green]loaded[/green] — {ctx.goal[:60]}") + lines.append(f"[dim] {len(ctx.routines_used)} routine(s), {len(ctx.output_files)} output file(s)[/dim]") chat.write(Text.from_markup("\n".join(lines))) chat.write("") @@ -129,6 +144,61 @@ def _add(p: str) -> None: _add(r.get("output_file", "")) return paths + # ── Custom slash commands ───────────────────────────────────────── + + def _handle_custom_command(self, cmd: str, raw_input: str) -> bool: + if raw_input.lower().startswith("/generate_context"): + chat = self.query_one("#chat-log", RichLog) + if not self._agent: + chat.write(Text.from_markup("[red]Agent not initialized.[/red]")) + return True + + user_focus = raw_input[len("/generate_context"):].strip() or None + chat.write(Text.from_markup( + "[yellow]Generating context from this session...[/yellow]" + )) + self._processing = True + self._generate_context_async(user_focus) + return True + return False + + @work(thread=True) + def _generate_context_async(self, focus: str | None) -> None: + """Run generate_context in a background thread via structured output.""" + try: + if not isinstance(self._agent, BlueBoxAgent): + raise TypeError(f"Expected BlueBoxAgent, got {type(self._agent).__name__}") + result = self._agent.generate_context(focus=focus) + self.call_from_thread(self._show_context_success, result) + except Exception as e: + self.call_from_thread(self._show_context_error, str(e)) + + def _show_context_success(self, context: BlueBoxAgentContext) -> None: + """Display context generation success in the chat pane.""" + context_dir = Path(self._workspace_dir) / "context" + json_path = str(context_dir / "agent_context.json") + md_path = str(context_dir / "agent_context.md") + chat = self.query_one("#chat-log", RichLog) + chat.write(Text.from_markup( + f"[bold green]Context saved![/bold green]\n" + f"[dim]Goal:[/dim] {context.goal}\n" + f"[dim]Summary:[/dim] {context.summary}\n" + f"[dim]Routines:[/dim] {len(context.routines_used)}\n" + f"[dim]JSON:[/dim] {json_path}\n" + f"[dim]Markdown:[/dim] {md_path}" + )) + self._add_saved_file(json_path) + self._add_saved_file(md_path) + self._processing = False + self._update_status() + + def _show_context_error(self, error: str) -> None: + """Display context generation error in the chat pane.""" + chat = self.query_one("#chat-log", RichLog) + chat.write(Text.from_markup(f"[bold red]Context generation failed:[/bold red] {error}")) + self._processing = False + self._update_status() + # ─── Entry point ───────────────────────────────────────────────────────────── @@ -142,6 +212,12 @@ def main() -> None: default="./bluebox_workspace", help="Workspace directory. Raw results in raw/, output files in outputs/ (default: ./bluebox_workspace)", ) + parser.add_argument( + "--context-file", + type=str, + default=None, + help="Path to a context file (.json or .md) from a previous session to guide the agent", + ) parser.add_argument("-q", "--quiet", action="store_true", help="Suppress logs") parser.add_argument("--log-file", type=str, default=None, help="Log to file") args = parser.parse_args() @@ -186,6 +262,7 @@ def main() -> None: app = BlueBoxAgentTUI( llm_model=llm_model, workspace_dir=args.workspace_dir, + context_file=args.context_file, ) app.run() diff --git a/tests/unit/agents/test_bluebox_agent_context.py b/tests/unit/agents/test_bluebox_agent_context.py new file mode 100644 index 00000000..95db05b8 --- /dev/null +++ b/tests/unit/agents/test_bluebox_agent_context.py @@ -0,0 +1,511 @@ +""" +tests/unit/agents/test_bluebox_agent_context.py + +Unit tests for BlueBoxAgentContext data model and context generation/loading +in BlueBoxAgent. +""" + +import json +import os +import time +from datetime import datetime, timezone +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from bluebox.agents.bluebox_agent import BlueBoxAgent +from bluebox.agents.workspace import LocalWorkspace +from bluebox.data_models.agents.context import BlueBoxAgentContext, UsedRoutine, UsedRoutineParameter + + +# ============================================================================= +# Fixtures +# ============================================================================= + + +@pytest.fixture +def sample_context() -> BlueBoxAgentContext: + """A fully populated context for testing.""" + return BlueBoxAgentContext( + version=1, + goal="Find one-way train tickets from NYC to Boston on March 15, 2026", + routines_used=[ + UsedRoutine.from_dict_params( + routine_id="Routine_abc123", + routine_name="AmtrakOneWaySearch", + parameters={"origin": "New York", "destination": "Boston", "date": "2026-03-15"}, + ), + UsedRoutine.from_dict_params( + routine_id="Routine_def456", + routine_name="AmtrakPriceFilter", + parameters={"max_price": 100}, + ), + ], + python_code=( + 'import csv\n' + 'with open("outputs/trains.csv", "w") as f:\n' + ' writer = csv.DictWriter(f, fieldnames=["departure", "price"])\n' + ' writer.writeheader()\n' + ' for rr in routine_results:\n' + ' for train in rr["result"]["data"]["trains"]:\n' + ' writer.writerow(train)\n' + 'print("Done")' + ), + output_files=["outputs/trains.csv"], + output_description="CSV with columns: departure, price. 12 rows of Amtrak trains under $100.", + summary="Searched Amtrak for NYC-Boston trains on March 15, filtered by price, and exported to CSV.", + generated_at=datetime(2026, 2, 22, 10, 30, 0, tzinfo=timezone.utc), + ) + + +@pytest.fixture +def minimal_context() -> BlueBoxAgentContext: + """A context with only required fields.""" + return BlueBoxAgentContext( + goal="Search for flights", + output_description="JSON with flight data", + summary="Found flights.", + ) + + +# ============================================================================= +# BlueBoxAgentContext model tests +# ============================================================================= + + +class TestBlueBoxAgentContextModel: + """Tests for the Pydantic model itself.""" + + def test_json_roundtrip(self, sample_context: BlueBoxAgentContext) -> None: + """Serialize to JSON and back, verify equality.""" + json_str = sample_context.model_dump_json(indent=2) + restored = BlueBoxAgentContext.model_validate_json(json_str) + assert restored.version == sample_context.version + assert restored.goal == sample_context.goal + assert restored.summary == sample_context.summary + assert restored.output_description == sample_context.output_description + assert restored.python_code == sample_context.python_code + assert restored.output_files == sample_context.output_files + assert len(restored.routines_used) == 2 + assert restored.routines_used[0].routine_id == "Routine_abc123" + assert restored.routines_used[1].parameters_as_dict() == {"max_price": 100} + assert isinstance(restored.generated_at, datetime) + + def test_version_defaults_to_1(self, minimal_context: BlueBoxAgentContext) -> None: + assert minimal_context.version == 1 + + def test_generated_at_defaults_to_now(self, minimal_context: BlueBoxAgentContext) -> None: + assert isinstance(minimal_context.generated_at, datetime) + # Should be recent (within last 10 seconds) + delta = datetime.now(tz=timezone.utc) - minimal_context.generated_at + assert delta.total_seconds() < 10 + + def test_optional_fields_default(self, minimal_context: BlueBoxAgentContext) -> None: + assert minimal_context.routines_used == [] + assert minimal_context.python_code is None + assert minimal_context.output_files == [] + + +# ============================================================================= +# Markdown round-trip tests +# ============================================================================= + + +class TestMarkdownRoundTrip: + """Tests for to_markdown() and from_markdown().""" + + def test_to_markdown_has_expected_sections(self, sample_context: BlueBoxAgentContext) -> None: + md = sample_context.to_markdown() + assert "# BlueBox Agent Context" in md + assert "## Goal" in md + assert "## Summary" in md + assert "## Routines Used" in md + assert "## Python Code" in md + assert "## Output Files" in md + assert "## Output Description" in md + assert "**Version:** 1" in md + assert "**Generated:**" in md + + def test_to_markdown_contains_routine_details(self, sample_context: BlueBoxAgentContext) -> None: + md = sample_context.to_markdown() + assert "AmtrakOneWaySearch" in md + assert "Routine_abc123" in md + assert '"origin": "New York"' in md + + def test_to_markdown_contains_python_code(self, sample_context: BlueBoxAgentContext) -> None: + md = sample_context.to_markdown() + assert "```python" in md + assert "csv.DictWriter" in md + + def test_from_markdown_roundtrip(self, sample_context: BlueBoxAgentContext) -> None: + """from_markdown(to_markdown(ctx)) should produce an equivalent model.""" + md = sample_context.to_markdown() + restored = BlueBoxAgentContext.from_markdown(md) + assert restored.version == sample_context.version + assert restored.goal == sample_context.goal + assert restored.summary == sample_context.summary + assert restored.output_description == sample_context.output_description + assert restored.python_code == sample_context.python_code + assert restored.output_files == sample_context.output_files + assert len(restored.routines_used) == len(sample_context.routines_used) + for orig, rest in zip(sample_context.routines_used, restored.routines_used): + assert rest.routine_id == orig.routine_id + assert rest.routine_name == orig.routine_name + assert rest.parameters_as_dict() == orig.parameters_as_dict() + assert restored.generated_at == sample_context.generated_at + + def test_from_markdown_no_python_code(self, minimal_context: BlueBoxAgentContext) -> None: + """Markdown with no Python Code section should parse python_code as None.""" + md = minimal_context.to_markdown() + assert "## Python Code" not in md + restored = BlueBoxAgentContext.from_markdown(md) + assert restored.python_code is None + + def test_from_markdown_no_routines(self, minimal_context: BlueBoxAgentContext) -> None: + md = minimal_context.to_markdown() + assert "## Routines Used" not in md + restored = BlueBoxAgentContext.from_markdown(md) + assert restored.routines_used == [] + + def test_from_markdown_no_output_files(self, minimal_context: BlueBoxAgentContext) -> None: + md = minimal_context.to_markdown() + restored = BlueBoxAgentContext.from_markdown(md) + assert restored.output_files == [] + + +# ============================================================================= +# Context loading tests (BlueBoxAgent integration) +# ============================================================================= + + +class TestContextLoading: + """Tests for context file loading in BlueBoxAgent.""" + + def _make_agent( + self, + workspace_dir: Path, + context_file: str | None = None, + ) -> BlueBoxAgent: + """Create a BlueBoxAgent with mocked dependencies.""" + return BlueBoxAgent( + emit_message_callable=MagicMock(), + workspace=LocalWorkspace(str(workspace_dir)), + auth_headers_provider=lambda: {"X-Service-Token": "test"}, + context_file=context_file, + ) + + def test_loads_json_context_file(self, tmp_path: Path, sample_context: BlueBoxAgentContext) -> None: + ctx_file = tmp_path / "my_context.json" + ctx_file.write_text(sample_context.model_dump_json(indent=2)) + + agent = self._make_agent(tmp_path, context_file=str(ctx_file)) + assert agent._agent_context is not None + assert agent._agent_context.goal == sample_context.goal + + def test_loads_markdown_context_file(self, tmp_path: Path, sample_context: BlueBoxAgentContext) -> None: + ctx_file = tmp_path / "my_context.md" + ctx_file.write_text(sample_context.to_markdown()) + + agent = self._make_agent(tmp_path, context_file=str(ctx_file)) + assert agent._agent_context is not None + assert agent._agent_context.goal == sample_context.goal + + def test_workspace_relative_path(self, tmp_path: Path, sample_context: BlueBoxAgentContext) -> None: + context_dir = tmp_path / "context" + context_dir.mkdir() + ctx_file = context_dir / "my_context.json" + ctx_file.write_text(sample_context.model_dump_json(indent=2)) + + agent = self._make_agent(tmp_path, context_file="context/my_context.json") + assert agent._agent_context is not None + assert agent._agent_context.goal == sample_context.goal + + def test_auto_discovers_from_workspace(self, tmp_path: Path, sample_context: BlueBoxAgentContext) -> None: + context_dir = tmp_path / "context" + context_dir.mkdir() + ctx_file = context_dir / "agent_context.json" + ctx_file.write_text(sample_context.model_dump_json(indent=2)) + + agent = self._make_agent(tmp_path) + assert agent._agent_context is not None + assert agent._agent_context.goal == sample_context.goal + + def test_auto_discovers_most_recent(self, tmp_path: Path) -> None: + """When multiple context files exist, loads the most recently modified.""" + context_dir = tmp_path / "context" + context_dir.mkdir() + + old = BlueBoxAgentContext(goal="old goal", output_description="old", summary="old") + old_file = context_dir / "old.json" + old_file.write_text(old.model_dump_json()) + past = time.time() - 10 + os.utime(old_file, (past, past)) # force mtime 10s in the past + + new = BlueBoxAgentContext(goal="new goal", output_description="new", summary="new") + (context_dir / "new.json").write_text(new.model_dump_json()) + + agent = self._make_agent(tmp_path) + assert agent._agent_context is not None + assert agent._agent_context.goal == "new goal" + + def test_explicit_context_file_overrides_auto_discovery( + self, tmp_path: Path, sample_context: BlueBoxAgentContext, + ) -> None: + # Put one context in workspace + context_dir = tmp_path / "context" + context_dir.mkdir() + auto_ctx = BlueBoxAgentContext(goal="auto goal", output_description="auto", summary="auto") + (context_dir / "auto.json").write_text(auto_ctx.model_dump_json()) + + # Put explicit context elsewhere + explicit_file = tmp_path / "explicit.json" + explicit_file.write_text(sample_context.model_dump_json(indent=2)) + + agent = self._make_agent(tmp_path, context_file=str(explicit_file)) + assert agent._agent_context is not None + assert agent._agent_context.goal == sample_context.goal + + def test_invalid_context_file_ignored(self, tmp_path: Path) -> None: + agent = self._make_agent(tmp_path, context_file="/nonexistent/path.json") + assert agent._agent_context is None + + def test_malformed_json_ignored(self, tmp_path: Path) -> None: + bad_file = tmp_path / "bad.json" + bad_file.write_text("not valid json!!!") + agent = self._make_agent(tmp_path, context_file=str(bad_file)) + assert agent._agent_context is None + + def test_no_context_dir_no_error(self, tmp_path: Path) -> None: + agent = self._make_agent(tmp_path) + assert agent._agent_context is None + + +# ============================================================================= +# System prompt injection tests +# ============================================================================= + + +class TestContextPromptInjection: + """Tests for _get_context_prompt_section and system prompt integration.""" + + def _make_agent(self, tmp_path: Path, context: BlueBoxAgentContext) -> BlueBoxAgent: + ctx_file = tmp_path / "context.json" + ctx_file.write_text(context.model_dump_json(indent=2)) + + return BlueBoxAgent( + emit_message_callable=MagicMock(), + workspace=LocalWorkspace(str(tmp_path)), + auth_headers_provider=lambda: {"X-Service-Token": "test"}, + context_file=str(ctx_file), + ) + + def test_context_section_in_system_prompt(self, tmp_path: Path, sample_context: BlueBoxAgentContext) -> None: + agent = self._make_agent(tmp_path, sample_context) + prompt = agent._get_system_prompt() + assert "## Prior Context" in prompt + assert sample_context.goal in prompt + assert sample_context.summary in prompt + assert "Routine_abc123" in prompt + assert "AmtrakOneWaySearch" in prompt + + def test_context_section_includes_python_code(self, tmp_path: Path, sample_context: BlueBoxAgentContext) -> None: + agent = self._make_agent(tmp_path, sample_context) + prompt = agent._get_system_prompt() + assert "```python" in prompt + assert "csv.DictWriter" in prompt + + def test_context_section_truncation(self, tmp_path: Path) -> None: + """Context over 20K chars gets truncated with a hint.""" + big_context = BlueBoxAgentContext( + goal="x" * 25_000, + output_description="desc", + summary="summary", + ) + agent = self._make_agent(tmp_path, big_context) + section = agent._get_context_prompt_section() + assert len(section) < 25_000 + assert "context truncated" in section + assert "read_workspace_file" in section + + def test_no_context_no_section(self, tmp_path: Path) -> None: + agent = BlueBoxAgent( + emit_message_callable=MagicMock(), + workspace=LocalWorkspace(str(tmp_path)), + auth_headers_provider=lambda: {"X-Service-Token": "test"}, + ) + prompt = agent._get_system_prompt() + assert "## Prior Context" not in prompt + + +# ============================================================================= +# generate_context (structured output) tests +# ============================================================================= + + +class TestGenerateContext: + """Tests for the generate_context public method (structured output).""" + + def _make_agent(self, tmp_path: Path) -> BlueBoxAgent: + return BlueBoxAgent( + emit_message_callable=MagicMock(), + workspace=LocalWorkspace(str(tmp_path)), + auth_headers_provider=lambda: {"X-Service-Token": "test"}, + ) + + def _mock_llm_response(self, context: BlueBoxAgentContext) -> MagicMock: + """Create a mock LLMChatResponse with parsed context.""" + response = MagicMock() + response.parsed = context + return response + + def test_tool_is_not_registered(self) -> None: + """generate_context should NOT be an agent tool anymore.""" + tools = BlueBoxAgent._collect_tools() + tool_names = [meta.name for meta, _ in tools] + assert "generate_context" not in tool_names + + def test_saves_both_files(self, tmp_path: Path, sample_context: BlueBoxAgentContext) -> None: + agent = self._make_agent(tmp_path) + agent.llm_client.call_sync = MagicMock(return_value=self._mock_llm_response(sample_context)) + + result = agent.generate_context() + + assert result.goal == sample_context.goal + assert result.summary == sample_context.summary + + # Verify JSON file exists and is valid + json_path = tmp_path / "context" / "agent_context.json" + assert json_path.is_file() + loaded = BlueBoxAgentContext.model_validate_json(json_path.read_text()) + assert loaded.goal == sample_context.goal + + # Verify MD file exists with expected sections + md_path = tmp_path / "context" / "agent_context.md" + assert md_path.is_file() + assert "## Goal" in md_path.read_text() + + def test_saves_to_context_subdirectory(self, tmp_path: Path, minimal_context: BlueBoxAgentContext) -> None: + agent = self._make_agent(tmp_path) + agent.llm_client.call_sync = MagicMock(return_value=self._mock_llm_response(minimal_context)) + + agent.generate_context() + + context_dir = tmp_path / "context" + assert context_dir.is_dir() + assert len(list(context_dir.glob("*.json"))) == 1 + assert len(list(context_dir.glob("*.md"))) == 1 + + def test_raises_on_none_parsed(self, tmp_path: Path) -> None: + """Should raise ValueError when LLM returns None parsed result.""" + agent = self._make_agent(tmp_path) + response = MagicMock() + response.parsed = None + agent.llm_client.call_sync = MagicMock(return_value=response) + + with pytest.raises(ValueError, match="failed to produce"): + agent.generate_context() + + def test_auto_populates_routines_from_raw(self, tmp_path: Path) -> None: + """When LLM returns empty routines_used, auto-populate from raw/.""" + agent = self._make_agent(tmp_path) + + # Write a fake routine result to raw/ + raw_dir = tmp_path / "raw" + raw_dir.mkdir(exist_ok=True) + (raw_dir / "result_1.json").write_text(json.dumps({ + "routine_id": "Routine_abc", + "routine_name": "TestRoutine", + "status": "completed", + "parameters": {"city": "NYC"}, + "result": {"ok": True, "data": {}}, + })) + + # LLM returns context with empty routines_used + context_from_llm = BlueBoxAgentContext( + goal="test goal", + summary="test summary", + output_description="test output", + routines_used=[], + ) + agent.llm_client.call_sync = MagicMock(return_value=self._mock_llm_response(context_from_llm)) + + result = agent.generate_context() + + assert len(result.routines_used) == 1 + assert result.routines_used[0].routine_id == "Routine_abc" + assert result.routines_used[0].routine_name == "TestRoutine" + assert result.routines_used[0].parameters_as_dict() == {"city": "NYC"} + + def test_auto_populate_deduplicates_routines(self, tmp_path: Path) -> None: + """Same routine_id executed multiple times should appear once.""" + agent = self._make_agent(tmp_path) + + raw_dir = tmp_path / "raw" + raw_dir.mkdir(exist_ok=True) + for i in range(3): + (raw_dir / f"result_{i}.json").write_text(json.dumps({ + "routine_id": "Routine_same", + "routine_name": "SameRoutine", + "status": "completed", + "parameters": {"q": f"query_{i}"}, + "result": {"ok": True, "data": {}}, + })) + + context_from_llm = BlueBoxAgentContext( + goal="test", summary="test", output_description="test", + routines_used=[], + ) + agent.llm_client.call_sync = MagicMock(return_value=self._mock_llm_response(context_from_llm)) + + result = agent.generate_context() + assert len(result.routines_used) == 1 + + def test_llm_provided_routines_not_overridden(self, tmp_path: Path) -> None: + """When LLM provides routines_used, don't auto-populate from raw/.""" + agent = self._make_agent(tmp_path) + + raw_dir = tmp_path / "raw" + raw_dir.mkdir(exist_ok=True) + (raw_dir / "result_1.json").write_text(json.dumps({ + "routine_id": "Routine_from_raw", + "routine_name": "RawRoutine", + "status": "completed", + "parameters": {}, + "result": {"ok": True, "data": {}}, + })) + + context_from_llm = BlueBoxAgentContext( + goal="test", summary="test", output_description="test", + routines_used=[UsedRoutine.from_dict_params( + routine_id="Routine_llm_provided", + routine_name="LLMRoutine", + parameters={"x": 1}, + )], + ) + agent.llm_client.call_sync = MagicMock(return_value=self._mock_llm_response(context_from_llm)) + + result = agent.generate_context() + assert len(result.routines_used) == 1 + assert result.routines_used[0].routine_id == "Routine_llm_provided" + + def test_passes_focus_to_system_prompt(self, tmp_path: Path, minimal_context: BlueBoxAgentContext) -> None: + """Focus text should be included in the system prompt sent to LLM.""" + agent = self._make_agent(tmp_path) + agent.llm_client.call_sync = MagicMock(return_value=self._mock_llm_response(minimal_context)) + + agent.generate_context(focus="focus on the flight search part") + + call_kwargs = agent.llm_client.call_sync.call_args + system_prompt = call_kwargs.kwargs.get("system_prompt") or call_kwargs[1].get("system_prompt", "") + assert "focus on the flight search part" in system_prompt + + def test_passes_response_model(self, tmp_path: Path, minimal_context: BlueBoxAgentContext) -> None: + """Should call llm_client.call_sync with response_model=BlueBoxAgentContext.""" + agent = self._make_agent(tmp_path) + agent.llm_client.call_sync = MagicMock(return_value=self._mock_llm_response(minimal_context)) + + agent.generate_context() + + call_kwargs = agent.llm_client.call_sync.call_args + assert call_kwargs.kwargs.get("response_model") is BlueBoxAgentContext diff --git a/tests/unit/test_read_workspace_file.py b/tests/unit/test_read_workspace_file.py index a10bee9b..a38a237f 100644 --- a/tests/unit/test_read_workspace_file.py +++ b/tests/unit/test_read_workspace_file.py @@ -28,7 +28,6 @@ class TestPathTraversalPrevention: def test_parent_traversal_blocked(self, tmp_path: Path) -> None: """../ should be denied.""" ws = _make_workspace(tmp_path / "workspace") - ws.root_path.mkdir() result = _call(ws, "../../../etc/passwd") assert "error" in result assert "Access denied" in result["error"] @@ -36,7 +35,6 @@ def test_parent_traversal_blocked(self, tmp_path: Path) -> None: def test_absolute_path_outside_blocked(self, tmp_path: Path) -> None: """/etc/passwd should be denied.""" ws = _make_workspace(tmp_path / "workspace") - ws.root_path.mkdir() result = _call(ws, "/etc/passwd") assert "error" in result assert "Access denied" in result["error"] diff --git a/tests/unit/test_workspace.py b/tests/unit/test_workspace.py index 7aa0b38c..49df8612 100644 --- a/tests/unit/test_workspace.py +++ b/tests/unit/test_workspace.py @@ -20,31 +20,32 @@ class TestSaveFile: def test_saves_file_with_content(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) - result = ws.save_file("raw", "routine_result", '{"data": 1}') + result = ws.save_file("raw", "routine_result.json", '{"data": 1}') assert "output_file" in result saved = Path(result["output_file"]) assert saved.exists() assert saved.read_text() == '{"data": 1}' + assert saved.name == "routine_result.json" def test_creates_subdirectory(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) - result = ws.save_file("raw", "test", "content") - assert (tmp_path / "raw").is_dir() + ws.save_file("custom_subdir", "test.json", "content") + assert (tmp_path / "custom_subdir").is_dir() - def test_unique_filenames(self, tmp_path: Path) -> None: + def test_overwrites_existing_file(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) - r1 = ws.save_file("raw", "test", "a") - r2 = ws.save_file("raw", "test", "b") - assert r1["output_file"] != r2["output_file"] + ws.save_file("raw", "test.json", "old") + ws.save_file("raw", "test.json", "new") + assert (tmp_path / "raw" / "test.json").read_text() == "new" - def test_custom_extension(self, tmp_path: Path) -> None: + def test_different_extensions(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) - result = ws.save_file("outputs", "browser_agent", "# Result", extension=".md") + result = ws.save_file("outputs", "result.md", "# Result") assert result["output_file"].endswith(".md") def test_no_s3_key_in_result(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) - result = ws.save_file("raw", "test", "data") + result = ws.save_file("raw", "test.json", "data") assert "output_file_s3_key" not in result @@ -90,9 +91,7 @@ def test_empty_workspace(self, tmp_path: Path) -> None: def test_lists_files_in_subdirs(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) - (tmp_path / "raw").mkdir() (tmp_path / "raw" / "result.json").write_text("{}") - (tmp_path / "outputs").mkdir() (tmp_path / "outputs" / "out.csv").write_text("a,b") result = ws.list_files() assert result["total_files"] == 2 @@ -106,7 +105,6 @@ class TestLoadRawJson: def test_loads_json_files(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) raw = tmp_path / "raw" - raw.mkdir() (raw / "a.json").write_text('{"key": "a"}') (raw / "b.json").write_text('{"key": "b"}') results = ws.load_raw_json() @@ -117,7 +115,6 @@ def test_loads_json_files(self, tmp_path: Path) -> None: def test_skips_invalid_json(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) raw = tmp_path / "raw" - raw.mkdir() (raw / "good.json").write_text('{"ok": true}') (raw / "bad.json").write_text("not json") results = ws.load_raw_json() @@ -134,7 +131,6 @@ class TestSnapshotAndDiffOutputs: def test_detects_new_file(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) - (tmp_path / "outputs").mkdir() before = ws.snapshot_outputs() (tmp_path / "outputs" / "new.csv").write_text("data") changed = ws.diff_outputs(before) @@ -144,7 +140,6 @@ def test_detects_new_file(self, tmp_path: Path) -> None: def test_detects_modified_file(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) outputs = tmp_path / "outputs" - outputs.mkdir() f = outputs / "existing.csv" f.write_text("old") before = ws.snapshot_outputs() @@ -156,7 +151,6 @@ def test_detects_modified_file(self, tmp_path: Path) -> None: def test_no_changes(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path)) outputs = tmp_path / "outputs" - outputs.mkdir() (outputs / "stable.csv").write_text("data") before = ws.snapshot_outputs() changed = ws.diff_outputs(before) @@ -166,11 +160,11 @@ def test_no_changes(self, tmp_path: Path) -> None: class TestEnsureDirs: """Tests for LocalWorkspace.ensure_dirs.""" - def test_creates_raw_and_outputs(self, tmp_path: Path) -> None: + def test_creates_raw_outputs_and_context(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path / "new_workspace")) - ws.ensure_dirs() assert (tmp_path / "new_workspace" / "raw").is_dir() assert (tmp_path / "new_workspace" / "outputs").is_dir() + assert (tmp_path / "new_workspace" / "context").is_dir() def test_idempotent(self, tmp_path: Path) -> None: ws = LocalWorkspace(str(tmp_path))