Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
624 changes: 624 additions & 0 deletions .project/resume-server-death-resilience-plan.md

Large diffs are not rendered by default.

53 changes: 53 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,59 @@ CodeCome ships reusable phase prompts under `prompts/`:
CODECOME_MODEL=<id> # pin model per phase, e.g. anthropic/claude-opus-4-7
CODECOME_MODEL_VARIANT=<v> # pin model variant, e.g. high, max

### Resilience and recovery environment variables

CodeCome consumes the opencode SSE event stream and owns the `opencode serve`
lifecycle. These knobs tune how it detects a dead, unresponsive, or hung
server/session and how it recovers. Defaults are sensible; override only if you
hit edge cases.

# --- Stall detection (hung model turn while the session stays "busy") ---
CODECOME_BUSY_STALL_TIMEOUT=180 # seconds with no meaningful SSE event
# (heartbeats/connected don't count)
# before a busy turn is treated as
# stalled. 0 disables.
CODECOME_SSE_READ_TICK=10 # SSE socket read tick (seconds). Forces
# the reader to wake on a silent-but-
# open stream so the stall watchdog
# can run. Lower = faster detection.
CODECOME_HEARTBEAT_STALL_TIMEOUT=0 # optional diagnostic signal: if
# server.heartbeat was flowing and
# then stops for this long while busy,
# flag a stall early. Disabled by
# default because opencode can pause
# heartbeats during valid long turns.

# --- Server restart / retry budget (shared by death + stall recovery) ---
CODECOME_MAX_SERVER_RESTARTS=2 # how many times CodeCome restarts
# opencode serve and retries the
# failed phase/subphase before giving
# up (covers server death or
# unresponsiveness and session stalls).
CODECOME_MAX_FATAL_RETRIES=2 # retries for transient infrastructure
# errors (timeouts, connection blips).
CODECOME_MAX_ITERATION_RETRIES=<n> # auto-resume budget for genuine mid-turn
# model/provider cutoffs (default 1 for
# Phase 1 subphases, 3 for phases 2-6).

# --- Resume readiness (waiting for an existing session to go idle) ---
CODECOME_RESUME_IDLE_TIMEOUT=120 # max seconds to wait for a resumed
# session to report idle.
CODECOME_RESUME_IDLE_POLL=1 # poll interval (seconds) while waiting.
CODECOME_RESUME_PROBE_TIMEOUT=2 # per-probe HTTP timeout for
# /session/status and /global/health.
CODECOME_RESUME_SERVER_UNAVAILABLE_THRESHOLD=3
# consecutive failed status+health probes
# (no process-liveness signal) before
# declaring the server unreachable.

Recovery behavior: when a server death or a session stall is detected, CodeCome
restarts `opencode serve` and retries the affected phase (Phase 1 re-enters at the
failed subphase `1a`/`1b`/`1c`); both conditions draw from the single
`CODECOME_MAX_SERVER_RESTARTS` budget. A long but healthy model turn is never
abandoned — CodeCome keeps consuming the stream as long as the session is `busy`
and the server process is alive, up to the stall timeout.

### Model resolution and thinking display

The wrapper resolves the effective model in this order:
Expand Down
2 changes: 2 additions & 0 deletions templates/sandboxes/erlang-otp/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ services:
context: .
dockerfile: Dockerfile
working_dir: /workspace
environment:
ERL_CRASH_DUMP: /workspace/tmp/erl_crash.dump
volumes:
- ../:/workspace
- ../.tools/codeql/current:/opt/codeql:ro
Expand Down
102 changes: 93 additions & 9 deletions tests/test_codecome_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,11 @@ def test_run_single_attempt_records_prompt_timeout(mock_args, mock_console, monk
monkeypatch.setattr(runner, "_consume_events", lambda *a, **kw: RunResult())

def fake_send(*_a, **_kw):
raise TimeoutError("timed out")
raise runner.OpenCodeRequestError(
"Failed to send prompt: timed out",
retriable=True,
operation="send_prompt",
)

monkeypatch.setattr(runner, "send_prompt_to_session", fake_send)

Expand All @@ -172,23 +176,103 @@ def fake_send(*_a, **_kw):
fake_transcript.write_event.side_effect = events.append
monkeypatch.setattr(Transcript, "for_phase", classmethod(lambda cls, p, f: fake_transcript))

fatal_errors = []
code, session_id, _res, _path = runner._run_single_attempt(
mock_args, mock_console, "do work", "model", "var",
"http://base", "auth", "dir", lambda *a: None,
emit_fatal_error_fn=lambda _console, _title, msg: fatal_errors.append(msg),
emit_fatal_error_fn=lambda *_a: pytest.fail("retriable prompt timeout should not be fatal"),
)

assert code == 1
assert session_id == ""
assert fatal_errors == ["timed out"]
assert code == 2
assert session_id == "new_session"
event_types = [event["type"] for event in events]
assert "codecome.prompt.send_started" in event_types
assert "codecome.prompt.send_failed" in event_types
assert "codecome.attempt.failed" in event_types
assert "codecome.attempt.incomplete" in event_types
assert "codecome.attempt.failed" not in event_types
failed = next(event for event in events if event["type"] == "codecome.prompt.send_failed")
assert failed["properties"]["errorType"] == "TimeoutError"
assert failed["properties"]["message"] == "timed out"
assert failed["properties"]["errorType"] == "OpenCodeRequestError"
assert failed["properties"]["message"] == "Failed to send prompt: timed out"


def test_run_single_attempt_prompt_timeout_requires_consumer_exit(mock_args, mock_console, monkeypatch):
monkeypatch.setattr(runner, "create_session", lambda *a, **kw: "new_session")
monkeypatch.setattr(runner, "_consume_events", lambda *a, **kw: RunResult())

def fake_send(*_a, **_kw):
raise runner.OpenCodeRequestError(
"Failed to send prompt: timed out",
retriable=True,
operation="send_prompt",
)

class AliveThread:
def __init__(self, *args, **kwargs):
pass

def start(self):
pass

def join(self, timeout=None):
pass

def is_alive(self):
return True

monkeypatch.setattr(runner, "send_prompt_to_session", fake_send)
monkeypatch.setattr(runner.threading, "Thread", AliveThread)
monkeypatch.setenv("CODECOME_SSE_READ_TICK", "invalid")

events = []
fake_transcript = MagicMock(spec=Transcript)
fake_transcript.path = Path("fake.jsonl")
fake_transcript.write_event.side_effect = events.append
monkeypatch.setattr(Transcript, "for_phase", classmethod(lambda cls, p, f: fake_transcript))

code, session_id, _res, _path = runner._run_single_attempt(
mock_args, mock_console, "do work", "model", "var",
"http://base", "auth", "dir", lambda *a: None,
emit_fatal_error_fn=lambda *_a: None,
)

assert code == 1
assert session_id == ""
event_types = [event["type"] for event in events]
assert "codecome.event_loop.stop_timeout" in event_types
assert "codecome.attempt.failed" in event_types
assert "codecome.attempt.incomplete" not in event_types


def test_run_single_attempt_create_session_timeout_is_recoverable(mock_args, mock_console, monkeypatch):
def fake_create(*_a, **_kw):
raise runner.OpenCodeRequestError(
"Failed to create session: timed out",
retriable=True,
operation="create_session",
)

monkeypatch.setattr(runner, "create_session", fake_create)
monkeypatch.setattr(runner, "_consume_events", lambda *a, **kw: pytest.fail("should not consume events"))
monkeypatch.setattr(runner, "send_prompt_to_session", lambda *a, **kw: pytest.fail("should not send prompt"))

events = []
fake_transcript = MagicMock(spec=Transcript)
fake_transcript.path = Path("fake.jsonl")
fake_transcript.write_event.side_effect = events.append
monkeypatch.setattr(Transcript, "for_phase", classmethod(lambda cls, p, f: fake_transcript))

code, session_id, res, _path = runner._run_single_attempt(
mock_args, mock_console, "do work", "model", "var",
"http://base", "auth", "dir", lambda *a: None,
emit_fatal_error_fn=lambda *_a: pytest.fail("retriable create timeout should not be fatal"),
)

assert code == 2
assert session_id == ""
assert res.last_finish_reason == "server_unreachable"
event_types = [event["type"] for event in events]
assert "codecome.session.create_failed" in event_types
assert "codecome.attempt.incomplete" in event_types
assert "codecome.session.ready" not in event_types


def test_existing_session_busy_guard_blocks_resume_prompt(mock_args, mock_console, monkeypatch):
Expand Down
157 changes: 157 additions & 0 deletions tests/test_harness_recovery_restart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from __future__ import annotations

import argparse
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "tools"))

from events.phase_loop import RunResult


class _FakeRuntimeConfig:
model = "op/test"
variant = None
model_source = "stub"
variant_source = "stub"
thinking_on = False
thinking_source = "stub"


class _FakeServerInfo:
base_url = "http://localhost"
password = "fake"
pid = 1


class _FakeServerRunner:
def __init__(self):
self.start_calls = 0
self.restart_calls = 0

def start(self, **_kw):
self.start_calls += 1
return _FakeServerInfo()

def restart(self, **_kw):
self.restart_calls += 1
return _FakeServerInfo()

def stop(self):
pass

@property
def info(self):
return _FakeServerInfo()


def _args(phase: str = "2") -> argparse.Namespace:
return argparse.Namespace(
phase=phase, label="test", agent="auditor",
prompt_file="prompts/phase-2.md", finding=None, chat=False,
show_model=False, debug=False, color="never", log_level="WARN",
read_display_lines=None, write_content_lines=None,
write_diff_limit=None, edit_diff_lines=None,
)


def _server_unreachable_result() -> RunResult:
return RunResult(last_finish_reason="server_unreachable", last_session_id="ses_x")


def _stalled_result() -> RunResult:
return RunResult(
last_finish_reason="session_stalled",
any_step_finish_seen=True,
step_finish_count=2,
session_stalled=True,
last_session_id="ses_x",
)


def _terminal_ok_result() -> RunResult:
return RunResult(last_finish_reason="stop", any_step_finish_seen=True, step_finish_count=1)


def _wire(monkeypatch, fake_runner, attempts):
from codecome import harness as harness_mod
from codecome import runner as runner_mod

it = iter(attempts)
monkeypatch.setattr(harness_mod, "ServerRunner", lambda: fake_runner)
monkeypatch.setattr(harness_mod, "load_prompt", lambda *_a, **_kw: "prompt")
monkeypatch.setattr(harness_mod, "resolve_runtime_config", lambda _agent: _FakeRuntimeConfig())
monkeypatch.setattr(harness_mod, "configure_rendering", lambda *_a, **_kw: None)
monkeypatch.setattr(harness_mod, "check_phase_graceful_completion", lambda *_a, **_kw: (True, []))
monkeypatch.setattr(runner_mod, "_run_single_attempt", lambda *_a, **_kw: next(it))
import subprocess
from unittest.mock import MagicMock
monkeypatch.setattr(subprocess, "run", lambda *_a, **_kw: MagicMock(returncode=0))
return harness_mod


def _t(harness_mod):
return harness_mod.ROOT / "tmp" / "fake.jsonl"


def test_server_unreachable_triggers_restart_then_succeeds(monkeypatch):
fake = _FakeServerRunner()
from codecome import harness as harness_mod
transcript = _t(harness_mod)
harness_mod = _wire(monkeypatch, fake, [
(2, "ses_x", _server_unreachable_result(), transcript),
(0, "ses_x", _terminal_ok_result(), transcript),
])
monkeypatch.setattr(harness_mod, "run_frontmatter_validation", lambda *_a, **_kw: (0, ""), raising=False)

rc = harness_mod.run_phase_mode(_args())
assert rc == 0
assert fake.restart_calls == 1


def test_session_stalled_triggers_restart_then_succeeds(monkeypatch):
fake = _FakeServerRunner()
from codecome import harness as harness_mod
transcript = _t(harness_mod)
harness_mod = _wire(monkeypatch, fake, [
(0, "ses_x", _stalled_result(), transcript),
(0, "ses_x", _terminal_ok_result(), transcript),
])
monkeypatch.setattr(harness_mod, "run_frontmatter_validation", lambda *_a, **_kw: (0, ""), raising=False)

rc = harness_mod.run_phase_mode(_args())
assert rc == 0
assert fake.restart_calls == 1


def test_recovery_shares_budget_and_exhausts(monkeypatch):
fake = _FakeServerRunner()
from codecome import harness as harness_mod
transcript = _t(harness_mod)
monkeypatch.setenv("CODECOME_MAX_SERVER_RESTARTS", "2")
# One stall + two unreachable = 3 recoverable conditions, budget is 2.
harness_mod = _wire(monkeypatch, fake, [
(0, "ses_x", _stalled_result(), transcript),
(2, "ses_x", _server_unreachable_result(), transcript),
(2, "ses_x", _server_unreachable_result(), transcript),
])

rc = harness_mod.run_phase_mode(_args())
# Budget exhausted after 2 restarts → non-zero terminal status.
assert rc != 0
assert fake.restart_calls == 2


def test_session_stalled_budget_exhaustion_is_non_success(monkeypatch):
fake = _FakeServerRunner()
from codecome import harness as harness_mod
transcript = _t(harness_mod)
monkeypatch.setenv("CODECOME_MAX_SERVER_RESTARTS", "0")
harness_mod = _wire(monkeypatch, fake, [
(0, "ses_x", _stalled_result(), transcript),
])

rc = harness_mod.run_phase_mode(_args())

assert rc != 0
assert fake.restart_calls == 0
Loading
Loading