diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 445286c..0078e4e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,6 +154,32 @@ jobs: with: name: playwright-test-results path: test-results + - uses: actions/upload-artifact@v4 + if: always() + with: + name: fbi-debug-log + path: /tmp/fbi-debug.log + if-no-files-found: ignore + - name: Collect quantico per-run debug logs + if: always() + run: | + set +e + mkdir -p /tmp/fbi-runs-state + if [ -d /tmp/fbi-e2e-runs ]; then + for run_dir in /tmp/fbi-e2e-runs/*/; do + run_id=$(basename "$run_dir") + if [ -d "$run_dir/state" ]; then + cp -r "$run_dir/state" "/tmp/fbi-runs-state/run-$run_id" 2>/dev/null || true + fi + done + fi + ls -la /tmp/fbi-runs-state/ || true + - uses: actions/upload-artifact@v4 + if: always() + with: + name: fbi-runs-state + path: /tmp/fbi-runs-state + if-no-files-found: ignore image-scan: name: Trivy scan agent image diff --git a/playwright.config.ts b/playwright.config.ts index d0c99e9..fad4c2b 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -4,11 +4,10 @@ export default defineConfig({ testDir: 'tests/e2e/quantico', timeout: 120_000, fullyParallel: false, // shared FBI server - // Retry up to 2× on CI to absorb genuinely-flaky timing failures - // (auto-scroll re-pin race, hang container kill propagation under - // parallel-test load — both observed flaking through retries=1 on PR - // CI). Keep 0 locally so flakes are visible during development. - retries: process.env.CI ? 2 : 0, + // TEMP: retries=0 on the debug/hang-flake branch so the suspect flakes + // surface and we can capture /tmp/fbi-debug.log. Bring back the 2 once + // we've identified root cause. + retries: 0, // Warm the orchestrator's docker image cache before any tests run. // Without this, the first 1-2 tests on a fresh CI runner race a 2-3 min // image build and fail their tight assertions. diff --git a/src/server/src/fbi/run/actor.gleam b/src/server/src/fbi/run/actor.gleam index c0b18d5..a54a46b 100644 --- a/src/server/src/fbi/run/actor.gleam +++ b/src/server/src/fbi/run/actor.gleam @@ -190,18 +190,37 @@ fn handle(state: State, msg: RunMsg) -> actor.Next(State, RunMsg) { Running(cid, _, bc, _, _), ContainerExited(outcome) -> transition_to_waiting(state, cid, bc, outcome) Running(cid, _, _, _, _), Cancel -> { + debug_log("CANCEL run=" <> int.to_string(state.run_id) <> " cid=" <> cid) case docker.connect(state.config.docker_socket) { Ok(sock) -> { - let _ = docker.kill_container(sock, cid) + let kill_result = docker.kill_container(sock, cid) docker.close(sock) + debug_log( + "CANCEL_KILLED run=" + <> int.to_string(state.run_id) + <> " cid=" + <> cid + <> " result=" + <> case kill_result { + Ok(_) -> "ok" + Error(e) -> "err:" <> docker.describe_error(e) + }, + ) } - Error(e) -> + Error(e) -> { wisp.log_warning( "run " <> int.to_string(state.run_id) <> " cancel connect failed: " <> docker.describe_error(e), ) + debug_log( + "CANCEL_CONNECT_FAILED run=" + <> int.to_string(state.run_id) + <> " err=" + <> docker.describe_error(e), + ) + } } actor.continue(state) } @@ -297,6 +316,19 @@ fn transition_to_waiting( <> " finished exit_code=" <> int.to_string(outcome.exit_code), ) + debug_log( + "TRANSITION_TO_WAITING run=" + <> int.to_string(state.run_id) + <> " cid=" + <> cid + <> " exit_code=" + <> int.to_string(outcome.exit_code) + <> " err=" + <> case outcome.error_message { + None -> "none" + Some(e) -> e + }, + ) let db_outcome = runs_db.RunOutcome( exit_code: outcome.exit_code, @@ -417,3 +449,13 @@ fn transcript_size(config: Config, run_id: Int) -> Int { @external(erlang, "fbi_time", "now_ms") fn now_ms() -> Int + +// DEBUG: temporary diagnostic logger for the hang-test "succeeded" flake. +// Writes timestamped events to /tmp/fbi-debug.log; CI uploads the file as +// an artifact on test failure. Remove this and its callers once the root +// cause is identified. +fn debug_log(msg: String) -> Nil { + let line = int.to_string(now_ms()) <> " " <> msg <> "\n" + let _ = simplifile.append(to: "/tmp/fbi-debug.log", contents: line) + Nil +} diff --git a/src/server/src/fbi/run/container_monitor.gleam b/src/server/src/fbi/run/container_monitor.gleam index 67eb073..52470d9 100644 --- a/src/server/src/fbi/run/container_monitor.gleam +++ b/src/server/src/fbi/run/container_monitor.gleam @@ -150,14 +150,21 @@ pub fn read_agent_status(state_dir: String) -> Option(String) { } fn wait_for_exit(config: Config, cid: String) -> Int { + debug_log("WAIT_BEGIN cid=" <> cid) case docker.connect(config.docker_socket) { Ok(sock) -> { let code = case docker.wait_container(sock, cid) { - Ok(c) -> c + Ok(c) -> { + debug_log("WAIT_RESULT cid=" <> cid <> " code=" <> int.to_string(c)) + c + } Error(e) -> { wisp.log_warning( "container_monitor: wait failed: " <> docker.describe_error(e), ) + debug_log( + "WAIT_FAILED cid=" <> cid <> " err=" <> docker.describe_error(e), + ) -1 } } @@ -168,11 +175,19 @@ fn wait_for_exit(config: Config, cid: String) -> Int { wisp.log_warning( "container_monitor: docker connect: " <> docker.describe_error(e), ) + debug_log( + "WAIT_CONNECT_FAILED cid=" <> cid <> " err=" <> docker.describe_error(e), + ) -1 } } } +fn debug_log(msg: String) -> Nil { + let _ = simplifile.append(to: "/tmp/fbi-debug.log", contents: msg <> "\n") + Nil +} + fn read_outcome(state_dir: String, exit_code: Int) -> RunOutcome { case simplifile.read(state_dir <> "/result.json") { Error(_) -> diff --git a/src/server/src/fbi/run/worker.gleam b/src/server/src/fbi/run/worker.gleam index 62b2f7d..5500206 100644 --- a/src/server/src/fbi/run/worker.gleam +++ b/src/server/src/fbi/run/worker.gleam @@ -82,17 +82,21 @@ fn do_real_launch(input: LaunchInput) -> Result(#(String, String), String) { input.config, on_log, )) - use _ <- result.try(setup_run_dir(input)) let container_name = "fbi-run-" <> run_id - // Remove any pre-existing container with this name. Cancel paths and - // crash recovery don't always reach transition_to_finishing (which is - // the only place that calls remove_container), so retrying the same - // run id otherwise hits a "container name in use" error from Docker. - // force=true also handles the case where it's still running. + // Remove any pre-existing container with this name BEFORE setup_run_dir. + // Cancel paths and crash recovery don't always reach + // transition_to_finishing (which is the only place that calls + // remove_container), so retrying the same run id otherwise hits a + // "container name in use" error from Docker. force=true also handles + // the case where it's still running. Order matters: while an old + // container holds the bind mount on run_dir/state, `del_dir_r` fails + // with EBUSY and stale state files (notably result.json) survive into + // the new run. let _ = with_docker(input.config.docker_socket, fn(sock) { docker.remove_container(sock, container_name, True) }) + use _ <- result.try(setup_run_dir(input)) let spec = container_spec(input, image_tag) wisp.log_debug("run " <> run_id <> ": creating container image=" <> image_tag) use cid <- result.try( @@ -127,12 +131,20 @@ fn do_mock_launch(input: LaunchInput) -> Result(#(String, String), String) { Some(p) -> Ok(p) None -> Error("FBI_QUANTICO_BINARY_PATH not set; mock runs require it") }) - use _ <- result.try(setup_run_dir(input)) + // Force-remove any prior container with this name BEFORE setup_run_dir. + // SQLite reuses rowids when prior runs are deleted (no AUTOINCREMENT), + // so a fresh run can land on an id whose state dir is still bind-mounted + // by a not-yet-removed container. While that bind mount is alive, the + // host-side `del_dir_r` of run_dir/state can't remove the directory + // (EBUSY) and stale files (notably result.json) survive — read_outcome + // then mistakes the *prior* run's exit code for the new run's, which is + // exactly the source of the hang-test "succeeded" flake. let container_name = "fbi-run-" <> run_id let _ = with_docker(input.config.docker_socket, fn(sock) { docker.remove_container(sock, container_name, True) }) + use _ <- result.try(setup_run_dir(input)) let spec = mock_container_spec(input, quantico_path, scenario) wisp.log_debug( "run " <> run_id <> ": creating mock container scenario=" <> scenario, @@ -417,6 +429,14 @@ fn setup_run_dir(input: LaunchInput) -> Result(Nil, String) { let _ = simplifile.delete(run_dir <> "/state") let _ = simplifile.delete(run_dir <> "/wip") let _ = simplifile.delete(scripts_dir) + // Belt-and-suspenders: even if `delete(run_dir/state)` fails (EBUSY + // when a prior container still has the bind mount), nuke the + // individual signal files by path so read_outcome / poll_status_loop + // can't see stale values from the previous run id. + let _ = simplifile.delete(run_dir <> "/state/result.json") + let _ = simplifile.delete(run_dir <> "/state/agent-status") + let _ = simplifile.delete(run_dir <> "/state/session-id") + let _ = simplifile.delete(run_dir <> "/state/ready") use _ <- result.try( simplifile.create_directory_all(scripts_dir) |> result.map_error(fn(e) { diff --git a/tests/quantico/src/main.rs b/tests/quantico/src/main.rs index ea9d458..53ec52e 100644 --- a/tests/quantico/src/main.rs +++ b/tests/quantico/src/main.rs @@ -228,14 +228,34 @@ fn run_scenario( state_dir: state_dir.clone(), }; let exit_code = match executor::run(&scenario, &mut ctx) { - Ok(executor::Outcome::Exited(c)) => c, + Ok(executor::Outcome::Exited(c)) => { + if let Some(ref dir) = state_dir { + let _ = std::fs::write( + dir.join("quantico-debug.log"), + format!("OUTCOME=Exited({})\n", c), + ); + } + c + } Ok(executor::Outcome::SleepingForever) => { + if let Some(ref dir) = state_dir { + let _ = std::fs::write( + dir.join("quantico-debug.log"), + "OUTCOME=SleepingForever\n", + ); + } // Block forever (until SIGKILL). SIGTERM honoured by default. loop { std::thread::park(); } } Err(e) => { + if let Some(ref dir) = state_dir { + let _ = std::fs::write( + dir.join("quantico-debug.log"), + format!("OUTCOME=Err: {} (kind={:?})\n", e, e.kind()), + ); + } let _ = writeln!(std::io::stderr(), "quantico: io error: {}", e); 1 }