fdatoo · fdatoo · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -154,6 +154,32 @@ jobs:
         with:
           name: playwright-test-results
           path: test-results
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: fbi-debug-log
+          path: /tmp/fbi-debug.log
+          if-no-files-found: ignore
+      - name: Collect quantico per-run debug logs
+        if: always()
+        run: |
+          set +e
+          mkdir -p /tmp/fbi-runs-state
+          if [ -d /tmp/fbi-e2e-runs ]; then
+            for run_dir in /tmp/fbi-e2e-runs/*/; do
+              run_id=$(basename "$run_dir")
+              if [ -d "$run_dir/state" ]; then
+                cp -r "$run_dir/state" "/tmp/fbi-runs-state/run-$run_id" 2>/dev/null || true
+              fi
+            done
+          fi
+          ls -la /tmp/fbi-runs-state/ || true
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: fbi-runs-state
+          path: /tmp/fbi-runs-state
+          if-no-files-found: ignore
 
   image-scan:
     name: Trivy scan agent image

diff --git a/playwright.config.ts b/playwright.config.ts
@@ -4,11 +4,10 @@ export default defineConfig({
   testDir: 'tests/e2e/quantico',
   timeout: 120_000,
   fullyParallel: false, // shared FBI server
-  // Retry up to 2× on CI to absorb genuinely-flaky timing failures
-  // (auto-scroll re-pin race, hang container kill propagation under
-  // parallel-test load — both observed flaking through retries=1 on PR
-  // CI). Keep 0 locally so flakes are visible during development.
-  retries: process.env.CI ? 2 : 0,
+  // TEMP: retries=0 on the debug/hang-flake branch so the suspect flakes
+  // surface and we can capture /tmp/fbi-debug.log. Bring back the 2 once
+  // we've identified root cause.
+  retries: 0,
   // Warm the orchestrator's docker image cache before any tests run.
   // Without this, the first 1-2 tests on a fresh CI runner race a 2-3 min
   // image build and fail their tight assertions.

diff --git a/src/server/src/fbi/run/actor.gleam b/src/server/src/fbi/run/actor.gleam
@@ -190,18 +190,37 @@ fn handle(state: State, msg: RunMsg) -> actor.Next(State, RunMsg) {
     Running(cid, _, bc, _, _), ContainerExited(outcome) ->
       transition_to_waiting(state, cid, bc, outcome)
     Running(cid, _, _, _, _), Cancel -> {
+      debug_log("CANCEL run=" <> int.to_string(state.run_id) <> " cid=" <> cid)
       case docker.connect(state.config.docker_socket) {
         Ok(sock) -> {
-          let _ = docker.kill_container(sock, cid)
+          let kill_result = docker.kill_container(sock, cid)
           docker.close(sock)
+          debug_log(
+            "CANCEL_KILLED run="
+            <> int.to_string(state.run_id)
+            <> " cid="
+            <> cid
+            <> " result="
+            <> case kill_result {
+              Ok(_) -> "ok"
+              Error(e) -> "err:" <> docker.describe_error(e)
+            },
+          )
         }
-        Error(e) ->
+        Error(e) -> {
           wisp.log_warning(
             "run "
             <> int.to_string(state.run_id)
             <> " cancel connect failed: "
             <> docker.describe_error(e),
           )
+          debug_log(
+            "CANCEL_CONNECT_FAILED run="
+            <> int.to_string(state.run_id)
+            <> " err="
+            <> docker.describe_error(e),
+          )
+        }
       }
       actor.continue(state)
     }
@@ -297,6 +316,19 @@ fn transition_to_waiting(
     <> " finished exit_code="
     <> int.to_string(outcome.exit_code),
   )
+  debug_log(
+    "TRANSITION_TO_WAITING run="
+    <> int.to_string(state.run_id)
+    <> " cid="
+    <> cid
+    <> " exit_code="
+    <> int.to_string(outcome.exit_code)
+    <> " err="
+    <> case outcome.error_message {
+      None -> "none"
+      Some(e) -> e
+    },
+  )
   let db_outcome =
     runs_db.RunOutcome(
       exit_code: outcome.exit_code,
@@ -417,3 +449,13 @@ fn transcript_size(config: Config, run_id: Int) -> Int {
 
 @external(erlang, "fbi_time", "now_ms")
 fn now_ms() -> Int
+
+// DEBUG: temporary diagnostic logger for the hang-test "succeeded" flake.
+// Writes timestamped events to /tmp/fbi-debug.log; CI uploads the file as
+// an artifact on test failure. Remove this and its callers once the root
+// cause is identified.
+fn debug_log(msg: String) -> Nil {
+  let line = int.to_string(now_ms()) <> " " <> msg <> "\n"
+  let _ = simplifile.append(to: "/tmp/fbi-debug.log", contents: line)
+  Nil
+}
diff --git a/src/server/src/fbi/run/container_monitor.gleam b/src/server/src/fbi/run/container_monitor.gleam
@@ -150,14 +150,21 @@ pub fn read_agent_status(state_dir: String) -> Option(String) {
 }
 
 fn wait_for_exit(config: Config, cid: String) -> Int {
+  debug_log("WAIT_BEGIN cid=" <> cid)
   case docker.connect(config.docker_socket) {
     Ok(sock) -> {
       let code = case docker.wait_container(sock, cid) {
-        Ok(c) -> c
+        Ok(c) -> {
+          debug_log("WAIT_RESULT cid=" <> cid <> " code=" <> int.to_string(c))
+          c
+        }
         Error(e) -> {
           wisp.log_warning(
             "container_monitor: wait failed: " <> docker.describe_error(e),
           )
+          debug_log(
+            "WAIT_FAILED cid=" <> cid <> " err=" <> docker.describe_error(e),
+          )
           -1
         }
       }
@@ -168,11 +175,19 @@ fn wait_for_exit(config: Config, cid: String) -> Int {
       wisp.log_warning(
         "container_monitor: docker connect: " <> docker.describe_error(e),
       )
+      debug_log(
+        "WAIT_CONNECT_FAILED cid=" <> cid <> " err=" <> docker.describe_error(e),
+      )
       -1
     }
   }
 }
 
+fn debug_log(msg: String) -> Nil {
+  let _ = simplifile.append(to: "/tmp/fbi-debug.log", contents: msg <> "\n")
+  Nil
+}
+
 fn read_outcome(state_dir: String, exit_code: Int) -> RunOutcome {
   case simplifile.read(state_dir <> "/result.json") {
     Error(_) ->

diff --git a/src/server/src/fbi/run/worker.gleam b/src/server/src/fbi/run/worker.gleam
@@ -82,17 +82,21 @@ fn do_real_launch(input: LaunchInput) -> Result(#(String, String), String) {
     input.config,
     on_log,
   ))
-  use _ <- result.try(setup_run_dir(input))
   let container_name = "fbi-run-" <> run_id
-  // Remove any pre-existing container with this name. Cancel paths and
-  // crash recovery don't always reach transition_to_finishing (which is
-  // the only place that calls remove_container), so retrying the same
-  // run id otherwise hits a "container name in use" error from Docker.
-  // force=true also handles the case where it's still running.
+  // Remove any pre-existing container with this name BEFORE setup_run_dir.
+  // Cancel paths and crash recovery don't always reach
+  // transition_to_finishing (which is the only place that calls
+  // remove_container), so retrying the same run id otherwise hits a
+  // "container name in use" error from Docker. force=true also handles
+  // the case where it's still running. Order matters: while an old
+  // container holds the bind mount on run_dir/state, `del_dir_r` fails
+  // with EBUSY and stale state files (notably result.json) survive into
+  // the new run.
   let _ =
     with_docker(input.config.docker_socket, fn(sock) {
       docker.remove_container(sock, container_name, True)
     })
+  use _ <- result.try(setup_run_dir(input))
   let spec = container_spec(input, image_tag)
   wisp.log_debug("run " <> run_id <> ": creating container image=" <> image_tag)
   use cid <- result.try(
@@ -127,12 +131,20 @@ fn do_mock_launch(input: LaunchInput) -> Result(#(String, String), String) {
     Some(p) -> Ok(p)
     None -> Error("FBI_QUANTICO_BINARY_PATH not set; mock runs require it")
   })
-  use _ <- result.try(setup_run_dir(input))
+  // Force-remove any prior container with this name BEFORE setup_run_dir.
+  // SQLite reuses rowids when prior runs are deleted (no AUTOINCREMENT),
+  // so a fresh run can land on an id whose state dir is still bind-mounted
+  // by a not-yet-removed container. While that bind mount is alive, the
+  // host-side `del_dir_r` of run_dir/state can't remove the directory
+  // (EBUSY) and stale files (notably result.json) survive — read_outcome
+  // then mistakes the *prior* run's exit code for the new run's, which is
+  // exactly the source of the hang-test "succeeded" flake.
   let container_name = "fbi-run-" <> run_id
   let _ =
     with_docker(input.config.docker_socket, fn(sock) {
       docker.remove_container(sock, container_name, True)
     })
+  use _ <- result.try(setup_run_dir(input))
   let spec = mock_container_spec(input, quantico_path, scenario)
   wisp.log_debug(
     "run " <> run_id <> ": creating mock container scenario=" <> scenario,
@@ -417,6 +429,14 @@ fn setup_run_dir(input: LaunchInput) -> Result(Nil, String) {
   let _ = simplifile.delete(run_dir <> "/state")
   let _ = simplifile.delete(run_dir <> "/wip")
   let _ = simplifile.delete(scripts_dir)
+  // Belt-and-suspenders: even if `delete(run_dir/state)` fails (EBUSY
+  // when a prior container still has the bind mount), nuke the
+  // individual signal files by path so read_outcome / poll_status_loop
+  // can't see stale values from the previous run id.
+  let _ = simplifile.delete(run_dir <> "/state/result.json")
+  let _ = simplifile.delete(run_dir <> "/state/agent-status")
+  let _ = simplifile.delete(run_dir <> "/state/session-id")
+  let _ = simplifile.delete(run_dir <> "/state/ready")
   use _ <- result.try(
     simplifile.create_directory_all(scripts_dir)
     |> result.map_error(fn(e) {

diff --git a/tests/quantico/src/main.rs b/tests/quantico/src/main.rs
@@ -228,14 +228,34 @@ fn run_scenario(
         state_dir: state_dir.clone(),
     };
     let exit_code = match executor::run(&scenario, &mut ctx) {
-        Ok(executor::Outcome::Exited(c)) => c,
+        Ok(executor::Outcome::Exited(c)) => {
+            if let Some(ref dir) = state_dir {
+                let _ = std::fs::write(
+                    dir.join("quantico-debug.log"),
+                    format!("OUTCOME=Exited({})\n", c),
+                );
+            }
+            c
+        }
         Ok(executor::Outcome::SleepingForever) => {
+            if let Some(ref dir) = state_dir {
+                let _ = std::fs::write(
+                    dir.join("quantico-debug.log"),
+                    "OUTCOME=SleepingForever\n",
+                );
+            }
             // Block forever (until SIGKILL). SIGTERM honoured by default.
             loop {
                 std::thread::park();
             }
         }
         Err(e) => {
+            if let Some(ref dir) = state_dir {
+                let _ = std::fs::write(
+                    dir.join("quantico-debug.log"),
+                    format!("OUTCOME=Err: {} (kind={:?})\n", e, e.kind()),
+                );
+            }
             let _ = writeln!(std::io::stderr(), "quantico: io error: {}", e);
             1
         }