Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,32 @@ jobs:
with:
name: playwright-test-results
path: test-results
- uses: actions/upload-artifact@v4
if: always()
with:
name: fbi-debug-log
path: /tmp/fbi-debug.log
if-no-files-found: ignore
- name: Collect quantico per-run debug logs
if: always()
run: |
set +e
mkdir -p /tmp/fbi-runs-state
if [ -d /tmp/fbi-e2e-runs ]; then
for run_dir in /tmp/fbi-e2e-runs/*/; do
run_id=$(basename "$run_dir")
if [ -d "$run_dir/state" ]; then
cp -r "$run_dir/state" "/tmp/fbi-runs-state/run-$run_id" 2>/dev/null || true
fi
done
fi
ls -la /tmp/fbi-runs-state/ || true
- uses: actions/upload-artifact@v4
if: always()
with:
name: fbi-runs-state
path: /tmp/fbi-runs-state
if-no-files-found: ignore

image-scan:
name: Trivy scan agent image
Expand Down
9 changes: 4 additions & 5 deletions playwright.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@ export default defineConfig({
testDir: 'tests/e2e/quantico',
timeout: 120_000,
fullyParallel: false, // shared FBI server
// Retry up to 2× on CI to absorb genuinely-flaky timing failures
// (auto-scroll re-pin race, hang container kill propagation under
// parallel-test load — both observed flaking through retries=1 on PR
// CI). Keep 0 locally so flakes are visible during development.
retries: process.env.CI ? 2 : 0,
// TEMP: retries=0 on the debug/hang-flake branch so the suspect flakes
// surface and we can capture /tmp/fbi-debug.log. Bring back the 2 once
// we've identified root cause.
retries: 0,
// Warm the orchestrator's docker image cache before any tests run.
// Without this, the first 1-2 tests on a fresh CI runner race a 2-3 min
// image build and fail their tight assertions.
Expand Down
46 changes: 44 additions & 2 deletions src/server/src/fbi/run/actor.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,37 @@ fn handle(state: State, msg: RunMsg) -> actor.Next(State, RunMsg) {
Running(cid, _, bc, _, _), ContainerExited(outcome) ->
transition_to_waiting(state, cid, bc, outcome)
Running(cid, _, _, _, _), Cancel -> {
debug_log("CANCEL run=" <> int.to_string(state.run_id) <> " cid=" <> cid)
case docker.connect(state.config.docker_socket) {
Ok(sock) -> {
let _ = docker.kill_container(sock, cid)
let kill_result = docker.kill_container(sock, cid)
docker.close(sock)
debug_log(
"CANCEL_KILLED run="
<> int.to_string(state.run_id)
<> " cid="
<> cid
<> " result="
<> case kill_result {
Ok(_) -> "ok"
Error(e) -> "err:" <> docker.describe_error(e)
},
)
}
Error(e) ->
Error(e) -> {
wisp.log_warning(
"run "
<> int.to_string(state.run_id)
<> " cancel connect failed: "
<> docker.describe_error(e),
)
debug_log(
"CANCEL_CONNECT_FAILED run="
<> int.to_string(state.run_id)
<> " err="
<> docker.describe_error(e),
)
}
}
actor.continue(state)
}
Expand Down Expand Up @@ -297,6 +316,19 @@ fn transition_to_waiting(
<> " finished exit_code="
<> int.to_string(outcome.exit_code),
)
debug_log(
"TRANSITION_TO_WAITING run="
<> int.to_string(state.run_id)
<> " cid="
<> cid
<> " exit_code="
<> int.to_string(outcome.exit_code)
<> " err="
<> case outcome.error_message {
None -> "none"
Some(e) -> e
},
)
let db_outcome =
runs_db.RunOutcome(
exit_code: outcome.exit_code,
Expand Down Expand Up @@ -417,3 +449,13 @@ fn transcript_size(config: Config, run_id: Int) -> Int {

@external(erlang, "fbi_time", "now_ms")
fn now_ms() -> Int

// DEBUG: temporary diagnostic logger for the hang-test "succeeded" flake.
// Writes timestamped events to /tmp/fbi-debug.log; CI uploads the file as
// an artifact on test failure. Remove this and its callers once the root
// cause is identified.
fn debug_log(msg: String) -> Nil {
let line = int.to_string(now_ms()) <> " " <> msg <> "\n"
let _ = simplifile.append(to: "/tmp/fbi-debug.log", contents: line)
Nil
}
17 changes: 16 additions & 1 deletion src/server/src/fbi/run/container_monitor.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,21 @@ pub fn read_agent_status(state_dir: String) -> Option(String) {
}

fn wait_for_exit(config: Config, cid: String) -> Int {
debug_log("WAIT_BEGIN cid=" <> cid)
case docker.connect(config.docker_socket) {
Ok(sock) -> {
let code = case docker.wait_container(sock, cid) {
Ok(c) -> c
Ok(c) -> {
debug_log("WAIT_RESULT cid=" <> cid <> " code=" <> int.to_string(c))
c
}
Error(e) -> {
wisp.log_warning(
"container_monitor: wait failed: " <> docker.describe_error(e),
)
debug_log(
"WAIT_FAILED cid=" <> cid <> " err=" <> docker.describe_error(e),
)
-1
}
}
Expand All @@ -168,11 +175,19 @@ fn wait_for_exit(config: Config, cid: String) -> Int {
wisp.log_warning(
"container_monitor: docker connect: " <> docker.describe_error(e),
)
debug_log(
"WAIT_CONNECT_FAILED cid=" <> cid <> " err=" <> docker.describe_error(e),
)
-1
}
}
}

fn debug_log(msg: String) -> Nil {
let _ = simplifile.append(to: "/tmp/fbi-debug.log", contents: msg <> "\n")
Nil
}

fn read_outcome(state_dir: String, exit_code: Int) -> RunOutcome {
case simplifile.read(state_dir <> "/result.json") {
Error(_) ->
Expand Down
34 changes: 27 additions & 7 deletions src/server/src/fbi/run/worker.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,21 @@ fn do_real_launch(input: LaunchInput) -> Result(#(String, String), String) {
input.config,
on_log,
))
use _ <- result.try(setup_run_dir(input))
let container_name = "fbi-run-" <> run_id
// Remove any pre-existing container with this name. Cancel paths and
// crash recovery don't always reach transition_to_finishing (which is
// the only place that calls remove_container), so retrying the same
// run id otherwise hits a "container name in use" error from Docker.
// force=true also handles the case where it's still running.
// Remove any pre-existing container with this name BEFORE setup_run_dir.
// Cancel paths and crash recovery don't always reach
// transition_to_finishing (which is the only place that calls
// remove_container), so retrying the same run id otherwise hits a
// "container name in use" error from Docker. force=true also handles
// the case where it's still running. Order matters: while an old
// container holds the bind mount on run_dir/state, `del_dir_r` fails
// with EBUSY and stale state files (notably result.json) survive into
// the new run.
let _ =
with_docker(input.config.docker_socket, fn(sock) {
docker.remove_container(sock, container_name, True)
})
use _ <- result.try(setup_run_dir(input))
let spec = container_spec(input, image_tag)
wisp.log_debug("run " <> run_id <> ": creating container image=" <> image_tag)
use cid <- result.try(
Expand Down Expand Up @@ -127,12 +131,20 @@ fn do_mock_launch(input: LaunchInput) -> Result(#(String, String), String) {
Some(p) -> Ok(p)
None -> Error("FBI_QUANTICO_BINARY_PATH not set; mock runs require it")
})
use _ <- result.try(setup_run_dir(input))
// Force-remove any prior container with this name BEFORE setup_run_dir.
// SQLite reuses rowids when prior runs are deleted (no AUTOINCREMENT),
// so a fresh run can land on an id whose state dir is still bind-mounted
// by a not-yet-removed container. While that bind mount is alive, the
// host-side `del_dir_r` of run_dir/state can't remove the directory
// (EBUSY) and stale files (notably result.json) survive — read_outcome
// then mistakes the *prior* run's exit code for the new run's, which is
// exactly the source of the hang-test "succeeded" flake.
let container_name = "fbi-run-" <> run_id
let _ =
with_docker(input.config.docker_socket, fn(sock) {
docker.remove_container(sock, container_name, True)
})
use _ <- result.try(setup_run_dir(input))
let spec = mock_container_spec(input, quantico_path, scenario)
wisp.log_debug(
"run " <> run_id <> ": creating mock container scenario=" <> scenario,
Expand Down Expand Up @@ -417,6 +429,14 @@ fn setup_run_dir(input: LaunchInput) -> Result(Nil, String) {
let _ = simplifile.delete(run_dir <> "/state")
let _ = simplifile.delete(run_dir <> "/wip")
let _ = simplifile.delete(scripts_dir)
// Belt-and-suspenders: even if `delete(run_dir/state)` fails (EBUSY
// when a prior container still has the bind mount), nuke the
// individual signal files by path so read_outcome / poll_status_loop
// can't see stale values from the previous run id.
let _ = simplifile.delete(run_dir <> "/state/result.json")
let _ = simplifile.delete(run_dir <> "/state/agent-status")
let _ = simplifile.delete(run_dir <> "/state/session-id")
let _ = simplifile.delete(run_dir <> "/state/ready")
use _ <- result.try(
simplifile.create_directory_all(scripts_dir)
|> result.map_error(fn(e) {
Expand Down
22 changes: 21 additions & 1 deletion tests/quantico/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,14 +228,34 @@ fn run_scenario(
state_dir: state_dir.clone(),
};
let exit_code = match executor::run(&scenario, &mut ctx) {
Ok(executor::Outcome::Exited(c)) => c,
Ok(executor::Outcome::Exited(c)) => {
if let Some(ref dir) = state_dir {
let _ = std::fs::write(
dir.join("quantico-debug.log"),
format!("OUTCOME=Exited({})\n", c),
);
}
c
}
Ok(executor::Outcome::SleepingForever) => {
if let Some(ref dir) = state_dir {
let _ = std::fs::write(
dir.join("quantico-debug.log"),
"OUTCOME=SleepingForever\n",
);
}
// Block forever (until SIGKILL). SIGTERM honoured by default.
loop {
std::thread::park();
}
}
Err(e) => {
if let Some(ref dir) = state_dir {
let _ = std::fs::write(
dir.join("quantico-debug.log"),
format!("OUTCOME=Err: {} (kind={:?})\n", e, e.kind()),
);
}
let _ = writeln!(std::io::stderr(), "quantico: io error: {}", e);
1
}
Expand Down
Loading