diff --git a/gpu_test/conftest.py b/gpu_test/conftest.py index 9b20e14..fabfa0f 100644 --- a/gpu_test/conftest.py +++ b/gpu_test/conftest.py @@ -181,7 +181,8 @@ def _wait_for_ssh(self) -> None: self.ssh_host, self.ssh_port, ) - # Upload and compile the C++ runner + # Wait for sshd to actually accept connections, then compile runner + self._wait_for_sshd() self._compile_runner() return @@ -190,6 +191,25 @@ def _wait_for_ssh(self) -> None: msg = f"Instance {self.instance_id} did not become SSH-ready within {POLL_TIMEOUT_S}s" raise TimeoutError(msg) + def _wait_for_sshd(self) -> None: + """Poll until sshd is actually accepting connections.""" + deadline = time.monotonic() + POLL_TIMEOUT_S + while time.monotonic() < deadline: + result = subprocess.run( + [*self._ssh_cmd(), "true"], + capture_output=True, + timeout=15, + check=False, + ) + if result.returncode == 0: + logger.info("sshd ready on %s:%s", self.ssh_host, self.ssh_port) + return + time.sleep(POLL_INTERVAL_S) + msg = ( + f"sshd on {self.ssh_host}:{self.ssh_port} did not become ready within {POLL_TIMEOUT_S}s" + ) + raise TimeoutError(msg) + def _compile_runner(self) -> None: """Upload warpforth-runner.cpp and compile it on the remote host.""" self.scp_upload(RUNNER_SRC, f"{REMOTE_TMP}/warpforth-runner.cpp") @@ -241,6 +261,8 @@ def _ssh_cmd(self) -> list[str]: "-o", "ConnectTimeout=10", "-o", + "ConnectionAttempts=3", + "-o", "LogLevel=ERROR", "-p", str(self.ssh_port), @@ -271,6 +293,10 @@ def scp_upload(self, local_path: str | Path, remote_path: str) -> None: "-o", "UserKnownHostsFile=/dev/null", "-o", + "ConnectTimeout=10", + "-o", + "ConnectionAttempts=3", + "-o", "LogLevel=ERROR", "-P", str(self.ssh_port),