From 46dedcb8c07c49af9a5eeef598264de82f526b45 Mon Sep 17 00:00:00 2001 From: Andrew Sy Kim Date: Tue, 12 May 2026 14:40:09 +0000 Subject: [PATCH] [tx] add retries for getting random port used for JAX coordinator address Signed-off-by: Andrew Sy Kim --- skyrl/backends/ray_jax.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/skyrl/backends/ray_jax.py b/skyrl/backends/ray_jax.py index 52cf97937e..fa268c685d 100644 --- a/skyrl/backends/ray_jax.py +++ b/skyrl/backends/ray_jax.py @@ -10,9 +10,15 @@ def _get_random_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("", 0)) - return s.getsockname()[1] + # try a few different ports in case another process is using randomly assigned port + for _ in range(10): + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + except OSError: + continue + raise RuntimeError("Could not allocate a free port") @ray.remote