From 9e9ba67c22577b1b989b63f94a50f8c19a75b4cf Mon Sep 17 00:00:00 2001
From: xyuzh <xinyzng@gmail.com>
Date: Mon, 18 May 2026 12:08:47 -0700
Subject: [PATCH 1/3] [workers] Clean teardown on SIGTERM: drain CUDA streams +
 destroy process group

When a k8s pod is evicted (preemption, scale-down, node drain) the container
gets SIGTERM with a 25s grace period before SIGKILL. Without a handler, in-flight
NCCL collectives leak communicators and the next run may hit stale process group
state.

Add a SIGTERM handler inside DistributedTorchRayActor.init_worker_process_group()
that:
  - calls torch.cuda.synchronize() to drain any in-flight CUDA work
  - calls torch.distributed.destroy_process_group() to release NCCL
  - exits cleanly with sys.exit(0)

Both calls are wrapped in try/except so a partial-state worker still tears down
the half that's healthy. The whole sequence is well under the 25s grace window.

Each call is guarded (`torch.distributed.is_available() and is_initialized()`)
so it does nothing when distributed isn't set up yet.
---
 skyrl/backends/skyrl_train/workers/worker.py | 24 ++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/skyrl/backends/skyrl_train/workers/worker.py b/skyrl/backends/skyrl_train/workers/worker.py
index 43a73fc28c..4aa02f3842 100644
--- a/skyrl/backends/skyrl_train/workers/worker.py
+++ b/skyrl/backends/skyrl_train/workers/worker.py
@@ -1,7 +1,9 @@
 import asyncio
 import logging
 import os
+import signal
 import socket
+import sys
 from collections import defaultdict
 from ctypes import CDLL, POINTER, Structure, c_char_p, c_int, c_ulong, c_void_p
 from datetime import timedelta
@@ -121,6 +123,28 @@ def init_worker_process_group(self):
                 backend="cpu:gloo,cuda:nccl", timeout=timedelta(seconds=SKYRL_WORKER_NCCL_TIMEOUT_IN_S)
             )
 
+        # Clean teardown on k8s SIGTERM: drain CUDA streams + release NCCL
+        # communicators before the 25s grace period elapses.
+        rank = self._rank
+
+        def _sigterm_cleanup(signum, frame):
+            logger.warning(f"SIGTERM received in worker rank={rank}, cleaning up...")
+
+            try:
+                torch.cuda.synchronize()
+            except Exception as e:
+                logger.warning(f"cuda.synchronize() failed: {e}")
+
+            try:
+                if torch.distributed.is_available() and torch.distributed.is_initialized():
+                    torch.distributed.destroy_process_group()
+            except Exception as e:
+                logger.warning(f"destroy_process_group() failed: {e}")
+
+            sys.exit(0)
+
+        signal.signal(signal.SIGTERM, _sigterm_cleanup)
+
         # setup device mesh
         # TODO: Support TP / PP for additional backends
         # NOTE (sumanthrh): Device mesh and mesh rank are rank specific attributes. For the current way the strategy is defined,

From eb8d6126d034180fc280c1c64d3b672ef76513f1 Mon Sep 17 00:00:00 2001
From: xyuzh <xinyzng@gmail.com>
Date: Mon, 18 May 2026 12:14:24 -0700
Subject: [PATCH 2/3] Guard cuda.synchronize() with torch.cuda.is_available()

Skip the call entirely on CPU-only environments so we don't generate a noisy
warning every time a non-CUDA worker is terminated. Only emit a warning if
synchronize() actually fails on a CUDA-capable system.
---
 skyrl/backends/skyrl_train/workers/worker.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/skyrl/backends/skyrl_train/workers/worker.py b/skyrl/backends/skyrl_train/workers/worker.py
index 4aa02f3842..d0ae5af0cc 100644
--- a/skyrl/backends/skyrl_train/workers/worker.py
+++ b/skyrl/backends/skyrl_train/workers/worker.py
@@ -130,10 +130,11 @@ def init_worker_process_group(self):
         def _sigterm_cleanup(signum, frame):
             logger.warning(f"SIGTERM received in worker rank={rank}, cleaning up...")
 
-            try:
-                torch.cuda.synchronize()
-            except Exception as e:
-                logger.warning(f"cuda.synchronize() failed: {e}")
+            if torch.cuda.is_available():
+                try:
+                    torch.cuda.synchronize()
+                except Exception as e:
+                    logger.warning(f"cuda.synchronize() failed: {e}")
 
             try:
                 if torch.distributed.is_available() and torch.distributed.is_initialized():

From 983b6a27ba0eba821ce7e9b153ca2d5363a53e35 Mon Sep 17 00:00:00 2001
From: xyuzh <xinyzng@gmail.com>
Date: Mon, 18 May 2026 12:27:06 -0700
Subject: [PATCH 3/3] [examples][infra] Anyscale 2-node Qwen3 launch flow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an Anyscale-friendly launch path for Qwen3-30B-A3B and Qwen3-235B-A22B
on 2 H100 nodes. The key piece is a one-shot venv build on shared NFS that
all Ray actors share via py_executable — without it every actor independently
ran uv install of skyrl[megatron], causing NCCL rendezvous to time out before
all 16 actors registered.

Built on top of the SIGTERM teardown PR (#1688) so preemption / scale-down
doesn't leak NCCL communicators.
---
 .../megatron/anyscale_qwen3_235b_2nodes.yaml  | 135 ++++++++++++++
 .../megatron/anyscale_qwen3_30b_2nodes.yaml   |  72 ++++++++
 .../train/megatron/build_shared_venv.yaml     |  55 ++++++
 examples/train/megatron/clear_uv_cache.yaml   |  20 ++
 examples/train/megatron/clear_venv.yaml       |  19 ++
 examples/train/megatron/download_model.yaml   |  36 ++++
 examples/train/megatron/dump_diag.yaml        |  31 ++++
 examples/train/megatron/inspect_cudnn.yaml    |  31 ++++
 .../run_megatron_qwen3_235b_2nodes.sh         | 173 ++++++++++++++++++
 .../megatron/run_megatron_qwen3_30b_2nodes.sh | 137 ++++++++++++++
 10 files changed, 709 insertions(+)
 create mode 100644 examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml
 create mode 100644 examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml
 create mode 100644 examples/train/megatron/build_shared_venv.yaml
 create mode 100644 examples/train/megatron/clear_uv_cache.yaml
 create mode 100644 examples/train/megatron/clear_venv.yaml
 create mode 100644 examples/train/megatron/download_model.yaml
 create mode 100644 examples/train/megatron/dump_diag.yaml
 create mode 100644 examples/train/megatron/inspect_cudnn.yaml
 create mode 100644 examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh
 create mode 100644 examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh

diff --git a/examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml b/examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml
new file mode 100644
index 0000000000..bf39a62a0d
--- /dev/null
+++ b/examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml
@@ -0,0 +1,135 @@
+name: qwen3-235b-a22b-rl-training
+entrypoint: |
+  set -e
+
+  # === DIAGNOSTICS: persistent log file survives head pod restarts ===
+  DIAG_LOG=/mnt/cluster_storage/head_diag.log
+  RUN_TAG="run-$(date -u +%Y%m%dT%H%M%SZ)-pid$$"
+  echo "===== [$RUN_TAG] HEAD ENTRYPOINT START $(date -u) =====" | tee -a "$DIAG_LOG"
+  uname -a 2>&1 | tee -a "$DIAG_LOG"
+  cat /etc/os-release 2>/dev/null | tee -a "$DIAG_LOG" || true
+  echo "--- env (filtered) ---" | tee -a "$DIAG_LOG"
+  env | grep -E "^(POD|NODE|HOSTNAME|KUEUE|RAY|ANYSCALE|HEAD)" 2>&1 | tee -a "$DIAG_LOG" || true
+  echo "--- container resource limits (cgroup v2) ---" | tee -a "$DIAG_LOG"
+  cat /sys/fs/cgroup/memory.max 2>/dev/null | tee -a "$DIAG_LOG" || echo "(memory.max not readable)" | tee -a "$DIAG_LOG"
+  cat /sys/fs/cgroup/memory.current 2>/dev/null | tee -a "$DIAG_LOG" || true
+  cat /sys/fs/cgroup/cpu.max 2>/dev/null | tee -a "$DIAG_LOG" || true
+  echo "--- previous boot dmesg (oom_killer if present) ---" | tee -a "$DIAG_LOG"
+  dmesg 2>/dev/null | tail -200 | tee -a "$DIAG_LOG" || echo "(dmesg not accessible)" | tee -a "$DIAG_LOG"
+  echo "--- initial free / df ---" | tee -a "$DIAG_LOG"
+  free -h 2>&1 | tee -a "$DIAG_LOG"
+  df -h 2>&1 | tee -a "$DIAG_LOG"
+
+  # Trap signals so we know WHY we're exiting (k8s preemption sends SIGTERM)
+  trap 'rc=$?; echo "[$RUN_TAG] TRAP EXIT code=$rc at $(date -u)" | tee -a "$DIAG_LOG"; free -h 2>&1 | tee -a "$DIAG_LOG"; ps -eo pid,ppid,rss,vsz,pcpu,pmem,etime,comm --sort=-rss 2>&1 | head -20 | tee -a "$DIAG_LOG"; exit $rc' EXIT
+  trap 'echo "[$RUN_TAG] SIGTERM received at $(date -u)" | tee -a "$DIAG_LOG"; free -h 2>&1 | tee -a "$DIAG_LOG"; ps -eo pid,ppid,rss,vsz,pcpu,pmem,etime,comm --sort=-rss 2>&1 | head -20 | tee -a "$DIAG_LOG"' TERM
+  trap 'echo "[$RUN_TAG] SIGINT received at $(date -u)" | tee -a "$DIAG_LOG"' INT
+  trap 'echo "[$RUN_TAG] SIGHUP received at $(date -u)" | tee -a "$DIAG_LOG"' HUP
+
+  # Background monitor: every 30s dump mem/cpu/disk to DIAG_LOG
+  (
+    while true; do
+      ts=$(date -u +%H:%M:%S)
+      mem=$(free -m 2>/dev/null | awk '/^Mem:/{printf "used=%dMi total=%dMi avail=%dMi", $3, $2, $7}')
+      cgmem=$(cat /sys/fs/cgroup/memory.current 2>/dev/null)
+      cgmax=$(cat /sys/fs/cgroup/memory.max 2>/dev/null)
+      load=$(cat /proc/loadavg 2>/dev/null | awk '{print $1,$2,$3}')
+      diskcs=$(df -h /mnt/cluster_storage 2>/dev/null | tail -1 | awk '{print "cs="$5"("$3"/"$2")"}')
+      diskroot=$(df -h / 2>/dev/null | tail -1 | awk '{print "/="$5"("$3"/"$2")"}')
+      echo "[$RUN_TAG] $ts $mem cg_used=$cgmem cg_max=$cgmax load=$load $diskcs $diskroot" >> "$DIAG_LOG"
+      sleep 30
+    done
+  ) &
+  MONITOR_PID=$!
+  echo "[$RUN_TAG] background monitor pid=$MONITOR_PID" | tee -a "$DIAG_LOG"
+
+  # === ACTUAL JOB ===
+  # Install nvidia headers to shared cluster storage so all worker nodes can find them
+  # via CPATH/LD_LIBRARY_PATH below. Only the head runs this; workers mount the same FS.
+  NV=/mnt/cluster_storage/nv_pkg
+  if [ ! -f "$NV/nvidia/cudnn/include/cudnn.h" ]; then
+    echo "[bootstrap] installing nvidia headers to $NV" | tee -a "$DIAG_LOG"
+    /home/ray/anaconda3/bin/python3 -m pip install -q --target "$NV" \
+      nvidia-cudnn-cu12 nvidia-nccl-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 \
+      nvidia-cusolver-cu12 nvidia-curand-cu12 nvidia-cufft-cu12 nvidia-cuda-runtime-cu12 \
+      nvidia-cuda-nvrtc-cu12 nvidia-cuda-cupti-cu12 nvidia-nvjitlink-cu12 nvidia-nvtx-cu12 \
+      nvidia-cudnn-frontend
+  fi
+  ls -la "$NV/nvidia/cudnn/include/cudnn.h"
+  bash examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+# Use the shared pre-built venv on NFS for all Ray worker actors.
+py_executable: "/mnt/cluster_storage/.skyrl-venv/bin/python"
+
+# cuDNN/NCCL/cublas paths point to our pre-installed nvidia packages on shared NFS
+# (set up by the head entrypoint preamble). /opt/cudnn doesn't exist in this image.
+env_vars:
+  # Disable Ray's auto "uv run" propagation. Don't set RAY_RUNTIME_ENV_HOOK
+  # to empty here — unset it in the entrypoint shell instead (Ray's load_class
+  # crashes on empty string).
+  RAY_ENABLE_UV_RUN_RUNTIME_ENV: "0"
+  CUDA_HOME: "/usr/local/cuda"
+  CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include"
+  C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib"
+  LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib"
+  CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn"
+  # Workers compile transformer-engine-torch from source on first start; that takes
+  # 5-10 minutes which exceeds Ray's default PG timeout. Give it 30 minutes.
+  SKYRL_RAY_PG_TIMEOUT_IN_S: "1800"
+  # 235B model on TP=16 needs much longer than the default 600s to become healthy.
+  SKYRL_WAIT_UNTIL_INFERENCE_SERVER_HEALTHY_TIMEOUT_S: "3600"
+  # Share uv's build/wheel cache across all nodes and runs via NFS. After TE
+  # builds once globally, every future cluster start pulls the cached wheel
+  # instead of rebuilding (cuts launch time from ~25 min cold to ~5 min warm).
+  UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache"
+  UV_LINK_MODE: "copy"
+  # ninja parallelism — 4 jobs ≈ 12 GiB peak (fits 32 GiB head; trivial for workers).
+  MAX_JOBS: "4"
+  NINJA_JOBS: "4"
+  CMAKE_BUILD_PARALLEL_LEVEL: "4"
+  # Ray's 60s worker-register timeout otherwise fires repeatedly during the
+  # first-time TE build, producing log noise + spurious worker process spawns.
+  RAY_worker_register_timeout_seconds: "1800"
+  # Critical: RAY_OVERRIDE_JOB_RUNTIME_ENV=1 replaces SkyRL's prepare_runtime_environment
+  # env vars with ours. Re-add all the ones SkyRL would have set (utils.py:615-647)
+  # so vLLM/megatron behave correctly.
+  RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
+  NVTE_FUSED_ATTN: "0"
+  VLLM_ALLOW_RUNTIME_LORA_UPDATING: "true"
+  VLLM_ALLOW_INSECURE_SERIALIZATION: "1"
+  VLLM_DISABLE_COMPILE_CACHE: "1"
+  VLLM_USE_V1: "1"
+  VLLM_ENABLE_V1_MULTIPROCESSING: "0"
+  _SKYRL_USE_NEW_INFERENCE: "1"
+  # Cache HF downloads in shared FS so model doesn't redownload on restarts / per-worker.
+  HF_HOME: "/mnt/cluster_storage/hf_cache"
+  HF_HUB_ENABLE_HF_TRANSFER: "1"
+  # Allow YAML env_vars to be merged with SkyRL's runtime env instead of conflicting.
+  RAY_OVERRIDE_JOB_RUNTIME_ENV: "1"
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 8
+      memory: 32Gi
+  worker_nodes:
+    - required_resources:
+        CPU: 184
+        memory: 1920Gi
+        GPU: 8
+      required_labels:
+        ray.io/accelerator-type: H100
+      min_nodes: 2
+      max_nodes: 2
diff --git a/examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml b/examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml
new file mode 100644
index 0000000000..9a1df4222f
--- /dev/null
+++ b/examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml
@@ -0,0 +1,72 @@
+name: qwen3-30b-validate
+entrypoint: |
+  set -e
+  DIAG_LOG=/mnt/cluster_storage/head_diag.log
+  RUN_TAG="run-30b-$(date -u +%Y%m%dT%H%M%SZ)-pid$$"
+  echo "===== [$RUN_TAG] HEAD ENTRYPOINT START $(date -u) =====" | tee -a "$DIAG_LOG"
+  trap 'rc=$?; echo "[$RUN_TAG] TRAP EXIT code=$rc at $(date -u)" | tee -a "$DIAG_LOG"; exit $rc' EXIT
+  bash examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+# Use the shared pre-built venv on NFS for all Ray worker actors.
+# This eliminates per-actor `uv run` which was causing 16 redundant installs.
+py_executable: "/mnt/cluster_storage/.skyrl-venv/bin/python"
+
+env_vars:
+  # Disable Ray's auto "uv run" propagation — we are using a pre-built venv directly.
+  # NOTE: do NOT set RAY_RUNTIME_ENV_HOOK to empty string; Ray's load_class()
+  # treats that as "load class named ''" and crashes. We unset it inside the
+  # entrypoint script instead.
+  RAY_ENABLE_UV_RUN_RUNTIME_ENV: "0"
+  UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache"
+  UV_LINK_MODE: "copy"
+  MAX_JOBS: "4"
+  NINJA_JOBS: "4"
+  CMAKE_BUILD_PARALLEL_LEVEL: "4"
+  RAY_worker_register_timeout_seconds: "1800"
+  SKYRL_RAY_PG_TIMEOUT_IN_S: "1800"
+  SKYRL_WAIT_UNTIL_INFERENCE_SERVER_HEALTHY_TIMEOUT_S: "3600"
+  HF_HOME: "/mnt/cluster_storage/hf_cache"
+  HF_HUB_ENABLE_HF_TRANSFER: "1"
+  RAY_OVERRIDE_JOB_RUNTIME_ENV: "1"
+  RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
+  NVTE_FUSED_ATTN: "0"
+  VLLM_ALLOW_RUNTIME_LORA_UPDATING: "true"
+  VLLM_ALLOW_INSECURE_SERIALIZATION: "1"
+  VLLM_DISABLE_COMPILE_CACHE: "1"
+  VLLM_USE_V1: "1"
+  VLLM_ENABLE_V1_MULTIPROCESSING: "0"
+  _SKYRL_USE_NEW_INFERENCE: "1"
+  CUDA_HOME: "/usr/local/cuda"
+  # Point to our pre-installed cuDNN/NCCL/cublas etc. on shared NFS.
+  # /opt/cudnn doesn't exist in this image (Dockerfile.megatron paths don't apply).
+  CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include"
+  C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib"
+  LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib"
+  CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn"
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 8
+      memory: 32Gi
+  worker_nodes:
+    - required_resources:
+        CPU: 184
+        memory: 1920Gi
+        GPU: 8
+      required_labels:
+        ray.io/accelerator-type: H100
+      min_nodes: 2
+      max_nodes: 2
diff --git a/examples/train/megatron/build_shared_venv.yaml b/examples/train/megatron/build_shared_venv.yaml
new file mode 100644
index 0000000000..82ba61eb56
--- /dev/null
+++ b/examples/train/megatron/build_shared_venv.yaml
@@ -0,0 +1,55 @@
+name: build-skyrl-shared-venv
+entrypoint: |
+  set -e
+  VENV=/mnt/cluster_storage/.skyrl-venv
+  STAMP=$VENV/.built-ok
+  if [ -f "$STAMP" ]; then
+    echo "[setup] venv already built at $VENV — verifying imports"
+    "$VENV/bin/python" -c "import torch, megatron.core, transformer_engine.pytorch, vllm, skyrl_gym, skyrl.train; print('OK')"
+    echo "[setup] no rebuild needed."
+    exit 0
+  fi
+  echo "[setup] building venv at $VENV  ($(date -u))"
+  rm -rf "$VENV"
+  # We need pyproject.toml + skyrl + skyrl-gym sources to run uv sync. The Anyscale
+  # working_dir is uploaded into the runtime_resources dir; copy it to a stable path.
+  WD="$(pwd)"
+  echo "[setup] working_dir = $WD"
+  cd "$WD"
+  uv venv --python 3.12 "$VENV"
+  # --no-editable so the workspace packages (skyrl, skyrl-gym) are installed as
+  # regular packages inside the venv; otherwise their source path lives in this
+  # ephemeral pod and the venv breaks when training runs elsewhere.
+  UV_PROJECT_ENVIRONMENT="$VENV" uv sync --extra megatron --no-editable
+  "$VENV/bin/python" -c "import torch, megatron.core, transformer_engine.pytorch, vllm, skyrl_gym, skyrl.train; print('IMPORTS OK')"
+  touch "$STAMP"
+  echo "[setup] DONE at $(date -u). venv at $VENV"
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+env_vars:
+  UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache"
+  UV_LINK_MODE: "copy"
+  MAX_JOBS: "8"
+  NINJA_JOBS: "8"
+  CMAKE_BUILD_PARALLEL_LEVEL: "8"
+  CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include"
+  C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib"
+  LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib"
+  CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn"
+  CUDA_HOME: "/usr/local/cuda"
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 8
+      memory: 32Gi
diff --git a/examples/train/megatron/clear_uv_cache.yaml b/examples/train/megatron/clear_uv_cache.yaml
new file mode 100644
index 0000000000..a36af8245d
--- /dev/null
+++ b/examples/train/megatron/clear_uv_cache.yaml
@@ -0,0 +1,20 @@
+name: clear-uv-cache
+entrypoint: |
+  echo "Before:"; du -sh /mnt/cluster_storage/.uv_cache 2>/dev/null
+  rm -rf /mnt/cluster_storage/.uv_cache 2>&1
+  echo "Done."
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 2
+      memory: 4Gi
diff --git a/examples/train/megatron/clear_venv.yaml b/examples/train/megatron/clear_venv.yaml
new file mode 100644
index 0000000000..948bff34b3
--- /dev/null
+++ b/examples/train/megatron/clear_venv.yaml
@@ -0,0 +1,19 @@
+name: clear-skyrl-venv
+entrypoint: |
+  echo "Before:"; du -sh /mnt/cluster_storage/.skyrl-venv 2>/dev/null
+  rm -rf /mnt/cluster_storage/.skyrl-venv 2>&1
+  echo "Done."
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 2
+      memory: 4Gi
diff --git a/examples/train/megatron/download_model.yaml b/examples/train/megatron/download_model.yaml
new file mode 100644
index 0000000000..4f993adcd5
--- /dev/null
+++ b/examples/train/megatron/download_model.yaml
@@ -0,0 +1,36 @@
+name: qwen-model-prefetch
+entrypoint: |
+  set -e
+  export HF_HOME=/mnt/cluster_storage/hf_cache
+  export HF_HUB_ENABLE_HF_TRANSFER=1
+  /home/ray/anaconda3/bin/python3 -m pip install -q --upgrade huggingface_hub hf_transfer
+  /home/ray/anaconda3/bin/python3 - <<'PY'
+  from huggingface_hub import snapshot_download
+  import time
+  t0 = time.time()
+  path = snapshot_download(
+      repo_id="Qwen/Qwen3-235B-A22B-Instruct-2507",
+      cache_dir="/mnt/cluster_storage/hf_cache/hub",
+      max_workers=8,
+      allow_patterns=["*.json", "*.txt", "*.safetensors", "tokenizer*"],
+  )
+  print(f"\n[prefetch] Downloaded to {path} in {time.time()-t0:.1f}s")
+  PY
+  echo "[prefetch] DONE"
+  ls -lh /mnt/cluster_storage/hf_cache/hub/models--Qwen--Qwen3-235B-A22B-Instruct-2507/snapshots/*/ | head -5
+  du -sh /mnt/cluster_storage/hf_cache/hub/models--Qwen--Qwen3-235B-A22B-Instruct-2507/
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 4
+      memory: 16Gi
diff --git a/examples/train/megatron/dump_diag.yaml b/examples/train/megatron/dump_diag.yaml
new file mode 100644
index 0000000000..fcca7e5a45
--- /dev/null
+++ b/examples/train/megatron/dump_diag.yaml
@@ -0,0 +1,31 @@
+name: dump-head-diag
+entrypoint: |
+  set -x
+  echo "===== cluster_storage layout ====="
+  ls -la /mnt/cluster_storage/ 2>&1
+  echo "===== hf_cache layout ====="
+  ls -la /mnt/cluster_storage/hf_cache/ 2>&1 || echo "(no hf_cache)"
+  echo "===== model directories ====="
+  find /mnt/cluster_storage/hf_cache/ -maxdepth 4 -type d 2>/dev/null | head -50
+  echo "===== safetensors files (sizes) ====="
+  find /mnt/cluster_storage/hf_cache/ -name "*.safetensors" -o -name "*.safetensors.tmp*" 2>/dev/null | xargs -I{} ls -lh {} 2>/dev/null | head -100
+  echo "===== uv cache size ====="
+  du -sh /mnt/cluster_storage/.uv_cache 2>/dev/null
+  ls /mnt/cluster_storage/.uv_cache/ 2>&1 | head -20
+  echo "===== tail of head_diag.log ====="
+  tail -30 /mnt/cluster_storage/head_diag.log 2>&1 || echo "(no diag log)"
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 2
+      memory: 4Gi
diff --git a/examples/train/megatron/inspect_cudnn.yaml b/examples/train/megatron/inspect_cudnn.yaml
new file mode 100644
index 0000000000..3b09c5bb8f
--- /dev/null
+++ b/examples/train/megatron/inspect_cudnn.yaml
@@ -0,0 +1,31 @@
+name: inspect-cudnn
+entrypoint: |
+  echo "===== /opt/cudnn (symlink) ====="
+  ls -la /opt/cudnn 2>&1
+  echo "===== /opt/cudnn target contents ====="
+  ls -la /opt/cudnn/ 2>&1
+  echo "===== /opt/cudnn/include ====="
+  ls -la /opt/cudnn/include/ 2>&1 | head -20
+  echo "===== find cudnn.h ====="
+  find / -name "cudnn.h" 2>/dev/null | head -20
+  echo "===== find libcudnn ====="
+  find / -name "libcudnn*" 2>/dev/null | head -20
+  echo "===== env CPATH / CUDNN_PATH ====="
+  echo "CPATH=$CPATH"
+  echo "CUDNN_PATH=$CUDNN_PATH"
+  echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 2
+      memory: 4Gi
diff --git a/examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh b/examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh
new file mode 100644
index 0000000000..01f2038fe5
--- /dev/null
+++ b/examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh
@@ -0,0 +1,173 @@
+set -x
+
+# Colocated DAPO+LoRA training+generation for Qwen3-235B-A22B on 2 nodes of 8xH100s.
+# Adapted from run_megatron_dapo_qwen3_235b_a22b_lora.sh (4-node) for 16-GPU setups.
+# Uses FP8 training (TransformerEngine) and FP8 vLLM inference to reduce VRAM pressure.
+
+# bash examples/train/algorithms/dapo/prepare_dapo_data.sh
+# bash examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh
+
+LOGGER="console"  # set to "wandb" with WANDB_API_KEY env var to enable wandb
+
+# Use shared cluster storage so workers can read the data files too.
+DATA_DIR="/mnt/cluster_storage/data/dapo"
+TRAIN_FILE="$DATA_DIR/dapo-math-17k.parquet"
+TEST_FILE="$DATA_DIR/aime-2024.parquet"
+
+mkdir -p "$DATA_DIR"
+[ -f "$TRAIN_FILE" ] || wget -q -O "$TRAIN_FILE" "https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k/resolve/main/data/dapo-math-17k.parquet?download=true"
+[ -f "$TEST_FILE" ] || wget -q -O "$TEST_FILE" "https://huggingface.co/datasets/BytedTsinghua-SIA/AIME-2024/resolve/main/data/aime-2024.parquet?download=true"
+# download Qwen/Qwen3-235B-A22B-Instruct-2507 from huggingface
+# `pip install huggingface_hub hf_transfer`
+# `HF_HUB_ENABLE_HF_TRANSFER=1 hf download Qwen/Qwen3-235B-A22B-Instruct-2507 --local-dir ~/qwen235b`
+MODEL_NAME="Qwen/Qwen3-235B-A22B-Instruct-2507"
+
+NUM_NODES=2
+NUM_GPUS_PER_NODE=8
+
+### Megatron configuration
+# Qwen3-235B-A22B uses GQA with 4 groups, so max TP=4
+MEGATRON_TP=4
+# PP=2 (halved from 4-node config) keeps DP=2: TP x PP = 8, 16/8 = 2 DP
+MEGATRON_PP=2
+MEGATRON_CP=1
+# EP=2 (EP must divide DP; DP=2 so EP<=2)
+MEGATRON_EP=2
+MEGATRON_ETP=1
+# Qwen3-235B-A22B has 94 blocks; with PP=2, each stage holds 47 blocks
+MEGATRON_LAST_PIPELINE_STAGE_LAYER=47
+FLASH_ATTN=true
+# Optimizer offloading is essential for fitting 235B on 16 GPUs
+OPTIMIZER_OFFLOAD=true
+OPTIMIZER_OFFLOAD_FRACTION=1.0
+
+### Inference engine configuration
+INFERENCE_BACKEND="vllm"
+# 2 engines × TP=8 (each engine on a single node) avoids cross-node NCCL
+# in vLLM init, which was hanging silently.
+NUM_INFERENCE_ENGINES=2
+INFERENCE_ENGINE_TP=8
+# Lowered from 12000 to reduce KV cache VRAM on 16 GPUs
+INFERENCE_ENGINE_MAX_MODEL_LEN=8192
+
+### LoRA configuration
+LORA_RANK=128
+LORA_ALPHA=128
+
+### DAPO parameters
+CLIP_RATIO_LOW=0.2
+CLIP_RATIO_HIGH=0.28
+LOSS_REDUCTION="token_mean"
+APPLY_OVERLONG_FILTERING=true
+OVERLONG_BUFFER_LEN=$((1024 * 4))
+OVERLONG_BUFFER_PENALTY_FACTOR=1.0
+USE_KL_LOSS=false
+TEMPERATURE=1.0
+TOP_P=1.0
+EVAL_TOP_P=0.7
+CLIP_RATIO_C=10.0
+MAX_PROMPT_LENGTH=$((1024 * 2))
+MAX_RESPONSE_LENGTH=$((1024 * 8))
+
+### Batch sizes (conservative for 16-GPU memory budget)
+TRAIN_BATCH_SIZE=64      # halved from 128
+MINI_BATCH_SIZE=16       # halved from 32
+N_SAMPLES_PER_PROMPT=16
+EVAL_N_SAMPLES_PER_PROMPT=32
+ENFORCE_EAGER=true  # skip CUDA-graph capture — vLLM 235B+FP8 was hanging silently during capture
+LR=1e-5
+
+### Rollout correction
+TIS_RATIO_TYPE="token"
+TIS_IMP_RATIO_CAP=2.0
+
+# Disable Ray's uv-run propagation hook so worker actors use the shared venv
+# directly via py_executable instead of re-running uv install per actor.
+unset RAY_RUNTIME_ENV_HOOK
+
+# Forward SIGTERM/SIGINT to the python child so its in-process signal handler can
+# call dist.destroy_process_group() + cuda.synchronize() before k8s sends SIGKILL.
+trap 'echo "[script] forwarding SIGTERM to pid=$PID"; kill -TERM "$PID" 2>/dev/null; wait "$PID"' TERM INT
+
+# Ray workers share the head's .venv (built from this working_dir), so we must
+# keep --extra megatron. MAX_JOBS=1 / NINJA_JOBS=1 in env_vars caps build memory.
+/mnt/cluster_storage/.skyrl-venv/bin/python -m examples.train.algorithms.dapo.main_dapo \
+  data.train_data="['$TRAIN_FILE']" \
+  data.val_data="['$TEST_FILE']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.algorithm.policy_loss_type="dual_clip" \
+  trainer.algorithm.overlong_buffer_len=$OVERLONG_BUFFER_LEN \
+  trainer.algorithm.overlong_buffer_penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \
+  trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
+  generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
+  generator.sampling_params.temperature=$TEMPERATURE \
+  generator.sampling_params.top_p=$TOP_P \
+  generator.eval_sampling_params.top_p=$EVAL_TOP_P \
+  generator.eval_sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \
+  trainer.policy.model.path=$MODEL_NAME \
+  trainer.placement.colocate_all=true \
+  trainer.strategy=megatron \
+  trainer.placement.policy_num_nodes=$NUM_NODES \
+  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \
+  generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \
+  generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TP \
+  trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \
+  trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \
+  trainer.policy.model.lora.rank=$LORA_RANK \
+  trainer.policy.model.lora.alpha=$LORA_ALPHA \
+  trainer.algorithm.off_policy_correction.tis_ratio_type=$TIS_RATIO_TYPE \
+  trainer.algorithm.off_policy_correction.token_tis_ratio_clip_high=$TIS_IMP_RATIO_CAP \
+  trainer.policy.megatron_config.optimizer_config_kwargs.overlap_cpu_optimizer_d2h_h2d=$OPTIMIZER_OFFLOAD \
+  trainer.policy.megatron_config.optimizer_config_kwargs.use_precision_aware_optimizer=$OPTIMIZER_OFFLOAD \
+  trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_cpu_offload=$OPTIMIZER_OFFLOAD \
+  trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_offload_fraction=$OPTIMIZER_OFFLOAD_FRACTION \
+  trainer.policy.megatron_config.transformer_config_kwargs.num_layers_in_last_pipeline_stage=$MEGATRON_LAST_PIPELINE_STAGE_LAYER \
+  trainer.policy.megatron_config.transformer_config_kwargs.fp8=e4m3 \
+  trainer.policy.megatron_config.transformer_config_kwargs.fp8_margin=0 \
+  trainer.policy.megatron_config.transformer_config_kwargs.fp8_amax_history_len=1024 \
+  trainer.policy.megatron_config.transformer_config_kwargs.fp8_amax_compute_algo=max \
+  generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \
+  generator.inference_engine.engine_init_kwargs.quantization=fp8 \
+  trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
+  trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \
+  trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \
+  trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \
+  trainer.use_sample_packing=true \
+  trainer.flash_attn=$FLASH_ATTN \
+  generator.inference_engine.backend=$INFERENCE_BACKEND \
+  generator.inference_engine.run_engines_locally=true \
+  generator.inference_engine.weight_sync_backend=nccl \
+  generator.inference_engine.async_engine=true \
+  generator.batched=true \
+  environment.env_class=aime \
+  generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
+  generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \
+  generator.inference_engine.gpu_memory_utilization=0.7 \
+  trainer.epochs=20 \
+  trainer.eval_batch_size=512 \
+  trainer.eval_before_train=true \
+  trainer.eval_interval=5 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=$TRAIN_BATCH_SIZE \
+  trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \
+  trainer.micro_forward_batch_size_per_gpu=1 \
+  trainer.micro_train_batch_size_per_gpu=1 \
+  trainer.ckpt_interval=10 \
+  trainer.max_prompt_length=$MAX_PROMPT_LENGTH \
+  generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \
+  trainer.policy.optimizer_config.lr=$LR \
+  trainer.logger="$LOGGER" \
+  trainer.project_name="dapo_aime" \
+  trainer.run_name="dapo_qwen3_235b_a22b_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_ep${MEGATRON_EP}_lora_rank${LORA_RANK}_fp8_2nodes" \
+  trainer.export_path="$HOME/exports/dapo_qwen3_235b_a22b_fp8_2nodes" \
+  trainer.hf_save_interval=300 \
+  trainer.resume_mode=latest \
+  trainer.max_ckpts_to_keep=3 \
+  trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_235b_a22b_fp8_2nodes" \
+  "$@" &
+PID=$!
+wait "$PID"
diff --git a/examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh b/examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh
new file mode 100644
index 0000000000..1ced1f410a
--- /dev/null
+++ b/examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh
@@ -0,0 +1,137 @@
+set -x
+
+# Validation run: Qwen3-30B-A3B on 2 nodes × 8 H100s — known-good config.
+# Cloned from run_megatron_dapo_qwen3_30b_a3b.sh with:
+#   - data + ckpts on /mnt/cluster_storage
+#   - LOGGER=console (no wandb needed)
+#   - download data if missing
+#   - uv extra=megatron, no --isolated, NFS uv cache shared with the 235B run
+
+LOGGER="console"
+
+DATA_DIR="/mnt/cluster_storage/data/dapo"
+TRAIN_FILE="$DATA_DIR/dapo-math-17k.parquet"
+TEST_FILE="$DATA_DIR/aime-2024.parquet"
+mkdir -p "$DATA_DIR"
+[ -f "$TRAIN_FILE" ] || wget -q -O "$TRAIN_FILE" "https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k/resolve/main/data/dapo-math-17k.parquet?download=true"
+[ -f "$TEST_FILE" ] || wget -q -O "$TEST_FILE" "https://huggingface.co/datasets/BytedTsinghua-SIA/AIME-2024/resolve/main/data/aime-2024.parquet?download=true"
+
+# Cache model locally for fast load. Worker nodes have local NVMe; copy once from NFS.
+LOCAL_HF=/home/ray/hf_cache
+NFS_HF=/mnt/cluster_storage/hf_cache
+mkdir -p "$LOCAL_HF/hub"
+if [ -d "$NFS_HF/hub/models--Qwen--Qwen3-30B-A3B-Base" ] && [ ! -d "$LOCAL_HF/hub/models--Qwen--Qwen3-30B-A3B-Base" ]; then
+  echo "[script] copying Qwen3-30B-A3B-Base from NFS to local"
+  cp -r "$NFS_HF/hub/models--Qwen--Qwen3-30B-A3B-Base" "$LOCAL_HF/hub/"
+fi
+export HF_HOME="$LOCAL_HF"
+
+MODEL_NAME="Qwen/Qwen3-30B-A3B-Base"
+NUM_NODES=2
+NUM_GPUS_PER_NODE=8
+NUM_INFERENCE_ENGINES=2
+INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=8
+
+# DAPO knobs (mirror the existing 30B script)
+CLIP_RATIO_LOW=0.2
+CLIP_RATIO_HIGH=0.28
+LOSS_REDUCTION="token_mean"
+APPLY_OVERLONG_FILTERING=true
+OVERLONG_BUFFER_LEN=$((1024 * 4))
+OVERLONG_BUFFER_PENALTY_FACTOR=1.0
+USE_KL_LOSS=false
+TEMPERATURE=1.0
+TOP_P=1.0
+EVAL_TOP_P=0.7
+CLIP_RATIO_C=10.0
+MAX_PROMPT_LENGTH=$((1024 * 2))
+MAX_RESPONSE_LENGTH=$((1024 * 8))
+TRAIN_BATCH_SIZE=512
+MINI_BATCH_SIZE=32
+N_SAMPLES_PER_PROMPT=16
+EVAL_N_SAMPLES_PER_PROMPT=32
+ENFORCE_EAGER=true
+LR=1e-6
+MEGATRON_TP=4
+MEGATRON_PP=1
+MEGATRON_CP=1
+MEGATRON_EP=8
+MEGATRON_ETP=1
+TIS_IMP_RATIO_CAP=2.0
+TIS_TYPE=token
+
+# Disable Ray's uv-run propagation hook so worker actors use the shared venv
+# directly via py_executable instead of re-running uv install per actor.
+unset RAY_RUNTIME_ENV_HOOK
+
+trap 'echo "[script] forwarding SIGTERM to pid=$PID"; kill -TERM "$PID" 2>/dev/null; wait "$PID"' TERM INT
+
+/mnt/cluster_storage/.skyrl-venv/bin/python -m examples.train.algorithms.dapo.main_dapo \
+  data.train_data="['$TRAIN_FILE']" \
+  data.val_data="['$TEST_FILE']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.algorithm.policy_loss_type="dual_clip" \
+  trainer.algorithm.overlong_buffer_len=$OVERLONG_BUFFER_LEN \
+  trainer.algorithm.overlong_buffer_penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \
+  trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
+  generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
+  generator.sampling_params.temperature=$TEMPERATURE \
+  generator.sampling_params.top_p=$TOP_P \
+  generator.eval_sampling_params.top_p=$EVAL_TOP_P \
+  generator.eval_sampling_params.temperature=$TEMPERATURE \
+  generator.eval_sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \
+  trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
+  trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \
+  trainer.policy.model.path="$MODEL_NAME" \
+  trainer.placement.colocate_all=true \
+  trainer.strategy=megatron \
+  trainer.placement.policy_num_nodes=$NUM_NODES \
+  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \
+  generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \
+  generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \
+  trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \
+  trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \
+  trainer.algorithm.off_policy_correction.tis_ratio_type=$TIS_TYPE \
+  trainer.algorithm.off_policy_correction.token_tis_ratio_clip_high=$TIS_IMP_RATIO_CAP \
+  trainer.epochs=20 \
+  trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \
+  trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \
+  trainer.eval_batch_size=1024 \
+  trainer.eval_before_train=true \
+  trainer.eval_interval=5 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=$TRAIN_BATCH_SIZE \
+  trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \
+  trainer.micro_forward_batch_size_per_gpu=4 \
+  trainer.micro_train_batch_size_per_gpu=2 \
+  trainer.ckpt_interval=10 \
+  trainer.max_prompt_length=$MAX_PROMPT_LENGTH \
+  generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \
+  trainer.policy.optimizer_config.lr=$LR \
+  trainer.policy.optimizer_config.num_warmup_steps=160 \
+  trainer.policy.optimizer_config.weight_decay=0.1 \
+  trainer.policy.optimizer_config.max_grad_norm=1.0 \
+  generator.inference_engine.backend=vllm \
+  generator.inference_engine.run_engines_locally=true \
+  generator.inference_engine.weight_sync_backend=nccl \
+  generator.inference_engine.async_engine=false \
+  generator.batched=true \
+  environment.env_class=aime \
+  generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
+  generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \
+  generator.inference_engine.gpu_memory_utilization=0.7 \
+  trainer.logger="$LOGGER" \
+  trainer.project_name="dapo_aime_30b" \
+  trainer.run_name="dapo_qwen3_30b_a3b_2nodes_validate" \
+  trainer.export_path="/mnt/cluster_storage/exports/qwen3_30b_validate" \
+  trainer.hf_save_interval=300 \
+  trainer.resume_mode=latest \
+  trainer.max_ckpts_to_keep=3 \
+  trainer.ckpt_path="/mnt/cluster_storage/ckpts/qwen3_30b_validate" \
+  "$@" &
+PID=$!
+wait "$PID"