From 9e9ba67c22577b1b989b63f94a50f8c19a75b4cf Mon Sep 17 00:00:00 2001 From: xyuzh Date: Mon, 18 May 2026 12:08:47 -0700 Subject: [PATCH 1/3] [workers] Clean teardown on SIGTERM: drain CUDA streams + destroy process group When a k8s pod is evicted (preemption, scale-down, node drain) the container gets SIGTERM with a 25s grace period before SIGKILL. Without a handler, in-flight NCCL collectives leak communicators and the next run may hit stale process group state. Add a SIGTERM handler inside DistributedTorchRayActor.init_worker_process_group() that: - calls torch.cuda.synchronize() to drain any in-flight CUDA work - calls torch.distributed.destroy_process_group() to release NCCL - exits cleanly with sys.exit(0) Both calls are wrapped in try/except so a partial-state worker still tears down the half that's healthy. The whole sequence is well under the 25s grace window. Each call is guarded (`torch.distributed.is_available() and is_initialized()`) so it does nothing when distributed isn't set up yet. --- skyrl/backends/skyrl_train/workers/worker.py | 24 ++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/skyrl/backends/skyrl_train/workers/worker.py b/skyrl/backends/skyrl_train/workers/worker.py index 43a73fc28c..4aa02f3842 100644 --- a/skyrl/backends/skyrl_train/workers/worker.py +++ b/skyrl/backends/skyrl_train/workers/worker.py @@ -1,7 +1,9 @@ import asyncio import logging import os +import signal import socket +import sys from collections import defaultdict from ctypes import CDLL, POINTER, Structure, c_char_p, c_int, c_ulong, c_void_p from datetime import timedelta @@ -121,6 +123,28 @@ def init_worker_process_group(self): backend="cpu:gloo,cuda:nccl", timeout=timedelta(seconds=SKYRL_WORKER_NCCL_TIMEOUT_IN_S) ) + # Clean teardown on k8s SIGTERM: drain CUDA streams + release NCCL + # communicators before the 25s grace period elapses. + rank = self._rank + + def _sigterm_cleanup(signum, frame): + logger.warning(f"SIGTERM received in worker rank={rank}, cleaning up...") + + try: + torch.cuda.synchronize() + except Exception as e: + logger.warning(f"cuda.synchronize() failed: {e}") + + try: + if torch.distributed.is_available() and torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + except Exception as e: + logger.warning(f"destroy_process_group() failed: {e}") + + sys.exit(0) + + signal.signal(signal.SIGTERM, _sigterm_cleanup) + # setup device mesh # TODO: Support TP / PP for additional backends # NOTE (sumanthrh): Device mesh and mesh rank are rank specific attributes. For the current way the strategy is defined, From eb8d6126d034180fc280c1c64d3b672ef76513f1 Mon Sep 17 00:00:00 2001 From: xyuzh Date: Mon, 18 May 2026 12:14:24 -0700 Subject: [PATCH 2/3] Guard cuda.synchronize() with torch.cuda.is_available() Skip the call entirely on CPU-only environments so we don't generate a noisy warning every time a non-CUDA worker is terminated. Only emit a warning if synchronize() actually fails on a CUDA-capable system. --- skyrl/backends/skyrl_train/workers/worker.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skyrl/backends/skyrl_train/workers/worker.py b/skyrl/backends/skyrl_train/workers/worker.py index 4aa02f3842..d0ae5af0cc 100644 --- a/skyrl/backends/skyrl_train/workers/worker.py +++ b/skyrl/backends/skyrl_train/workers/worker.py @@ -130,10 +130,11 @@ def init_worker_process_group(self): def _sigterm_cleanup(signum, frame): logger.warning(f"SIGTERM received in worker rank={rank}, cleaning up...") - try: - torch.cuda.synchronize() - except Exception as e: - logger.warning(f"cuda.synchronize() failed: {e}") + if torch.cuda.is_available(): + try: + torch.cuda.synchronize() + except Exception as e: + logger.warning(f"cuda.synchronize() failed: {e}") try: if torch.distributed.is_available() and torch.distributed.is_initialized(): From 983b6a27ba0eba821ce7e9b153ca2d5363a53e35 Mon Sep 17 00:00:00 2001 From: xyuzh Date: Mon, 18 May 2026 12:27:06 -0700 Subject: [PATCH 3/3] [examples][infra] Anyscale 2-node Qwen3 launch flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an Anyscale-friendly launch path for Qwen3-30B-A3B and Qwen3-235B-A22B on 2 H100 nodes. The key piece is a one-shot venv build on shared NFS that all Ray actors share via py_executable — without it every actor independently ran uv install of skyrl[megatron], causing NCCL rendezvous to time out before all 16 actors registered. Built on top of the SIGTERM teardown PR (#1688) so preemption / scale-down doesn't leak NCCL communicators. --- .../megatron/anyscale_qwen3_235b_2nodes.yaml | 135 ++++++++++++++ .../megatron/anyscale_qwen3_30b_2nodes.yaml | 72 ++++++++ .../train/megatron/build_shared_venv.yaml | 55 ++++++ examples/train/megatron/clear_uv_cache.yaml | 20 ++ examples/train/megatron/clear_venv.yaml | 19 ++ examples/train/megatron/download_model.yaml | 36 ++++ examples/train/megatron/dump_diag.yaml | 31 ++++ examples/train/megatron/inspect_cudnn.yaml | 31 ++++ .../run_megatron_qwen3_235b_2nodes.sh | 173 ++++++++++++++++++ .../megatron/run_megatron_qwen3_30b_2nodes.sh | 137 ++++++++++++++ 10 files changed, 709 insertions(+) create mode 100644 examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml create mode 100644 examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml create mode 100644 examples/train/megatron/build_shared_venv.yaml create mode 100644 examples/train/megatron/clear_uv_cache.yaml create mode 100644 examples/train/megatron/clear_venv.yaml create mode 100644 examples/train/megatron/download_model.yaml create mode 100644 examples/train/megatron/dump_diag.yaml create mode 100644 examples/train/megatron/inspect_cudnn.yaml create mode 100644 examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh create mode 100644 examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh diff --git a/examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml b/examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml new file mode 100644 index 0000000000..bf39a62a0d --- /dev/null +++ b/examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml @@ -0,0 +1,135 @@ +name: qwen3-235b-a22b-rl-training +entrypoint: | + set -e + + # === DIAGNOSTICS: persistent log file survives head pod restarts === + DIAG_LOG=/mnt/cluster_storage/head_diag.log + RUN_TAG="run-$(date -u +%Y%m%dT%H%M%SZ)-pid$$" + echo "===== [$RUN_TAG] HEAD ENTRYPOINT START $(date -u) =====" | tee -a "$DIAG_LOG" + uname -a 2>&1 | tee -a "$DIAG_LOG" + cat /etc/os-release 2>/dev/null | tee -a "$DIAG_LOG" || true + echo "--- env (filtered) ---" | tee -a "$DIAG_LOG" + env | grep -E "^(POD|NODE|HOSTNAME|KUEUE|RAY|ANYSCALE|HEAD)" 2>&1 | tee -a "$DIAG_LOG" || true + echo "--- container resource limits (cgroup v2) ---" | tee -a "$DIAG_LOG" + cat /sys/fs/cgroup/memory.max 2>/dev/null | tee -a "$DIAG_LOG" || echo "(memory.max not readable)" | tee -a "$DIAG_LOG" + cat /sys/fs/cgroup/memory.current 2>/dev/null | tee -a "$DIAG_LOG" || true + cat /sys/fs/cgroup/cpu.max 2>/dev/null | tee -a "$DIAG_LOG" || true + echo "--- previous boot dmesg (oom_killer if present) ---" | tee -a "$DIAG_LOG" + dmesg 2>/dev/null | tail -200 | tee -a "$DIAG_LOG" || echo "(dmesg not accessible)" | tee -a "$DIAG_LOG" + echo "--- initial free / df ---" | tee -a "$DIAG_LOG" + free -h 2>&1 | tee -a "$DIAG_LOG" + df -h 2>&1 | tee -a "$DIAG_LOG" + + # Trap signals so we know WHY we're exiting (k8s preemption sends SIGTERM) + trap 'rc=$?; echo "[$RUN_TAG] TRAP EXIT code=$rc at $(date -u)" | tee -a "$DIAG_LOG"; free -h 2>&1 | tee -a "$DIAG_LOG"; ps -eo pid,ppid,rss,vsz,pcpu,pmem,etime,comm --sort=-rss 2>&1 | head -20 | tee -a "$DIAG_LOG"; exit $rc' EXIT + trap 'echo "[$RUN_TAG] SIGTERM received at $(date -u)" | tee -a "$DIAG_LOG"; free -h 2>&1 | tee -a "$DIAG_LOG"; ps -eo pid,ppid,rss,vsz,pcpu,pmem,etime,comm --sort=-rss 2>&1 | head -20 | tee -a "$DIAG_LOG"' TERM + trap 'echo "[$RUN_TAG] SIGINT received at $(date -u)" | tee -a "$DIAG_LOG"' INT + trap 'echo "[$RUN_TAG] SIGHUP received at $(date -u)" | tee -a "$DIAG_LOG"' HUP + + # Background monitor: every 30s dump mem/cpu/disk to DIAG_LOG + ( + while true; do + ts=$(date -u +%H:%M:%S) + mem=$(free -m 2>/dev/null | awk '/^Mem:/{printf "used=%dMi total=%dMi avail=%dMi", $3, $2, $7}') + cgmem=$(cat /sys/fs/cgroup/memory.current 2>/dev/null) + cgmax=$(cat /sys/fs/cgroup/memory.max 2>/dev/null) + load=$(cat /proc/loadavg 2>/dev/null | awk '{print $1,$2,$3}') + diskcs=$(df -h /mnt/cluster_storage 2>/dev/null | tail -1 | awk '{print "cs="$5"("$3"/"$2")"}') + diskroot=$(df -h / 2>/dev/null | tail -1 | awk '{print "/="$5"("$3"/"$2")"}') + echo "[$RUN_TAG] $ts $mem cg_used=$cgmem cg_max=$cgmax load=$load $diskcs $diskroot" >> "$DIAG_LOG" + sleep 30 + done + ) & + MONITOR_PID=$! + echo "[$RUN_TAG] background monitor pid=$MONITOR_PID" | tee -a "$DIAG_LOG" + + # === ACTUAL JOB === + # Install nvidia headers to shared cluster storage so all worker nodes can find them + # via CPATH/LD_LIBRARY_PATH below. Only the head runs this; workers mount the same FS. + NV=/mnt/cluster_storage/nv_pkg + if [ ! -f "$NV/nvidia/cudnn/include/cudnn.h" ]; then + echo "[bootstrap] installing nvidia headers to $NV" | tee -a "$DIAG_LOG" + /home/ray/anaconda3/bin/python3 -m pip install -q --target "$NV" \ + nvidia-cudnn-cu12 nvidia-nccl-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 \ + nvidia-cusolver-cu12 nvidia-curand-cu12 nvidia-cufft-cu12 nvidia-cuda-runtime-cu12 \ + nvidia-cuda-nvrtc-cu12 nvidia-cuda-cupti-cu12 nvidia-nvjitlink-cu12 nvidia-nvtx-cu12 \ + nvidia-cudnn-frontend + fi + ls -la "$NV/nvidia/cudnn/include/cudnn.h" + bash examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh +image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8 +cloud: rkn-gpu-cloud +ray_version: "2.51.1" +working_dir: . +max_retries: 0 + +# Use the shared pre-built venv on NFS for all Ray worker actors. +py_executable: "/mnt/cluster_storage/.skyrl-venv/bin/python" + +# cuDNN/NCCL/cublas paths point to our pre-installed nvidia packages on shared NFS +# (set up by the head entrypoint preamble). /opt/cudnn doesn't exist in this image. +env_vars: + # Disable Ray's auto "uv run" propagation. Don't set RAY_RUNTIME_ENV_HOOK + # to empty here — unset it in the entrypoint shell instead (Ray's load_class + # crashes on empty string). + RAY_ENABLE_UV_RUN_RUNTIME_ENV: "0" + CUDA_HOME: "/usr/local/cuda" + CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include" + C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include" + CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include" + LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib" + LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib" + CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn" + # Workers compile transformer-engine-torch from source on first start; that takes + # 5-10 minutes which exceeds Ray's default PG timeout. Give it 30 minutes. + SKYRL_RAY_PG_TIMEOUT_IN_S: "1800" + # 235B model on TP=16 needs much longer than the default 600s to become healthy. + SKYRL_WAIT_UNTIL_INFERENCE_SERVER_HEALTHY_TIMEOUT_S: "3600" + # Share uv's build/wheel cache across all nodes and runs via NFS. After TE + # builds once globally, every future cluster start pulls the cached wheel + # instead of rebuilding (cuts launch time from ~25 min cold to ~5 min warm). + UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache" + UV_LINK_MODE: "copy" + # ninja parallelism — 4 jobs ≈ 12 GiB peak (fits 32 GiB head; trivial for workers). + MAX_JOBS: "4" + NINJA_JOBS: "4" + CMAKE_BUILD_PARALLEL_LEVEL: "4" + # Ray's 60s worker-register timeout otherwise fires repeatedly during the + # first-time TE build, producing log noise + spurious worker process spawns. + RAY_worker_register_timeout_seconds: "1800" + # Critical: RAY_OVERRIDE_JOB_RUNTIME_ENV=1 replaces SkyRL's prepare_runtime_environment + # env vars with ours. Re-add all the ones SkyRL would have set (utils.py:615-647) + # so vLLM/megatron behave correctly. + RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0" + CUDA_DEVICE_MAX_CONNECTIONS: "1" + NVTE_FUSED_ATTN: "0" + VLLM_ALLOW_RUNTIME_LORA_UPDATING: "true" + VLLM_ALLOW_INSECURE_SERIALIZATION: "1" + VLLM_DISABLE_COMPILE_CACHE: "1" + VLLM_USE_V1: "1" + VLLM_ENABLE_V1_MULTIPROCESSING: "0" + _SKYRL_USE_NEW_INFERENCE: "1" + # Cache HF downloads in shared FS so model doesn't redownload on restarts / per-worker. + HF_HOME: "/mnt/cluster_storage/hf_cache" + HF_HUB_ENABLE_HF_TRANSFER: "1" + # Allow YAML env_vars to be merged with SkyRL's runtime env instead of conflicting. + RAY_OVERRIDE_JOB_RUNTIME_ENV: "1" + +compute_config: + advanced_instance_config: + metadata: + labels: + kueue.x-k8s.io/queue-name: default-queue + head_node: + required_resources: + CPU: 8 + memory: 32Gi + worker_nodes: + - required_resources: + CPU: 184 + memory: 1920Gi + GPU: 8 + required_labels: + ray.io/accelerator-type: H100 + min_nodes: 2 + max_nodes: 2 diff --git a/examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml b/examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml new file mode 100644 index 0000000000..9a1df4222f --- /dev/null +++ b/examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml @@ -0,0 +1,72 @@ +name: qwen3-30b-validate +entrypoint: | + set -e + DIAG_LOG=/mnt/cluster_storage/head_diag.log + RUN_TAG="run-30b-$(date -u +%Y%m%dT%H%M%SZ)-pid$$" + echo "===== [$RUN_TAG] HEAD ENTRYPOINT START $(date -u) =====" | tee -a "$DIAG_LOG" + trap 'rc=$?; echo "[$RUN_TAG] TRAP EXIT code=$rc at $(date -u)" | tee -a "$DIAG_LOG"; exit $rc' EXIT + bash examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh +image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8 +cloud: rkn-gpu-cloud +ray_version: "2.51.1" +working_dir: . +max_retries: 0 + +# Use the shared pre-built venv on NFS for all Ray worker actors. +# This eliminates per-actor `uv run` which was causing 16 redundant installs. +py_executable: "/mnt/cluster_storage/.skyrl-venv/bin/python" + +env_vars: + # Disable Ray's auto "uv run" propagation — we are using a pre-built venv directly. + # NOTE: do NOT set RAY_RUNTIME_ENV_HOOK to empty string; Ray's load_class() + # treats that as "load class named ''" and crashes. We unset it inside the + # entrypoint script instead. + RAY_ENABLE_UV_RUN_RUNTIME_ENV: "0" + UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache" + UV_LINK_MODE: "copy" + MAX_JOBS: "4" + NINJA_JOBS: "4" + CMAKE_BUILD_PARALLEL_LEVEL: "4" + RAY_worker_register_timeout_seconds: "1800" + SKYRL_RAY_PG_TIMEOUT_IN_S: "1800" + SKYRL_WAIT_UNTIL_INFERENCE_SERVER_HEALTHY_TIMEOUT_S: "3600" + HF_HOME: "/mnt/cluster_storage/hf_cache" + HF_HUB_ENABLE_HF_TRANSFER: "1" + RAY_OVERRIDE_JOB_RUNTIME_ENV: "1" + RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0" + CUDA_DEVICE_MAX_CONNECTIONS: "1" + NVTE_FUSED_ATTN: "0" + VLLM_ALLOW_RUNTIME_LORA_UPDATING: "true" + VLLM_ALLOW_INSECURE_SERIALIZATION: "1" + VLLM_DISABLE_COMPILE_CACHE: "1" + VLLM_USE_V1: "1" + VLLM_ENABLE_V1_MULTIPROCESSING: "0" + _SKYRL_USE_NEW_INFERENCE: "1" + CUDA_HOME: "/usr/local/cuda" + # Point to our pre-installed cuDNN/NCCL/cublas etc. on shared NFS. + # /opt/cudnn doesn't exist in this image (Dockerfile.megatron paths don't apply). + CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include" + C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include" + CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include" + LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib" + LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib" + CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn" + +compute_config: + advanced_instance_config: + metadata: + labels: + kueue.x-k8s.io/queue-name: default-queue + head_node: + required_resources: + CPU: 8 + memory: 32Gi + worker_nodes: + - required_resources: + CPU: 184 + memory: 1920Gi + GPU: 8 + required_labels: + ray.io/accelerator-type: H100 + min_nodes: 2 + max_nodes: 2 diff --git a/examples/train/megatron/build_shared_venv.yaml b/examples/train/megatron/build_shared_venv.yaml new file mode 100644 index 0000000000..82ba61eb56 --- /dev/null +++ b/examples/train/megatron/build_shared_venv.yaml @@ -0,0 +1,55 @@ +name: build-skyrl-shared-venv +entrypoint: | + set -e + VENV=/mnt/cluster_storage/.skyrl-venv + STAMP=$VENV/.built-ok + if [ -f "$STAMP" ]; then + echo "[setup] venv already built at $VENV — verifying imports" + "$VENV/bin/python" -c "import torch, megatron.core, transformer_engine.pytorch, vllm, skyrl_gym, skyrl.train; print('OK')" + echo "[setup] no rebuild needed." + exit 0 + fi + echo "[setup] building venv at $VENV ($(date -u))" + rm -rf "$VENV" + # We need pyproject.toml + skyrl + skyrl-gym sources to run uv sync. The Anyscale + # working_dir is uploaded into the runtime_resources dir; copy it to a stable path. + WD="$(pwd)" + echo "[setup] working_dir = $WD" + cd "$WD" + uv venv --python 3.12 "$VENV" + # --no-editable so the workspace packages (skyrl, skyrl-gym) are installed as + # regular packages inside the venv; otherwise their source path lives in this + # ephemeral pod and the venv breaks when training runs elsewhere. + UV_PROJECT_ENVIRONMENT="$VENV" uv sync --extra megatron --no-editable + "$VENV/bin/python" -c "import torch, megatron.core, transformer_engine.pytorch, vllm, skyrl_gym, skyrl.train; print('IMPORTS OK')" + touch "$STAMP" + echo "[setup] DONE at $(date -u). venv at $VENV" +image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8 +cloud: rkn-gpu-cloud +ray_version: "2.51.1" +working_dir: . +max_retries: 0 + +env_vars: + UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache" + UV_LINK_MODE: "copy" + MAX_JOBS: "8" + NINJA_JOBS: "8" + CMAKE_BUILD_PARALLEL_LEVEL: "8" + CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include" + C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include" + CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include" + LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib" + LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib" + CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn" + CUDA_HOME: "/usr/local/cuda" + +compute_config: + advanced_instance_config: + metadata: + labels: + kueue.x-k8s.io/queue-name: default-queue + head_node: + required_resources: + CPU: 8 + memory: 32Gi diff --git a/examples/train/megatron/clear_uv_cache.yaml b/examples/train/megatron/clear_uv_cache.yaml new file mode 100644 index 0000000000..a36af8245d --- /dev/null +++ b/examples/train/megatron/clear_uv_cache.yaml @@ -0,0 +1,20 @@ +name: clear-uv-cache +entrypoint: | + echo "Before:"; du -sh /mnt/cluster_storage/.uv_cache 2>/dev/null + rm -rf /mnt/cluster_storage/.uv_cache 2>&1 + echo "Done." +image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8 +cloud: rkn-gpu-cloud +ray_version: "2.51.1" +working_dir: . +max_retries: 0 + +compute_config: + advanced_instance_config: + metadata: + labels: + kueue.x-k8s.io/queue-name: default-queue + head_node: + required_resources: + CPU: 2 + memory: 4Gi diff --git a/examples/train/megatron/clear_venv.yaml b/examples/train/megatron/clear_venv.yaml new file mode 100644 index 0000000000..948bff34b3 --- /dev/null +++ b/examples/train/megatron/clear_venv.yaml @@ -0,0 +1,19 @@ +name: clear-skyrl-venv +entrypoint: | + echo "Before:"; du -sh /mnt/cluster_storage/.skyrl-venv 2>/dev/null + rm -rf /mnt/cluster_storage/.skyrl-venv 2>&1 + echo "Done." +image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8 +cloud: rkn-gpu-cloud +ray_version: "2.51.1" +working_dir: . +max_retries: 0 +compute_config: + advanced_instance_config: + metadata: + labels: + kueue.x-k8s.io/queue-name: default-queue + head_node: + required_resources: + CPU: 2 + memory: 4Gi diff --git a/examples/train/megatron/download_model.yaml b/examples/train/megatron/download_model.yaml new file mode 100644 index 0000000000..4f993adcd5 --- /dev/null +++ b/examples/train/megatron/download_model.yaml @@ -0,0 +1,36 @@ +name: qwen-model-prefetch +entrypoint: | + set -e + export HF_HOME=/mnt/cluster_storage/hf_cache + export HF_HUB_ENABLE_HF_TRANSFER=1 + /home/ray/anaconda3/bin/python3 -m pip install -q --upgrade huggingface_hub hf_transfer + /home/ray/anaconda3/bin/python3 - <<'PY' + from huggingface_hub import snapshot_download + import time + t0 = time.time() + path = snapshot_download( + repo_id="Qwen/Qwen3-235B-A22B-Instruct-2507", + cache_dir="/mnt/cluster_storage/hf_cache/hub", + max_workers=8, + allow_patterns=["*.json", "*.txt", "*.safetensors", "tokenizer*"], + ) + print(f"\n[prefetch] Downloaded to {path} in {time.time()-t0:.1f}s") + PY + echo "[prefetch] DONE" + ls -lh /mnt/cluster_storage/hf_cache/hub/models--Qwen--Qwen3-235B-A22B-Instruct-2507/snapshots/*/ | head -5 + du -sh /mnt/cluster_storage/hf_cache/hub/models--Qwen--Qwen3-235B-A22B-Instruct-2507/ +image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8 +cloud: rkn-gpu-cloud +ray_version: "2.51.1" +working_dir: . +max_retries: 0 + +compute_config: + advanced_instance_config: + metadata: + labels: + kueue.x-k8s.io/queue-name: default-queue + head_node: + required_resources: + CPU: 4 + memory: 16Gi diff --git a/examples/train/megatron/dump_diag.yaml b/examples/train/megatron/dump_diag.yaml new file mode 100644 index 0000000000..fcca7e5a45 --- /dev/null +++ b/examples/train/megatron/dump_diag.yaml @@ -0,0 +1,31 @@ +name: dump-head-diag +entrypoint: | + set -x + echo "===== cluster_storage layout =====" + ls -la /mnt/cluster_storage/ 2>&1 + echo "===== hf_cache layout =====" + ls -la /mnt/cluster_storage/hf_cache/ 2>&1 || echo "(no hf_cache)" + echo "===== model directories =====" + find /mnt/cluster_storage/hf_cache/ -maxdepth 4 -type d 2>/dev/null | head -50 + echo "===== safetensors files (sizes) =====" + find /mnt/cluster_storage/hf_cache/ -name "*.safetensors" -o -name "*.safetensors.tmp*" 2>/dev/null | xargs -I{} ls -lh {} 2>/dev/null | head -100 + echo "===== uv cache size =====" + du -sh /mnt/cluster_storage/.uv_cache 2>/dev/null + ls /mnt/cluster_storage/.uv_cache/ 2>&1 | head -20 + echo "===== tail of head_diag.log =====" + tail -30 /mnt/cluster_storage/head_diag.log 2>&1 || echo "(no diag log)" +image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8 +cloud: rkn-gpu-cloud +ray_version: "2.51.1" +working_dir: . +max_retries: 0 + +compute_config: + advanced_instance_config: + metadata: + labels: + kueue.x-k8s.io/queue-name: default-queue + head_node: + required_resources: + CPU: 2 + memory: 4Gi diff --git a/examples/train/megatron/inspect_cudnn.yaml b/examples/train/megatron/inspect_cudnn.yaml new file mode 100644 index 0000000000..3b09c5bb8f --- /dev/null +++ b/examples/train/megatron/inspect_cudnn.yaml @@ -0,0 +1,31 @@ +name: inspect-cudnn +entrypoint: | + echo "===== /opt/cudnn (symlink) =====" + ls -la /opt/cudnn 2>&1 + echo "===== /opt/cudnn target contents =====" + ls -la /opt/cudnn/ 2>&1 + echo "===== /opt/cudnn/include =====" + ls -la /opt/cudnn/include/ 2>&1 | head -20 + echo "===== find cudnn.h =====" + find / -name "cudnn.h" 2>/dev/null | head -20 + echo "===== find libcudnn =====" + find / -name "libcudnn*" 2>/dev/null | head -20 + echo "===== env CPATH / CUDNN_PATH =====" + echo "CPATH=$CPATH" + echo "CUDNN_PATH=$CUDNN_PATH" + echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" +image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8 +cloud: rkn-gpu-cloud +ray_version: "2.51.1" +working_dir: . +max_retries: 0 + +compute_config: + advanced_instance_config: + metadata: + labels: + kueue.x-k8s.io/queue-name: default-queue + head_node: + required_resources: + CPU: 2 + memory: 4Gi diff --git a/examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh b/examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh new file mode 100644 index 0000000000..01f2038fe5 --- /dev/null +++ b/examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh @@ -0,0 +1,173 @@ +set -x + +# Colocated DAPO+LoRA training+generation for Qwen3-235B-A22B on 2 nodes of 8xH100s. +# Adapted from run_megatron_dapo_qwen3_235b_a22b_lora.sh (4-node) for 16-GPU setups. +# Uses FP8 training (TransformerEngine) and FP8 vLLM inference to reduce VRAM pressure. + +# bash examples/train/algorithms/dapo/prepare_dapo_data.sh +# bash examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh + +LOGGER="console" # set to "wandb" with WANDB_API_KEY env var to enable wandb + +# Use shared cluster storage so workers can read the data files too. +DATA_DIR="/mnt/cluster_storage/data/dapo" +TRAIN_FILE="$DATA_DIR/dapo-math-17k.parquet" +TEST_FILE="$DATA_DIR/aime-2024.parquet" + +mkdir -p "$DATA_DIR" +[ -f "$TRAIN_FILE" ] || wget -q -O "$TRAIN_FILE" "https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k/resolve/main/data/dapo-math-17k.parquet?download=true" +[ -f "$TEST_FILE" ] || wget -q -O "$TEST_FILE" "https://huggingface.co/datasets/BytedTsinghua-SIA/AIME-2024/resolve/main/data/aime-2024.parquet?download=true" +# download Qwen/Qwen3-235B-A22B-Instruct-2507 from huggingface +# `pip install huggingface_hub hf_transfer` +# `HF_HUB_ENABLE_HF_TRANSFER=1 hf download Qwen/Qwen3-235B-A22B-Instruct-2507 --local-dir ~/qwen235b` +MODEL_NAME="Qwen/Qwen3-235B-A22B-Instruct-2507" + +NUM_NODES=2 +NUM_GPUS_PER_NODE=8 + +### Megatron configuration +# Qwen3-235B-A22B uses GQA with 4 groups, so max TP=4 +MEGATRON_TP=4 +# PP=2 (halved from 4-node config) keeps DP=2: TP x PP = 8, 16/8 = 2 DP +MEGATRON_PP=2 +MEGATRON_CP=1 +# EP=2 (EP must divide DP; DP=2 so EP<=2) +MEGATRON_EP=2 +MEGATRON_ETP=1 +# Qwen3-235B-A22B has 94 blocks; with PP=2, each stage holds 47 blocks +MEGATRON_LAST_PIPELINE_STAGE_LAYER=47 +FLASH_ATTN=true +# Optimizer offloading is essential for fitting 235B on 16 GPUs +OPTIMIZER_OFFLOAD=true +OPTIMIZER_OFFLOAD_FRACTION=1.0 + +### Inference engine configuration +INFERENCE_BACKEND="vllm" +# 2 engines × TP=8 (each engine on a single node) avoids cross-node NCCL +# in vLLM init, which was hanging silently. +NUM_INFERENCE_ENGINES=2 +INFERENCE_ENGINE_TP=8 +# Lowered from 12000 to reduce KV cache VRAM on 16 GPUs +INFERENCE_ENGINE_MAX_MODEL_LEN=8192 + +### LoRA configuration +LORA_RANK=128 +LORA_ALPHA=128 + +### DAPO parameters +CLIP_RATIO_LOW=0.2 +CLIP_RATIO_HIGH=0.28 +LOSS_REDUCTION="token_mean" +APPLY_OVERLONG_FILTERING=true +OVERLONG_BUFFER_LEN=$((1024 * 4)) +OVERLONG_BUFFER_PENALTY_FACTOR=1.0 +USE_KL_LOSS=false +TEMPERATURE=1.0 +TOP_P=1.0 +EVAL_TOP_P=0.7 +CLIP_RATIO_C=10.0 +MAX_PROMPT_LENGTH=$((1024 * 2)) +MAX_RESPONSE_LENGTH=$((1024 * 8)) + +### Batch sizes (conservative for 16-GPU memory budget) +TRAIN_BATCH_SIZE=64 # halved from 128 +MINI_BATCH_SIZE=16 # halved from 32 +N_SAMPLES_PER_PROMPT=16 +EVAL_N_SAMPLES_PER_PROMPT=32 +ENFORCE_EAGER=true # skip CUDA-graph capture — vLLM 235B+FP8 was hanging silently during capture +LR=1e-5 + +### Rollout correction +TIS_RATIO_TYPE="token" +TIS_IMP_RATIO_CAP=2.0 + +# Disable Ray's uv-run propagation hook so worker actors use the shared venv +# directly via py_executable instead of re-running uv install per actor. +unset RAY_RUNTIME_ENV_HOOK + +# Forward SIGTERM/SIGINT to the python child so its in-process signal handler can +# call dist.destroy_process_group() + cuda.synchronize() before k8s sends SIGKILL. +trap 'echo "[script] forwarding SIGTERM to pid=$PID"; kill -TERM "$PID" 2>/dev/null; wait "$PID"' TERM INT + +# Ray workers share the head's .venv (built from this working_dir), so we must +# keep --extra megatron. MAX_JOBS=1 / NINJA_JOBS=1 in env_vars caps build memory. +/mnt/cluster_storage/.skyrl-venv/bin/python -m examples.train.algorithms.dapo.main_dapo \ + data.train_data="['$TRAIN_FILE']" \ + data.val_data="['$TEST_FILE']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.policy_loss_type="dual_clip" \ + trainer.algorithm.overlong_buffer_len=$OVERLONG_BUFFER_LEN \ + trainer.algorithm.overlong_buffer_penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \ + trainer.algorithm.loss_reduction=$LOSS_REDUCTION \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ + generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \ + generator.sampling_params.temperature=$TEMPERATURE \ + generator.sampling_params.top_p=$TOP_P \ + generator.eval_sampling_params.top_p=$EVAL_TOP_P \ + generator.eval_sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.policy.model.path=$MODEL_NAME \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \ + generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TP \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.policy.model.lora.rank=$LORA_RANK \ + trainer.policy.model.lora.alpha=$LORA_ALPHA \ + trainer.algorithm.off_policy_correction.tis_ratio_type=$TIS_RATIO_TYPE \ + trainer.algorithm.off_policy_correction.token_tis_ratio_clip_high=$TIS_IMP_RATIO_CAP \ + trainer.policy.megatron_config.optimizer_config_kwargs.overlap_cpu_optimizer_d2h_h2d=$OPTIMIZER_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.use_precision_aware_optimizer=$OPTIMIZER_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_cpu_offload=$OPTIMIZER_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_offload_fraction=$OPTIMIZER_OFFLOAD_FRACTION \ + trainer.policy.megatron_config.transformer_config_kwargs.num_layers_in_last_pipeline_stage=$MEGATRON_LAST_PIPELINE_STAGE_LAYER \ + trainer.policy.megatron_config.transformer_config_kwargs.fp8=e4m3 \ + trainer.policy.megatron_config.transformer_config_kwargs.fp8_margin=0 \ + trainer.policy.megatron_config.transformer_config_kwargs.fp8_amax_history_len=1024 \ + trainer.policy.megatron_config.transformer_config_kwargs.fp8_amax_compute_algo=max \ + generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \ + generator.inference_engine.engine_init_kwargs.quantization=fp8 \ + trainer.algorithm.use_kl_loss=$USE_KL_LOSS \ + trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \ + trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \ + trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \ + trainer.use_sample_packing=true \ + trainer.flash_attn=$FLASH_ATTN \ + generator.inference_engine.backend=$INFERENCE_BACKEND \ + generator.inference_engine.run_engines_locally=true \ + generator.inference_engine.weight_sync_backend=nccl \ + generator.inference_engine.async_engine=true \ + generator.batched=true \ + environment.env_class=aime \ + generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ + generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \ + generator.inference_engine.gpu_memory_utilization=0.7 \ + trainer.epochs=20 \ + trainer.eval_batch_size=512 \ + trainer.eval_before_train=true \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=$TRAIN_BATCH_SIZE \ + trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \ + trainer.micro_forward_batch_size_per_gpu=1 \ + trainer.micro_train_batch_size_per_gpu=1 \ + trainer.ckpt_interval=10 \ + trainer.max_prompt_length=$MAX_PROMPT_LENGTH \ + generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.policy.optimizer_config.lr=$LR \ + trainer.logger="$LOGGER" \ + trainer.project_name="dapo_aime" \ + trainer.run_name="dapo_qwen3_235b_a22b_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_ep${MEGATRON_EP}_lora_rank${LORA_RANK}_fp8_2nodes" \ + trainer.export_path="$HOME/exports/dapo_qwen3_235b_a22b_fp8_2nodes" \ + trainer.hf_save_interval=300 \ + trainer.resume_mode=latest \ + trainer.max_ckpts_to_keep=3 \ + trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_235b_a22b_fp8_2nodes" \ + "$@" & +PID=$! +wait "$PID" diff --git a/examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh b/examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh new file mode 100644 index 0000000000..1ced1f410a --- /dev/null +++ b/examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh @@ -0,0 +1,137 @@ +set -x + +# Validation run: Qwen3-30B-A3B on 2 nodes × 8 H100s — known-good config. +# Cloned from run_megatron_dapo_qwen3_30b_a3b.sh with: +# - data + ckpts on /mnt/cluster_storage +# - LOGGER=console (no wandb needed) +# - download data if missing +# - uv extra=megatron, no --isolated, NFS uv cache shared with the 235B run + +LOGGER="console" + +DATA_DIR="/mnt/cluster_storage/data/dapo" +TRAIN_FILE="$DATA_DIR/dapo-math-17k.parquet" +TEST_FILE="$DATA_DIR/aime-2024.parquet" +mkdir -p "$DATA_DIR" +[ -f "$TRAIN_FILE" ] || wget -q -O "$TRAIN_FILE" "https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k/resolve/main/data/dapo-math-17k.parquet?download=true" +[ -f "$TEST_FILE" ] || wget -q -O "$TEST_FILE" "https://huggingface.co/datasets/BytedTsinghua-SIA/AIME-2024/resolve/main/data/aime-2024.parquet?download=true" + +# Cache model locally for fast load. Worker nodes have local NVMe; copy once from NFS. +LOCAL_HF=/home/ray/hf_cache +NFS_HF=/mnt/cluster_storage/hf_cache +mkdir -p "$LOCAL_HF/hub" +if [ -d "$NFS_HF/hub/models--Qwen--Qwen3-30B-A3B-Base" ] && [ ! -d "$LOCAL_HF/hub/models--Qwen--Qwen3-30B-A3B-Base" ]; then + echo "[script] copying Qwen3-30B-A3B-Base from NFS to local" + cp -r "$NFS_HF/hub/models--Qwen--Qwen3-30B-A3B-Base" "$LOCAL_HF/hub/" +fi +export HF_HOME="$LOCAL_HF" + +MODEL_NAME="Qwen/Qwen3-30B-A3B-Base" +NUM_NODES=2 +NUM_GPUS_PER_NODE=8 +NUM_INFERENCE_ENGINES=2 +INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=8 + +# DAPO knobs (mirror the existing 30B script) +CLIP_RATIO_LOW=0.2 +CLIP_RATIO_HIGH=0.28 +LOSS_REDUCTION="token_mean" +APPLY_OVERLONG_FILTERING=true +OVERLONG_BUFFER_LEN=$((1024 * 4)) +OVERLONG_BUFFER_PENALTY_FACTOR=1.0 +USE_KL_LOSS=false +TEMPERATURE=1.0 +TOP_P=1.0 +EVAL_TOP_P=0.7 +CLIP_RATIO_C=10.0 +MAX_PROMPT_LENGTH=$((1024 * 2)) +MAX_RESPONSE_LENGTH=$((1024 * 8)) +TRAIN_BATCH_SIZE=512 +MINI_BATCH_SIZE=32 +N_SAMPLES_PER_PROMPT=16 +EVAL_N_SAMPLES_PER_PROMPT=32 +ENFORCE_EAGER=true +LR=1e-6 +MEGATRON_TP=4 +MEGATRON_PP=1 +MEGATRON_CP=1 +MEGATRON_EP=8 +MEGATRON_ETP=1 +TIS_IMP_RATIO_CAP=2.0 +TIS_TYPE=token + +# Disable Ray's uv-run propagation hook so worker actors use the shared venv +# directly via py_executable instead of re-running uv install per actor. +unset RAY_RUNTIME_ENV_HOOK + +trap 'echo "[script] forwarding SIGTERM to pid=$PID"; kill -TERM "$PID" 2>/dev/null; wait "$PID"' TERM INT + +/mnt/cluster_storage/.skyrl-venv/bin/python -m examples.train.algorithms.dapo.main_dapo \ + data.train_data="['$TRAIN_FILE']" \ + data.val_data="['$TEST_FILE']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.policy_loss_type="dual_clip" \ + trainer.algorithm.overlong_buffer_len=$OVERLONG_BUFFER_LEN \ + trainer.algorithm.overlong_buffer_penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \ + trainer.algorithm.loss_reduction=$LOSS_REDUCTION \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ + generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \ + generator.sampling_params.temperature=$TEMPERATURE \ + generator.sampling_params.top_p=$TOP_P \ + generator.eval_sampling_params.top_p=$EVAL_TOP_P \ + generator.eval_sampling_params.temperature=$TEMPERATURE \ + generator.eval_sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.algorithm.use_kl_loss=$USE_KL_LOSS \ + trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \ + trainer.policy.model.path="$MODEL_NAME" \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \ + generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.algorithm.off_policy_correction.tis_ratio_type=$TIS_TYPE \ + trainer.algorithm.off_policy_correction.token_tis_ratio_clip_high=$TIS_IMP_RATIO_CAP \ + trainer.epochs=20 \ + trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \ + trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=true \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=$TRAIN_BATCH_SIZE \ + trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \ + trainer.micro_forward_batch_size_per_gpu=4 \ + trainer.micro_train_batch_size_per_gpu=2 \ + trainer.ckpt_interval=10 \ + trainer.max_prompt_length=$MAX_PROMPT_LENGTH \ + generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.policy.optimizer_config.lr=$LR \ + trainer.policy.optimizer_config.num_warmup_steps=160 \ + trainer.policy.optimizer_config.weight_decay=0.1 \ + trainer.policy.optimizer_config.max_grad_norm=1.0 \ + generator.inference_engine.backend=vllm \ + generator.inference_engine.run_engines_locally=true \ + generator.inference_engine.weight_sync_backend=nccl \ + generator.inference_engine.async_engine=false \ + generator.batched=true \ + environment.env_class=aime \ + generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ + generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \ + generator.inference_engine.gpu_memory_utilization=0.7 \ + trainer.logger="$LOGGER" \ + trainer.project_name="dapo_aime_30b" \ + trainer.run_name="dapo_qwen3_30b_a3b_2nodes_validate" \ + trainer.export_path="/mnt/cluster_storage/exports/qwen3_30b_validate" \ + trainer.hf_save_interval=300 \ + trainer.resume_mode=latest \ + trainer.max_ckpts_to_keep=3 \ + trainer.ckpt_path="/mnt/cluster_storage/ckpts/qwen3_30b_validate" \ + "$@" & +PID=$! +wait "$PID"