Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
name: qwen3-235b-a22b-rl-training
entrypoint: |
set -e

# === DIAGNOSTICS: persistent log file survives head pod restarts ===
DIAG_LOG=/mnt/cluster_storage/head_diag.log
RUN_TAG="run-$(date -u +%Y%m%dT%H%M%SZ)-pid$$"
echo "===== [$RUN_TAG] HEAD ENTRYPOINT START $(date -u) =====" | tee -a "$DIAG_LOG"
uname -a 2>&1 | tee -a "$DIAG_LOG"
cat /etc/os-release 2>/dev/null | tee -a "$DIAG_LOG" || true
echo "--- env (filtered) ---" | tee -a "$DIAG_LOG"
env | grep -E "^(POD|NODE|HOSTNAME|KUEUE|RAY|ANYSCALE|HEAD)" 2>&1 | tee -a "$DIAG_LOG" || true
echo "--- container resource limits (cgroup v2) ---" | tee -a "$DIAG_LOG"
cat /sys/fs/cgroup/memory.max 2>/dev/null | tee -a "$DIAG_LOG" || echo "(memory.max not readable)" | tee -a "$DIAG_LOG"
cat /sys/fs/cgroup/memory.current 2>/dev/null | tee -a "$DIAG_LOG" || true
cat /sys/fs/cgroup/cpu.max 2>/dev/null | tee -a "$DIAG_LOG" || true
echo "--- previous boot dmesg (oom_killer if present) ---" | tee -a "$DIAG_LOG"
dmesg 2>/dev/null | tail -200 | tee -a "$DIAG_LOG" || echo "(dmesg not accessible)" | tee -a "$DIAG_LOG"
echo "--- initial free / df ---" | tee -a "$DIAG_LOG"
free -h 2>&1 | tee -a "$DIAG_LOG"
df -h 2>&1 | tee -a "$DIAG_LOG"

# Trap signals so we know WHY we're exiting (k8s preemption sends SIGTERM)
trap 'rc=$?; echo "[$RUN_TAG] TRAP EXIT code=$rc at $(date -u)" | tee -a "$DIAG_LOG"; free -h 2>&1 | tee -a "$DIAG_LOG"; ps -eo pid,ppid,rss,vsz,pcpu,pmem,etime,comm --sort=-rss 2>&1 | head -20 | tee -a "$DIAG_LOG"; exit $rc' EXIT
trap 'echo "[$RUN_TAG] SIGTERM received at $(date -u)" | tee -a "$DIAG_LOG"; free -h 2>&1 | tee -a "$DIAG_LOG"; ps -eo pid,ppid,rss,vsz,pcpu,pmem,etime,comm --sort=-rss 2>&1 | head -20 | tee -a "$DIAG_LOG"' TERM
trap 'echo "[$RUN_TAG] SIGINT received at $(date -u)" | tee -a "$DIAG_LOG"' INT
trap 'echo "[$RUN_TAG] SIGHUP received at $(date -u)" | tee -a "$DIAG_LOG"' HUP

# Background monitor: every 30s dump mem/cpu/disk to DIAG_LOG
(
while true; do
ts=$(date -u +%H:%M:%S)
mem=$(free -m 2>/dev/null | awk '/^Mem:/{printf "used=%dMi total=%dMi avail=%dMi", $3, $2, $7}')
cgmem=$(cat /sys/fs/cgroup/memory.current 2>/dev/null)
cgmax=$(cat /sys/fs/cgroup/memory.max 2>/dev/null)
load=$(cat /proc/loadavg 2>/dev/null | awk '{print $1,$2,$3}')
diskcs=$(df -h /mnt/cluster_storage 2>/dev/null | tail -1 | awk '{print "cs="$5"("$3"/"$2")"}')
diskroot=$(df -h / 2>/dev/null | tail -1 | awk '{print "/="$5"("$3"/"$2")"}')
echo "[$RUN_TAG] $ts $mem cg_used=$cgmem cg_max=$cgmax load=$load $diskcs $diskroot" >> "$DIAG_LOG"
sleep 30
done
) &
MONITOR_PID=$!
echo "[$RUN_TAG] background monitor pid=$MONITOR_PID" | tee -a "$DIAG_LOG"

# === ACTUAL JOB ===
# Install nvidia headers to shared cluster storage so all worker nodes can find them
# via CPATH/LD_LIBRARY_PATH below. Only the head runs this; workers mount the same FS.
NV=/mnt/cluster_storage/nv_pkg
if [ ! -f "$NV/nvidia/cudnn/include/cudnn.h" ]; then
echo "[bootstrap] installing nvidia headers to $NV" | tee -a "$DIAG_LOG"
/home/ray/anaconda3/bin/python3 -m pip install -q --target "$NV" \
nvidia-cudnn-cu12 nvidia-nccl-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 \
nvidia-cusolver-cu12 nvidia-curand-cu12 nvidia-cufft-cu12 nvidia-cuda-runtime-cu12 \
nvidia-cuda-nvrtc-cu12 nvidia-cuda-cupti-cu12 nvidia-nvjitlink-cu12 nvidia-nvtx-cu12 \
nvidia-cudnn-frontend
fi
Comment on lines +50 to +57
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Installing nvidia packages directly to a shared NFS path without locking is prone to race conditions if multiple jobs are launched. A partial installation from one job could cause others to fail or use corrupted headers/libraries.

Consider using a more robust synchronization method or ensuring that this bootstrap step is performed by a single, idempotent setup job (similar to the venv build).

ls -la "$NV/nvidia/cudnn/include/cudnn.h"
bash examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh
image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
cloud: rkn-gpu-cloud
ray_version: "2.51.1"
working_dir: .
max_retries: 0

# Use the shared pre-built venv on NFS for all Ray worker actors.
py_executable: "/mnt/cluster_storage/.skyrl-venv/bin/python"

# cuDNN/NCCL/cublas paths point to our pre-installed nvidia packages on shared NFS
# (set up by the head entrypoint preamble). /opt/cudnn doesn't exist in this image.
env_vars:
# Disable Ray's auto "uv run" propagation. Don't set RAY_RUNTIME_ENV_HOOK
# to empty here — unset it in the entrypoint shell instead (Ray's load_class
# crashes on empty string).
RAY_ENABLE_UV_RUN_RUNTIME_ENV: "0"
CUDA_HOME: "/usr/local/cuda"
CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include"
C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib"
LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib"
CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn"
# Workers compile transformer-engine-torch from source on first start; that takes
# 5-10 minutes which exceeds Ray's default PG timeout. Give it 30 minutes.
SKYRL_RAY_PG_TIMEOUT_IN_S: "1800"
# 235B model on TP=16 needs much longer than the default 600s to become healthy.
SKYRL_WAIT_UNTIL_INFERENCE_SERVER_HEALTHY_TIMEOUT_S: "3600"
# Share uv's build/wheel cache across all nodes and runs via NFS. After TE
# builds once globally, every future cluster start pulls the cached wheel
# instead of rebuilding (cuts launch time from ~25 min cold to ~5 min warm).
UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache"
UV_LINK_MODE: "copy"
# ninja parallelism — 4 jobs ≈ 12 GiB peak (fits 32 GiB head; trivial for workers).
MAX_JOBS: "4"
NINJA_JOBS: "4"
CMAKE_BUILD_PARALLEL_LEVEL: "4"
# Ray's 60s worker-register timeout otherwise fires repeatedly during the
# first-time TE build, producing log noise + spurious worker process spawns.
RAY_worker_register_timeout_seconds: "1800"
# Critical: RAY_OVERRIDE_JOB_RUNTIME_ENV=1 replaces SkyRL's prepare_runtime_environment
# env vars with ours. Re-add all the ones SkyRL would have set (utils.py:615-647)
# so vLLM/megatron behave correctly.
RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
CUDA_DEVICE_MAX_CONNECTIONS: "1"
NVTE_FUSED_ATTN: "0"
VLLM_ALLOW_RUNTIME_LORA_UPDATING: "true"
VLLM_ALLOW_INSECURE_SERIALIZATION: "1"
VLLM_DISABLE_COMPILE_CACHE: "1"
VLLM_USE_V1: "1"
VLLM_ENABLE_V1_MULTIPROCESSING: "0"
_SKYRL_USE_NEW_INFERENCE: "1"
# Cache HF downloads in shared FS so model doesn't redownload on restarts / per-worker.
HF_HOME: "/mnt/cluster_storage/hf_cache"
HF_HUB_ENABLE_HF_TRANSFER: "1"
# Allow YAML env_vars to be merged with SkyRL's runtime env instead of conflicting.
RAY_OVERRIDE_JOB_RUNTIME_ENV: "1"

compute_config:
advanced_instance_config:
metadata:
labels:
kueue.x-k8s.io/queue-name: default-queue
head_node:
required_resources:
CPU: 8
memory: 32Gi
worker_nodes:
- required_resources:
CPU: 184
memory: 1920Gi
GPU: 8
required_labels:
ray.io/accelerator-type: H100
min_nodes: 2
max_nodes: 2
72 changes: 72 additions & 0 deletions examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: qwen3-30b-validate
entrypoint: |
set -e
DIAG_LOG=/mnt/cluster_storage/head_diag.log
RUN_TAG="run-30b-$(date -u +%Y%m%dT%H%M%SZ)-pid$$"
echo "===== [$RUN_TAG] HEAD ENTRYPOINT START $(date -u) =====" | tee -a "$DIAG_LOG"
trap 'rc=$?; echo "[$RUN_TAG] TRAP EXIT code=$rc at $(date -u)" | tee -a "$DIAG_LOG"; exit $rc' EXIT
bash examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh
image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
cloud: rkn-gpu-cloud
ray_version: "2.51.1"
working_dir: .
max_retries: 0

# Use the shared pre-built venv on NFS for all Ray worker actors.
# This eliminates per-actor `uv run` which was causing 16 redundant installs.
py_executable: "/mnt/cluster_storage/.skyrl-venv/bin/python"

env_vars:
# Disable Ray's auto "uv run" propagation — we are using a pre-built venv directly.
# NOTE: do NOT set RAY_RUNTIME_ENV_HOOK to empty string; Ray's load_class()
# treats that as "load class named ''" and crashes. We unset it inside the
# entrypoint script instead.
RAY_ENABLE_UV_RUN_RUNTIME_ENV: "0"
UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache"
UV_LINK_MODE: "copy"
MAX_JOBS: "4"
NINJA_JOBS: "4"
CMAKE_BUILD_PARALLEL_LEVEL: "4"
RAY_worker_register_timeout_seconds: "1800"
SKYRL_RAY_PG_TIMEOUT_IN_S: "1800"
SKYRL_WAIT_UNTIL_INFERENCE_SERVER_HEALTHY_TIMEOUT_S: "3600"
HF_HOME: "/mnt/cluster_storage/hf_cache"
HF_HUB_ENABLE_HF_TRANSFER: "1"
RAY_OVERRIDE_JOB_RUNTIME_ENV: "1"
RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
CUDA_DEVICE_MAX_CONNECTIONS: "1"
NVTE_FUSED_ATTN: "0"
VLLM_ALLOW_RUNTIME_LORA_UPDATING: "true"
VLLM_ALLOW_INSECURE_SERIALIZATION: "1"
VLLM_DISABLE_COMPILE_CACHE: "1"
VLLM_USE_V1: "1"
VLLM_ENABLE_V1_MULTIPROCESSING: "0"
_SKYRL_USE_NEW_INFERENCE: "1"
CUDA_HOME: "/usr/local/cuda"
# Point to our pre-installed cuDNN/NCCL/cublas etc. on shared NFS.
# /opt/cudnn doesn't exist in this image (Dockerfile.megatron paths don't apply).
CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include"
C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib"
LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib"
CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn"

compute_config:
advanced_instance_config:
metadata:
labels:
kueue.x-k8s.io/queue-name: default-queue
head_node:
required_resources:
CPU: 8
memory: 32Gi
worker_nodes:
- required_resources:
CPU: 184
memory: 1920Gi
GPU: 8
required_labels:
ray.io/accelerator-type: H100
min_nodes: 2
max_nodes: 2
55 changes: 55 additions & 0 deletions examples/train/megatron/build_shared_venv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: build-skyrl-shared-venv
entrypoint: |
set -e
VENV=/mnt/cluster_storage/.skyrl-venv
STAMP=$VENV/.built-ok
if [ -f "$STAMP" ]; then
echo "[setup] venv already built at $VENV — verifying imports"
"$VENV/bin/python" -c "import torch, megatron.core, transformer_engine.pytorch, vllm, skyrl_gym, skyrl.train; print('OK')"
echo "[setup] no rebuild needed."
exit 0
fi
echo "[setup] building venv at $VENV ($(date -u))"
rm -rf "$VENV"
Comment on lines +12 to +13
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

There is a potential race condition if multiple build jobs are triggered simultaneously or if a job is retried. The rm -rf "$VENV" followed by a non-atomic uv sync can leave the shared storage in a corrupted state for other processes.

It is safer to build the environment in a temporary directory and then use an atomic mv to place it at the final destination once the build is successful (and the stamp is created).

# We need pyproject.toml + skyrl + skyrl-gym sources to run uv sync. The Anyscale
# working_dir is uploaded into the runtime_resources dir; copy it to a stable path.
WD="$(pwd)"
echo "[setup] working_dir = $WD"
cd "$WD"
uv venv --python 3.12 "$VENV"
# --no-editable so the workspace packages (skyrl, skyrl-gym) are installed as
# regular packages inside the venv; otherwise their source path lives in this
# ephemeral pod and the venv breaks when training runs elsewhere.
UV_PROJECT_ENVIRONMENT="$VENV" uv sync --extra megatron --no-editable
"$VENV/bin/python" -c "import torch, megatron.core, transformer_engine.pytorch, vllm, skyrl_gym, skyrl.train; print('IMPORTS OK')"
touch "$STAMP"
echo "[setup] DONE at $(date -u). venv at $VENV"
image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
cloud: rkn-gpu-cloud
ray_version: "2.51.1"
working_dir: .
max_retries: 0

env_vars:
UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache"
UV_LINK_MODE: "copy"
MAX_JOBS: "8"
NINJA_JOBS: "8"
CMAKE_BUILD_PARALLEL_LEVEL: "8"
CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include"
C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib"
LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib"
CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn"
CUDA_HOME: "/usr/local/cuda"

compute_config:
advanced_instance_config:
metadata:
labels:
kueue.x-k8s.io/queue-name: default-queue
head_node:
required_resources:
CPU: 8
memory: 32Gi
20 changes: 20 additions & 0 deletions examples/train/megatron/clear_uv_cache.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: clear-uv-cache
entrypoint: |
echo "Before:"; du -sh /mnt/cluster_storage/.uv_cache 2>/dev/null
rm -rf /mnt/cluster_storage/.uv_cache 2>&1
echo "Done."
image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
cloud: rkn-gpu-cloud
ray_version: "2.51.1"
working_dir: .
max_retries: 0

compute_config:
advanced_instance_config:
metadata:
labels:
kueue.x-k8s.io/queue-name: default-queue
head_node:
required_resources:
CPU: 2
memory: 4Gi
19 changes: 19 additions & 0 deletions examples/train/megatron/clear_venv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: clear-skyrl-venv
entrypoint: |
echo "Before:"; du -sh /mnt/cluster_storage/.skyrl-venv 2>/dev/null
rm -rf /mnt/cluster_storage/.skyrl-venv 2>&1
echo "Done."
image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
cloud: rkn-gpu-cloud
ray_version: "2.51.1"
working_dir: .
max_retries: 0
compute_config:
advanced_instance_config:
metadata:
labels:
kueue.x-k8s.io/queue-name: default-queue
head_node:
required_resources:
CPU: 2
memory: 4Gi
36 changes: 36 additions & 0 deletions examples/train/megatron/download_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: qwen-model-prefetch
entrypoint: |
set -e
export HF_HOME=/mnt/cluster_storage/hf_cache
export HF_HUB_ENABLE_HF_TRANSFER=1
/home/ray/anaconda3/bin/python3 -m pip install -q --upgrade huggingface_hub hf_transfer
/home/ray/anaconda3/bin/python3 - <<'PY'
from huggingface_hub import snapshot_download
import time
t0 = time.time()
path = snapshot_download(
repo_id="Qwen/Qwen3-235B-A22B-Instruct-2507",
cache_dir="/mnt/cluster_storage/hf_cache/hub",
max_workers=8,
allow_patterns=["*.json", "*.txt", "*.safetensors", "tokenizer*"],
)
print(f"\n[prefetch] Downloaded to {path} in {time.time()-t0:.1f}s")
PY
echo "[prefetch] DONE"
ls -lh /mnt/cluster_storage/hf_cache/hub/models--Qwen--Qwen3-235B-A22B-Instruct-2507/snapshots/*/ | head -5
du -sh /mnt/cluster_storage/hf_cache/hub/models--Qwen--Qwen3-235B-A22B-Instruct-2507/
image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
cloud: rkn-gpu-cloud
ray_version: "2.51.1"
working_dir: .
max_retries: 0

compute_config:
advanced_instance_config:
metadata:
labels:
kueue.x-k8s.io/queue-name: default-queue
head_node:
required_resources:
CPU: 4
memory: 16Gi
31 changes: 31 additions & 0 deletions examples/train/megatron/dump_diag.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: dump-head-diag
entrypoint: |
set -x
echo "===== cluster_storage layout ====="
ls -la /mnt/cluster_storage/ 2>&1
echo "===== hf_cache layout ====="
ls -la /mnt/cluster_storage/hf_cache/ 2>&1 || echo "(no hf_cache)"
echo "===== model directories ====="
find /mnt/cluster_storage/hf_cache/ -maxdepth 4 -type d 2>/dev/null | head -50
echo "===== safetensors files (sizes) ====="
find /mnt/cluster_storage/hf_cache/ -name "*.safetensors" -o -name "*.safetensors.tmp*" 2>/dev/null | xargs -I{} ls -lh {} 2>/dev/null | head -100
echo "===== uv cache size ====="
du -sh /mnt/cluster_storage/.uv_cache 2>/dev/null
ls /mnt/cluster_storage/.uv_cache/ 2>&1 | head -20
echo "===== tail of head_diag.log ====="
tail -30 /mnt/cluster_storage/head_diag.log 2>&1 || echo "(no diag log)"
image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
cloud: rkn-gpu-cloud
ray_version: "2.51.1"
working_dir: .
max_retries: 0

compute_config:
advanced_instance_config:
metadata:
labels:
kueue.x-k8s.io/queue-name: default-queue
head_node:
required_resources:
CPU: 2
memory: 4Gi
Loading
Loading