NovaSky-AI · xyuzh · May 18, 2026 · May 18, 2026 · May 18, 2026 · gemini-code-assist
diff --git a/examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml b/examples/train/megatron/anyscale_qwen3_235b_2nodes.yaml
@@ -0,0 +1,135 @@
+name: qwen3-235b-a22b-rl-training
+entrypoint: |
+  set -e
+
+  # === DIAGNOSTICS: persistent log file survives head pod restarts ===
+  DIAG_LOG=/mnt/cluster_storage/head_diag.log
+  RUN_TAG="run-$(date -u +%Y%m%dT%H%M%SZ)-pid$$"
+  echo "===== [$RUN_TAG] HEAD ENTRYPOINT START $(date -u) =====" | tee -a "$DIAG_LOG"
+  uname -a 2>&1 | tee -a "$DIAG_LOG"
+  cat /etc/os-release 2>/dev/null | tee -a "$DIAG_LOG" || true
+  echo "--- env (filtered) ---" | tee -a "$DIAG_LOG"
+  env | grep -E "^(POD|NODE|HOSTNAME|KUEUE|RAY|ANYSCALE|HEAD)" 2>&1 | tee -a "$DIAG_LOG" || true
+  echo "--- container resource limits (cgroup v2) ---" | tee -a "$DIAG_LOG"
+  cat /sys/fs/cgroup/memory.max 2>/dev/null | tee -a "$DIAG_LOG" || echo "(memory.max not readable)" | tee -a "$DIAG_LOG"
+  cat /sys/fs/cgroup/memory.current 2>/dev/null | tee -a "$DIAG_LOG" || true
+  cat /sys/fs/cgroup/cpu.max 2>/dev/null | tee -a "$DIAG_LOG" || true
+  echo "--- previous boot dmesg (oom_killer if present) ---" | tee -a "$DIAG_LOG"
+  dmesg 2>/dev/null | tail -200 | tee -a "$DIAG_LOG" || echo "(dmesg not accessible)" | tee -a "$DIAG_LOG"
+  echo "--- initial free / df ---" | tee -a "$DIAG_LOG"
+  free -h 2>&1 | tee -a "$DIAG_LOG"
+  df -h 2>&1 | tee -a "$DIAG_LOG"
+
+  # Trap signals so we know WHY we're exiting (k8s preemption sends SIGTERM)
+  trap 'rc=$?; echo "[$RUN_TAG] TRAP EXIT code=$rc at $(date -u)" | tee -a "$DIAG_LOG"; free -h 2>&1 | tee -a "$DIAG_LOG"; ps -eo pid,ppid,rss,vsz,pcpu,pmem,etime,comm --sort=-rss 2>&1 | head -20 | tee -a "$DIAG_LOG"; exit $rc' EXIT
+  trap 'echo "[$RUN_TAG] SIGTERM received at $(date -u)" | tee -a "$DIAG_LOG"; free -h 2>&1 | tee -a "$DIAG_LOG"; ps -eo pid,ppid,rss,vsz,pcpu,pmem,etime,comm --sort=-rss 2>&1 | head -20 | tee -a "$DIAG_LOG"' TERM
+  trap 'echo "[$RUN_TAG] SIGINT received at $(date -u)" | tee -a "$DIAG_LOG"' INT
+  trap 'echo "[$RUN_TAG] SIGHUP received at $(date -u)" | tee -a "$DIAG_LOG"' HUP
+
+  # Background monitor: every 30s dump mem/cpu/disk to DIAG_LOG
+  (
+    while true; do
+      ts=$(date -u +%H:%M:%S)
+      mem=$(free -m 2>/dev/null | awk '/^Mem:/{printf "used=%dMi total=%dMi avail=%dMi", $3, $2, $7}')
+      cgmem=$(cat /sys/fs/cgroup/memory.current 2>/dev/null)
+      cgmax=$(cat /sys/fs/cgroup/memory.max 2>/dev/null)
+      load=$(cat /proc/loadavg 2>/dev/null | awk '{print $1,$2,$3}')
+      diskcs=$(df -h /mnt/cluster_storage 2>/dev/null | tail -1 | awk '{print "cs="$5"("$3"/"$2")"}')
+      diskroot=$(df -h / 2>/dev/null | tail -1 | awk '{print "/="$5"("$3"/"$2")"}')
+      echo "[$RUN_TAG] $ts $mem cg_used=$cgmem cg_max=$cgmax load=$load $diskcs $diskroot" >> "$DIAG_LOG"
+      sleep 30
+    done
+  ) &
+  MONITOR_PID=$!
+  echo "[$RUN_TAG] background monitor pid=$MONITOR_PID" | tee -a "$DIAG_LOG"
+
+  # === ACTUAL JOB ===
+  # Install nvidia headers to shared cluster storage so all worker nodes can find them
+  # via CPATH/LD_LIBRARY_PATH below. Only the head runs this; workers mount the same FS.
+  NV=/mnt/cluster_storage/nv_pkg
+  if [ ! -f "$NV/nvidia/cudnn/include/cudnn.h" ]; then
+    echo "[bootstrap] installing nvidia headers to $NV" | tee -a "$DIAG_LOG"
+    /home/ray/anaconda3/bin/python3 -m pip install -q --target "$NV" \
+      nvidia-cudnn-cu12 nvidia-nccl-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 \
+      nvidia-cusolver-cu12 nvidia-curand-cu12 nvidia-cufft-cu12 nvidia-cuda-runtime-cu12 \
+      nvidia-cuda-nvrtc-cu12 nvidia-cuda-cupti-cu12 nvidia-nvjitlink-cu12 nvidia-nvtx-cu12 \
+      nvidia-cudnn-frontend
+  fi
+  ls -la "$NV/nvidia/cudnn/include/cudnn.h"
+  bash examples/train/megatron/run_megatron_qwen3_235b_2nodes.sh
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+# Use the shared pre-built venv on NFS for all Ray worker actors.
+py_executable: "/mnt/cluster_storage/.skyrl-venv/bin/python"
+
+# cuDNN/NCCL/cublas paths point to our pre-installed nvidia packages on shared NFS
+# (set up by the head entrypoint preamble). /opt/cudnn doesn't exist in this image.
+env_vars:
+  # Disable Ray's auto "uv run" propagation. Don't set RAY_RUNTIME_ENV_HOOK
+  # to empty here — unset it in the entrypoint shell instead (Ray's load_class
+  # crashes on empty string).
+  RAY_ENABLE_UV_RUN_RUNTIME_ENV: "0"
+  CUDA_HOME: "/usr/local/cuda"
+  CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include"
+  C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib"
+  LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib"
+  CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn"
+  # Workers compile transformer-engine-torch from source on first start; that takes
+  # 5-10 minutes which exceeds Ray's default PG timeout. Give it 30 minutes.
+  SKYRL_RAY_PG_TIMEOUT_IN_S: "1800"
+  # 235B model on TP=16 needs much longer than the default 600s to become healthy.
+  SKYRL_WAIT_UNTIL_INFERENCE_SERVER_HEALTHY_TIMEOUT_S: "3600"
+  # Share uv's build/wheel cache across all nodes and runs via NFS. After TE
+  # builds once globally, every future cluster start pulls the cached wheel
+  # instead of rebuilding (cuts launch time from ~25 min cold to ~5 min warm).
+  UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache"
+  UV_LINK_MODE: "copy"
+  # ninja parallelism — 4 jobs ≈ 12 GiB peak (fits 32 GiB head; trivial for workers).
+  MAX_JOBS: "4"
+  NINJA_JOBS: "4"
+  CMAKE_BUILD_PARALLEL_LEVEL: "4"
+  # Ray's 60s worker-register timeout otherwise fires repeatedly during the
+  # first-time TE build, producing log noise + spurious worker process spawns.
+  RAY_worker_register_timeout_seconds: "1800"
+  # Critical: RAY_OVERRIDE_JOB_RUNTIME_ENV=1 replaces SkyRL's prepare_runtime_environment
+  # env vars with ours. Re-add all the ones SkyRL would have set (utils.py:615-647)
+  # so vLLM/megatron behave correctly.
+  RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
+  NVTE_FUSED_ATTN: "0"
+  VLLM_ALLOW_RUNTIME_LORA_UPDATING: "true"
+  VLLM_ALLOW_INSECURE_SERIALIZATION: "1"
+  VLLM_DISABLE_COMPILE_CACHE: "1"
+  VLLM_USE_V1: "1"
+  VLLM_ENABLE_V1_MULTIPROCESSING: "0"
+  _SKYRL_USE_NEW_INFERENCE: "1"
+  # Cache HF downloads in shared FS so model doesn't redownload on restarts / per-worker.
+  HF_HOME: "/mnt/cluster_storage/hf_cache"
+  HF_HUB_ENABLE_HF_TRANSFER: "1"
+  # Allow YAML env_vars to be merged with SkyRL's runtime env instead of conflicting.
+  RAY_OVERRIDE_JOB_RUNTIME_ENV: "1"
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 8
+      memory: 32Gi
+  worker_nodes:
+    - required_resources:
+        CPU: 184
+        memory: 1920Gi
+        GPU: 8
+      required_labels:
+        ray.io/accelerator-type: H100
+      min_nodes: 2
+      max_nodes: 2
diff --git a/examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml b/examples/train/megatron/anyscale_qwen3_30b_2nodes.yaml
@@ -0,0 +1,72 @@
+name: qwen3-30b-validate
+entrypoint: |
+  set -e
+  DIAG_LOG=/mnt/cluster_storage/head_diag.log
+  RUN_TAG="run-30b-$(date -u +%Y%m%dT%H%M%SZ)-pid$$"
+  echo "===== [$RUN_TAG] HEAD ENTRYPOINT START $(date -u) =====" | tee -a "$DIAG_LOG"
+  trap 'rc=$?; echo "[$RUN_TAG] TRAP EXIT code=$rc at $(date -u)" | tee -a "$DIAG_LOG"; exit $rc' EXIT
+  bash examples/train/megatron/run_megatron_qwen3_30b_2nodes.sh
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+# Use the shared pre-built venv on NFS for all Ray worker actors.
+# This eliminates per-actor `uv run` which was causing 16 redundant installs.
+py_executable: "/mnt/cluster_storage/.skyrl-venv/bin/python"
+
+env_vars:
+  # Disable Ray's auto "uv run" propagation — we are using a pre-built venv directly.
+  # NOTE: do NOT set RAY_RUNTIME_ENV_HOOK to empty string; Ray's load_class()
+  # treats that as "load class named ''" and crashes. We unset it inside the
+  # entrypoint script instead.
+  RAY_ENABLE_UV_RUN_RUNTIME_ENV: "0"
+  UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache"
+  UV_LINK_MODE: "copy"
+  MAX_JOBS: "4"
+  NINJA_JOBS: "4"
+  CMAKE_BUILD_PARALLEL_LEVEL: "4"
+  RAY_worker_register_timeout_seconds: "1800"
+  SKYRL_RAY_PG_TIMEOUT_IN_S: "1800"
+  SKYRL_WAIT_UNTIL_INFERENCE_SERVER_HEALTHY_TIMEOUT_S: "3600"
+  HF_HOME: "/mnt/cluster_storage/hf_cache"
+  HF_HUB_ENABLE_HF_TRANSFER: "1"
+  RAY_OVERRIDE_JOB_RUNTIME_ENV: "1"
+  RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
+  NVTE_FUSED_ATTN: "0"
+  VLLM_ALLOW_RUNTIME_LORA_UPDATING: "true"
+  VLLM_ALLOW_INSECURE_SERIALIZATION: "1"
+  VLLM_DISABLE_COMPILE_CACHE: "1"
+  VLLM_USE_V1: "1"
+  VLLM_ENABLE_V1_MULTIPROCESSING: "0"
+  _SKYRL_USE_NEW_INFERENCE: "1"
+  CUDA_HOME: "/usr/local/cuda"
+  # Point to our pre-installed cuDNN/NCCL/cublas etc. on shared NFS.
+  # /opt/cudnn doesn't exist in this image (Dockerfile.megatron paths don't apply).
+  CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include"
+  C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib"
+  LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib"
+  CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn"
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 8
+      memory: 32Gi
+  worker_nodes:
+    - required_resources:
+        CPU: 184
+        memory: 1920Gi
+        GPU: 8
+      required_labels:
+        ray.io/accelerator-type: H100
+      min_nodes: 2
+      max_nodes: 2
diff --git a/examples/train/megatron/build_shared_venv.yaml b/examples/train/megatron/build_shared_venv.yaml
@@ -0,0 +1,55 @@
+name: build-skyrl-shared-venv
+entrypoint: |
+  set -e
+  VENV=/mnt/cluster_storage/.skyrl-venv
+  STAMP=$VENV/.built-ok
+  if [ -f "$STAMP" ]; then
+    echo "[setup] venv already built at $VENV — verifying imports"
+    "$VENV/bin/python" -c "import torch, megatron.core, transformer_engine.pytorch, vllm, skyrl_gym, skyrl.train; print('OK')"
+    echo "[setup] no rebuild needed."
+    exit 0
+  fi
+  echo "[setup] building venv at $VENV  ($(date -u))"
+  rm -rf "$VENV"
+  # We need pyproject.toml + skyrl + skyrl-gym sources to run uv sync. The Anyscale
+  # working_dir is uploaded into the runtime_resources dir; copy it to a stable path.
+  WD="$(pwd)"
+  echo "[setup] working_dir = $WD"
+  cd "$WD"
+  uv venv --python 3.12 "$VENV"
+  # --no-editable so the workspace packages (skyrl, skyrl-gym) are installed as
+  # regular packages inside the venv; otherwise their source path lives in this
+  # ephemeral pod and the venv breaks when training runs elsewhere.
+  UV_PROJECT_ENVIRONMENT="$VENV" uv sync --extra megatron --no-editable
+  "$VENV/bin/python" -c "import torch, megatron.core, transformer_engine.pytorch, vllm, skyrl_gym, skyrl.train; print('IMPORTS OK')"
+  touch "$STAMP"
+  echo "[setup] DONE at $(date -u). venv at $VENV"
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+env_vars:
+  UV_CACHE_DIR: "/mnt/cluster_storage/.uv_cache"
+  UV_LINK_MODE: "copy"
+  MAX_JOBS: "8"
+  NINJA_JOBS: "8"
+  CMAKE_BUILD_PARALLEL_LEVEL: "8"
+  CPATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include:/mnt/cluster_storage/nv_pkg/nvidia/cublas/include"
+  C_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  CPLUS_INCLUDE_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/include:/mnt/cluster_storage/nv_pkg/nvidia/nccl/include"
+  LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib:/mnt/cluster_storage/nv_pkg/nvidia/cublas/lib"
+  LD_LIBRARY_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn/lib:/mnt/cluster_storage/nv_pkg/nvidia/nccl/lib"
+  CUDNN_PATH: "/mnt/cluster_storage/nv_pkg/nvidia/cudnn"
+  CUDA_HOME: "/usr/local/cuda"
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 8
+      memory: 32Gi
diff --git a/examples/train/megatron/clear_uv_cache.yaml b/examples/train/megatron/clear_uv_cache.yaml
@@ -0,0 +1,20 @@
+name: clear-uv-cache
+entrypoint: |
+  echo "Before:"; du -sh /mnt/cluster_storage/.uv_cache 2>/dev/null
+  rm -rf /mnt/cluster_storage/.uv_cache 2>&1
+  echo "Done."
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 2
+      memory: 4Gi
diff --git a/examples/train/megatron/clear_venv.yaml b/examples/train/megatron/clear_venv.yaml
@@ -0,0 +1,19 @@
+name: clear-skyrl-venv
+entrypoint: |
+  echo "Before:"; du -sh /mnt/cluster_storage/.skyrl-venv 2>/dev/null
+  rm -rf /mnt/cluster_storage/.skyrl-venv 2>&1
+  echo "Done."
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 2
+      memory: 4Gi
diff --git a/examples/train/megatron/download_model.yaml b/examples/train/megatron/download_model.yaml
@@ -0,0 +1,36 @@
+name: qwen-model-prefetch
+entrypoint: |
+  set -e
+  export HF_HOME=/mnt/cluster_storage/hf_cache
+  export HF_HUB_ENABLE_HF_TRANSFER=1
+  /home/ray/anaconda3/bin/python3 -m pip install -q --upgrade huggingface_hub hf_transfer
+  /home/ray/anaconda3/bin/python3 - <<'PY'
+  from huggingface_hub import snapshot_download
+  import time
+  t0 = time.time()
+  path = snapshot_download(
+      repo_id="Qwen/Qwen3-235B-A22B-Instruct-2507",
+      cache_dir="/mnt/cluster_storage/hf_cache/hub",
+      max_workers=8,
+      allow_patterns=["*.json", "*.txt", "*.safetensors", "tokenizer*"],
+  )
+  print(f"\n[prefetch] Downloaded to {path} in {time.time()-t0:.1f}s")
+  PY
+  echo "[prefetch] DONE"
+  ls -lh /mnt/cluster_storage/hf_cache/hub/models--Qwen--Qwen3-235B-A22B-Instruct-2507/snapshots/*/ | head -5
+  du -sh /mnt/cluster_storage/hf_cache/hub/models--Qwen--Qwen3-235B-A22B-Instruct-2507/
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 4
+      memory: 16Gi
diff --git a/examples/train/megatron/dump_diag.yaml b/examples/train/megatron/dump_diag.yaml
@@ -0,0 +1,31 @@
+name: dump-head-diag
+entrypoint: |
+  set -x
+  echo "===== cluster_storage layout ====="
+  ls -la /mnt/cluster_storage/ 2>&1
+  echo "===== hf_cache layout ====="
+  ls -la /mnt/cluster_storage/hf_cache/ 2>&1 || echo "(no hf_cache)"
+  echo "===== model directories ====="
+  find /mnt/cluster_storage/hf_cache/ -maxdepth 4 -type d 2>/dev/null | head -50
+  echo "===== safetensors files (sizes) ====="
+  find /mnt/cluster_storage/hf_cache/ -name "*.safetensors" -o -name "*.safetensors.tmp*" 2>/dev/null | xargs -I{} ls -lh {} 2>/dev/null | head -100
+  echo "===== uv cache size ====="
+  du -sh /mnt/cluster_storage/.uv_cache 2>/dev/null
+  ls /mnt/cluster_storage/.uv_cache/ 2>&1 | head -20
+  echo "===== tail of head_diag.log ====="
+  tail -30 /mnt/cluster_storage/head_diag.log 2>&1 || echo "(no diag log)"
+image_uri: novaskyai/skyrl-train-ray-2.51.1-py3.12-cu12.8
+cloud: rkn-gpu-cloud
+ray_version: "2.51.1"
+working_dir: .
+max_retries: 0
+
+compute_config:
+  advanced_instance_config:
+    metadata:
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+  head_node:
+    required_resources:
+      CPU: 2
+      memory: 4Gi