From bd14735434809b55d9de4cdd96b33948fa2cc01f Mon Sep 17 00:00:00 2001 From: Rishabh Baghel Date: Fri, 6 Mar 2026 09:38:08 +0000 Subject: [PATCH] add qwen on slurm --- .../4node-FP8CS-GBS1024/recipe/README.md | 2 +- .../16node-BF16-GBS4096/recipe/README.md | 100 ++++++++++++ .../recipe/launch_script.sh | 148 ++++++++++++++++++ .../recipe/sbatch_script.sh | 52 ++++++ 4 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/README.md create mode 100644 training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/launch_script.sh create mode 100644 training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/sbatch_script.sh diff --git a/training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/README.md b/training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/README.md index 16a8b58f..eef7244b 100644 --- a/training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/README.md +++ b/training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/README.md @@ -39,7 +39,7 @@ Set the environment variables to match your environment: export PROJECT_ID= export CLUSTER_REGION= export CLUSTER_NAME= - gcloud compute ssh $CLUSTER_NAME --project supercomputer-testing --zone $CLUSTER_REGION -- -o Hostname=nic0.$CLUSTER_NAME.$CLUSTER_REGION.c.$PROJECT_ID$.internal.gcpnode.com + gcloud compute ssh $CLUSTER_NAME --project --zone $CLUSTER_REGION -- -o Hostname=nic0.$CLUSTER_NAME.$CLUSTER_REGION.c.$PROJECT_ID$.internal.gcpnode.com ``` diff --git a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/README.md b/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/README.md new file mode 100644 index 00000000..b7cab4bc --- /dev/null +++ b/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/README.md @@ -0,0 +1,100 @@ + +# Pretrain Qwen 3 235B workloads on A4 Slurm Cluster with Nvidia Megatron-Bridge + +This recipe outlines the steps for running a Qwen 3 235B pretraining workload on [Google Cloud A4 Slurm clusters](https://docs.cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster) by using [NVIDIA Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Slurm Workload Manager](https://slurm.schedmd.com/) +- Deployment - [Cluster Toolkit](https://cloud.google.com/cluster-toolkit/docs/overview) + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- A4 Slurm Cluster (16 nodes, 128 GPUs) +- Machine Type: `a4-highgpu-8g` +- Lustre Filesystem + +Please follow the instructions in the [Cluster Toolkit A4 Example README](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a4-highgpu-8g) to provision an A4 Slurm cluster. + + +## Docker container image + +This recipe uses the following container images: + +- `nvcr.io/nvidia/nemo:25.11` + +## Run the recipe + + +### Configure environment settings + +Set the environment variables to match your environment: + + ```bash + export PROJECT_ID= + export CLUSTER_REGION= + export CLUSTER_NAME= + gcloud compute ssh $CLUSTER_NAME --project --zone $CLUSTER_REGION -- -o Hostname=nic0.$CLUSTER_NAME.$CLUSTER_REGION.c.$PROJECT_ID$.internal.gcpnode.com + + ``` + +Replace the following values: + + - ``: your Google Cloud project ID. + - ``: the region where your cluster is located. + - ``: the name of your SLURM cluster. + +Set the default project: + + ```bash + gcloud config set project $PROJECT_ID + ``` + +From your cluster login node, complete the following steps: + +### Get the recipe + +Clone the `gpu-recipes` repository and set a reference to the recipe folder. + +``` +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=`git rev-parse --show-toplevel` +export RECIPE_ROOT=$REPO_ROOT/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe +cd $RECIPE_ROOT +``` + +### Submit a pretraining job + +``` +# set your HF_TOKEN inside launch_script.sh +export HF_TOKEN="YOUR_HF_TOKEN" # Replace with your Hugging Face token. + +cd .. +sbatch ./recipe/sbatch_script.sh +``` + +### Monitor the job + +To check the status of pods in your job, run the following command: + +``` +squeue --me +``` + + +To get the logs for the job, run the following command: + +``` +tail -f slurm_{jobID}.out +``` + +### Uninstall the job + +```bash +scancel -u $USER +``` diff --git a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/launch_script.sh b/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/launch_script.sh new file mode 100644 index 00000000..523219e4 --- /dev/null +++ b/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/launch_script.sh @@ -0,0 +1,148 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [[ "$1" != "" ]]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [[ -z "${config_overrides[*]}" ]]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib:$NCCL_PLUGIN_PATH:$LD_LIBRARY_PATH" +ldconfig "$LD_LIBRARY_PATH" +echo "Added $LD_LIBRARY_PATH to ldconfig:" +ldconfig -p | grep libcuda | sed 's/^/ /' +echo "" + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp "${TOKENIZER_PATH}"/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +# Create the nsys directory. +mkdir -p "${explicit_log_dir}/nsys" + +# Collect diagnostics to a single line +kv="\"kernel_version\": \"$(uname --kernel-release)\"" +if command -v nvidia-smi &> /dev/null; then + cuda_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' || true) + driver_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' || true) + vbios_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' | head -n1 || true) + kv="${kv}, \"cuda_version\": \"${cuda_v}\"" + kv="${kv}, \"driver_version\": \"${driver_v}\"" + kv="${kv}, \"vbios_version\": \"${vbios_v}\"" +fi +echo "VERSION_DIAGNOSTICS: {${kv}}" + + +export HF_TOKEN="" + +cd /opt +rm -rf Megatron-Bridge +git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git +cd Megatron-Bridge +git checkout 7695d4acbfac19353d20e456509117efe4733d6b +ls + + + +worker_command=$(cat <<- EOM + if [ "\$RANK" -eq "0" ]; then + echo "Worker 0 is stalling for a few seconds.." ; + sleep 3 ; + echo "The detected environment within worker rank 0 is:" ; + env | sed 's/^/ /' ; + fi ; + + cd /opt/Megatron-Bridge ; + + numactl \ + --cpunodebind=\$((LOCAL_RANK/4)) \ + --membind=\$((LOCAL_RANK/4)) nsys profile \ + -t nvtx,cuda \ + --cuda-event-trace=false \ + --sample=none \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + --kill none \ + -o "/${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK" \ + --force-overwrite true \ + --session-new "nsys-\$RANDOM-\$RANK" \ + nice -10 \ + python scripts/performance/run_script.py \ + --gpu b200 \ + --model_family_name qwen \ + --model_recipe_name qwen3_235b_a22b \ + --gpus_per_node 8 \ + --num_gpus 128 \ + --seq_length 4096 \ + --compute_dtype bf16 \ + --global_batch_size 4096 \ + --tensor_model_parallel_size 1 \ + --pipeline_model_parallel_size 8 \ + --virtual_pipeline_model_parallel_size 4 \ + --expert_model_parallel_size 8 \ + --expert_tensor_parallel_size 1 \ + --moe_a2a_overlap True \ + --max_steps 30 + +EOM +) + +echo "$worker_command" > worker_command.sh +chmod 777 worker_command.sh + +torchrun \ +--nproc-per-node="8" \ +--nnodes="16" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +--no-python bash worker_command.sh + + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p "${ARTIFACT_DIR}" + cp -r "${explicit_log_dir}"/* "${ARTIFACT_DIR}/" + env > "${ARTIFACT_DIR}/environ.txt" + ls "${ARTIFACT_DIR}" +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" diff --git a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/sbatch_script.sh b/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/sbatch_script.sh new file mode 100644 index 00000000..e290555d --- /dev/null +++ b/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/sbatch_script.sh @@ -0,0 +1,52 @@ +#!/bin/bash +#SBATCH --job-name=qwen3_235b_bf16_b200_128gpus-8jje +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=1 +#SBATCH --gres=gpu:8 +#SBATCH --mem=0 + +# Exit early on failures +set -e + +# Validate that the recipe location is setup correctly. +# Recipe is expected to be in "recipe" folder inside current working directory +RECIPE_DIR="$(pwd)/recipe" +LAUNCH_SCRIPT="${RECIPE_DIR}/launch_script.sh" +if [[ ! -f "${LAUNCH_SCRIPT}" ]]; then + echo "Error: Recipe is not located correctly. The recipe is expected to be in "recipe" folder inside current working directory. We could not find the launch script there." >&2 + exit 1 +fi +chmod +x "${LAUNCH_SCRIPT}" + +# Enroot the image if it is not already enrooted. +export ENROOT_CONFIG_PATH=${HOME}/.config/enroot +ORIG_IMAGE=nvcr.io#nvidia/nemo:25.11 +SQSH_IMAGE_PATH=${RECIPE_DIR}/sqsh/nvcr.io_nvidia_nemo:25.11 +if [[ ! -f "${SQSH_IMAGE_PATH}" ]]; then + mkdir -p "$(dirname "${SQSH_IMAGE_PATH}")" + echo "enrooting $ORIG_IMAGE to ${SQSH_IMAGE_PATH}" + enroot import --output "${SQSH_IMAGE_PATH}" -- "docker://${ORIG_IMAGE}" +fi + +# get the master node +master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +master_port=29500 + +ARTIFACT_DIR_HOME="/home/$USER/job_artifacts/${SLURM_JOB_ID}" +mkdir -p "$ARTIFACT_DIR_HOME" + +export NNODES=$SLURM_NNODES +export MASTER_ADDR=$master_addr +export MASTER_PORT=$master_port +export ARTIFACT_DIR=/artifacts +export JOB_NAME=qwen3_235b_bf16_b200_128gpus-8jje +export JOB_IDENTIFIER=qwen3_235b_bf16_b200_128gpus-8jje + + + +srun --container-image="$SQSH_IMAGE_PATH" \ + --container-mounts="${RECIPE_DIR}:/recipe:mkdir,${ARTIFACT_DIR_HOME}:${ARTIFACT_DIR}:mkdir" \ + --container-workdir=/recipe \ + --container-writable \ + bash -c 'export JOB_COMPLETION_INDEX=$SLURM_NODEID; ./launch_script.sh' + \ No newline at end of file