From b2713448fe1b770840f49362ef3df2b23058cebd Mon Sep 17 00:00:00 2001 From: Aman Seervi Date: Wed, 4 Mar 2026 19:58:48 +0000 Subject: [PATCH 1/3] Added slurm megatron-bridge recipes for llama3.1-405b on A4 --- .../16node-FP8DS-GBS1024/README.md | 107 ++++++++++++++++++ .../16node-FP8DS-GBS1024/submit.slurm | 103 +++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md create mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md new file mode 100644 index 00000000..c1fb5371 --- /dev/null +++ b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md @@ -0,0 +1,107 @@ + +# Pretrain Llama 3.1 405B workloads on A4 Slurm Cluster with Nvidia Megatron-Bridge + +This recipe outlines the steps for running a Llama 3.1 405B pretraining workload on [Google Cloud A4 Slurm clusters](https://docs.cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster) by using [NVIDIA Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Slurm Workload Manager](https://slurm.schedmd.com/) +- Deployment - [Cluster Toolkit](https://cloud.google.com/cluster-toolkit/docs/overview) + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- A4 Slurm Cluster (16 nodes, 128 GPUs) +- Machine Type: `a4-highgpu-8g` +- Lustre Filesystem + +Please follow the instructions in the [Cluster Toolkit A4 Example README](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a4-highgpu-8g/README.md) to provision an A4 High Slurm cluster. + +## Docker container image + +This recipe uses the following container images: + +- `nvcr.io/nvidia/nemo:25.09` + +## Run the recipe + +From your cluster login node, complete the following steps: + +### Configure environment settings + +#### Setup Enroot for Megatron + +We recommend setting this up on Lustre. + +```bash + # Here, /home is a lustre filesystem +export BASE_DIR=/home/${USER} +export LOCAL_SSD_DIR=/mnt/localssd +cd ${BASE_DIR} + +# Configure Enroot +export ENROOT_CONFIG_PATH=${HOME}/.config/enroot +mkdir -p ${ENROOT_CONFIG_PATH} + +# Authenticate with Google Cloud Docker registry +gcloud auth configure-docker us-docker.pkg.dev + +# Import the NVIDIA NeMo container +mkdir -p ${BASE_DIR}/sqsh +enroot import --output ${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh -- docker://nvcr.io#nvidia/nemo:25.09 +``` + +### Get the recipe + +Clone the Megatron-Bridge repository: + +```bash +git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git +cd Megatron-Bridge +git checkout minor_cfg_updates_2509 +cd ${BASE_DIR} +``` + +### Configure and submit a pretraining job + +#### Using 16 nodes (64 GPUs) FP8 precision + +The `submit.slurm` script is provided to run the training job. Ensure you are in your `${BASE_DIR}` and copy/create the `submit.slurm` script there. + +Create a logs directory to store the job output: + +```bash +mkdir -p ${BASE_DIR}/logs +``` + +To execute the job with the default settings, run the following command: + +```bash +sbatch ${BASE_DIR}/submit.slurm +``` + + +### Monitor the job + +To check the status of jobs in your queue, run the following command: + +```bash +squeue +``` + +To view the output logs, use `tail` on the output file generated by Slurm (replace `` with your actual job ID): + +```bash +tail -f logs/__.out +``` + +### Cancel the job + +To cancel a running job: + +```bash +scancel +``` \ No newline at end of file diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm new file mode 100644 index 00000000..524adfdf --- /dev/null +++ b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm @@ -0,0 +1,103 @@ +#!/bin/bash +#SBATCH --exclusive +#SBATCH --job-name=llama3-405b-pretrain +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=8 +#SBATCH --mem=0 +#SBATCH --output=logs/%x_%u_%j.out +#SBATCH --time=24:00:00 +#SBATCH --open-mode=append + +set -e + +# MASTER ADDRESS SETUP +nodes=( $( scontrol show hostnames ${SLURM_JOB_NODELIST} ) ) +head_node=${nodes[0]} +export MASTER_ADDR=$head_node +export MASTER_PORT=6002 +echo "Master Node: $MASTER_ADDR" + +# Infrastructure Config +export MGMT_IFACE=enp0s19 +export PMIX_MCA_gds=^ds12 + +# Control Plane +export UCX_TLS=tcp,sm,self +export UCX_NET_DEVICES=$MGMT_IFACE + +# Gloo +export GLOO_SOCKET_IFNAME=$MGMT_IFACE +export GLOO_TIMEOUT_SECONDS=1200 + +# Performance & Debug +export PYTHONUNBUFFERED=1 +export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' + +# PATHS - Modify these if your setup differs +if [[ ! -d "${BASE_DIR}" ]]; then + echo "Error: BASE_DIR ${BASE_DIR} not found. Please ensure it exists." + exit 1 +fi +CONTAINER="${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh" +RUN_SCRIPT="${BASE_DIR}/Megatron-Bridge/scripts/performance/run_script.py" +CONFIG_FILE="${BASE_DIR}/Megatron-Bridge/scripts/performance/configs/llama31/llama31_405b_llm_pretrain.yaml" +TOTAL_GPUS=$(( SLURM_JOB_NUM_NODES * 8 )) + +# CACHE SETUP +CACHE_ROOT="${LOCAL_SSD_DIR}/triton_cache" +INDUCTOR_ROOT="${LOCAL_SSD_DIR}/torchinductor_cache" +JOB_CACHE_DIR="${CACHE_ROOT}/${SLURM_JOBID}" +JOB_INDUCTOR_DIR="${INDUCTOR_ROOT}/${SLURM_JOBID}" + +echo "Submitting job on $SLURM_JOB_NUM_NODES nodes ($TOTAL_GPUS GPUs)..." + +# EXECUTION +srun \ + --container-image "${CONTAINER}" \ + --container-mounts "${BASE_DIR}:${BASE_DIR},/usr/local/gib:/usr/local/gib" \ + --container-workdir "${BASE_DIR}" \ + --no-container-mount-home \ + --container-writable \ + --gres=gpu:8 \ + -l \ + --mpi=pmix \ + bash -c " + # Ensure clean slate inside container + unset NCCL_SOCKET_IFNAME + unset NCCL_IB_DISABLE + unset NCCL_GPUDIRECT_TCPX_FORCE_ACK + + # Distributed Ranks + export WORLD_SIZE=\${SLURM_NTASKS} + export RANK=\${SLURM_PROCID} + export LOCAL_RANK=\${SLURM_LOCALID} + export NODE_RANK=\${SLURM_NODEID} + + + export TRITON_CACHE_DIR=${JOB_CACHE_DIR}/node_\${SLURM_NODEID} + export TORCHINDUCTOR_CACHE_DIR=${JOB_INDUCTOR_DIR}/node_\${SLURM_NODEID} + mkdir -p \${TRITON_CACHE_DIR} \${TORCHINDUCTOR_CACHE_DIR} + + + export LD_LIBRARY_PATH=/usr/local/gib/lib64:\${LD_LIBRARY_PATH} + + # Source environment scripts + if [ -f /usr/local/gib/scripts/set_nccl_env.sh ]; then + source /usr/local/gib/scripts/set_nccl_env.sh + fi + + echo \"Rank \${RANK} starting on host \$(hostname)...\" + + python ${RUN_SCRIPT} \ + --config_file ${CONFIG_FILE} \ + --model_name llama31 \ + --model_size 405b \ + --compute_dtype fp8 \ + --fp8_recipe cs \ + --gpu b200 \ + --use_tokendrop True \ + -a dummy -p dummy \ + -ng ${TOTAL_GPUS} \ + train.manual_gc=true \ + train.manual_gc_interval=100 + " \ No newline at end of file From 8fe9bada083ac234e05adfba52284e6de53de487 Mon Sep 17 00:00:00 2001 From: Aman Seervi Date: Wed, 4 Mar 2026 20:05:28 +0000 Subject: [PATCH 2/3] Corrected recipe type to fp8cs in directory name --- .../16node-FP8CS-GBS1024/README.md | 107 ++++++++++++++++++ .../16node-FP8CS-GBS1024/submit.slurm | 103 +++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md create mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md new file mode 100644 index 00000000..c1fb5371 --- /dev/null +++ b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md @@ -0,0 +1,107 @@ + +# Pretrain Llama 3.1 405B workloads on A4 Slurm Cluster with Nvidia Megatron-Bridge + +This recipe outlines the steps for running a Llama 3.1 405B pretraining workload on [Google Cloud A4 Slurm clusters](https://docs.cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster) by using [NVIDIA Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Slurm Workload Manager](https://slurm.schedmd.com/) +- Deployment - [Cluster Toolkit](https://cloud.google.com/cluster-toolkit/docs/overview) + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- A4 Slurm Cluster (16 nodes, 128 GPUs) +- Machine Type: `a4-highgpu-8g` +- Lustre Filesystem + +Please follow the instructions in the [Cluster Toolkit A4 Example README](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a4-highgpu-8g/README.md) to provision an A4 High Slurm cluster. + +## Docker container image + +This recipe uses the following container images: + +- `nvcr.io/nvidia/nemo:25.09` + +## Run the recipe + +From your cluster login node, complete the following steps: + +### Configure environment settings + +#### Setup Enroot for Megatron + +We recommend setting this up on Lustre. + +```bash + # Here, /home is a lustre filesystem +export BASE_DIR=/home/${USER} +export LOCAL_SSD_DIR=/mnt/localssd +cd ${BASE_DIR} + +# Configure Enroot +export ENROOT_CONFIG_PATH=${HOME}/.config/enroot +mkdir -p ${ENROOT_CONFIG_PATH} + +# Authenticate with Google Cloud Docker registry +gcloud auth configure-docker us-docker.pkg.dev + +# Import the NVIDIA NeMo container +mkdir -p ${BASE_DIR}/sqsh +enroot import --output ${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh -- docker://nvcr.io#nvidia/nemo:25.09 +``` + +### Get the recipe + +Clone the Megatron-Bridge repository: + +```bash +git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git +cd Megatron-Bridge +git checkout minor_cfg_updates_2509 +cd ${BASE_DIR} +``` + +### Configure and submit a pretraining job + +#### Using 16 nodes (64 GPUs) FP8 precision + +The `submit.slurm` script is provided to run the training job. Ensure you are in your `${BASE_DIR}` and copy/create the `submit.slurm` script there. + +Create a logs directory to store the job output: + +```bash +mkdir -p ${BASE_DIR}/logs +``` + +To execute the job with the default settings, run the following command: + +```bash +sbatch ${BASE_DIR}/submit.slurm +``` + + +### Monitor the job + +To check the status of jobs in your queue, run the following command: + +```bash +squeue +``` + +To view the output logs, use `tail` on the output file generated by Slurm (replace `` with your actual job ID): + +```bash +tail -f logs/__.out +``` + +### Cancel the job + +To cancel a running job: + +```bash +scancel +``` \ No newline at end of file diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm new file mode 100644 index 00000000..524adfdf --- /dev/null +++ b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm @@ -0,0 +1,103 @@ +#!/bin/bash +#SBATCH --exclusive +#SBATCH --job-name=llama3-405b-pretrain +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=8 +#SBATCH --mem=0 +#SBATCH --output=logs/%x_%u_%j.out +#SBATCH --time=24:00:00 +#SBATCH --open-mode=append + +set -e + +# MASTER ADDRESS SETUP +nodes=( $( scontrol show hostnames ${SLURM_JOB_NODELIST} ) ) +head_node=${nodes[0]} +export MASTER_ADDR=$head_node +export MASTER_PORT=6002 +echo "Master Node: $MASTER_ADDR" + +# Infrastructure Config +export MGMT_IFACE=enp0s19 +export PMIX_MCA_gds=^ds12 + +# Control Plane +export UCX_TLS=tcp,sm,self +export UCX_NET_DEVICES=$MGMT_IFACE + +# Gloo +export GLOO_SOCKET_IFNAME=$MGMT_IFACE +export GLOO_TIMEOUT_SECONDS=1200 + +# Performance & Debug +export PYTHONUNBUFFERED=1 +export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' + +# PATHS - Modify these if your setup differs +if [[ ! -d "${BASE_DIR}" ]]; then + echo "Error: BASE_DIR ${BASE_DIR} not found. Please ensure it exists." + exit 1 +fi +CONTAINER="${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh" +RUN_SCRIPT="${BASE_DIR}/Megatron-Bridge/scripts/performance/run_script.py" +CONFIG_FILE="${BASE_DIR}/Megatron-Bridge/scripts/performance/configs/llama31/llama31_405b_llm_pretrain.yaml" +TOTAL_GPUS=$(( SLURM_JOB_NUM_NODES * 8 )) + +# CACHE SETUP +CACHE_ROOT="${LOCAL_SSD_DIR}/triton_cache" +INDUCTOR_ROOT="${LOCAL_SSD_DIR}/torchinductor_cache" +JOB_CACHE_DIR="${CACHE_ROOT}/${SLURM_JOBID}" +JOB_INDUCTOR_DIR="${INDUCTOR_ROOT}/${SLURM_JOBID}" + +echo "Submitting job on $SLURM_JOB_NUM_NODES nodes ($TOTAL_GPUS GPUs)..." + +# EXECUTION +srun \ + --container-image "${CONTAINER}" \ + --container-mounts "${BASE_DIR}:${BASE_DIR},/usr/local/gib:/usr/local/gib" \ + --container-workdir "${BASE_DIR}" \ + --no-container-mount-home \ + --container-writable \ + --gres=gpu:8 \ + -l \ + --mpi=pmix \ + bash -c " + # Ensure clean slate inside container + unset NCCL_SOCKET_IFNAME + unset NCCL_IB_DISABLE + unset NCCL_GPUDIRECT_TCPX_FORCE_ACK + + # Distributed Ranks + export WORLD_SIZE=\${SLURM_NTASKS} + export RANK=\${SLURM_PROCID} + export LOCAL_RANK=\${SLURM_LOCALID} + export NODE_RANK=\${SLURM_NODEID} + + + export TRITON_CACHE_DIR=${JOB_CACHE_DIR}/node_\${SLURM_NODEID} + export TORCHINDUCTOR_CACHE_DIR=${JOB_INDUCTOR_DIR}/node_\${SLURM_NODEID} + mkdir -p \${TRITON_CACHE_DIR} \${TORCHINDUCTOR_CACHE_DIR} + + + export LD_LIBRARY_PATH=/usr/local/gib/lib64:\${LD_LIBRARY_PATH} + + # Source environment scripts + if [ -f /usr/local/gib/scripts/set_nccl_env.sh ]; then + source /usr/local/gib/scripts/set_nccl_env.sh + fi + + echo \"Rank \${RANK} starting on host \$(hostname)...\" + + python ${RUN_SCRIPT} \ + --config_file ${CONFIG_FILE} \ + --model_name llama31 \ + --model_size 405b \ + --compute_dtype fp8 \ + --fp8_recipe cs \ + --gpu b200 \ + --use_tokendrop True \ + -a dummy -p dummy \ + -ng ${TOTAL_GPUS} \ + train.manual_gc=true \ + train.manual_gc_interval=100 + " \ No newline at end of file From f507880fde2ed2d3ae1ed19adc8cfd2bd85d7675 Mon Sep 17 00:00:00 2001 From: Aman Seervi Date: Wed, 4 Mar 2026 20:08:42 +0000 Subject: [PATCH 3/3] Remove old FP8DS directory files --- .../16node-FP8DS-GBS1024/README.md | 107 ------------------ .../16node-FP8DS-GBS1024/submit.slurm | 103 ----------------- 2 files changed, 210 deletions(-) delete mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md delete mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md deleted file mode 100644 index c1fb5371..00000000 --- a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md +++ /dev/null @@ -1,107 +0,0 @@ - -# Pretrain Llama 3.1 405B workloads on A4 Slurm Cluster with Nvidia Megatron-Bridge - -This recipe outlines the steps for running a Llama 3.1 405B pretraining workload on [Google Cloud A4 Slurm clusters](https://docs.cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster) by using [NVIDIA Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge). - -## Orchestration and deployment tools - -For this recipe, the following setup is used: - -- Orchestration - [Slurm Workload Manager](https://slurm.schedmd.com/) -- Deployment - [Cluster Toolkit](https://cloud.google.com/cluster-toolkit/docs/overview) - -## Test environment - -This recipe has been optimized for and tested with the following configuration: - -- A4 Slurm Cluster (16 nodes, 128 GPUs) -- Machine Type: `a4-highgpu-8g` -- Lustre Filesystem - -Please follow the instructions in the [Cluster Toolkit A4 Example README](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a4-highgpu-8g/README.md) to provision an A4 High Slurm cluster. - -## Docker container image - -This recipe uses the following container images: - -- `nvcr.io/nvidia/nemo:25.09` - -## Run the recipe - -From your cluster login node, complete the following steps: - -### Configure environment settings - -#### Setup Enroot for Megatron - -We recommend setting this up on Lustre. - -```bash - # Here, /home is a lustre filesystem -export BASE_DIR=/home/${USER} -export LOCAL_SSD_DIR=/mnt/localssd -cd ${BASE_DIR} - -# Configure Enroot -export ENROOT_CONFIG_PATH=${HOME}/.config/enroot -mkdir -p ${ENROOT_CONFIG_PATH} - -# Authenticate with Google Cloud Docker registry -gcloud auth configure-docker us-docker.pkg.dev - -# Import the NVIDIA NeMo container -mkdir -p ${BASE_DIR}/sqsh -enroot import --output ${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh -- docker://nvcr.io#nvidia/nemo:25.09 -``` - -### Get the recipe - -Clone the Megatron-Bridge repository: - -```bash -git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git -cd Megatron-Bridge -git checkout minor_cfg_updates_2509 -cd ${BASE_DIR} -``` - -### Configure and submit a pretraining job - -#### Using 16 nodes (64 GPUs) FP8 precision - -The `submit.slurm` script is provided to run the training job. Ensure you are in your `${BASE_DIR}` and copy/create the `submit.slurm` script there. - -Create a logs directory to store the job output: - -```bash -mkdir -p ${BASE_DIR}/logs -``` - -To execute the job with the default settings, run the following command: - -```bash -sbatch ${BASE_DIR}/submit.slurm -``` - - -### Monitor the job - -To check the status of jobs in your queue, run the following command: - -```bash -squeue -``` - -To view the output logs, use `tail` on the output file generated by Slurm (replace `` with your actual job ID): - -```bash -tail -f logs/__.out -``` - -### Cancel the job - -To cancel a running job: - -```bash -scancel -``` \ No newline at end of file diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm deleted file mode 100644 index 524adfdf..00000000 --- a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/bash -#SBATCH --exclusive -#SBATCH --job-name=llama3-405b-pretrain -#SBATCH --nodes=16 -#SBATCH --ntasks-per-node=8 -#SBATCH --mem=0 -#SBATCH --output=logs/%x_%u_%j.out -#SBATCH --time=24:00:00 -#SBATCH --open-mode=append - -set -e - -# MASTER ADDRESS SETUP -nodes=( $( scontrol show hostnames ${SLURM_JOB_NODELIST} ) ) -head_node=${nodes[0]} -export MASTER_ADDR=$head_node -export MASTER_PORT=6002 -echo "Master Node: $MASTER_ADDR" - -# Infrastructure Config -export MGMT_IFACE=enp0s19 -export PMIX_MCA_gds=^ds12 - -# Control Plane -export UCX_TLS=tcp,sm,self -export UCX_NET_DEVICES=$MGMT_IFACE - -# Gloo -export GLOO_SOCKET_IFNAME=$MGMT_IFACE -export GLOO_TIMEOUT_SECONDS=1200 - -# Performance & Debug -export PYTHONUNBUFFERED=1 -export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' - -# PATHS - Modify these if your setup differs -if [[ ! -d "${BASE_DIR}" ]]; then - echo "Error: BASE_DIR ${BASE_DIR} not found. Please ensure it exists." - exit 1 -fi -CONTAINER="${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh" -RUN_SCRIPT="${BASE_DIR}/Megatron-Bridge/scripts/performance/run_script.py" -CONFIG_FILE="${BASE_DIR}/Megatron-Bridge/scripts/performance/configs/llama31/llama31_405b_llm_pretrain.yaml" -TOTAL_GPUS=$(( SLURM_JOB_NUM_NODES * 8 )) - -# CACHE SETUP -CACHE_ROOT="${LOCAL_SSD_DIR}/triton_cache" -INDUCTOR_ROOT="${LOCAL_SSD_DIR}/torchinductor_cache" -JOB_CACHE_DIR="${CACHE_ROOT}/${SLURM_JOBID}" -JOB_INDUCTOR_DIR="${INDUCTOR_ROOT}/${SLURM_JOBID}" - -echo "Submitting job on $SLURM_JOB_NUM_NODES nodes ($TOTAL_GPUS GPUs)..." - -# EXECUTION -srun \ - --container-image "${CONTAINER}" \ - --container-mounts "${BASE_DIR}:${BASE_DIR},/usr/local/gib:/usr/local/gib" \ - --container-workdir "${BASE_DIR}" \ - --no-container-mount-home \ - --container-writable \ - --gres=gpu:8 \ - -l \ - --mpi=pmix \ - bash -c " - # Ensure clean slate inside container - unset NCCL_SOCKET_IFNAME - unset NCCL_IB_DISABLE - unset NCCL_GPUDIRECT_TCPX_FORCE_ACK - - # Distributed Ranks - export WORLD_SIZE=\${SLURM_NTASKS} - export RANK=\${SLURM_PROCID} - export LOCAL_RANK=\${SLURM_LOCALID} - export NODE_RANK=\${SLURM_NODEID} - - - export TRITON_CACHE_DIR=${JOB_CACHE_DIR}/node_\${SLURM_NODEID} - export TORCHINDUCTOR_CACHE_DIR=${JOB_INDUCTOR_DIR}/node_\${SLURM_NODEID} - mkdir -p \${TRITON_CACHE_DIR} \${TORCHINDUCTOR_CACHE_DIR} - - - export LD_LIBRARY_PATH=/usr/local/gib/lib64:\${LD_LIBRARY_PATH} - - # Source environment scripts - if [ -f /usr/local/gib/scripts/set_nccl_env.sh ]; then - source /usr/local/gib/scripts/set_nccl_env.sh - fi - - echo \"Rank \${RANK} starting on host \$(hostname)...\" - - python ${RUN_SCRIPT} \ - --config_file ${CONFIG_FILE} \ - --model_name llama31 \ - --model_size 405b \ - --compute_dtype fp8 \ - --fp8_recipe cs \ - --gpu b200 \ - --use_tokendrop True \ - -a dummy -p dummy \ - -ng ${TOTAL_GPUS} \ - train.manual_gc=true \ - train.manual_gc_interval=100 - " \ No newline at end of file