From b2713448fe1b770840f49362ef3df2b23058cebd Mon Sep 17 00:00:00 2001
From: Aman Seervi <amanseervi@google.com>
Date: Wed, 4 Mar 2026 19:58:48 +0000
Subject: [PATCH 1/3] Added slurm megatron-bridge recipes for llama3.1-405b on
 A4

---
 .../16node-FP8DS-GBS1024/README.md            | 107 ++++++++++++++++++
 .../16node-FP8DS-GBS1024/submit.slurm         | 103 +++++++++++++++++
 2 files changed, 210 insertions(+)
 create mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md
 create mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm
diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md
new file mode 100644
index 00000000..c1fb5371
--- /dev/null
+++ b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md
@@ -0,0 +1,107 @@
+<!-- mdformat global-off -->
+# Pretrain Llama 3.1 405B workloads on A4 Slurm Cluster with Nvidia Megatron-Bridge
+
+This recipe outlines the steps for running a Llama 3.1 405B pretraining workload on [Google Cloud A4 Slurm clusters](https://docs.cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster) by using [NVIDIA Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Slurm Workload Manager](https://slurm.schedmd.com/)
+- Deployment - [Cluster Toolkit](https://cloud.google.com/cluster-toolkit/docs/overview)
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- A4 Slurm Cluster (16 nodes, 128 GPUs)
+- Machine Type: `a4-highgpu-8g`
+- Lustre Filesystem
+
+Please follow the instructions in the [Cluster Toolkit A4 Example README](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a4-highgpu-8g/README.md) to provision an A4 High Slurm cluster.
+
+## Docker container image
+
+This recipe uses the following container images:
+
+- `nvcr.io/nvidia/nemo:25.09`
+
+## Run the recipe
+
+From your cluster login node, complete the following steps:
+
+### Configure environment settings
+
+#### Setup Enroot for Megatron
+
+We recommend setting this up on Lustre.
+
+```bash
+ # Here, /home is a lustre filesystem
+export BASE_DIR=/home/${USER}
+export LOCAL_SSD_DIR=/mnt/localssd
+cd ${BASE_DIR}
+
+# Configure Enroot
+export ENROOT_CONFIG_PATH=${HOME}/.config/enroot
+mkdir -p ${ENROOT_CONFIG_PATH}
+
+# Authenticate with Google Cloud Docker registry
+gcloud auth configure-docker us-docker.pkg.dev
+
+# Import the NVIDIA NeMo container
+mkdir -p ${BASE_DIR}/sqsh
+enroot import --output ${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh -- docker://nvcr.io#nvidia/nemo:25.09
+```
+
+### Get the recipe
+
+Clone the Megatron-Bridge repository:
+
+```bash
+git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
+cd Megatron-Bridge
+git checkout minor_cfg_updates_2509
+cd ${BASE_DIR}
+```
+
+### Configure and submit a pretraining job
+
+#### Using 16 nodes (64 GPUs) FP8 precision
+
+The `submit.slurm` script is provided to run the training job. Ensure you are in your `${BASE_DIR}` and copy/create the `submit.slurm` script there.
+
+Create a logs directory to store the job output:
+
+```bash
+mkdir -p ${BASE_DIR}/logs
+```
+
+To execute the job with the default settings, run the following command:
+
+```bash
+sbatch ${BASE_DIR}/submit.slurm
+```
+
+
+### Monitor the job
+
+To check the status of jobs in your queue, run the following command:
+
+```bash
+squeue
+```
+
+To view the output logs, use `tail` on the output file generated by Slurm (replace `<JOB_ID>` with your actual job ID):
+
+```bash
+tail -f logs/<JOB_NAME>_<USER>_<JOB_ID>.out
+```
+
+### Cancel the job
+
+To cancel a running job:
+
+```bash
+scancel <JOB_ID>
+```
\ No newline at end of file
diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm
new file mode 100644
index 00000000..524adfdf
--- /dev/null
+++ b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm
@@ -0,0 +1,103 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --job-name=llama3-405b-pretrain
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=8
+#SBATCH --mem=0
+#SBATCH --output=logs/%x_%u_%j.out
+#SBATCH --time=24:00:00
+#SBATCH --open-mode=append
+
+set -e
+
+#  MASTER ADDRESS SETUP
+nodes=( $( scontrol show hostnames ${SLURM_JOB_NODELIST} ) )
+head_node=${nodes[0]}
+export MASTER_ADDR=$head_node
+export MASTER_PORT=6002
+echo "Master Node: $MASTER_ADDR"
+
+#  Infrastructure Config
+export MGMT_IFACE=enp0s19
+export PMIX_MCA_gds=^ds12
+
+# Control Plane
+export UCX_TLS=tcp,sm,self
+export UCX_NET_DEVICES=$MGMT_IFACE
+
+# Gloo
+export GLOO_SOCKET_IFNAME=$MGMT_IFACE
+export GLOO_TIMEOUT_SECONDS=1200
+
+# Performance & Debug
+export PYTHONUNBUFFERED=1
+export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
+
+# PATHS - Modify these if your setup differs
+if [[ ! -d "${BASE_DIR}" ]]; then
+    echo "Error: BASE_DIR ${BASE_DIR} not found. Please ensure it exists."
+    exit 1
+fi
+CONTAINER="${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh"
+RUN_SCRIPT="${BASE_DIR}/Megatron-Bridge/scripts/performance/run_script.py"
+CONFIG_FILE="${BASE_DIR}/Megatron-Bridge/scripts/performance/configs/llama31/llama31_405b_llm_pretrain.yaml"
+TOTAL_GPUS=$(( SLURM_JOB_NUM_NODES * 8 ))
+
+# CACHE SETUP
+CACHE_ROOT="${LOCAL_SSD_DIR}/triton_cache"
+INDUCTOR_ROOT="${LOCAL_SSD_DIR}/torchinductor_cache"
+JOB_CACHE_DIR="${CACHE_ROOT}/${SLURM_JOBID}"
+JOB_INDUCTOR_DIR="${INDUCTOR_ROOT}/${SLURM_JOBID}"
+
+echo "Submitting job on $SLURM_JOB_NUM_NODES nodes ($TOTAL_GPUS GPUs)..."
+
+#  EXECUTION
+srun \
+  --container-image "${CONTAINER}" \
+  --container-mounts "${BASE_DIR}:${BASE_DIR},/usr/local/gib:/usr/local/gib" \
+  --container-workdir "${BASE_DIR}" \
+  --no-container-mount-home \
+  --container-writable \
+  --gres=gpu:8 \
+  -l \
+  --mpi=pmix \
+  bash -c "
+    # Ensure clean slate inside container
+    unset NCCL_SOCKET_IFNAME
+    unset NCCL_IB_DISABLE
+    unset NCCL_GPUDIRECT_TCPX_FORCE_ACK
+
+    # Distributed Ranks
+    export WORLD_SIZE=\${SLURM_NTASKS}
+    export RANK=\${SLURM_PROCID}
+    export LOCAL_RANK=\${SLURM_LOCALID}
+    export NODE_RANK=\${SLURM_NODEID}
+
+
+    export TRITON_CACHE_DIR=${JOB_CACHE_DIR}/node_\${SLURM_NODEID}
+    export TORCHINDUCTOR_CACHE_DIR=${JOB_INDUCTOR_DIR}/node_\${SLURM_NODEID}
+    mkdir -p \${TRITON_CACHE_DIR} \${TORCHINDUCTOR_CACHE_DIR}
+
+
+    export LD_LIBRARY_PATH=/usr/local/gib/lib64:\${LD_LIBRARY_PATH}
+
+    # Source environment scripts
+    if [ -f /usr/local/gib/scripts/set_nccl_env.sh ]; then
+        source /usr/local/gib/scripts/set_nccl_env.sh
+    fi
+
+    echo \"Rank \${RANK} starting on host \$(hostname)...\"
+
+    python ${RUN_SCRIPT} \
+    --config_file ${CONFIG_FILE} \
+    --model_name llama31 \
+    --model_size 405b \
+    --compute_dtype fp8 \
+    --fp8_recipe cs \
+    --gpu b200 \
+    --use_tokendrop True \
+    -a dummy -p dummy \
+    -ng ${TOTAL_GPUS} \
+    train.manual_gc=true \
+    train.manual_gc_interval=100
+  "
\ No newline at end of file

From 8fe9bada083ac234e05adfba52284e6de53de487 Mon Sep 17 00:00:00 2001
From: Aman Seervi <amanseervi@google.com>
Date: Wed, 4 Mar 2026 20:05:28 +0000
Subject: [PATCH 2/3] Corrected recipe type to fp8cs in directory name

---
 .../16node-FP8CS-GBS1024/README.md            | 107 ++++++++++++++++++
 .../16node-FP8CS-GBS1024/submit.slurm         | 103 +++++++++++++++++
 2 files changed, 210 insertions(+)
 create mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md
 create mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm

diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md
new file mode 100644
index 00000000..c1fb5371
--- /dev/null
+++ b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/README.md
@@ -0,0 +1,107 @@
+<!-- mdformat global-off -->
+# Pretrain Llama 3.1 405B workloads on A4 Slurm Cluster with Nvidia Megatron-Bridge
+
+This recipe outlines the steps for running a Llama 3.1 405B pretraining workload on [Google Cloud A4 Slurm clusters](https://docs.cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster) by using [NVIDIA Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Slurm Workload Manager](https://slurm.schedmd.com/)
+- Deployment - [Cluster Toolkit](https://cloud.google.com/cluster-toolkit/docs/overview)
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- A4 Slurm Cluster (16 nodes, 128 GPUs)
+- Machine Type: `a4-highgpu-8g`
+- Lustre Filesystem
+
+Please follow the instructions in the [Cluster Toolkit A4 Example README](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a4-highgpu-8g/README.md) to provision an A4 High Slurm cluster.
+
+## Docker container image
+
+This recipe uses the following container images:
+
+- `nvcr.io/nvidia/nemo:25.09`
+
+## Run the recipe
+
+From your cluster login node, complete the following steps:
+
+### Configure environment settings
+
+#### Setup Enroot for Megatron
+
+We recommend setting this up on Lustre.
+
+```bash
+ # Here, /home is a lustre filesystem
+export BASE_DIR=/home/${USER}
+export LOCAL_SSD_DIR=/mnt/localssd
+cd ${BASE_DIR}
+
+# Configure Enroot
+export ENROOT_CONFIG_PATH=${HOME}/.config/enroot
+mkdir -p ${ENROOT_CONFIG_PATH}
+
+# Authenticate with Google Cloud Docker registry
+gcloud auth configure-docker us-docker.pkg.dev
+
+# Import the NVIDIA NeMo container
+mkdir -p ${BASE_DIR}/sqsh
+enroot import --output ${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh -- docker://nvcr.io#nvidia/nemo:25.09
+```
+
+### Get the recipe
+
+Clone the Megatron-Bridge repository:
+
+```bash
+git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
+cd Megatron-Bridge
+git checkout minor_cfg_updates_2509
+cd ${BASE_DIR}
+```
+
+### Configure and submit a pretraining job
+
+#### Using 16 nodes (64 GPUs) FP8 precision
+
+The `submit.slurm` script is provided to run the training job. Ensure you are in your `${BASE_DIR}` and copy/create the `submit.slurm` script there.
+
+Create a logs directory to store the job output:
+
+```bash
+mkdir -p ${BASE_DIR}/logs
+```
+
+To execute the job with the default settings, run the following command:
+
+```bash
+sbatch ${BASE_DIR}/submit.slurm
+```
+
+
+### Monitor the job
+
+To check the status of jobs in your queue, run the following command:
+
+```bash
+squeue
+```
+
+To view the output logs, use `tail` on the output file generated by Slurm (replace `<JOB_ID>` with your actual job ID):
+
+```bash
+tail -f logs/<JOB_NAME>_<USER>_<JOB_ID>.out
+```
+
+### Cancel the job
+
+To cancel a running job:
+
+```bash
+scancel <JOB_ID>
+```
\ No newline at end of file
diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm
new file mode 100644
index 00000000..524adfdf
--- /dev/null
+++ b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8CS-GBS1024/submit.slurm
@@ -0,0 +1,103 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --job-name=llama3-405b-pretrain
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=8
+#SBATCH --mem=0
+#SBATCH --output=logs/%x_%u_%j.out
+#SBATCH --time=24:00:00
+#SBATCH --open-mode=append
+
+set -e
+
+#  MASTER ADDRESS SETUP
+nodes=( $( scontrol show hostnames ${SLURM_JOB_NODELIST} ) )
+head_node=${nodes[0]}
+export MASTER_ADDR=$head_node
+export MASTER_PORT=6002
+echo "Master Node: $MASTER_ADDR"
+
+#  Infrastructure Config
+export MGMT_IFACE=enp0s19
+export PMIX_MCA_gds=^ds12
+
+# Control Plane
+export UCX_TLS=tcp,sm,self
+export UCX_NET_DEVICES=$MGMT_IFACE
+
+# Gloo
+export GLOO_SOCKET_IFNAME=$MGMT_IFACE
+export GLOO_TIMEOUT_SECONDS=1200
+
+# Performance & Debug
+export PYTHONUNBUFFERED=1
+export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
+
+# PATHS - Modify these if your setup differs
+if [[ ! -d "${BASE_DIR}" ]]; then
+    echo "Error: BASE_DIR ${BASE_DIR} not found. Please ensure it exists."
+    exit 1
+fi
+CONTAINER="${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh"
+RUN_SCRIPT="${BASE_DIR}/Megatron-Bridge/scripts/performance/run_script.py"
+CONFIG_FILE="${BASE_DIR}/Megatron-Bridge/scripts/performance/configs/llama31/llama31_405b_llm_pretrain.yaml"
+TOTAL_GPUS=$(( SLURM_JOB_NUM_NODES * 8 ))
+
+# CACHE SETUP
+CACHE_ROOT="${LOCAL_SSD_DIR}/triton_cache"
+INDUCTOR_ROOT="${LOCAL_SSD_DIR}/torchinductor_cache"
+JOB_CACHE_DIR="${CACHE_ROOT}/${SLURM_JOBID}"
+JOB_INDUCTOR_DIR="${INDUCTOR_ROOT}/${SLURM_JOBID}"
+
+echo "Submitting job on $SLURM_JOB_NUM_NODES nodes ($TOTAL_GPUS GPUs)..."
+
+#  EXECUTION
+srun \
+  --container-image "${CONTAINER}" \
+  --container-mounts "${BASE_DIR}:${BASE_DIR},/usr/local/gib:/usr/local/gib" \
+  --container-workdir "${BASE_DIR}" \
+  --no-container-mount-home \
+  --container-writable \
+  --gres=gpu:8 \
+  -l \
+  --mpi=pmix \
+  bash -c "
+    # Ensure clean slate inside container
+    unset NCCL_SOCKET_IFNAME
+    unset NCCL_IB_DISABLE
+    unset NCCL_GPUDIRECT_TCPX_FORCE_ACK
+
+    # Distributed Ranks
+    export WORLD_SIZE=\${SLURM_NTASKS}
+    export RANK=\${SLURM_PROCID}
+    export LOCAL_RANK=\${SLURM_LOCALID}
+    export NODE_RANK=\${SLURM_NODEID}
+
+
+    export TRITON_CACHE_DIR=${JOB_CACHE_DIR}/node_\${SLURM_NODEID}
+    export TORCHINDUCTOR_CACHE_DIR=${JOB_INDUCTOR_DIR}/node_\${SLURM_NODEID}
+    mkdir -p \${TRITON_CACHE_DIR} \${TORCHINDUCTOR_CACHE_DIR}
+
+
+    export LD_LIBRARY_PATH=/usr/local/gib/lib64:\${LD_LIBRARY_PATH}
+
+    # Source environment scripts
+    if [ -f /usr/local/gib/scripts/set_nccl_env.sh ]; then
+        source /usr/local/gib/scripts/set_nccl_env.sh
+    fi
+
+    echo \"Rank \${RANK} starting on host \$(hostname)...\"
+
+    python ${RUN_SCRIPT} \
+    --config_file ${CONFIG_FILE} \
+    --model_name llama31 \
+    --model_size 405b \
+    --compute_dtype fp8 \
+    --fp8_recipe cs \
+    --gpu b200 \
+    --use_tokendrop True \
+    -a dummy -p dummy \
+    -ng ${TOTAL_GPUS} \
+    train.manual_gc=true \
+    train.manual_gc_interval=100
+  "
\ No newline at end of file

From f507880fde2ed2d3ae1ed19adc8cfd2bd85d7675 Mon Sep 17 00:00:00 2001
From: Aman Seervi <amanseervi@google.com>
Date: Wed, 4 Mar 2026 20:08:42 +0000
Subject: [PATCH 3/3] Remove old FP8DS directory files

---
 .../16node-FP8DS-GBS1024/README.md            | 107 ------------------
 .../16node-FP8DS-GBS1024/submit.slurm         | 103 -----------------
 2 files changed, 210 deletions(-)
 delete mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md
 delete mode 100644 training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm

diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md
deleted file mode 100644
index c1fb5371..00000000
--- a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/README.md
+++ /dev/null
@@ -1,107 +0,0 @@
-<!-- mdformat global-off -->
-# Pretrain Llama 3.1 405B workloads on A4 Slurm Cluster with Nvidia Megatron-Bridge
-
-This recipe outlines the steps for running a Llama 3.1 405B pretraining workload on [Google Cloud A4 Slurm clusters](https://docs.cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster) by using [NVIDIA Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge).
-
-## Orchestration and deployment tools
-
-For this recipe, the following setup is used:
-
-- Orchestration - [Slurm Workload Manager](https://slurm.schedmd.com/)
-- Deployment - [Cluster Toolkit](https://cloud.google.com/cluster-toolkit/docs/overview)
-
-## Test environment
-
-This recipe has been optimized for and tested with the following configuration:
-
-- A4 Slurm Cluster (16 nodes, 128 GPUs)
-- Machine Type: `a4-highgpu-8g`
-- Lustre Filesystem
-
-Please follow the instructions in the [Cluster Toolkit A4 Example README](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a4-highgpu-8g/README.md) to provision an A4 High Slurm cluster.
-
-## Docker container image
-
-This recipe uses the following container images:
-
-- `nvcr.io/nvidia/nemo:25.09`
-
-## Run the recipe
-
-From your cluster login node, complete the following steps:
-
-### Configure environment settings
-
-#### Setup Enroot for Megatron
-
-We recommend setting this up on Lustre.
-
-```bash
- # Here, /home is a lustre filesystem
-export BASE_DIR=/home/${USER}
-export LOCAL_SSD_DIR=/mnt/localssd
-cd ${BASE_DIR}
-
-# Configure Enroot
-export ENROOT_CONFIG_PATH=${HOME}/.config/enroot
-mkdir -p ${ENROOT_CONFIG_PATH}
-
-# Authenticate with Google Cloud Docker registry
-gcloud auth configure-docker us-docker.pkg.dev
-
-# Import the NVIDIA NeMo container
-mkdir -p ${BASE_DIR}/sqsh
-enroot import --output ${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh -- docker://nvcr.io#nvidia/nemo:25.09
-```
-
-### Get the recipe
-
-Clone the Megatron-Bridge repository:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
-cd Megatron-Bridge
-git checkout minor_cfg_updates_2509
-cd ${BASE_DIR}
-```
-
-### Configure and submit a pretraining job
-
-#### Using 16 nodes (64 GPUs) FP8 precision
-
-The `submit.slurm` script is provided to run the training job. Ensure you are in your `${BASE_DIR}` and copy/create the `submit.slurm` script there.
-
-Create a logs directory to store the job output:
-
-```bash
-mkdir -p ${BASE_DIR}/logs
-```
-
-To execute the job with the default settings, run the following command:
-
-```bash
-sbatch ${BASE_DIR}/submit.slurm
-```
-
-
-### Monitor the job
-
-To check the status of jobs in your queue, run the following command:
-
-```bash
-squeue
-```
-
-To view the output logs, use `tail` on the output file generated by Slurm (replace `<JOB_ID>` with your actual job ID):
-
-```bash
-tail -f logs/<JOB_NAME>_<USER>_<JOB_ID>.out
-```
-
-### Cancel the job
-
-To cancel a running job:
-
-```bash
-scancel <JOB_ID>
-```
\ No newline at end of file
diff --git a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm b/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm
deleted file mode 100644
index 524adfdf..00000000
--- a/training/a4/llama3-1-405b/megatron-bridge-pretraining-slurm/16node-FP8DS-GBS1024/submit.slurm
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/bin/bash
-#SBATCH --exclusive
-#SBATCH --job-name=llama3-405b-pretrain
-#SBATCH --nodes=16
-#SBATCH --ntasks-per-node=8
-#SBATCH --mem=0
-#SBATCH --output=logs/%x_%u_%j.out
-#SBATCH --time=24:00:00
-#SBATCH --open-mode=append
-
-set -e
-
-#  MASTER ADDRESS SETUP
-nodes=( $( scontrol show hostnames ${SLURM_JOB_NODELIST} ) )
-head_node=${nodes[0]}
-export MASTER_ADDR=$head_node
-export MASTER_PORT=6002
-echo "Master Node: $MASTER_ADDR"
-
-#  Infrastructure Config
-export MGMT_IFACE=enp0s19
-export PMIX_MCA_gds=^ds12
-
-# Control Plane
-export UCX_TLS=tcp,sm,self
-export UCX_NET_DEVICES=$MGMT_IFACE
-
-# Gloo
-export GLOO_SOCKET_IFNAME=$MGMT_IFACE
-export GLOO_TIMEOUT_SECONDS=1200
-
-# Performance & Debug
-export PYTHONUNBUFFERED=1
-export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
-
-# PATHS - Modify these if your setup differs
-if [[ ! -d "${BASE_DIR}" ]]; then
-    echo "Error: BASE_DIR ${BASE_DIR} not found. Please ensure it exists."
-    exit 1
-fi
-CONTAINER="${BASE_DIR}/sqsh/nvidia+nemo+25.09.sqsh"
-RUN_SCRIPT="${BASE_DIR}/Megatron-Bridge/scripts/performance/run_script.py"
-CONFIG_FILE="${BASE_DIR}/Megatron-Bridge/scripts/performance/configs/llama31/llama31_405b_llm_pretrain.yaml"
-TOTAL_GPUS=$(( SLURM_JOB_NUM_NODES * 8 ))
-
-# CACHE SETUP
-CACHE_ROOT="${LOCAL_SSD_DIR}/triton_cache"
-INDUCTOR_ROOT="${LOCAL_SSD_DIR}/torchinductor_cache"
-JOB_CACHE_DIR="${CACHE_ROOT}/${SLURM_JOBID}"
-JOB_INDUCTOR_DIR="${INDUCTOR_ROOT}/${SLURM_JOBID}"
-
-echo "Submitting job on $SLURM_JOB_NUM_NODES nodes ($TOTAL_GPUS GPUs)..."
-
-#  EXECUTION
-srun \
-  --container-image "${CONTAINER}" \
-  --container-mounts "${BASE_DIR}:${BASE_DIR},/usr/local/gib:/usr/local/gib" \
-  --container-workdir "${BASE_DIR}" \
-  --no-container-mount-home \
-  --container-writable \
-  --gres=gpu:8 \
-  -l \
-  --mpi=pmix \
-  bash -c "
-    # Ensure clean slate inside container
-    unset NCCL_SOCKET_IFNAME
-    unset NCCL_IB_DISABLE
-    unset NCCL_GPUDIRECT_TCPX_FORCE_ACK
-
-    # Distributed Ranks
-    export WORLD_SIZE=\${SLURM_NTASKS}
-    export RANK=\${SLURM_PROCID}
-    export LOCAL_RANK=\${SLURM_LOCALID}
-    export NODE_RANK=\${SLURM_NODEID}
-
-
-    export TRITON_CACHE_DIR=${JOB_CACHE_DIR}/node_\${SLURM_NODEID}
-    export TORCHINDUCTOR_CACHE_DIR=${JOB_INDUCTOR_DIR}/node_\${SLURM_NODEID}
-    mkdir -p \${TRITON_CACHE_DIR} \${TORCHINDUCTOR_CACHE_DIR}
-
-
-    export LD_LIBRARY_PATH=/usr/local/gib/lib64:\${LD_LIBRARY_PATH}
-
-    # Source environment scripts
-    if [ -f /usr/local/gib/scripts/set_nccl_env.sh ]; then
-        source /usr/local/gib/scripts/set_nccl_env.sh
-    fi
-
-    echo \"Rank \${RANK} starting on host \$(hostname)...\"
-
-    python ${RUN_SCRIPT} \
-    --config_file ${CONFIG_FILE} \
-    --model_name llama31 \
-    --model_size 405b \
-    --compute_dtype fp8 \
-    --fp8_recipe cs \
-    --gpu b200 \
-    --use_tokendrop True \
-    -a dummy -p dummy \
-    -ng ${TOTAL_GPUS} \
-    train.manual_gc=true \
-    train.manual_gc_interval=100
-  "
\ No newline at end of file