AI-Hypercomputer · tonyjohnchen · Feb 26, 2026 · Feb 26, 2026
diff --git a/...ng/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md b/...ng/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md
@@ -87,47 +87,49 @@ gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
 To execute the job with the default settings, run the following command from
 your client:
 
-    cd $RECIPE_ROOT
-    export WORKLOAD_NAME=$USER-a4x-llama3-1-405b
-    helm install $WORKLOAD_NAME . -f values.yaml \
-    --set-file workload_launcher=launcher.sh \
-    --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpus256.py \
-    --set workload.image=nvcr.io/nvidia/nemo:25.07 \
-    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
-    --set volumes.gcsMounts[0].mountPath=/job-logs \
-    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
-    --set queue=${KUEUE_NAME}
-
-**Examples**
-
--   To set the number of training steps to 100, run the following command from
-    your client:
-
 ```bash
 cd $RECIPE_ROOT
-export WORKLOAD_NAME=$USER-a4x-llama3-1-405b
+export WORKLOAD_NAME=$USER-a4x-llama3-1-405b-64node
 helm install $WORKLOAD_NAME . -f values.yaml \
 --set-file workload_launcher=launcher.sh \
---set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpus256.py \
+--set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpu256.py \
 --set workload.image=nvcr.io/nvidia/nemo:25.07 \
 --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
 --set volumes.gcsMounts[0].mountPath=/job-logs \
 --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
---set queue=${KUEUE_NAME} \
---set workload.arguments[0]="max_steps=100"
+--set queue=${KUEUE_NAME}
 ```
 
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-a4x-llama3-1-405b-64node
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpu256.py \
+    --set workload.image=nvcr.io/nvidia/nemo:25.07 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
 ### Monitor the job
 
 To check the status of pods in your job, run the following command:
 
 ```
-kubectl get pods | grep $USER-a4x-llama3-1-405b
+kubectl get pods | grep $USER-a4x-llama3-1-405b-64node
 ```
 
 Replace the following:
 
-- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4x-llama3-1-405b.
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4x-llama3-1-405b-64node.
 
 To get the logs for one of the pods, run the following command:
 
@@ -139,13 +141,13 @@ Information about the training job's progress, including crucial details such as
 loss, step count, and step time, is generated by the rank 0 process.
 This process runs on the pod whose name begins with
 `JOB_NAME_PREFIX-workload-0-0`.
-For example: `$USER-a4x-llama3-1-405b-workload-0-0-s9zrv`.
+For example: `$USER-a4x-llama3-1-405b-64node-workload-0-0-s9zrv`.
 
 ### Uninstall the Helm release
 
 You can delete the job and other resources created by the Helm chart. To
 uninstall Helm, run the following command from your client:
 
 ```bash
-helm uninstall $USER-a4x-llama3-1-405b
-```
+helm uninstall $USER-a4x-llama3-1-405b-64node
+```
diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh
@@ -32,11 +32,13 @@ else
   echo "  ${config_overrides}"
 fi
 
-export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
-ldconfig $LD_LIBRARY_PATH
-echo "Added $LD_LIBRARY_PATH to ldconfig:"
-ldconfig -p | grep libcuda | sed 's/^/  /'
-echo ""
+if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then
+  export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
+  ldconfig $LD_LIBRARY_PATH
+  echo "Added $LD_LIBRARY_PATH to ldconfig:"
+  ldconfig -p | grep libcuda | sed 's/^/  /'
+  echo ""
+fi
 
 if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
   explicit_log_dir=${EXPLICIT_LOG_DIR}
@@ -56,23 +58,46 @@ echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of
 
 pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
 
+
+# Export the nemo2 config to yaml.
+python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=20 \
+trainer.num_nodes=64 \
+trainer.devices=4 \
+${config_overrides} \
+--to-yaml exported_nemo_config.yaml
+
 # Create the nsys directory.
 mkdir -p ${explicit_log_dir}/nsys
 
-torchrun --no-python \
---nproc-per-node="${GPUS_PER_NODE}" \
+OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
+/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
+-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \
+--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \
+--wait all \
+torchrun \
+--nproc-per-node="4" \
 --nnodes="${NNODES}" \
 --node_rank="${JOB_COMPLETION_INDEX}" \
 --rdzv_id="${JOB_IDENTIFIER}" \
 --master_addr="${MASTER_ADDR}" \
 --master_port="${MASTER_PORT}" \
-bash -c "numactl --cpunodebind=\$((LOCAL_RANK/2)) --membind=\$((LOCAL_RANK/2)) python ${NEMO_LAUNCH_SCRIPT} ${config_overrides}"
+${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=20 \
+trainer.num_nodes=64 \
+trainer.devices=4 \
+${config_overrides}
 
 if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
   mkdir -p ${ARTIFACT_DIR}
   cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
   cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py
   cp dllogger.json ${ARTIFACT_DIR}/dllogger.json
+  cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml
   env > ${ARTIFACT_DIR}/environ.txt
   ls ${ARTIFACT_DIR}
 fi

diff --git a/...5b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py b/...5b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py
@@ -0,0 +1,141 @@
+"""Nemo2 pretraining recipe for Llama 3.1 405B model."""
+
+from nemo.collections import llm
+from nemo.collections.llm.recipes import llama31_405b
+from nemo.lightning.pytorch.callbacks import NsysCallback
+from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
+from nemo.utils.loggers.dllogger import DLLogger
+import nemo_run as run
+from scripts.performance.helpers import (
+    set_primary_perf_configs,
+)
+from scripts.performance.utils import get_comm_overlap_callback_idx
+
+
+def recipe(
+    profile_enabled: bool = False,
+    profile_start_step: int = 0,
+    profile_end_step: int = 0,
+    profile_ranks: str = "0",
+) -> run.Partial:
+  """Returns a Nemo2 training recipe for Llama 3.1 405B model.
+
+  Args:
+      profile_enabled: Whether to enable Nsys profiling.
+      profile_start_step: The step to start profiling.
+      profile_end_step: The step to end profiling.
+      profile_ranks: The ranks to profile, comma separated.
+
+  Returns:
+      A Nemo2 training recipe.
+  """
+  # Start from the Nemo standard recipe.
+  pretrain = llama31_405b.pretrain_recipe(performance_mode=True)
+
+  num_nodes = 64
+  num_gpus_per_node = 4
+  mbs = 1
+  gbs = 2048
+  max_steps = 20
+  tp_size = 2
+  pp_size = 1
+  cp_size = 1
+  vp_size = 1  # Virtual Pipeline Parallelism
+  ep_size = 1  # Expert Parallelism
+  activation_offload_layers = 95
+  enable_cuda_graphs = False
+  compute_dtype = "fp8"
+  fp8_recipe = "cs"
+  nccl_communicator_config_path = None
+  use_mcore_fsdp = True
+  use_fsdp_double_buffer = False
+  use_user_buffer_registration = False
+  use_sharp = False
+  keep_fsdp_fp8_transpose_cache = False
+
+  pretrain = set_primary_perf_configs(
+      pretrain,
+      "pre_train",
+      num_nodes=num_nodes,
+      num_gpus_per_node=num_gpus_per_node,
+      mbs=mbs,
+      gbs=gbs,
+      max_steps=max_steps,
+      tp_size=tp_size,
+      pp_size=pp_size,
+      cp_size=cp_size,
+      vp_size=vp_size,
+      ep_size=ep_size,
+      activation_offload_layers=activation_offload_layers,
+      enable_cuda_graphs=enable_cuda_graphs,
+      compute_dtype=compute_dtype,
+      fp8_recipe=fp8_recipe,
+      nccl_communicator_config_path=nccl_communicator_config_path,
+      use_mcore_fsdp=use_mcore_fsdp,
+      use_fsdp_double_buffer=use_fsdp_double_buffer,
+      use_user_buffer_registration=use_user_buffer_registration,
+      use_sharp=use_sharp,
+      keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache,
+  )
+
+  # Sequence Length (model and data)
+  pretrain.model.config.seq_length = 8192
+  pretrain.data.seq_length = 8192
+
+  # Set the number of steps to 50 for a quicker benchmark.
+  pretrain.trainer.max_steps = 50
+
+  # Disable validation batches.
+  pretrain.trainer.limit_val_batches = 0.0
+  pretrain.trainer.val_check_interval = 30
+
+  # Add the Nsys profiling callback if enabled.
+  if profile_enabled:
+    pretrain.trainer.callbacks.append(
+        run.Config(
+            NsysCallback,
+            start_step=profile_start_step,
+            end_step=profile_end_step,
+            ranks=[int(x) for x in profile_ranks.split(",")],
+            gen_shape=False,
+        )
+    )
+
+  # Add the FLOPs measurement callback.
+  pretrain.trainer.callbacks.append(
+      run.Config(
+          FLOPsMeasurementCallback,
+          model_name="llama31-405b",
+          model_config=pretrain.model.config,
+          data_config=pretrain.data,
+      )
+  )
+
+  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
+      pretrain.trainer.callbacks
+  )
+  pretrain.trainer.callbacks[
+      comm_overlap_callback_idx
+  ].tp_comm_bootstrap_backend = "nccl"
+
+  # Disable checkpointing.
+  pretrain.log.ckpt = None
+  pretrain.trainer.enable_checkpointing = False
+
+  # Log every step.
+  pretrain.trainer.log_every_n_steps = 1
+
+  # Enable DLLogger
+  dllogger_config = run.Config(
+      DLLogger,
+      verbose=True,
+      stdout=True,
+      json_file="dllogger.json",
+  )
+  pretrain.log.extra_loggers = [dllogger_config]
+
+  return pretrain
+
+
+if __name__ == "__main__":
+  run.cli.main(llm.pretrain, default_factory=recipe)