From 4de97345fec0413412cb5916caf295c3fcac71cf Mon Sep 17 00:00:00 2001
From: Mutian Zhu <mutianzhu@google.com>
Date: Thu, 26 Feb 2026 20:09:56 +0000
Subject: [PATCH] Add Llama3.1-405b FP8 recipe for 64 nodes with GBS 2048
 generated by ubench

---
 .../64node-FP8CS-GBS2048/recipe/README.md     |  52 ++---
 .../64node-FP8CS-GBS2048/recipe/launcher.sh   |  41 +++-
 .../llama3-1-405b-fp8cs-gbs2048-gpu256.py     | 141 ++++++++++++
 .../llama3-1-405b-fp8cs-gbs2048-gpus256.py    | 207 ------------------
 .../recipe/recipe_launch_command.sh           |   1 +
 .../templates/workload-config-configmap.yaml  |   2 +
 .../recipe/templates/workload-job.yaml        |  29 ++-
 .../64node-FP8CS-GBS2048/recipe/values.yaml   |  63 +-----
 8 files changed, 226 insertions(+), 310 deletions(-)
 create mode 100644 training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py
 delete mode 100644 training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpus256.py
 create mode 100644 training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/recipe_launch_command.sh

diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md
index 454a1ba6..b73cd914 100644
--- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md
+++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md
@@ -87,47 +87,49 @@ gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
 To execute the job with the default settings, run the following command from
 your client:
 
-    cd $RECIPE_ROOT
-    export WORKLOAD_NAME=$USER-a4x-llama3-1-405b
-    helm install $WORKLOAD_NAME . -f values.yaml \
-    --set-file workload_launcher=launcher.sh \
-    --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpus256.py \
-    --set workload.image=nvcr.io/nvidia/nemo:25.07 \
-    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
-    --set volumes.gcsMounts[0].mountPath=/job-logs \
-    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
-    --set queue=${KUEUE_NAME}
-
-**Examples**
-
--   To set the number of training steps to 100, run the following command from
-    your client:
-
 ```bash
 cd $RECIPE_ROOT
-export WORKLOAD_NAME=$USER-a4x-llama3-1-405b
+export WORKLOAD_NAME=$USER-a4x-llama3-1-405b-64node
 helm install $WORKLOAD_NAME . -f values.yaml \
 --set-file workload_launcher=launcher.sh \
---set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpus256.py \
+--set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpu256.py \
 --set workload.image=nvcr.io/nvidia/nemo:25.07 \
 --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
 --set volumes.gcsMounts[0].mountPath=/job-logs \
 --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
---set queue=${KUEUE_NAME} \
---set workload.arguments[0]="max_steps=100"
+--set queue=${KUEUE_NAME}
 ```
 
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-a4x-llama3-1-405b-64node
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpu256.py \
+    --set workload.image=nvcr.io/nvidia/nemo:25.07 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
 ### Monitor the job
 
 To check the status of pods in your job, run the following command:
 
 ```
-kubectl get pods | grep $USER-a4x-llama3-1-405b
+kubectl get pods | grep $USER-a4x-llama3-1-405b-64node
 ```
 
 Replace the following:
 
-- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4x-llama3-1-405b.
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4x-llama3-1-405b-64node.
 
 To get the logs for one of the pods, run the following command:
 
@@ -139,7 +141,7 @@ Information about the training job's progress, including crucial details such as
 loss, step count, and step time, is generated by the rank 0 process.
 This process runs on the pod whose name begins with
 `JOB_NAME_PREFIX-workload-0-0`.
-For example: `$USER-a4x-llama3-1-405b-workload-0-0-s9zrv`.
+For example: `$USER-a4x-llama3-1-405b-64node-workload-0-0-s9zrv`.
 
 ### Uninstall the Helm release
 
@@ -147,5 +149,5 @@ You can delete the job and other resources created by the Helm chart. To
 uninstall Helm, run the following command from your client:
 
 ```bash
-helm uninstall $USER-a4x-llama3-1-405b
-```
+helm uninstall $USER-a4x-llama3-1-405b-64node
+```
\ No newline at end of file
diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh
index c77d3142..66200540 100644
--- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh
+++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh
@@ -32,11 +32,13 @@ else
   echo "  ${config_overrides}"
 fi
 
-export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
-ldconfig $LD_LIBRARY_PATH
-echo "Added $LD_LIBRARY_PATH to ldconfig:"
-ldconfig -p | grep libcuda | sed 's/^/  /'
-echo ""
+if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then
+  export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
+  ldconfig $LD_LIBRARY_PATH
+  echo "Added $LD_LIBRARY_PATH to ldconfig:"
+  ldconfig -p | grep libcuda | sed 's/^/  /'
+  echo ""
+fi
 
 if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
   explicit_log_dir=${EXPLICIT_LOG_DIR}
@@ -56,23 +58,46 @@ echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of
 
 pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
 
+
+# Export the nemo2 config to yaml.
+python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=20 \
+trainer.num_nodes=64 \
+trainer.devices=4 \
+${config_overrides} \
+--to-yaml exported_nemo_config.yaml
+
 # Create the nsys directory.
 mkdir -p ${explicit_log_dir}/nsys
 
-torchrun --no-python \
---nproc-per-node="${GPUS_PER_NODE}" \
+OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
+/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
+-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \
+--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \
+--wait all \
+torchrun \
+--nproc-per-node="4" \
 --nnodes="${NNODES}" \
 --node_rank="${JOB_COMPLETION_INDEX}" \
 --rdzv_id="${JOB_IDENTIFIER}" \
 --master_addr="${MASTER_ADDR}" \
 --master_port="${MASTER_PORT}" \
-bash -c "numactl --cpunodebind=\$((LOCAL_RANK/2)) --membind=\$((LOCAL_RANK/2)) python ${NEMO_LAUNCH_SCRIPT} ${config_overrides}"
+${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=20 \
+trainer.num_nodes=64 \
+trainer.devices=4 \
+${config_overrides}
 
 if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
   mkdir -p ${ARTIFACT_DIR}
   cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
   cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py
   cp dllogger.json ${ARTIFACT_DIR}/dllogger.json
+  cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml
   env > ${ARTIFACT_DIR}/environ.txt
   ls ${ARTIFACT_DIR}
 fi
diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py
new file mode 100644
index 00000000..4a78d757
--- /dev/null
+++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py
@@ -0,0 +1,141 @@
+"""Nemo2 pretraining recipe for Llama 3.1 405B model."""
+
+from nemo.collections import llm
+from nemo.collections.llm.recipes import llama31_405b
+from nemo.lightning.pytorch.callbacks import NsysCallback
+from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
+from nemo.utils.loggers.dllogger import DLLogger
+import nemo_run as run
+from scripts.performance.helpers import (
+    set_primary_perf_configs,
+)
+from scripts.performance.utils import get_comm_overlap_callback_idx
+
+
+def recipe(
+    profile_enabled: bool = False,
+    profile_start_step: int = 0,
+    profile_end_step: int = 0,
+    profile_ranks: str = "0",
+) -> run.Partial:
+  """Returns a Nemo2 training recipe for Llama 3.1 405B model.
+
+  Args:
+      profile_enabled: Whether to enable Nsys profiling.
+      profile_start_step: The step to start profiling.
+      profile_end_step: The step to end profiling.
+      profile_ranks: The ranks to profile, comma separated.
+
+  Returns:
+      A Nemo2 training recipe.
+  """
+  # Start from the Nemo standard recipe.
+  pretrain = llama31_405b.pretrain_recipe(performance_mode=True)
+
+  num_nodes = 64
+  num_gpus_per_node = 4
+  mbs = 1
+  gbs = 2048
+  max_steps = 20
+  tp_size = 2
+  pp_size = 1
+  cp_size = 1
+  vp_size = 1  # Virtual Pipeline Parallelism
+  ep_size = 1  # Expert Parallelism
+  activation_offload_layers = 95
+  enable_cuda_graphs = False
+  compute_dtype = "fp8"
+  fp8_recipe = "cs"
+  nccl_communicator_config_path = None
+  use_mcore_fsdp = True
+  use_fsdp_double_buffer = False
+  use_user_buffer_registration = False
+  use_sharp = False
+  keep_fsdp_fp8_transpose_cache = False
+
+  pretrain = set_primary_perf_configs(
+      pretrain,
+      "pre_train",
+      num_nodes=num_nodes,
+      num_gpus_per_node=num_gpus_per_node,
+      mbs=mbs,
+      gbs=gbs,
+      max_steps=max_steps,
+      tp_size=tp_size,
+      pp_size=pp_size,
+      cp_size=cp_size,
+      vp_size=vp_size,
+      ep_size=ep_size,
+      activation_offload_layers=activation_offload_layers,
+      enable_cuda_graphs=enable_cuda_graphs,
+      compute_dtype=compute_dtype,
+      fp8_recipe=fp8_recipe,
+      nccl_communicator_config_path=nccl_communicator_config_path,
+      use_mcore_fsdp=use_mcore_fsdp,
+      use_fsdp_double_buffer=use_fsdp_double_buffer,
+      use_user_buffer_registration=use_user_buffer_registration,
+      use_sharp=use_sharp,
+      keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache,
+  )
+
+  # Sequence Length (model and data)
+  pretrain.model.config.seq_length = 8192
+  pretrain.data.seq_length = 8192
+
+  # Set the number of steps to 50 for a quicker benchmark.
+  pretrain.trainer.max_steps = 50
+
+  # Disable validation batches.
+  pretrain.trainer.limit_val_batches = 0.0
+  pretrain.trainer.val_check_interval = 30
+
+  # Add the Nsys profiling callback if enabled.
+  if profile_enabled:
+    pretrain.trainer.callbacks.append(
+        run.Config(
+            NsysCallback,
+            start_step=profile_start_step,
+            end_step=profile_end_step,
+            ranks=[int(x) for x in profile_ranks.split(",")],
+            gen_shape=False,
+        )
+    )
+
+  # Add the FLOPs measurement callback.
+  pretrain.trainer.callbacks.append(
+      run.Config(
+          FLOPsMeasurementCallback,
+          model_name="llama31-405b",
+          model_config=pretrain.model.config,
+          data_config=pretrain.data,
+      )
+  )
+
+  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
+      pretrain.trainer.callbacks
+  )
+  pretrain.trainer.callbacks[
+      comm_overlap_callback_idx
+  ].tp_comm_bootstrap_backend = "nccl"
+
+  # Disable checkpointing.
+  pretrain.log.ckpt = None
+  pretrain.trainer.enable_checkpointing = False
+
+  # Log every step.
+  pretrain.trainer.log_every_n_steps = 1
+
+  # Enable DLLogger
+  dllogger_config = run.Config(
+      DLLogger,
+      verbose=True,
+      stdout=True,
+      json_file="dllogger.json",
+  )
+  pretrain.log.extra_loggers = [dllogger_config]
+
+  return pretrain
+
+
+if __name__ == "__main__":
+  run.cli.main(llm.pretrain, default_factory=recipe)
diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpus256.py b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpus256.py
deleted file mode 100644
index 2af09422..00000000
--- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpus256.py
+++ /dev/null
@@ -1,207 +0,0 @@
-"""Nemo2 pretraining recipe for Llama 3.1 405B model."""
-
-# hack for relative imports
-
-from os.path import basename, splitext
-import random
-import fiddle as fdl
-import fiddle._src.experimental.dataclasses as fdl_dc
-from nemo.collections.llm.recipes import llama31_405b
-from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
-from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
-from nemo.utils.loggers.dllogger import DLLogger
-import nemo_run as run
-from scripts.performance.argument_parser import parse_cli_args
-from scripts.performance.helpers import args_sanity_check
-from scripts.performance.helpers import get_user_configs
-from scripts.performance.helpers import set_exp_logging_configs
-from scripts.performance.helpers import set_primary_perf_configs
-from scripts.performance.utils import get_comm_overlap_callback_idx, hf_tokenizer
-
-
-def recipe(
-    args,
-    num_nodes,
-    mbs,
-    gbs,
-    tp_size,
-    pp_size,
-    cp_size,
-    vp_size,
-    ep_size,
-    enable_cuda_graphs,
-    use_mcore_fsdp,
-    recompute_layers,
-    activation_offload_layers,
-) -> run.Partial:
-  """Returns a Nemo2 training recipe for Llama 3.1 405B model."""
-  # Start from the Nemo standard recipe.
-  pretrain = llama31_405b.pretrain_recipe(performance_mode=True)
-
-  pretrain = set_primary_perf_configs(
-      pretrain,
-      "pre_train",
-      num_nodes=num_nodes,
-      num_gpus_per_node=4,
-      mbs=mbs,
-      gbs=gbs,
-      max_steps=args.max_steps,
-      tp_size=tp_size,
-      pp_size=pp_size,
-      cp_size=cp_size,
-      vp_size=vp_size,
-      ep_size=ep_size,
-      enable_cuda_graphs=enable_cuda_graphs,
-      activation_offload_layers=activation_offload_layers,
-      compute_dtype=args.compute_dtype,
-      fp8_recipe=args.fp8_recipe,
-      nccl_communicator_config_path=args.nccl_communicator_config_path,
-      use_mcore_fsdp=use_mcore_fsdp,
-      recompute_layers=recompute_layers,
-      use_fsdp_double_buffer=args.use_fsdp_double_buffer,
-      use_user_buffer_registration=args.use_user_buffer_registration,
-      use_sharp=args.use_sharp,
-      keep_fsdp_fp8_transpose_cache=args.keep_fsdp_fp8_transpose_cache,
-  )
-  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
-      pretrain.trainer.callbacks
-  )
-  pretrain.trainer.callbacks[
-      comm_overlap_callback_idx
-  ].tp_comm_bootstrap_backend = "nccl"
-
-  pretrain = set_exp_logging_configs(
-      pretrain,
-      "pre_train",
-      "llm",
-      "llama3",
-      args.tensorboard,
-      args.wandb,
-      args.wandb_prj_name,
-      args.wandb_job_name,
-  )
-
-  if args.use_hf_tokenizer:
-    pretrain.data.tokenizer = hf_tokenizer("meta-llama/Llama-3.1-405B")
-  else:
-    pretrain.data.tokenizer = run.Config(
-        get_nmt_tokenizer,
-        library="null",
-        model_name="NullTokenizer",
-        vocab_size=128256,
-    )
-  pretrain.model.tokenizer = pretrain.data.tokenizer
-
-  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
-      pretrain.trainer.callbacks
-  )
-  assert (
-      comm_overlap_callback_idx is not None
-  ), "MegatronCommOverlapCallback missing. Required for performance."
-
-  tp_comm_overlap_cfg = fdl.cast(
-      run.Config,
-      fdl_dc.convert_dataclasses_to_configs(
-          userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192
-      ),
-  )
-  pretrain.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap_cfg = (
-      tp_comm_overlap_cfg
-  )
-
-  if use_mcore_fsdp:
-    pretrain.trainer.strategy.num_distributed_optimizer_instances = (
-        num_nodes * 4
-    ) // 64
-
-  # Enable DLLogger
-  dllogger_config = run.Config(
-      DLLogger,
-      verbose=True,
-      stdout=True,
-      json_file="dllogger.json",
-  )
-  pretrain.log.extra_loggers = [dllogger_config]
-
-  return pretrain
-
-
-if __name__ == "__main__":
-  args = parse_cli_args().parse_args()
-  args_sanity_check(args)
-
-  kwargs = get_user_configs(
-      args.gpu.lower(), "pre_train", "llama31", "405b", args
-  )
-  (
-      num_nodes,
-      mbs,
-      gbs,
-      tp_size,
-      pp_size,
-      cp_size,
-      vp_size,
-      ep_size,
-      _,
-      enable_cuda_graphs,
-      use_mcore_fsdp,
-      recompute_layers,
-      activation_offload_layers,
-  ) = kwargs[:13]
-
-  recipe = recipe(
-      args,
-      num_nodes,
-      mbs,
-      gbs,
-      tp_size,
-      pp_size,
-      cp_size,
-      vp_size,
-      ep_size,
-      enable_cuda_graphs,
-      use_mcore_fsdp,
-      recompute_layers,
-      activation_offload_layers,
-  )
-
-  exp_config = (
-      f"{num_nodes}nodes_tp{tp_size}_pp{pp_size}_cp{cp_size}_vp{vp_size}_{mbs}mbs_{gbs}gbs-{random.randint(0, 100000)}"
-  )
-  exp_name = (
-      f"{splitext(basename(__file__))[0]}_{args.compute_dtype}_{exp_config}"
-  )
-
-  if use_mcore_fsdp:
-    # Needed to enable CuDNN LN for FSDP overlap
-    env_vars = {"NVTE_NORM_FWD_USE_CUDNN": "1", "NVTE_NORM_BWD_USE_CUDNN": "1"}
-  else:
-    env_vars = {}
-
-  executor = run.LocalExecutor()
-
-  plugins = [
-      PerfEnvPlugin(
-          enable_vboost=False,
-          nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
-          gpu_sm100_or_newer=True,
-      )
-  ]
-  if args.enable_nsys:
-    plugins.append(
-        NsysPlugin(start_step=10, end_step=13, ranks=list(range(0, 1)))
-    )
-  if args.enable_memory_profile:
-    assert args.memory_profile_out_path is not None
-    plugins.append(MemoryProfilePlugin(dir=args.memory_profile_out_path))
-
-  with run.Experiment(exp_name) as exp:
-    exp.add(
-        recipe,
-        executor=executor,
-        name=exp_name,
-        plugins=plugins,
-    )
-
-    exp.run(sequential=True, direct=True, detach=False)
diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/recipe_launch_command.sh b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/recipe_launch_command.sh
new file mode 100644
index 00000000..4958878d
--- /dev/null
+++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/recipe_launch_command.sh
@@ -0,0 +1 @@
+helm install mutianzhu-ubench-4tcg . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpu256.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/mutianzhu-ubench-4tcg --set queue=tas-lq
\ No newline at end of file
diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml
index f34b5080..a1d54cee 100644
--- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml
+++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+{{- if .Values.workload.configFile }}
 apiVersion: v1
 kind: ConfigMap
 metadata:
@@ -24,3 +25,4 @@ data:
 {{- else }}
 {{ "config: null" | nindent 4 }}
 {{- end }}
+{{- end }}
diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-job.yaml b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-job.yaml
index 2362c6e0..e2b6d544 100644
--- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-job.yaml
+++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-job.yaml
@@ -96,44 +96,39 @@ spec:
               {{- end }}
               {{- end }}
           spec:
-            nodeSelector:
-            {{- toYaml .Values.workload.nodeSelector | nindent 14 }}
             {{- if $root.Values.network.hostNetwork }}
             hostNetwork: true
             dnsPolicy: ClusterFirstWithHostNet
             {{- end }}
             subdomain: "{{.Release.Name}}"
             restartPolicy: Never
-            {{- if or $root.Values.targetNodes $root.Values.avoidNodes $root.Values.targetNodepools }}
+            {{- if $root.Values.targetNodes }}
             affinity:
               nodeAffinity:
                 requiredDuringSchedulingIgnoredDuringExecution:
                   nodeSelectorTerms:
                   - matchExpressions:
-                    {{- if $root.Values.targetNodes }}
                     - key: kubernetes.io/hostname
                       operator: "In"
                       values:
                       {{- range $hostname := $root.Values.targetNodes }}
                       - {{ $hostname }}
                       {{- end }}
-                    {{- end }}
-                    {{- if $root.Values.avoidNodes }}
+            {{- end }}
+            {{- if $root.Values.avoidNodes }}
+            {{- if not $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+            {{- end }}
+                  nodeSelectorTerms:
+                  - matchExpressions:
                     - key: kubernetes.io/hostname
                       operator: "NotIn"
                       values:
                       {{- range $hostname := $root.Values.avoidNodes }}
                       - {{ $hostname }}
                       {{- end }}
-                    {{- end }}
-                    {{- if $root.Values.targetNodepools }}
-                    - key: cloud.google.com/gke-nodepool
-                      operator: "In"
-                      values:
-                      {{- range $nodepool := $root.Values.targetNodepools }}
-                      - {{ $nodepool }}
-                      {{- end }}
-                    {{- end }}
             {{- end }}
             tolerations:
             - operator: "Exists"
@@ -151,12 +146,14 @@ spec:
               emptyDir: {}
             {{ end }}
 
+            {{- if $root.Values.workload.configFile }}
             - name: workload-configuration
               configMap:
                 name: "{{.Release.Name}}-config"
                 items:
                 - key: workload-configuration
                   path: {{ $root.Values.workload.configFile | default "workload-configuration" }}
+            {{- end }}
 
             - name: workload-launcher
               configMap:
@@ -322,8 +319,10 @@ spec:
                   mountPath: /usr/local/gib
                 {{ end }}
 
+                {{- if $root.Values.workload.configFile }}
                 - name: workload-configuration
                   mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+                {{- end }}
 
                 - name: workload-launcher
                   mountPath: /workload/launcher
diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/values.yaml b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/values.yaml
index 41839c24..8a6b110b 100644
--- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/values.yaml
+++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/values.yaml
@@ -5,78 +5,31 @@ network:
   hostNetwork: true
   ncclSettings:
   - name: NCCL_DEBUG
-    value: VERSION
-  - name: NCCL_ALGO
-    value: "Ring,Tree"
-  - name: NCCL_NET_GDR_LEVEL
-    value: PIX
-  - name: NCCL_NET_GDR_C2C
-    value: "1"
-  - name: NCCL_P2P_NET_CHUNKSIZE
-    value: "2097152"
-  - name: NCCL_NVLS_ENABLE
-    value: "0"
+    value: WARN
   subnetworks[]: null
-queue: tas-lq
-# targetNodepools: null
-targetNodepools: null
-  # - a4x-highgpu-4g-a4x-pool-0
-  # - a4x-highgpu-4g-a4x-pool-1
-
-
-# tasSettings:
-#   topologyRequest:
-#     kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
-
+queue: null
 tasSettings:
   topologyRequest:
-    kueue.x-k8s.io/podset-required-topology: "cloud.google.com/gce-topology-block"
-    kueue.x-k8s.io/podset-slice-required-topology: "cloud.google.com/gce-topology-subblock"
-    kueue.x-k8s.io/podset-slice-size: "16"
-
+    kueue.x-k8s.io/podset-required-topology: cloud.google.com/gce-topology-block
+    kueue.x-k8s.io/podset-slice-required-topology: cloud.google.com/gce-topology-subblock
+    kueue.x-k8s.io/podset-slice-size: '16'
 volumes:
   gcsMounts:
   - bucketName: null
     mountPath: null
   gcsVolumes: true
   psVolumes: false
-  ssdMountPath: "/ssd"
 workload:
-  nodeSelector:
-    cloud.google.com/gke-accelerator: nvidia-gb200
   arguments[]: null
-  configFile: llama3-1-405b-fp8cs-gbs2048-gpus256.py
+  configFile: llama3-1-405b-fp8cs-gbs2048-gpu256.py
   configPath: /workload/configs/
-  defaultArguments:
-  - --account=none
-  - --partition=none
-  - --gpu=gb200
-  - --num_gpus=256
-  - --compute_dtype=fp8
-  - --fp8_recipe=cs
-  - --global_batch_size=2048
-  - --max_steps=30
-  - --micro_batch_size=1
-  - --tensor_parallel_size=2
-  - --context_parallel_size=1
-  - --expert_parallel_size=1
-  - --expert_tensor_parallel_size=1
-  - --pipeline_parallel_size=1
-  - --virtual_pipeline_parallel_size=1
-  - --use_mcore_fsdp=1
-  - --cuda_graphs=0
-  - --activation_offload_layers=95
-  - --log_dir=/job-logs/nemo-logs
+  defaultArguments[]: null
   envs:
   - name: ARTIFACT_DIR
     value: null
-  - name: PL_TORCH_DISTRIBUTED_BACKEND
-    value: "nccl"
   - name: GLOO_SOCKET_IFNAME
     value: eth0
-  - name: TORCH_NCCL_HIGH_PRIORITY
-    value: "1"
   - name: NEMO_LAUNCH_SCRIPT
-    value: /workload/configs/llama3-1-405b-fp8cs-gbs2048-gpus256.py
+    value: /workload/configs/llama3-1-405b-fp8cs-gbs2048-gpu256.py
   gpus: 256
   image: nvcr.io/nvidia/nemo:25.07