diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/Chart.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/Chart.yaml
new file mode 100644
index 00000000..af46c11a
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: a4_jobset_workload
+description: a4_jobset_workload
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/README.md b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/README.md
new file mode 100644
index 00000000..aa487339
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/README.md
@@ -0,0 +1,151 @@
+<!-- mdformat global-off -->
+# Pretrain deepseek_v3-bf16-gbs2048-gpus256 workloads on a4 GKE Node pools with Megatron-Bridge
+
+This recipe outlines the steps for running a deepseek_v3 pretraining
+workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the
+[NVIDIA Megatron-Bridge framework](https://github.com/NVIDIA-NeMo/Megatron-Bridge).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+- Pretraining job configuration and deployment - A Helm chart is used to configure and deploy the Kubernetes Jobset resource which manages the execution of the [Megatron-Bridge pretraining workload](https://github.com/NVIDIA-NeMo/Megatron-Bridge).
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- GKE cluster: Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4) to create your a4 GKE cluster.
+- Node Configuration: 32 nodes (8 GPUs per node, 256 GPUs total).
+- GPU Architecture: NVIDIA Blackwell (B200).
+
+## Training dataset
+
+This recipe uses a mock pretraining dataset provided by [Megatron Bridge Framework Datasets utils](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/scripts/performance/utils/datasets.py)
+
+## Docker container image
+
+This recipe uses the following docker images:
+
+- `nvcr.io/nvidia/nemo:25.11.01`
+- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1`
+
+## Run the recipe
+
+From your client workstation, complete the following steps:
+
+### Configure environment settings
+
+Set the environment variables to match your environment:
+
+```bash
+export PROJECT_ID=<PROJECT_ID>
+export CLUSTER_REGION=<CLUSTER_REGION>
+export CLUSTER_NAME=<CLUSTER_NAME>
+export GCS_BUCKET=<GCS_BUCKET> # Note: path should not be prefixed with gs://
+export KUEUE_NAME=<KUEUE_NAME>
+```
+
+Replace the following values:
+
+- `<PROJECT_ID>`: your Google Cloud project ID.
+- `<CLUSTER_REGION>`: the region where your cluster is located.
+- `<CLUSTER_NAME>`: the name of your GKE cluster.
+- `<GCS_BUCKET>`: the name of your Cloud Storage bucket. Don't include the gs:// prefix.
+- `<KUEUE_NAME>`: the name of the Kueue local queue. The default queue created by the cluster toolkit is a4.
+
+Set the default project:
+
+```bash
+gcloud config set project $PROJECT_ID
+```
+
+### Get cluster credentials
+
+```bash
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+### Get the recipe
+
+Clone the `gpu-recipes` repository and set a reference to the recipe folder.
+
+```
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=`git rev-parse --show-toplevel`
+export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe
+cd $RECIPE_ROOT
+```
+
+### Configure and submit a pretraining job
+
+#### Using 32 nodes (256 gpus) bf16 precision
+
+To execute the job with the default settings, run the following command from your client:
+
+```bash
+cd $RECIPE_ROOT
+export WORKLOAD_NAME=$USER-deepseek-v3-32node-bf16-seq4096-gbs2048
+helm install $WORKLOAD_NAME . -f values.yaml \
+--set-file workload_launcher=launcher.sh \
+--set-file workload_config=custom_setup_experiment.py \
+--set workload.image=nvcr.io/nvidia/nemo:25.11.01 \
+--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+--set volumes.gcsMounts[0].mountPath=/job-logs \
+--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+--set queue=${KUEUE_NAME}
+```
+
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-deepseek-v3-32node-bf16-seq4096-gbs2048
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=custom_setup_experiment.py \
+    --set workload.image=nvcr.io/nvidia/nemo:25.11.01 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
+### Monitor the job
+
+To check the status of pods in your job, run the following command:
+
+```
+kubectl get pods | grep $USER-deepseek-v3-32node-bf16-seq4096-gbs2048
+```
+
+Replace the following:
+
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-deepseek-v3-32node-bf16-seq4096-gbs2048.
+
+To get the logs for one of the pods, run the following command:
+
+```
+kubectl logs POD_NAME
+```
+
+Information about the training job's progress, including crucial details such as
+loss, step count, and step time, is generated by the rank 0 process.
+This process runs on the pod whose name begins with
+`JOB_NAME_PREFIX-workload-0-0`.
+For example: `$USER-deepseek-v3-32node-bf16-seq4096-gbs2048-workload-0-0-s9zrv`.
+
+### Uninstall the Helm release
+
+You can delete the job and other resources created by the Helm chart. To
+uninstall Helm, run the following command from your client:
+
+```bash
+helm uninstall $USER-deepseek-v3-32node-bf16-seq4096-gbs2048
+```
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/custom_setup_experiment.py b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/custom_setup_experiment.py
new file mode 100644
index 00000000..32173cbc
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/custom_setup_experiment.py
@@ -0,0 +1,233 @@
+import glob
+import logging
+import os
+from pathlib import Path
+import sys
+import time
+from typing import Any, Dict, List, Optional
+
+import nemo_run as run
+from nemo_run.config import get_nemorun_home
+
+
+try:
+  from argument_parser import parse_cli_args
+  from utils.evaluate import calc_convergence_and_performance
+  from utils.executors import dgxc_executor, slurm_executor
+except (ImportError, ModuleNotFoundError):
+  from .argument_parser import parse_cli_args
+  from .utils.evaluate import calc_convergence_and_performance
+  from .utils.executors import dgxc_executor, slurm_executor
+
+try:
+  import wandb
+
+  HAVE_WANDB = True
+except (ImportError, ModuleNotFoundError):
+  HAVE_WANDB = False
+
+try:
+  from perf_plugins import NsysPlugin, PerfEnvPlugin
+  from resiliency_plugins import FaultTolerancePlugin
+except (ImportError, ModuleNotFoundError):
+  from .perf_plugins import NsysPlugin, PerfEnvPlugin
+  from .resiliency_plugins import FaultTolerancePlugin
+
+import logging
+
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+ENTRYPOINT_PEFORMANCE = "run_script.py"
+ENTRYPOINT_RECIPE = "run_recipe.py"
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+def main(
+    use_recipes: bool,
+    model_family_name: str,
+    model_recipe_name: str,
+    task: str,
+    compute_dtype: str,
+    gpu: str,
+    hf_token: str,
+    detach: bool,
+    dryrun: bool,
+    enable_vboost: bool,
+    enable_nsys: bool,
+    moe_a2a_overlap: bool,
+    tp_size: Optional[int],
+    pp_size: Optional[int],
+    cp_size: Optional[int],
+    wandb_key: str,
+    wandb_project_name: str,
+    wandb_experiment_name: str,
+    wandb_entity_name: str,
+    profiling_start_step: int,
+    profiling_stop_step: int,
+    profiling_gpu_metrics: bool,
+    profiling_ranks: Optional[List[int]],
+    nemo_home: str,
+    account: str,
+    partition: str,
+    log_dir: str,
+    gpus_per_node: int,
+    time_limit: str,
+    container_image: str,
+    custom_mounts: List[str],
+    custom_env_vars: List[str],
+    custom_srun_args: List[str],
+    pretrained_checkpoint: Optional[str],
+    num_gpus: int,
+    is_long_convergence_run: bool,
+    additional_slurm_params: Optional[Dict[str, Any]],
+    golden_values_path: str,
+    convergence_params: Dict[str, Any],
+    performance_params: Dict[str, Any],
+    max_retries: int,
+    dgxc_base_url: str,
+    dgxc_cluster: str,
+    dgxc_kube_apiserver_url: str,
+    dgxc_app_id: str,
+    dgxc_app_secret: str,
+    dgxc_project_name: str,
+    dgxc_pvc_claim_name: str,
+    dgxc_pvc_mount_path: str,
+):
+    logger.info("Hello World")
+
+    rank = os.environ['RANK']
+
+    exp_name = f"{model_recipe_name}_{model_family_name}"
+    exp_name += f'_worker{rank}'
+    if use_recipes:
+        script_name = ENTRYPOINT_RECIPE
+
+    else:
+        script_name = ENTRYPOINT_PEFORMANCE
+
+    run_script_path = SCRIPT_DIR / script_name
+    logger.info(f"Run script path: {run_script_path}")
+    if not run_script_path.is_file():
+        logger.error(f"Specified run script not found: {run_script_path}")
+        sys.exit(1)
+
+    nemorun_script = run.Script(
+        path=str(run_script_path),
+        entrypoint="python",
+        env={"PYTHONPATH": f"{SCRIPT_DIR}:$PYTHONPATH"},
+        args=list(sys.argv[1:]),
+    )
+
+    plugins = []
+
+    if not use_recipes:
+        plugins.append(
+            PerfEnvPlugin(
+                enable_vboost=enable_vboost,
+                moe_a2a_overlap=moe_a2a_overlap,
+                tp_size=tp_size,
+                pp_size=pp_size,
+                cp_size=cp_size,
+                model_family_name=model_family_name,
+                model_recipe_name=model_recipe_name,
+                gpu=gpu,
+                compute_dtype=compute_dtype,
+                train_task=task,
+            )
+        )
+
+    if enable_nsys:
+        plugins.append(
+            NsysPlugin(
+                profile_step_start=profiling_start_step,
+                profile_step_end=profiling_stop_step,
+                nsys_gpu_metrics=profiling_gpu_metrics,
+                profile_ranks=profiling_ranks,
+            )
+        )
+
+    executor = run.LocalExecutor()
+    run.run(
+        nemorun_script,
+        executor=executor,
+        plugins=plugins,
+        dryrun=False,
+        detach=False,
+        name=exp_name,
+    )
+
+
+if __name__ == "__main__":
+    parser = parse_cli_args()
+    args, unknown_args = parser.parse_known_args()
+
+    # probably better to use parser.parse_args() and make unknowns an error,
+    # but for now we'll just issue a warning.
+    if unknown_args:
+        logger.warning(f"Ignoring unrecognized arguments: {' '.join(unknown_args)}")
+
+    main(
+        use_recipes=args.use_recipes,
+        model_family_name=args.model_family_name,
+        model_recipe_name=args.model_recipe_name,
+        task=args.task,
+        compute_dtype=args.compute_dtype,
+        gpu=args.gpu,
+        hf_token=args.hf_token,
+        detach=args.detach,
+        dryrun=args.dryrun,
+        enable_vboost=args.enable_vboost,
+        enable_nsys=args.enable_nsys,
+        moe_a2a_overlap=args.moe_a2a_overlap,
+        tp_size=args.tensor_model_parallel_size,
+        pp_size=args.pipeline_model_parallel_size,
+        cp_size=args.context_parallel_size,
+        wandb_key=args.wandb_key,
+        wandb_project_name=args.wandb_project_name,
+        wandb_experiment_name=args.wandb_experiment_name,
+        wandb_entity_name=args.wandb_entity_name,
+        profiling_start_step=args.profiling_start_step,
+        profiling_stop_step=args.profiling_stop_step,
+        profiling_gpu_metrics=args.profiling_gpu_metrics,
+        profiling_ranks=args.profiling_ranks,
+        nemo_home=args.nemo_home,
+        account=args.account,
+        partition=args.partition,
+        log_dir=args.log_dir,
+        gpus_per_node=args.gpus_per_node,
+        time_limit=args.time_limit,
+        container_image=args.container_image,
+        custom_mounts=args.custom_mounts,
+        custom_env_vars=args.custom_env_vars,
+        custom_srun_args=args.custom_srun_args,
+        pretrained_checkpoint=args.pretrained_checkpoint,
+        num_gpus=args.num_gpus,
+        is_long_convergence_run=args.is_long_convergence_run,
+        additional_slurm_params=args.additional_slurm_params,
+        golden_values_path=args.golden_values_path,
+        convergence_params={
+            "correlation_threshold": args.correlation_threshold,
+            "high_loss_tolerance": args.high_loss_tolerance,
+            "medium_loss_tolerance": args.medium_loss_tolerance,
+            "low_loss_tolerance": args.low_loss_tolerance,
+            "final_loss_tolerance": args.final_loss_tolerance,
+            "max_outlier_ratio": args.max_outlier_ratio,
+            "outlier_threshold": args.outlier_threshold,
+            "skip_first_percent_loss": args.skip_first_percent_loss,
+        },
+        performance_params={
+            "timing_threshold": args.timing_threshold,
+            "skip_first_percent_time": args.skip_first_percent_time,
+        },
+        max_retries=args.max_retries,
+        dgxc_base_url=args.dgxc_base_url,
+        dgxc_cluster=args.dgxc_cluster,
+        dgxc_kube_apiserver_url=args.dgxc_kube_apiserver_url,
+        dgxc_app_id=args.dgxc_app_id,
+        dgxc_app_secret=args.dgxc_app_secret,
+        dgxc_project_name=args.dgxc_project_name,
+        dgxc_pvc_claim_name=args.dgxc_pvc_claim_name,
+        dgxc_pvc_mount_path=args.dgxc_pvc_mount_path,
+    )
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/launcher.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/launcher.sh
new file mode 100644
index 00000000..cabe3575
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/launcher.sh
@@ -0,0 +1,150 @@
+usage()
+{
+cat << EOF
+usage: bash ./launcher.sh [config-override  [config-override ...]]
+config-override  (Optional) A  NeMo configuration override. E.g. trainer.max_steps=10000.
+EOF
+}
+
+parse_args() {
+  while [[ "$1" != "" ]]; do
+    case $(grep -o "=" <<< "$1" | wc -l) in
+        1  )
+        config_overrides+=("$1")
+        ;;
+        * )
+            echo "Invalid config override: $1"
+            usage
+            exit 1
+    esac
+    shift
+  done
+  config_overrides="${config_overrides[*]}"
+}
+
+config_overrides=()
+parse_args "$@"
+
+if [[ -z "${config_overrides[*]}" ]]; then
+  echo "No NeMo config overrides specified"
+else
+  echo "NeMo config overrides:"
+  echo "  ${config_overrides}"
+fi
+
+export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib:$NCCL_PLUGIN_PATH:$LD_LIBRARY_PATH"
+ldconfig "$LD_LIBRARY_PATH"
+echo "Added $LD_LIBRARY_PATH to ldconfig:"
+ldconfig -p | grep libcuda | sed 's/^/  /'
+echo ""
+
+if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
+  explicit_log_dir=${EXPLICIT_LOG_DIR}
+else
+  explicit_log_dir=workload_logs
+fi
+echo "Logging to ${explicit_log_dir}"
+
+if [[ -n "${TOKENIZER_PATH}" ]]; then
+  echo "Getting tokenizer files"
+  cp "${TOKENIZER_PATH}"/* .
+  echo ""
+fi
+
+echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes"
+
+pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+# Create the nsys directory.
+mkdir -p "${explicit_log_dir}/nsys"
+
+# Collect diagnostics to a single line
+kv="\"kernel_version\": \"$(uname --kernel-release)\""
+if command -v nvidia-smi &> /dev/null; then
+  cuda_v=$(nvidia-smi -q -x | grep -Po '(?<=<cuda_version>).*(?=</cuda_version>)' || true)
+  driver_v=$(nvidia-smi -q -x | grep -Po '(?<=<driver_version>).*(?=</driver_version>)' || true)
+  vbios_v=$(nvidia-smi -q -x | grep -Po '(?<=<vbios_version>).*(?=</vbios_version>)' | head -n1 || true)
+  kv="${kv}, \"cuda_version\": \"${cuda_v}\""
+  kv="${kv}, \"driver_version\": \"${driver_v}\""
+  kv="${kv}, \"vbios_version\": \"${vbios_v}\""
+fi
+echo "VERSION_DIAGNOSTICS: {${kv}}"
+
+
+export HF_TOKEN=<YOUR_HF_TOKEN>
+
+cd /opt
+rm -rf Megatron-Bridge
+git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
+cd Megatron-Bridge
+git checkout 7695d4acbfac19353d20e456509117efe4733d6b
+sed -i -e '/pretrain(config=recipe/i \    recipe.dist.distributed_timeout_minutes = 120' scripts/performance/run_script.py
+ls
+
+cp $CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH scripts/performance/
+
+worker_command=$(cat <<- EOM
+  if [ "\$RANK" -eq "0" ]; then
+    echo "Worker 0 is stalling for a few seconds.." ;
+    sleep 3 ;
+    echo "The detected environment within worker rank 0 is:" ;
+    env | sed 's/^/  /' ;
+  fi ;
+
+  cd /opt/Megatron-Bridge ;
+
+  numactl \
+    --cpunodebind=\$((LOCAL_RANK/4)) \
+    --membind=\$((LOCAL_RANK/4))           nsys profile \
+    -t nvtx,cuda \
+    --cuda-event-trace=false \
+    --sample=none \
+    --capture-range=cudaProfilerApi \
+    --capture-range-end=stop \
+    --kill none \
+    -o "/${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK" \
+    --force-overwrite true \
+    --session-new "nsys-\$RANDOM-\$RANK" \
+  nice -10 \
+  python scripts/performance/custom_setup_experiment.py \
+    --gpu b200 \
+    --model_family_name deepseek \
+    --model_recipe_name deepseek_v3 \
+    --gpus_per_node 8 \
+    --num_gpus 256 \
+    --global_batch_size 2048 \
+    --micro_batch_size 1 \
+    --seq_length 4096 \
+    --tensor_model_parallel_size 1 \
+    --pipeline_model_parallel_size 16 \
+    --context_parallel_size 1 \
+    --virtual_pipeline_model_parallel_size None \
+    --expert_model_parallel_size 8 \
+    --compute_dtype bf16 \
+    --max_steps 30
+EOM
+)
+
+echo "$worker_command" > worker_command.sh
+chmod 777 worker_command.sh
+
+torchrun \
+--nproc-per-node="8" \
+--nnodes="32" \
+--node_rank="${JOB_COMPLETION_INDEX}" \
+--rdzv_id="${JOB_IDENTIFIER}" \
+--rdzv_conf="timeout=7200" \
+--rdzv_conf="join_timeout=7200" \
+--master_addr="${MASTER_ADDR}" \
+--master_port="${MASTER_PORT}" \
+--no-python bash worker_command.sh
+
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  mkdir -p "${ARTIFACT_DIR}"
+  cp -r "${explicit_log_dir}"/* "${ARTIFACT_DIR}/"
+  env > "${ARTIFACT_DIR}/environ.txt"
+  ls "${ARTIFACT_DIR}"
+fi
+echo "Training completed"
+echo "Pod on $(hostname --fqdn) is exiting"
\ No newline at end of file
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/recipe_launch_command.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/recipe_launch_command.sh
new file mode 100644
index 00000000..02092414
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/recipe_launch_command.sh
@@ -0,0 +1 @@
+helm install joeywan-ubench-5cgs . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=/tmp/ubench_recipe/joeywan-ubench-5cgs/custom_setup_experiment.py --set workload.image=nvcr.io/nvidia/nemo:25.11.01 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/joeywan-ubench-5cgs --set queue=a4
\ No newline at end of file
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-config-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-config-configmap.yaml
new file mode 100644
index 00000000..a1d54cee
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-config-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.workload.configFile }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  workload-configuration: |-
+{{- if .Values.workload_config }}
+{{ .Values.workload_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
+{{- end }}
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-job.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-job.yaml
new file mode 100644
index 00000000..54efbb6b
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-job.yaml
@@ -0,0 +1,333 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{$timestamp := now | date "2006-01-02-15-04-05"}}
+{{$jobSuffix := randAlphaNum 4 | lower}}
+{{$jobuuid := uuidv4}}
+{{$nodes := div .Values.workload.gpus 8 | max 1}}
+{{$gpusPerNode := min .Values.workload.gpus 8}}
+{{- $root := . -}}
+
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+  {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+  {{- end }}
+spec:
+  {{- if $root.Values.queue }}
+  suspend: true
+  {{- end }}
+  failurePolicy:
+    maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }}
+  replicatedJobs:
+  - name: workload
+    replicas: 1
+    template:
+      spec:
+        parallelism: {{ $nodes }}
+        completions: {{ $nodes }}
+        backoffLimit: 0
+        completionMode: Indexed
+        activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60)
+        ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60)
+        template:
+          metadata:
+            annotations:
+              kubectl.kubernetes.io/default-container: workload
+              {{- if $root.Values.volumes.gcsVolumes }}
+              gke-gcsfuse/volumes: "true"
+              gke-gcsfuse/cpu-limit: "500m"
+              gke-gcsfuse/memory-limit: "1Ti"
+              gke-gcsfuse/ephemeral-storage-limit: "2Ti"
+              {{- end }}
+              {{- if $root.Values.volumes.psVolumes }}
+              gke-parallelstore/volumes: "true"
+              gke-parallelstore/cpu-limit: "0"
+              gke-parallelstore/memory-limit: "0"
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }}
+              kueue.x-k8s.io/podset-preferred-topology: {{ .Values.tasSettings.topologyRequest | default "kubernetes.io/hostname" }}
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+              provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+              {{- end }}
+              {{- if not $root.Values.network.hostNetwork }}
+              networking.gke.io/default-interface: "eth0"
+              networking.gke.io/interfaces: |
+              {{- if $root.Values.network.subnetworks }}
+                [
+                  {{- range $i, $subnetwork := $root.Values.network.subnetworks }}
+                  {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- else }}
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gvnic-1"},
+                  {{- range  $i := until 8 }}
+                  {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- end }}
+              {{- end }}
+          spec:
+            {{- if $root.Values.network.hostNetwork }}
+            hostNetwork: true
+            dnsPolicy: ClusterFirstWithHostNet
+            {{- end }}
+            subdomain: "{{.Release.Name}}"
+            restartPolicy: Never
+            {{- if $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "In"
+                      values:
+                      {{- range $hostname := $root.Values.targetNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            {{- if $root.Values.avoidNodes }}
+            {{- if not $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+            {{- end }}
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "NotIn"
+                      values:
+                      {{- range $hostname := $root.Values.avoidNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            tolerations:
+            - operator: "Exists"
+              key: nvidia.com/gpu
+            - operator: "Exists"
+              key: cloud.google.com/impending-node-termination
+
+            volumes:
+            {{ if $root.Values.network.gibVersion }}
+            - name: gib
+              emptyDir: {}
+            {{ end }}
+
+            {{- if $root.Values.workload.configFile }}
+            - name: workload-configuration
+              configMap:
+                name: "{{.Release.Name}}-config"
+                items:
+                - key: workload-configuration
+                  path: {{ $root.Values.workload.configFile | default "workload-configuration" }}
+            {{- end }}
+
+            - name: workload-launcher
+              configMap:
+                name: "{{.Release.Name}}-launcher"
+
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+
+            {{- range $pvc := $root.Values.volumes.pvcMounts }}
+            - name: "{{ $pvc.claimName }}"
+              persistentVolumeClaim:
+                claimName: "{{ $pvc.claimName }}"
+            {{- end }}
+
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              csi:
+                driver: gcsfuse.csi.storage.gke.io
+                volumeAttributes:
+                  bucketName: "{{ $gcs.bucketName }}"
+                  {{- if $gcs.mountOptions }}
+                  mountOptions: "{{ $gcs.mountOptions }}"
+                  {{- end }}
+            {{- end}}
+
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              hostPath:
+                path: /mnt/stateful_partition/kube-ephemeral-ssd
+            {{- end }}
+
+            initContainers:
+            {{ if $root.Values.network.gibVersion }}
+            - name: nccl-plugin-installer
+              image: {{ $root.Values.network.gibVersion }}
+              imagePullPolicy: Always
+              args:
+              - |
+                set -ex
+                /scripts/container_entry.sh install --install-nccl
+                cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+                cp -R /var/lib/gib/. /target/usr/local/gib
+              command:
+              - /bin/sh
+              - -c
+              volumeMounts:
+              - mountPath: /target/usr/local/gib
+                name: gib
+            {{ end}}
+
+            containers:
+            {{- if $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-sidecar
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-metadata-prefetch
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            {{- end }}
+            {{- if $root.Values.workload.psSidecarImage }}
+            - name: gke-parallelstore-sidecar
+              image: {{ $root.Values.workload.psSidecarImage }}
+            {{- end }}
+
+            - name: workload
+              image: "{{ $root.Values.workload.image }}"
+              imagePullPolicy: Always
+              {{- if $root.Values.network.hostNetwork }}
+              securityContext:
+                privileged: true
+              {{- end }}
+              env:
+              - name: JOB_IDENTIFIER
+                value: "{{ .Release.Name }}-{{ $timestamp }}"
+              - name: JOB_TIMESTAMP
+                value: "{{ $timestamp }}"
+              - name: JOB_UUID
+                value: "{{ $jobuuid }}"
+              - name: JOB_ORCHESTRATOR
+                value: "gke"
+              # Add RANK based on the pod's index provided by the Indexed Job
+              # This is crucial for torch.distributed initialization.
+              - name: JOB_COMPLETION_INDEX
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+              - name: RANK_0_FQDN
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: HOSTNAME_PREFIX
+                value: "{{.Release.Name}}-workload-"
+              - name: DOMAIN_NAME
+                value: "{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_ADDR
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_PORT
+                value: "6002"
+              - name: WORLD_SIZE
+                value: "{{ $root.Values.workload.gpus }}"
+              - name: NNODES
+                value: "{{ $nodes }}"
+              - name: GPUS_PER_NODE
+                value: "{{ $gpusPerNode }}"
+
+              - name: NCCL_PLUGIN_PATH
+                value: /usr/local/gib/lib64
+
+              {{ if $root.Values.network.gibVersion }}
+              - name: NCCL_INIT_SCRIPT
+                value: "/usr/local/gib/scripts/set_nccl_env.sh"
+              {{ end }}
+
+              {{ if $root.Values.network.ncclSettings }}
+              {{- toYaml .Values.network.ncclSettings | nindent 14 }}
+              {{ end }}
+
+              {{ if $root.Values.workload.envs }}
+              {{- toYaml .Values.workload.envs | nindent 14 }}
+              {{ end }}
+
+              command:
+              - bash
+              - -c
+              - |
+                echo "Pod on $(hostname --fqdn) is running"
+                echo "Pod is assigned job index of $JOB_COMPLETION_INDEX"
+
+                if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then
+                  echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}"
+                  source ${NCCL_INIT_SCRIPT}
+                fi
+
+                # Overriding NCCL_SOCKET_IFNAME definition
+                export NCCL_SOCKET_IFNAME="eth0,eth1"
+                export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb
+
+                echo "Launching workload with the following arguments:"
+                {{- range $root.Values.workload.defaultArguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                echo ""
+
+                sleep 10
+
+                bash /workload/launcher/launch-workload.sh \
+                {{- range $root.Values.workload.defaultArguments }}
+                {{ . }} \
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                {{ . }} \
+                {{- end }}
+
+
+              volumeMounts:
+                {{ if $root.Values.network.gibVersion }}
+                - name: gib
+                  mountPath: /usr/local/gib
+                {{ end }}
+
+                {{- if $root.Values.workload.configFile }}
+                - name: workload-configuration
+                  mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+                {{- end }}
+
+                - name: workload-launcher
+                  mountPath: /workload/launcher
+
+                - name: shared-memory
+                  mountPath: /dev/shm
+
+                {{- range $pvc := $root.Values.volumes.pvcMounts }}
+                - name: "{{ $pvc.claimName }}"
+                  mountPath: "{{ $pvc.mountPath }}"
+                {{- end }}
+
+                {{- range $gcs := $root.Values.volumes.gcsMounts }}
+                - name: "{{ $gcs.bucketName }}"
+                  mountPath: "{{ $gcs.mountPath }}"
+                {{- end }}
+
+                {{- if $root.Values.volumes.ssdMountPath }}
+                - name: local-ssd
+                  mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+                {{- end }}
+
+              resources:
+                limits:
+                  nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-launcher-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-launcher-configmap.yaml
new file mode 100644
index 00000000..7026e0f1
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-launcher-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-svc.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-svc.yaml
new file mode 100644
index 00000000..7cfe220b
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-svc.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}"
+spec:
+  clusterIP: None
+  selector:
+    jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}"
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/values.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/values.yaml
new file mode 100644
index 00000000..91fe2b93
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/values.yaml
@@ -0,0 +1,33 @@
+dwsSettings:
+  maxRunDurationSeconds: null
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1
+  hostNetwork: true
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: WARN
+  subnetworks[]: null
+queue: null
+tasSettings:
+  topologyRequest:
+    kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
+volumes:
+  gcsMounts:
+  - bucketName: null
+    mountPath: null
+  gcsVolumes: true
+  psVolumes: false
+workload:
+  arguments[]: null
+  configFile: custom_setup_experiment.py
+  configPath: /workload/configs/
+  defaultArguments[]: null
+  envs:
+  - name: ARTIFACT_DIR
+    value: null
+  - name: GLOO_SOCKET_IFNAME
+    value: eth0
+  - name: CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH
+    value: /workload/configs/custom_setup_experiment.py
+  gpus: 256
+  image: nvcr.io/nvidia/nemo:25.11.01
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/Chart.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/Chart.yaml
new file mode 100644
index 00000000..af46c11a
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: a4_jobset_workload
+description: a4_jobset_workload
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/README.md b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/README.md
new file mode 100644
index 00000000..fc9352fb
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/README.md
@@ -0,0 +1,151 @@
+<!-- mdformat global-off -->
+# Pretrain deepseek_v3-bf16-gbs2048-gpus256 workloads on a4 GKE Node pools with Megatron-Bridge
+
+This recipe outlines the steps for running a deepseek_v3 pretraining
+workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the
+[NVIDIA Megatron-Bridge framework](https://github.com/NVIDIA-NeMo/Megatron-Bridge).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+- Pretraining job configuration and deployment - A Helm chart is used to configure and deploy the Kubernetes Jobset resource which manages the execution of the [Megatron-Bridge pretraining workload](https://github.com/NVIDIA-NeMo/Megatron-Bridge).
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- GKE cluster: Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4) to create your a4 GKE cluster.
+- Node Configuration: 32 nodes (8 GPUs per node, 256 GPUs total).
+- GPU Architecture: NVIDIA Blackwell (B200).
+
+## Training dataset
+
+This recipe uses a mock pretraining dataset provided by [Megatron Bridge Framework Datasets utils](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/scripts/performance/utils/datasets.py)
+
+## Docker container image
+
+This recipe uses the following docker images:
+
+- `nvcr.io/nvidia/nemo:26.02`
+- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1`
+
+## Run the recipe
+
+From your client workstation, complete the following steps:
+
+### Configure environment settings
+
+Set the environment variables to match your environment:
+
+```bash
+export PROJECT_ID=<PROJECT_ID>
+export CLUSTER_REGION=<CLUSTER_REGION>
+export CLUSTER_NAME=<CLUSTER_NAME>
+export GCS_BUCKET=<GCS_BUCKET> # Note: path should not be prefixed with gs://
+export KUEUE_NAME=<KUEUE_NAME>
+```
+
+Replace the following values:
+
+- `<PROJECT_ID>`: your Google Cloud project ID.
+- `<CLUSTER_REGION>`: the region where your cluster is located.
+- `<CLUSTER_NAME>`: the name of your GKE cluster.
+- `<GCS_BUCKET>`: the name of your Cloud Storage bucket. Don't include the gs:// prefix.
+- `<KUEUE_NAME>`: the name of the Kueue local queue. The default queue created by the cluster toolkit is a4.
+
+Set the default project:
+
+```bash
+gcloud config set project $PROJECT_ID
+```
+
+### Get cluster credentials
+
+```bash
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+### Get the recipe
+
+Clone the `gpu-recipes` repository and set a reference to the recipe folder.
+
+```
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=`git rev-parse --show-toplevel`
+export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe
+cd $RECIPE_ROOT
+```
+
+### Configure and submit a pretraining job
+
+#### Using 32 nodes (256 gpus) bf16 precision
+
+To execute the job with the default settings, run the following command from your client:
+
+```bash
+cd $RECIPE_ROOT
+export WORKLOAD_NAME=$USER-deepseek-v3-32node-bf16-seq4096-gbs2048
+helm install $WORKLOAD_NAME . -f values.yaml \
+--set-file workload_launcher=launcher.sh \
+--set-file workload_config=custom_setup_experiment.py \
+--set workload.image=nvcr.io/nvidia/nemo:26.02 \
+--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+--set volumes.gcsMounts[0].mountPath=/job-logs \
+--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+--set queue=${KUEUE_NAME}
+```
+
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-deepseek-v3-32node-bf16-seq4096-gbs2048
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=custom_setup_experiment.py \
+    --set workload.image=nvcr.io/nvidia/nemo:26.02 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
+### Monitor the job
+
+To check the status of pods in your job, run the following command:
+
+```
+kubectl get pods | grep $USER-deepseek-v3-32node-bf16-seq4096-gbs2048
+```
+
+Replace the following:
+
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-deepseek-v3-32node-bf16-seq4096-gbs2048.
+
+To get the logs for one of the pods, run the following command:
+
+```
+kubectl logs POD_NAME
+```
+
+Information about the training job's progress, including crucial details such as
+loss, step count, and step time, is generated by the rank 0 process.
+This process runs on the pod whose name begins with
+`JOB_NAME_PREFIX-workload-0-0`.
+For example: `$USER-deepseek-v3-32node-bf16-seq4096-gbs2048-workload-0-0-s9zrv`.
+
+### Uninstall the Helm release
+
+You can delete the job and other resources created by the Helm chart. To
+uninstall Helm, run the following command from your client:
+
+```bash
+helm uninstall $USER-deepseek-v3-32node-bf16-seq4096-gbs2048
+```
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py
new file mode 100644
index 00000000..369cfa0a
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py
@@ -0,0 +1,583 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import nemo_run as run
+from nemo_run.config import get_nemorun_home
+
+
+try:
+    from argument_parser import parse_cli_args
+    from utils.evaluate import calc_convergence_and_performance
+    from utils.executors import dgxc_executor, slurm_executor
+    from utils.utils import get_exp_name_config, select_config_variant_interactive
+except (ImportError, ModuleNotFoundError):
+    from .argument_parser import parse_cli_args
+    from .utils.evaluate import calc_convergence_and_performance
+    from .utils.executors import dgxc_executor, slurm_executor
+    from .utils.utils import get_exp_name_config, select_config_variant_interactive
+
+try:
+    import wandb
+
+    HAVE_WANDB = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_WANDB = False
+
+try:
+    from perf_plugins import NsysPlugin, PerfEnvPlugin, PyTorchProfilerPlugin
+    from resiliency_plugins import FaultTolerancePlugin
+except (ImportError, ModuleNotFoundError):
+    from .perf_plugins import NsysPlugin, PerfEnvPlugin, PyTorchProfilerPlugin
+    from .resiliency_plugins import FaultTolerancePlugin
+
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+ENTRYPOINT_PEFORMANCE = "run_script.py"
+ENTRYPOINT_RECIPE = "run_recipe.py"
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+def check_training_finished(log_file_path: str) -> bool:
+    """Check if training is finished."""
+    with open(log_file_path, "r") as f:
+        log_lines = f.readlines()
+    log = "\n".join(log_lines)
+    return "StopIteration" in log or "after training is done" in log or "exiting program at iteration" in log
+
+
+def check_slurm_timeout(log_file_path: str) -> bool:
+    """Check if Slurm job timed out."""
+    with open(log_file_path, "r") as f:
+        log_lines = f.readlines()
+    log = "\n".join(log_lines)
+    return "DUE TO TIME LIMIT" in log
+
+
+def is_flaky_failure(log_file_path: str) -> bool:
+    """Check if Slurm job failed due to flaky failure."""
+    with open(log_file_path, "r") as f:
+        log_lines = f.readlines()
+    log = "\n".join(log_lines)
+
+    return (
+        "The server socket has failed to listen on any local network address." in log
+        or "Some NCCL operations have failed or timed out." in log
+        or "uncorrectable ECC error encountered" in log
+        or "illegal memory access" in log
+        or "illegal instruction" in log
+        or "torch.distributed.DistNetworkError" in log
+        or "Segmentation fault" in log
+        or "found NaN in" in log
+        or "For debugging consider passing CUDA_LAUNCH_BLOCKING=1" in log
+        or "double free or corruption" in log
+        or "Call to CUDA function failed." in log
+        or "Connection reset by peer" in log
+        or "invalid pointer" in log
+        or "malloc(): unaligned tcache chunk detected" in log
+        or "zmq.error.ZMQError: Address already in use" in log
+        or "We couldn't connect to 'https://huggingface.co'" in log
+        or "Unpack failed: incomplete input" in log
+        or "unspecified launch failure" in log
+        or "free(): corrupted unsorted chunks" in log
+        or "Segfault encountered" in log
+        or "Fatal glibc error" in log
+        or "EOFError: No data left in file" in log
+    )
+
+
+def build_performance_config(args) -> Optional[Dict[str, Any]]:
+    """Build performance configuration from command-line arguments.
+
+    Args:
+        args: Parsed command-line arguments
+
+    Returns:
+        Dictionary with performance configuration or None if performance is disabled
+    """
+    config = {}
+
+    performance_params = {
+        "timing_threshold": args.timing_threshold,
+        "skip_first_percent_time": args.skip_first_percent_time,
+    }
+
+    for key, value in performance_params.items():
+        if value is not None:
+            config[key] = value
+
+    return config if config else None
+
+
+def ensure_logs_where_written(log_file_paths: List[str]):
+    """Ensure logs were written to disk."""
+    if len(log_file_paths) != 1:
+        raise FileNotFoundError(
+            f"Unexpected number of log files found: {log_file_paths}. Expected 1, got {len(log_file_paths)}"
+        )
+
+
+def get_job_dir_and_status_from_run(exp_name: str):
+    """Get job directory and status from run."""
+    result_dict = run.Experiment.from_title(exp_name).status(return_dict=True)
+    _, job_dict = list(result_dict.items())[0]
+    job_dir = job_dict["local_dir"]
+    job_status = str(job_dict["status"])
+    return job_dir, job_status
+
+
+def maybe_increase_n_attempts_on_flaky_failure(
+    n_attempts: int,
+    max_retries: int,
+    is_finished_experiment: bool,
+    is_long_convergence_run: bool,
+    log_file_paths: List[str],
+):
+    """Maybe increase number of attempts."""
+    if not is_finished_experiment and not is_long_convergence_run:
+        if is_flaky_failure(log_file_paths[-1]):
+            n_attempts += 1
+        else:
+            n_attempts = max_retries  # On non-flaky failures, we don't need to restart the experiment.
+
+    return n_attempts
+
+
+def main(
+    use_recipes: bool,
+    model_family_name: str,
+    model_recipe_name: str,
+    task: str,
+    compute_dtype: str,
+    gpu: str,
+    hf_token: str,
+    detach: bool,
+    dryrun: bool,
+    enable_vboost: bool,
+    enable_nsys: bool,
+    pytorch_profiler: bool,
+    moe_a2a_overlap: bool,
+    tp_size: Optional[int],
+    pp_size: Optional[int],
+    cp_size: Optional[int],
+    ep_size: Optional[int],
+    wandb_key: str,
+    wandb_project_name: str,
+    wandb_experiment_name: str,
+    wandb_entity_name: str,
+    profiling_start_step: int,
+    profiling_stop_step: int,
+    record_memory_history: bool,
+    profiling_gpu_metrics: bool,
+    profiling_ranks: Optional[List[int]],
+    nsys_trace: Optional[List[str]],
+    nsys_extra_args: Optional[List[str]],
+    nemo_home: str,
+    account: str,
+    partition: str,
+    log_dir: str,
+    gpus_per_node: int,
+    time_limit: str,
+    container_image: str,
+    custom_mounts: List[str],
+    custom_env_vars: Dict[str, str],
+    custom_srun_args: List[str],
+    custom_bash_cmds: List[List[str]],
+    nccl_ub: bool,
+    pretrained_checkpoint: Optional[str],
+    num_gpus: int,
+    is_long_convergence_run: bool,
+    additional_slurm_params: Optional[Dict[str, Any]],
+    golden_values_path: str,
+    convergence_params: Dict[str, Any],
+    performance_params: Dict[str, Any],
+    memory_params: Dict[str, Any],
+    max_retries: int,
+    dgxc_base_url: str,
+    dgxc_cluster: str,
+    dgxc_kube_apiserver_url: str,
+    dgxc_app_id: str,
+    dgxc_app_secret: str,
+    dgxc_project_name: str,
+    dgxc_pvc_claim_name: str,
+    dgxc_pvc_mount_path: str,
+    config_variant: str = "v1",
+):
+    """Sets up the experiment and runs it."""
+    if (
+        model_family_name in ["qwen3"]
+        and model_recipe_name
+        in [
+            "qwen3_30b_a3b",
+            "qwen3_235b_a22b",
+        ]
+        and task == "pretrain"
+    ):
+        assert hf_token is not None, "HF token is required for Qwen3 tokenizer. NullTokenizer to be used soon."
+
+    if wandb_key is not None:
+        assert wandb_project_name is not None and wandb_experiment_name is not None, (
+            "both wandb_project_name and wandb_experiment_name are required for logging with WandB"
+        )
+
+    if use_recipes:
+        script_name = ENTRYPOINT_RECIPE
+        exp_name = (
+            wandb_experiment_name
+            if wandb_experiment_name is not None
+            else f"{model_recipe_name}_{task}_{num_gpus}gpu_{gpu}"
+        )
+
+    else:
+        script_name = ENTRYPOINT_PEFORMANCE
+        exp_config = get_exp_name_config(
+            args, model_family_name, model_recipe_name, gpu, compute_dtype, task, config_variant
+        )
+        exp_name = (
+            wandb_experiment_name
+            if wandb_experiment_name is not None
+            else f"{task}_{model_recipe_name}_{compute_dtype}_{exp_config}"
+        )
+
+    if pretrained_checkpoint is not None:
+        custom_mounts.append(f"{pretrained_checkpoint}:{pretrained_checkpoint}")
+
+    import os
+    rank = os.environ.get('RANK', '0')
+    exp_name += f'_worker{rank}'
+
+    run_script_path = SCRIPT_DIR / script_name
+    logger.info(f"Run script path: {run_script_path}")
+    if not run_script_path.is_file():
+        logger.error(f"Specified run script not found: {run_script_path}")
+        sys.exit(1)
+
+    custom_mounts.extend(
+        [
+            f"{run_script_path}:{run_script_path}",
+            f"{SCRIPT_DIR}:{SCRIPT_DIR}",
+        ]
+    )
+
+    if nccl_ub:
+        custom_env_vars.update({"NCCL_NVLS_ENABLE": "1", "NCCL_CTA_POLICY": "1"})
+
+    executor = run.LocalExecutor()
+
+    plugins = []
+
+    if not use_recipes:
+        plugins.append(
+            PerfEnvPlugin(
+                enable_vboost=enable_vboost,
+                moe_a2a_overlap=moe_a2a_overlap,
+                tp_size=tp_size,
+                pp_size=pp_size,
+                cp_size=cp_size,
+                ep_size=ep_size,
+                model_family_name=model_family_name,
+                model_recipe_name=model_recipe_name,
+                gpu=gpu,
+                compute_dtype=compute_dtype,
+                train_task=task,
+                config_variant=config_variant,
+            )
+        )
+
+    if enable_nsys:
+        plugins.append(
+            NsysPlugin(
+                profile_step_start=profiling_start_step,
+                profile_step_end=profiling_stop_step,
+                nsys_gpu_metrics=profiling_gpu_metrics,
+                profile_ranks=profiling_ranks,
+                nsys_trace=args.nsys_trace,
+                nsys_extra_args=args.nsys_extra_args,
+            )
+        )
+    if pytorch_profiler:
+        plugins.append(
+            PyTorchProfilerPlugin(
+                profile_step_start=profiling_start_step,
+                profile_step_end=profiling_stop_step,
+                profile_ranks=profiling_ranks,
+                record_memory_history=record_memory_history,
+            )
+        )
+
+    nemorun_script = run.Script(
+        path=str(run_script_path),
+        entrypoint="python",
+        env={"PYTHONPATH": f"{SCRIPT_DIR}:$PYTHONPATH"},
+        args=list(sys.argv[1:]),
+    )
+
+    logger.info("Will launch the following command with Nemo-Run: %s", " ".join(nemorun_script.to_command()))
+
+    is_finished_experiment = False  # An experiment might consist of multiple training runs, due to restarts.
+    is_testing_passed = False  # Whether the testing passed convergence and performance validation.
+    error_msg = None
+    n_attempts = 0
+    exp_name = (
+        exp_name[:37] if dgxc_cluster is not None else exp_name
+    )  # Some k8s clusters have a limit on the length of the experiment name.
+    wandb_run_id = None
+    while n_attempts <= max_retries:
+        while is_finished_experiment is False:
+            if HAVE_WANDB:
+                wandb_run_id = (
+                    (wandb_run_id or wandb.util.generate_id()) if is_long_convergence_run else wandb.util.generate_id()
+                )
+                executor.env_vars.update(
+                    {
+                        "WANDB_RUN_ID": wandb_run_id,
+                        "WANDB_RESUME": "allow",
+                    }
+                )
+            if wandb_key is not None:
+                executor.env_vars["WANDB_API_KEY"] = wandb_key
+
+            run.run(
+                nemorun_script,
+                executor=executor,
+                plugins=plugins,
+                dryrun=dryrun,
+                detach=detach,
+                name=exp_name,
+            )
+            if dryrun:
+                logger.info("dryrun requested: exiting")
+                return
+
+            def _copy_logs_to_gcp(job_dir_path):
+                import shutil
+                import glob
+                
+                artifact_dir = os.environ.get("ARTIFACT_DIR", "/tmp/artifacts")
+                dest_logs_dir = os.path.join(artifact_dir, "logs")
+                os.makedirs(dest_logs_dir, exist_ok=True)
+                
+                try:
+                    log_files = glob.glob(f"{job_dir_path}/log-*.out") + glob.glob(f"{job_dir_path}/log-*.err")
+                    for log_f in log_files:
+                        shutil.copy(log_f, dest_logs_dir)
+                        msg = f"Copied {log_f} to {dest_logs_dir}"
+                        print(msg)
+                        logger.info(msg)
+                except Exception as e:
+                    print(f"Failed to copy logs to GCP: {e}")
+                    logger.error(f"Failed to copy logs to GCP: {e}")
+
+
+            job_dir, job_status = get_job_dir_and_status_from_run(exp_name)
+
+            if job_status not in ["SUCCEEDED", "SUBMITTED", "PENDING", "RUNNING"]:
+                _copy_logs_to_gcp(job_dir)
+                raise Exception(f"Experiment failed for {exp_name} with status: {job_status}.")
+
+            if detach:
+                is_finished_experiment = True
+                is_testing_passed = True
+                break
+
+            log_file_paths = list(Path(f"{job_dir}").glob("log-*_0.out"))
+            ensure_logs_where_written(log_file_paths)
+
+            is_finished_experiment = (
+                check_training_finished(log_file_paths[-1]) if is_long_convergence_run else (job_status == "SUCCEEDED")
+            )
+
+            n_attempts = maybe_increase_n_attempts_on_flaky_failure(
+                n_attempts=n_attempts,
+                max_retries=max_retries,
+                is_finished_experiment=is_finished_experiment,
+                is_long_convergence_run=is_long_convergence_run,
+                log_file_paths=log_file_paths,
+            )
+
+            if not is_finished_experiment and n_attempts <= max_retries:
+                logger.error(f"Starting attempt {n_attempts + 1} of {max_retries + 1} for {exp_name}")
+
+            if not is_finished_experiment:
+                break
+
+        if is_finished_experiment is True and detach is False:
+            log_paths = sorted(
+                list(glob.glob(f"{get_nemorun_home()}/experiments/{exp_name}/{exp_name}_*/{exp_name}/log-*_0.out"))
+            )
+
+            if not is_long_convergence_run:
+                log_paths = [log_paths[-1]]
+
+            logger.info(f"Starting convergence check for {model_family_name}_{model_recipe_name}")
+            wandb_run = None
+            if HAVE_WANDB and wandb_key:
+                wandb_run = wandb.init(
+                    project=wandb_project_name, entity=wandb_entity_name, id=wandb_run_id, resume="allow"
+                )
+
+            logger.info("Waiting 10 seconds for I/O to settle")
+            time.sleep(10)
+
+            is_testing_passed, error_msg = calc_convergence_and_performance(
+                model_family_name=model_family_name,
+                model_recipe_name=model_recipe_name,
+                assets_dir=os.path.join(job_dir, exp_name),
+                log_paths=log_paths,
+                loss_metric="lm loss",
+                timing_metric="elapsed time per iteration (ms)",
+                alloc_metric="alloc",
+                max_alloc_metric="max_alloc",
+                golden_values_path=golden_values_path,
+                convergence_config=convergence_params,
+                performance_config=performance_params,
+                memory_config=memory_params,
+                wandb_run=wandb_run,
+            )
+
+            if wandb_run:
+                wandb_run.finish()
+                wandb.teardown(exit_code=int(not is_testing_passed))
+
+            if not is_long_convergence_run:
+                n_attempts = max_retries
+                is_finished_experiment = True
+                if not is_testing_passed:
+                    _copy_logs_to_gcp(job_dir)
+                break
+
+        if is_finished_experiment and is_testing_passed:
+            break
+
+    if not is_testing_passed and error_msg is not None:
+        raise AssertionError(error_msg)
+    if is_testing_passed and error_msg is not None:
+        logger.warning(error_msg)
+
+    if not is_finished_experiment:
+        _copy_logs_to_gcp(job_dir)
+        raise Exception("Megatron-Bridge CI test job failed")
+    elif is_finished_experiment and not detach:
+        logger.info("Megatron-Bridge CI test job completed successfully!")
+
+
+if __name__ == "__main__":
+    parser = parse_cli_args()
+    args, unknown_args = parser.parse_known_args()
+
+    assert not (args.enable_nsys and args.pytorch_profiler), (
+        "Both NSys and PyTorch profiler cannot be enabled at the same time"
+    )
+
+    # probably better to use parser.parse_args() and make unknowns an error,
+    # but for now we'll just issue a warning.
+    if unknown_args:
+        logger.warning(f"Ignoring unrecognized arguments: {' '.join(unknown_args)}")
+
+    # Handle --list_config_variants: show available variants and interactively select
+    config_variant = args.config_variant
+    if args.list_config_variants:
+        config_variant = select_config_variant_interactive(
+            model_family_name=args.model_family_name,
+            model_recipe_name=args.model_recipe_name,
+            gpu=args.gpu,
+            compute_dtype=args.compute_dtype,
+            task=args.task,
+        )
+
+    main(
+        use_recipes=args.use_recipes,
+        model_family_name=args.model_family_name,
+        model_recipe_name=args.model_recipe_name,
+        task=args.task,
+        compute_dtype=args.compute_dtype,
+        gpu=args.gpu,
+        hf_token=args.hf_token,
+        detach=args.detach,
+        dryrun=args.dryrun,
+        enable_vboost=args.enable_vboost,
+        enable_nsys=args.enable_nsys,
+        pytorch_profiler=args.pytorch_profiler,
+        moe_a2a_overlap=args.moe_a2a_overlap,
+        tp_size=args.tensor_model_parallel_size,
+        pp_size=args.pipeline_model_parallel_size,
+        cp_size=args.context_parallel_size,
+        ep_size=args.expert_model_parallel_size,
+        wandb_key=args.wandb_key,
+        wandb_project_name=args.wandb_project_name,
+        wandb_experiment_name=args.wandb_experiment_name,
+        wandb_entity_name=args.wandb_entity_name,
+        profiling_start_step=args.profiling_start_step,
+        profiling_stop_step=args.profiling_stop_step,
+        record_memory_history=args.record_memory_history,
+        profiling_gpu_metrics=args.profiling_gpu_metrics,
+        profiling_ranks=args.profiling_ranks,
+        nsys_trace=args.nsys_trace,
+        nsys_extra_args=args.nsys_extra_args,
+        nemo_home=args.nemo_home,
+        account=args.account,
+        partition=args.partition,
+        log_dir=args.log_dir,
+        gpus_per_node=args.gpus_per_node,
+        time_limit=args.time_limit,
+        container_image=args.container_image,
+        custom_mounts=args.custom_mounts,
+        custom_env_vars=args.custom_env_vars,
+        custom_srun_args=args.custom_srun_args,
+        custom_bash_cmds=args.custom_bash_cmds,
+        nccl_ub=args.nccl_ub,
+        pretrained_checkpoint=args.pretrained_checkpoint,
+        num_gpus=args.num_gpus,
+        is_long_convergence_run=args.is_long_convergence_run,
+        additional_slurm_params=args.additional_slurm_params,
+        golden_values_path=args.golden_values_path,
+        convergence_params={
+            "correlation_threshold": args.correlation_threshold,
+            "high_loss_tolerance": args.high_loss_tolerance,
+            "medium_loss_tolerance": args.medium_loss_tolerance,
+            "low_loss_tolerance": args.low_loss_tolerance,
+            "final_loss_tolerance": args.final_loss_tolerance,
+            "max_outlier_ratio": args.max_outlier_ratio,
+            "outlier_threshold": args.outlier_threshold,
+            "skip_first_percent_loss": args.skip_first_percent_loss,
+        },
+        performance_params={
+            "timing_threshold": args.timing_threshold,
+            "skip_first_percent_time": args.skip_first_percent_time,
+        },
+        memory_params={
+            "memory_threshold": args.memory_threshold,
+        },
+        max_retries=args.max_retries,
+        dgxc_base_url=args.dgxc_base_url,
+        dgxc_cluster=args.dgxc_cluster,
+        dgxc_kube_apiserver_url=args.dgxc_kube_apiserver_url,
+        dgxc_app_id=args.dgxc_app_id,
+        dgxc_app_secret=args.dgxc_app_secret,
+        dgxc_project_name=args.dgxc_project_name,
+        dgxc_pvc_claim_name=args.dgxc_pvc_claim_name,
+        dgxc_pvc_mount_path=args.dgxc_pvc_mount_path,
+        config_variant=config_variant,
+    )
\ No newline at end of file
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh
new file mode 100644
index 00000000..3cb08b61
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh
@@ -0,0 +1,147 @@
+usage()
+{
+cat << EOF
+usage: bash ./launcher.sh [config-override  [config-override ...]]
+config-override  (Optional) A  NeMo configuration override. E.g. trainer.max_steps=10000.
+EOF
+}
+
+parse_args() {
+  while [ "$1" != "" ]; do
+    case $(grep -o "=" <<< "$1" | wc -l) in
+        1  )
+        config_overrides+=("$1")
+        ;;
+        * )
+            echo "Invalid config override: $1"
+            usage
+            exit 1
+    esac
+    shift
+  done
+  config_overrides="${config_overrides[*]}"
+}
+
+config_overrides=()
+parse_args "$@"
+
+if [ -z "${config_overrides}" ]; then
+  echo "No NeMo config overrides specified"
+else
+  echo "NeMo config overrides:"
+  echo "  ${config_overrides}"
+fi
+
+export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH:/usr/local/nvidia/lib64"
+ldconfig $LD_LIBRARY_PATH
+echo "Added $LD_LIBRARY_PATH to ldconfig:"
+ldconfig -p | grep libcuda | sed 's/^/  /'
+echo ""
+
+if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
+  explicit_log_dir=${EXPLICIT_LOG_DIR}
+else
+  explicit_log_dir=workload_logs
+fi
+echo "Logging to ${explicit_log_dir}"
+
+if [[ -n "${TOKENIZER_PATH}" ]]; then
+  echo "Getting tokenizer files"
+  cp ${TOKENIZER_PATH}/* .
+  echo ""
+fi
+
+echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes"
+
+pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+# Create the nsys directory.
+mkdir -p ${explicit_log_dir}/nsys
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  echo "--- DEBUG libnccl-env.so ---"
+  ls -la /usr/local/gib/lib/libnccl-env.so || echo "libnccl-env.so not found"
+  ls -lh /usr/local/gib/lib
+  echo "----------------------------"
+fi
+
+cd /opt
+rm -rf Megatron-Bridge
+git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
+cd Megatron-Bridge
+git checkout f7a9428f301fa17ac374d5e7166a63b0aa4771af
+git submodule update --init --recursive
+sed -i -e '/return config/i \    config.dist.distributed_timeout_minutes = 30' scripts/performance/run_recipe.py
+ls
+
+cp $CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH scripts/performance/
+
+worker_command=$(cat <<- EOM
+  if [ "\$RANK" -eq "0" ]; then
+    echo "--- LOCATING MEGATRON LIBRARIES ---" ;
+    python -c "import megatron.core; print('megatron.core:', megatron.core.__file__)" || echo "megatron.core not found" ;
+    python -c "import megatron.bridge; print('megatron.bridge:', megatron.bridge.__file__)" || echo "megatron.bridge not found" ;
+    echo "-----------------------------------" ;
+    echo "Worker 0 is stalling for a few seconds.." ;
+    sleep 3 ;
+    echo "The detected environment within worker rank 0 is:" ;
+    env | sed 's/^/  /' ;
+  fi ;
+
+  cd /opt/Megatron-Bridge ;
+  export PYTHONPATH="/opt/Megatron-Bridge:/opt/Megatron-Bridge/3rdparty/Megatron-LM:\$PYTHONPATH" ;
+
+  exec numactl \
+    --cpunodebind=\$((LOCAL_RANK/4)) \
+    --membind=\$((LOCAL_RANK/4))           nsys profile \
+    -t nvtx,cuda \
+    --cuda-event-trace=false \
+    --sample=none \
+    --capture-range=cudaProfilerApi \
+    --capture-range-end=stop \
+    --kill none \
+    -o /${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK \
+    --force-overwrite true \
+    --session-new "nsys-\$RANDOM-\$RANK" \
+  nice -10 \
+  python scripts/performance/custom_setup_experiment.py \
+    --gpu b200 \
+    --model_family_name deepseek \
+    --model_recipe_name deepseek_v3 \
+    --gpus_per_node 8 \
+    --num_gpus 256 \
+    --global_batch_size 2048 \
+    --micro_batch_size 1 \
+    --seq_length 4096 \
+    --tensor_model_parallel_size 1 \
+    --pipeline_model_parallel_size 16 \
+    --context_parallel_size 1 \
+    --virtual_pipeline_model_parallel_size None \
+    --expert_model_parallel_size 8 \
+    --compute_dtype bf16 \
+    --max_steps 30 dist.distributed_timeout_minutes=30
+
+EOM
+)
+
+echo "$worker_command" > worker_command.sh
+chmod 777 worker_command.sh
+
+torchrun \
+--nproc-per-node="8" \
+--nnodes="32" \
+--node_rank="${JOB_COMPLETION_INDEX}" \
+--rdzv_id="${JOB_IDENTIFIER}" \
+--master_addr="${MASTER_ADDR}" \
+--master_port="${MASTER_PORT}" \
+--no-python bash worker_command.sh
+
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  mkdir -p ${ARTIFACT_DIR}
+  cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
+  env > ${ARTIFACT_DIR}/environ.txt
+  ls ${ARTIFACT_DIR}
+fi
+echo "Training completed"
+echo "Pod on $(hostname --fqdn) is exiting"
\ No newline at end of file
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh
new file mode 100644
index 00000000..892961cb
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh
@@ -0,0 +1 @@
+helm install joeywan-ubench-6wsw . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=/tmp/ubench_recipe/joeywan-ubench-6wsw/custom_setup_experiment.py --set workload.image=nvcr.io/nvidia/nemo:26.02 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/joeywan-ubench-6wsw --set queue=a4
\ No newline at end of file
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-config-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-config-configmap.yaml
new file mode 100644
index 00000000..a1d54cee
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-config-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.workload.configFile }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  workload-configuration: |-
+{{- if .Values.workload_config }}
+{{ .Values.workload_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
+{{- end }}
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml
new file mode 100644
index 00000000..54efbb6b
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml
@@ -0,0 +1,333 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{$timestamp := now | date "2006-01-02-15-04-05"}}
+{{$jobSuffix := randAlphaNum 4 | lower}}
+{{$jobuuid := uuidv4}}
+{{$nodes := div .Values.workload.gpus 8 | max 1}}
+{{$gpusPerNode := min .Values.workload.gpus 8}}
+{{- $root := . -}}
+
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+  {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+  {{- end }}
+spec:
+  {{- if $root.Values.queue }}
+  suspend: true
+  {{- end }}
+  failurePolicy:
+    maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }}
+  replicatedJobs:
+  - name: workload
+    replicas: 1
+    template:
+      spec:
+        parallelism: {{ $nodes }}
+        completions: {{ $nodes }}
+        backoffLimit: 0
+        completionMode: Indexed
+        activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60)
+        ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60)
+        template:
+          metadata:
+            annotations:
+              kubectl.kubernetes.io/default-container: workload
+              {{- if $root.Values.volumes.gcsVolumes }}
+              gke-gcsfuse/volumes: "true"
+              gke-gcsfuse/cpu-limit: "500m"
+              gke-gcsfuse/memory-limit: "1Ti"
+              gke-gcsfuse/ephemeral-storage-limit: "2Ti"
+              {{- end }}
+              {{- if $root.Values.volumes.psVolumes }}
+              gke-parallelstore/volumes: "true"
+              gke-parallelstore/cpu-limit: "0"
+              gke-parallelstore/memory-limit: "0"
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }}
+              kueue.x-k8s.io/podset-preferred-topology: {{ .Values.tasSettings.topologyRequest | default "kubernetes.io/hostname" }}
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+              provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+              {{- end }}
+              {{- if not $root.Values.network.hostNetwork }}
+              networking.gke.io/default-interface: "eth0"
+              networking.gke.io/interfaces: |
+              {{- if $root.Values.network.subnetworks }}
+                [
+                  {{- range $i, $subnetwork := $root.Values.network.subnetworks }}
+                  {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- else }}
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gvnic-1"},
+                  {{- range  $i := until 8 }}
+                  {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- end }}
+              {{- end }}
+          spec:
+            {{- if $root.Values.network.hostNetwork }}
+            hostNetwork: true
+            dnsPolicy: ClusterFirstWithHostNet
+            {{- end }}
+            subdomain: "{{.Release.Name}}"
+            restartPolicy: Never
+            {{- if $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "In"
+                      values:
+                      {{- range $hostname := $root.Values.targetNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            {{- if $root.Values.avoidNodes }}
+            {{- if not $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+            {{- end }}
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "NotIn"
+                      values:
+                      {{- range $hostname := $root.Values.avoidNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            tolerations:
+            - operator: "Exists"
+              key: nvidia.com/gpu
+            - operator: "Exists"
+              key: cloud.google.com/impending-node-termination
+
+            volumes:
+            {{ if $root.Values.network.gibVersion }}
+            - name: gib
+              emptyDir: {}
+            {{ end }}
+
+            {{- if $root.Values.workload.configFile }}
+            - name: workload-configuration
+              configMap:
+                name: "{{.Release.Name}}-config"
+                items:
+                - key: workload-configuration
+                  path: {{ $root.Values.workload.configFile | default "workload-configuration" }}
+            {{- end }}
+
+            - name: workload-launcher
+              configMap:
+                name: "{{.Release.Name}}-launcher"
+
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+
+            {{- range $pvc := $root.Values.volumes.pvcMounts }}
+            - name: "{{ $pvc.claimName }}"
+              persistentVolumeClaim:
+                claimName: "{{ $pvc.claimName }}"
+            {{- end }}
+
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              csi:
+                driver: gcsfuse.csi.storage.gke.io
+                volumeAttributes:
+                  bucketName: "{{ $gcs.bucketName }}"
+                  {{- if $gcs.mountOptions }}
+                  mountOptions: "{{ $gcs.mountOptions }}"
+                  {{- end }}
+            {{- end}}
+
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              hostPath:
+                path: /mnt/stateful_partition/kube-ephemeral-ssd
+            {{- end }}
+
+            initContainers:
+            {{ if $root.Values.network.gibVersion }}
+            - name: nccl-plugin-installer
+              image: {{ $root.Values.network.gibVersion }}
+              imagePullPolicy: Always
+              args:
+              - |
+                set -ex
+                /scripts/container_entry.sh install --install-nccl
+                cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+                cp -R /var/lib/gib/. /target/usr/local/gib
+              command:
+              - /bin/sh
+              - -c
+              volumeMounts:
+              - mountPath: /target/usr/local/gib
+                name: gib
+            {{ end}}
+
+            containers:
+            {{- if $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-sidecar
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-metadata-prefetch
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            {{- end }}
+            {{- if $root.Values.workload.psSidecarImage }}
+            - name: gke-parallelstore-sidecar
+              image: {{ $root.Values.workload.psSidecarImage }}
+            {{- end }}
+
+            - name: workload
+              image: "{{ $root.Values.workload.image }}"
+              imagePullPolicy: Always
+              {{- if $root.Values.network.hostNetwork }}
+              securityContext:
+                privileged: true
+              {{- end }}
+              env:
+              - name: JOB_IDENTIFIER
+                value: "{{ .Release.Name }}-{{ $timestamp }}"
+              - name: JOB_TIMESTAMP
+                value: "{{ $timestamp }}"
+              - name: JOB_UUID
+                value: "{{ $jobuuid }}"
+              - name: JOB_ORCHESTRATOR
+                value: "gke"
+              # Add RANK based on the pod's index provided by the Indexed Job
+              # This is crucial for torch.distributed initialization.
+              - name: JOB_COMPLETION_INDEX
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+              - name: RANK_0_FQDN
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: HOSTNAME_PREFIX
+                value: "{{.Release.Name}}-workload-"
+              - name: DOMAIN_NAME
+                value: "{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_ADDR
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_PORT
+                value: "6002"
+              - name: WORLD_SIZE
+                value: "{{ $root.Values.workload.gpus }}"
+              - name: NNODES
+                value: "{{ $nodes }}"
+              - name: GPUS_PER_NODE
+                value: "{{ $gpusPerNode }}"
+
+              - name: NCCL_PLUGIN_PATH
+                value: /usr/local/gib/lib64
+
+              {{ if $root.Values.network.gibVersion }}
+              - name: NCCL_INIT_SCRIPT
+                value: "/usr/local/gib/scripts/set_nccl_env.sh"
+              {{ end }}
+
+              {{ if $root.Values.network.ncclSettings }}
+              {{- toYaml .Values.network.ncclSettings | nindent 14 }}
+              {{ end }}
+
+              {{ if $root.Values.workload.envs }}
+              {{- toYaml .Values.workload.envs | nindent 14 }}
+              {{ end }}
+
+              command:
+              - bash
+              - -c
+              - |
+                echo "Pod on $(hostname --fqdn) is running"
+                echo "Pod is assigned job index of $JOB_COMPLETION_INDEX"
+
+                if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then
+                  echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}"
+                  source ${NCCL_INIT_SCRIPT}
+                fi
+
+                # Overriding NCCL_SOCKET_IFNAME definition
+                export NCCL_SOCKET_IFNAME="eth0,eth1"
+                export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb
+
+                echo "Launching workload with the following arguments:"
+                {{- range $root.Values.workload.defaultArguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                echo ""
+
+                sleep 10
+
+                bash /workload/launcher/launch-workload.sh \
+                {{- range $root.Values.workload.defaultArguments }}
+                {{ . }} \
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                {{ . }} \
+                {{- end }}
+
+
+              volumeMounts:
+                {{ if $root.Values.network.gibVersion }}
+                - name: gib
+                  mountPath: /usr/local/gib
+                {{ end }}
+
+                {{- if $root.Values.workload.configFile }}
+                - name: workload-configuration
+                  mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+                {{- end }}
+
+                - name: workload-launcher
+                  mountPath: /workload/launcher
+
+                - name: shared-memory
+                  mountPath: /dev/shm
+
+                {{- range $pvc := $root.Values.volumes.pvcMounts }}
+                - name: "{{ $pvc.claimName }}"
+                  mountPath: "{{ $pvc.mountPath }}"
+                {{- end }}
+
+                {{- range $gcs := $root.Values.volumes.gcsMounts }}
+                - name: "{{ $gcs.bucketName }}"
+                  mountPath: "{{ $gcs.mountPath }}"
+                {{- end }}
+
+                {{- if $root.Values.volumes.ssdMountPath }}
+                - name: local-ssd
+                  mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+                {{- end }}
+
+              resources:
+                limits:
+                  nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-launcher-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-launcher-configmap.yaml
new file mode 100644
index 00000000..7026e0f1
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-launcher-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-svc.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-svc.yaml
new file mode 100644
index 00000000..7cfe220b
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-svc.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}"
+spec:
+  clusterIP: None
+  selector:
+    jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}"
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml
new file mode 100644
index 00000000..05e98e12
--- /dev/null
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml
@@ -0,0 +1,35 @@
+dwsSettings:
+  maxRunDurationSeconds: null
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1
+  hostNetwork: true
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: INFO
+  - name: NCCL_TIMEOUT
+    value: '7200000'
+  subnetworks[]: null
+queue: null
+tasSettings:
+  topologyRequest:
+    kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
+volumes:
+  gcsMounts:
+  - bucketName: null
+    mountPath: null
+  gcsVolumes: true
+  psVolumes: false
+workload:
+  arguments[]: null
+  configFile: custom_setup_experiment.py
+  configPath: /workload/configs/
+  defaultArguments[]: null
+  envs:
+  - name: ARTIFACT_DIR
+    value: null
+  - name: GLOO_SOCKET_IFNAME
+    value: eth0
+  - name: CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH
+    value: /workload/configs/custom_setup_experiment.py
+  gpus: 256
+  image: nvcr.io/nvidia/nemo:26.02