From 4de97345fec0413412cb5916caf295c3fcac71cf Mon Sep 17 00:00:00 2001 From: Mutian Zhu Date: Thu, 26 Feb 2026 20:09:56 +0000 Subject: [PATCH] Add Llama3.1-405b FP8 recipe for 64 nodes with GBS 2048 generated by ubench --- .../64node-FP8CS-GBS2048/recipe/README.md | 52 ++--- .../64node-FP8CS-GBS2048/recipe/launcher.sh | 41 +++- .../llama3-1-405b-fp8cs-gbs2048-gpu256.py | 141 ++++++++++++ .../llama3-1-405b-fp8cs-gbs2048-gpus256.py | 207 ------------------ .../recipe/recipe_launch_command.sh | 1 + .../templates/workload-config-configmap.yaml | 2 + .../recipe/templates/workload-job.yaml | 29 ++- .../64node-FP8CS-GBS2048/recipe/values.yaml | 63 +----- 8 files changed, 226 insertions(+), 310 deletions(-) create mode 100644 training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py delete mode 100644 training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpus256.py create mode 100644 training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/recipe_launch_command.sh diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md index 454a1ba6..b73cd914 100644 --- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md +++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/README.md @@ -87,47 +87,49 @@ gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION To execute the job with the default settings, run the following command from your client: - cd $RECIPE_ROOT - export WORKLOAD_NAME=$USER-a4x-llama3-1-405b - helm install $WORKLOAD_NAME . -f values.yaml \ - --set-file workload_launcher=launcher.sh \ - --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpus256.py \ - --set workload.image=nvcr.io/nvidia/nemo:25.07 \ - --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ - --set volumes.gcsMounts[0].mountPath=/job-logs \ - --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ - --set queue=${KUEUE_NAME} - -**Examples** - -- To set the number of training steps to 100, run the following command from - your client: - ```bash cd $RECIPE_ROOT -export WORKLOAD_NAME=$USER-a4x-llama3-1-405b +export WORKLOAD_NAME=$USER-a4x-llama3-1-405b-64node helm install $WORKLOAD_NAME . -f values.yaml \ --set-file workload_launcher=launcher.sh \ ---set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpus256.py \ +--set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpu256.py \ --set workload.image=nvcr.io/nvidia/nemo:25.07 \ --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ --set volumes.gcsMounts[0].mountPath=/job-logs \ --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ ---set queue=${KUEUE_NAME} \ ---set workload.arguments[0]="max_steps=100" +--set queue=${KUEUE_NAME} ``` +**Examples** + +- To set the number of training steps to 100, run the following command from + your client: + + ```bash + cd $RECIPE_ROOT + export WORKLOAD_NAME=$USER-a4x-llama3-1-405b-64node + helm install $WORKLOAD_NAME . -f values.yaml \ + --set-file workload_launcher=launcher.sh \ + --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpu256.py \ + --set workload.image=nvcr.io/nvidia/nemo:25.07 \ + --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ + --set volumes.gcsMounts[0].mountPath=/job-logs \ + --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ + --set queue=${KUEUE_NAME} \ + --set workload.arguments[0]="trainer.max_steps=100" + ``` + ### Monitor the job To check the status of pods in your job, run the following command: ``` -kubectl get pods | grep $USER-a4x-llama3-1-405b +kubectl get pods | grep $USER-a4x-llama3-1-405b-64node ``` Replace the following: -- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4x-llama3-1-405b. +- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4x-llama3-1-405b-64node. To get the logs for one of the pods, run the following command: @@ -139,7 +141,7 @@ Information about the training job's progress, including crucial details such as loss, step count, and step time, is generated by the rank 0 process. This process runs on the pod whose name begins with `JOB_NAME_PREFIX-workload-0-0`. -For example: `$USER-a4x-llama3-1-405b-workload-0-0-s9zrv`. +For example: `$USER-a4x-llama3-1-405b-64node-workload-0-0-s9zrv`. ### Uninstall the Helm release @@ -147,5 +149,5 @@ You can delete the job and other resources created by the Helm chart. To uninstall Helm, run the following command from your client: ```bash -helm uninstall $USER-a4x-llama3-1-405b -``` +helm uninstall $USER-a4x-llama3-1-405b-64node +``` \ No newline at end of file diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh index c77d3142..66200540 100644 --- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh +++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/launcher.sh @@ -32,11 +32,13 @@ else echo " ${config_overrides}" fi -export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" -ldconfig $LD_LIBRARY_PATH -echo "Added $LD_LIBRARY_PATH to ldconfig:" -ldconfig -p | grep libcuda | sed 's/^/ /' -echo "" +if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then + export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" + ldconfig $LD_LIBRARY_PATH + echo "Added $LD_LIBRARY_PATH to ldconfig:" + ldconfig -p | grep libcuda | sed 's/^/ /' + echo "" +fi if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then explicit_log_dir=${EXPLICIT_LOG_DIR} @@ -56,23 +58,46 @@ echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +# Export the nemo2 config to yaml. +python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=20 \ +trainer.num_nodes=64 \ +trainer.devices=4 \ +${config_overrides} \ +--to-yaml exported_nemo_config.yaml + # Create the nsys directory. mkdir -p ${explicit_log_dir}/nsys -torchrun --no-python \ ---nproc-per-node="${GPUS_PER_NODE}" \ +OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \ +/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \ +-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \ +--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \ +--wait all \ +torchrun \ +--nproc-per-node="4" \ --nnodes="${NNODES}" \ --node_rank="${JOB_COMPLETION_INDEX}" \ --rdzv_id="${JOB_IDENTIFIER}" \ --master_addr="${MASTER_ADDR}" \ --master_port="${MASTER_PORT}" \ -bash -c "numactl --cpunodebind=\$((LOCAL_RANK/2)) --membind=\$((LOCAL_RANK/2)) python ${NEMO_LAUNCH_SCRIPT} ${config_overrides}" +${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=20 \ +trainer.num_nodes=64 \ +trainer.devices=4 \ +${config_overrides} if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then mkdir -p ${ARTIFACT_DIR} cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py cp dllogger.json ${ARTIFACT_DIR}/dllogger.json + cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml env > ${ARTIFACT_DIR}/environ.txt ls ${ARTIFACT_DIR} fi diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py new file mode 100644 index 00000000..4a78d757 --- /dev/null +++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpu256.py @@ -0,0 +1,141 @@ +"""Nemo2 pretraining recipe for Llama 3.1 405B model.""" + +from nemo.collections import llm +from nemo.collections.llm.recipes import llama31_405b +from nemo.lightning.pytorch.callbacks import NsysCallback +from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback +from nemo.utils.loggers.dllogger import DLLogger +import nemo_run as run +from scripts.performance.helpers import ( + set_primary_perf_configs, +) +from scripts.performance.utils import get_comm_overlap_callback_idx + + +def recipe( + profile_enabled: bool = False, + profile_start_step: int = 0, + profile_end_step: int = 0, + profile_ranks: str = "0", +) -> run.Partial: + """Returns a Nemo2 training recipe for Llama 3.1 405B model. + + Args: + profile_enabled: Whether to enable Nsys profiling. + profile_start_step: The step to start profiling. + profile_end_step: The step to end profiling. + profile_ranks: The ranks to profile, comma separated. + + Returns: + A Nemo2 training recipe. + """ + # Start from the Nemo standard recipe. + pretrain = llama31_405b.pretrain_recipe(performance_mode=True) + + num_nodes = 64 + num_gpus_per_node = 4 + mbs = 1 + gbs = 2048 + max_steps = 20 + tp_size = 2 + pp_size = 1 + cp_size = 1 + vp_size = 1 # Virtual Pipeline Parallelism + ep_size = 1 # Expert Parallelism + activation_offload_layers = 95 + enable_cuda_graphs = False + compute_dtype = "fp8" + fp8_recipe = "cs" + nccl_communicator_config_path = None + use_mcore_fsdp = True + use_fsdp_double_buffer = False + use_user_buffer_registration = False + use_sharp = False + keep_fsdp_fp8_transpose_cache = False + + pretrain = set_primary_perf_configs( + pretrain, + "pre_train", + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + mbs=mbs, + gbs=gbs, + max_steps=max_steps, + tp_size=tp_size, + pp_size=pp_size, + cp_size=cp_size, + vp_size=vp_size, + ep_size=ep_size, + activation_offload_layers=activation_offload_layers, + enable_cuda_graphs=enable_cuda_graphs, + compute_dtype=compute_dtype, + fp8_recipe=fp8_recipe, + nccl_communicator_config_path=nccl_communicator_config_path, + use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=use_fsdp_double_buffer, + use_user_buffer_registration=use_user_buffer_registration, + use_sharp=use_sharp, + keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache, + ) + + # Sequence Length (model and data) + pretrain.model.config.seq_length = 8192 + pretrain.data.seq_length = 8192 + + # Set the number of steps to 50 for a quicker benchmark. + pretrain.trainer.max_steps = 50 + + # Disable validation batches. + pretrain.trainer.limit_val_batches = 0.0 + pretrain.trainer.val_check_interval = 30 + + # Add the Nsys profiling callback if enabled. + if profile_enabled: + pretrain.trainer.callbacks.append( + run.Config( + NsysCallback, + start_step=profile_start_step, + end_step=profile_end_step, + ranks=[int(x) for x in profile_ranks.split(",")], + gen_shape=False, + ) + ) + + # Add the FLOPs measurement callback. + pretrain.trainer.callbacks.append( + run.Config( + FLOPsMeasurementCallback, + model_name="llama31-405b", + model_config=pretrain.model.config, + data_config=pretrain.data, + ) + ) + + comm_overlap_callback_idx = get_comm_overlap_callback_idx( + pretrain.trainer.callbacks + ) + pretrain.trainer.callbacks[ + comm_overlap_callback_idx + ].tp_comm_bootstrap_backend = "nccl" + + # Disable checkpointing. + pretrain.log.ckpt = None + pretrain.trainer.enable_checkpointing = False + + # Log every step. + pretrain.trainer.log_every_n_steps = 1 + + # Enable DLLogger + dllogger_config = run.Config( + DLLogger, + verbose=True, + stdout=True, + json_file="dllogger.json", + ) + pretrain.log.extra_loggers = [dllogger_config] + + return pretrain + + +if __name__ == "__main__": + run.cli.main(llm.pretrain, default_factory=recipe) diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpus256.py b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpus256.py deleted file mode 100644 index 2af09422..00000000 --- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/llama3-1-405b-fp8cs-gbs2048-gpus256.py +++ /dev/null @@ -1,207 +0,0 @@ -"""Nemo2 pretraining recipe for Llama 3.1 405B model.""" - -# hack for relative imports - -from os.path import basename, splitext -import random -import fiddle as fdl -import fiddle._src.experimental.dataclasses as fdl_dc -from nemo.collections.llm.recipes import llama31_405b -from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192 -from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin -from nemo.utils.loggers.dllogger import DLLogger -import nemo_run as run -from scripts.performance.argument_parser import parse_cli_args -from scripts.performance.helpers import args_sanity_check -from scripts.performance.helpers import get_user_configs -from scripts.performance.helpers import set_exp_logging_configs -from scripts.performance.helpers import set_primary_perf_configs -from scripts.performance.utils import get_comm_overlap_callback_idx, hf_tokenizer - - -def recipe( - args, - num_nodes, - mbs, - gbs, - tp_size, - pp_size, - cp_size, - vp_size, - ep_size, - enable_cuda_graphs, - use_mcore_fsdp, - recompute_layers, - activation_offload_layers, -) -> run.Partial: - """Returns a Nemo2 training recipe for Llama 3.1 405B model.""" - # Start from the Nemo standard recipe. - pretrain = llama31_405b.pretrain_recipe(performance_mode=True) - - pretrain = set_primary_perf_configs( - pretrain, - "pre_train", - num_nodes=num_nodes, - num_gpus_per_node=4, - mbs=mbs, - gbs=gbs, - max_steps=args.max_steps, - tp_size=tp_size, - pp_size=pp_size, - cp_size=cp_size, - vp_size=vp_size, - ep_size=ep_size, - enable_cuda_graphs=enable_cuda_graphs, - activation_offload_layers=activation_offload_layers, - compute_dtype=args.compute_dtype, - fp8_recipe=args.fp8_recipe, - nccl_communicator_config_path=args.nccl_communicator_config_path, - use_mcore_fsdp=use_mcore_fsdp, - recompute_layers=recompute_layers, - use_fsdp_double_buffer=args.use_fsdp_double_buffer, - use_user_buffer_registration=args.use_user_buffer_registration, - use_sharp=args.use_sharp, - keep_fsdp_fp8_transpose_cache=args.keep_fsdp_fp8_transpose_cache, - ) - comm_overlap_callback_idx = get_comm_overlap_callback_idx( - pretrain.trainer.callbacks - ) - pretrain.trainer.callbacks[ - comm_overlap_callback_idx - ].tp_comm_bootstrap_backend = "nccl" - - pretrain = set_exp_logging_configs( - pretrain, - "pre_train", - "llm", - "llama3", - args.tensorboard, - args.wandb, - args.wandb_prj_name, - args.wandb_job_name, - ) - - if args.use_hf_tokenizer: - pretrain.data.tokenizer = hf_tokenizer("meta-llama/Llama-3.1-405B") - else: - pretrain.data.tokenizer = run.Config( - get_nmt_tokenizer, - library="null", - model_name="NullTokenizer", - vocab_size=128256, - ) - pretrain.model.tokenizer = pretrain.data.tokenizer - - comm_overlap_callback_idx = get_comm_overlap_callback_idx( - pretrain.trainer.callbacks - ) - assert ( - comm_overlap_callback_idx is not None - ), "MegatronCommOverlapCallback missing. Required for performance." - - tp_comm_overlap_cfg = fdl.cast( - run.Config, - fdl_dc.convert_dataclasses_to_configs( - userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192 - ), - ) - pretrain.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap_cfg = ( - tp_comm_overlap_cfg - ) - - if use_mcore_fsdp: - pretrain.trainer.strategy.num_distributed_optimizer_instances = ( - num_nodes * 4 - ) // 64 - - # Enable DLLogger - dllogger_config = run.Config( - DLLogger, - verbose=True, - stdout=True, - json_file="dllogger.json", - ) - pretrain.log.extra_loggers = [dllogger_config] - - return pretrain - - -if __name__ == "__main__": - args = parse_cli_args().parse_args() - args_sanity_check(args) - - kwargs = get_user_configs( - args.gpu.lower(), "pre_train", "llama31", "405b", args - ) - ( - num_nodes, - mbs, - gbs, - tp_size, - pp_size, - cp_size, - vp_size, - ep_size, - _, - enable_cuda_graphs, - use_mcore_fsdp, - recompute_layers, - activation_offload_layers, - ) = kwargs[:13] - - recipe = recipe( - args, - num_nodes, - mbs, - gbs, - tp_size, - pp_size, - cp_size, - vp_size, - ep_size, - enable_cuda_graphs, - use_mcore_fsdp, - recompute_layers, - activation_offload_layers, - ) - - exp_config = ( - f"{num_nodes}nodes_tp{tp_size}_pp{pp_size}_cp{cp_size}_vp{vp_size}_{mbs}mbs_{gbs}gbs-{random.randint(0, 100000)}" - ) - exp_name = ( - f"{splitext(basename(__file__))[0]}_{args.compute_dtype}_{exp_config}" - ) - - if use_mcore_fsdp: - # Needed to enable CuDNN LN for FSDP overlap - env_vars = {"NVTE_NORM_FWD_USE_CUDNN": "1", "NVTE_NORM_BWD_USE_CUDNN": "1"} - else: - env_vars = {} - - executor = run.LocalExecutor() - - plugins = [ - PerfEnvPlugin( - enable_vboost=False, - nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None, - gpu_sm100_or_newer=True, - ) - ] - if args.enable_nsys: - plugins.append( - NsysPlugin(start_step=10, end_step=13, ranks=list(range(0, 1))) - ) - if args.enable_memory_profile: - assert args.memory_profile_out_path is not None - plugins.append(MemoryProfilePlugin(dir=args.memory_profile_out_path)) - - with run.Experiment(exp_name) as exp: - exp.add( - recipe, - executor=executor, - name=exp_name, - plugins=plugins, - ) - - exp.run(sequential=True, direct=True, detach=False) diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/recipe_launch_command.sh b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/recipe_launch_command.sh new file mode 100644 index 00000000..4958878d --- /dev/null +++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/recipe_launch_command.sh @@ -0,0 +1 @@ +helm install mutianzhu-ubench-4tcg . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpu256.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/mutianzhu-ubench-4tcg --set queue=tas-lq \ No newline at end of file diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml index f34b5080..a1d54cee 100644 --- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml +++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +{{- if .Values.workload.configFile }} apiVersion: v1 kind: ConfigMap metadata: @@ -24,3 +25,4 @@ data: {{- else }} {{ "config: null" | nindent 4 }} {{- end }} +{{- end }} diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-job.yaml b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-job.yaml index 2362c6e0..e2b6d544 100644 --- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-job.yaml +++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-job.yaml @@ -96,44 +96,39 @@ spec: {{- end }} {{- end }} spec: - nodeSelector: - {{- toYaml .Values.workload.nodeSelector | nindent 14 }} {{- if $root.Values.network.hostNetwork }} hostNetwork: true dnsPolicy: ClusterFirstWithHostNet {{- end }} subdomain: "{{.Release.Name}}" restartPolicy: Never - {{- if or $root.Values.targetNodes $root.Values.avoidNodes $root.Values.targetNodepools }} + {{- if $root.Values.targetNodes }} affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - {{- if $root.Values.targetNodes }} - key: kubernetes.io/hostname operator: "In" values: {{- range $hostname := $root.Values.targetNodes }} - {{ $hostname }} {{- end }} - {{- end }} - {{- if $root.Values.avoidNodes }} + {{- end }} + {{- if $root.Values.avoidNodes }} + {{- if not $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + {{- end }} + nodeSelectorTerms: + - matchExpressions: - key: kubernetes.io/hostname operator: "NotIn" values: {{- range $hostname := $root.Values.avoidNodes }} - {{ $hostname }} {{- end }} - {{- end }} - {{- if $root.Values.targetNodepools }} - - key: cloud.google.com/gke-nodepool - operator: "In" - values: - {{- range $nodepool := $root.Values.targetNodepools }} - - {{ $nodepool }} - {{- end }} - {{- end }} {{- end }} tolerations: - operator: "Exists" @@ -151,12 +146,14 @@ spec: emptyDir: {} {{ end }} + {{- if $root.Values.workload.configFile }} - name: workload-configuration configMap: name: "{{.Release.Name}}-config" items: - key: workload-configuration path: {{ $root.Values.workload.configFile | default "workload-configuration" }} + {{- end }} - name: workload-launcher configMap: @@ -322,8 +319,10 @@ spec: mountPath: /usr/local/gib {{ end }} + {{- if $root.Values.workload.configFile }} - name: workload-configuration mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }} + {{- end }} - name: workload-launcher mountPath: /workload/launcher diff --git a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/values.yaml b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/values.yaml index 41839c24..8a6b110b 100644 --- a/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/values.yaml +++ b/training/a4x/llama3-1-405b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/values.yaml @@ -5,78 +5,31 @@ network: hostNetwork: true ncclSettings: - name: NCCL_DEBUG - value: VERSION - - name: NCCL_ALGO - value: "Ring,Tree" - - name: NCCL_NET_GDR_LEVEL - value: PIX - - name: NCCL_NET_GDR_C2C - value: "1" - - name: NCCL_P2P_NET_CHUNKSIZE - value: "2097152" - - name: NCCL_NVLS_ENABLE - value: "0" + value: WARN subnetworks[]: null -queue: tas-lq -# targetNodepools: null -targetNodepools: null - # - a4x-highgpu-4g-a4x-pool-0 - # - a4x-highgpu-4g-a4x-pool-1 - - -# tasSettings: -# topologyRequest: -# kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname - +queue: null tasSettings: topologyRequest: - kueue.x-k8s.io/podset-required-topology: "cloud.google.com/gce-topology-block" - kueue.x-k8s.io/podset-slice-required-topology: "cloud.google.com/gce-topology-subblock" - kueue.x-k8s.io/podset-slice-size: "16" - + kueue.x-k8s.io/podset-required-topology: cloud.google.com/gce-topology-block + kueue.x-k8s.io/podset-slice-required-topology: cloud.google.com/gce-topology-subblock + kueue.x-k8s.io/podset-slice-size: '16' volumes: gcsMounts: - bucketName: null mountPath: null gcsVolumes: true psVolumes: false - ssdMountPath: "/ssd" workload: - nodeSelector: - cloud.google.com/gke-accelerator: nvidia-gb200 arguments[]: null - configFile: llama3-1-405b-fp8cs-gbs2048-gpus256.py + configFile: llama3-1-405b-fp8cs-gbs2048-gpu256.py configPath: /workload/configs/ - defaultArguments: - - --account=none - - --partition=none - - --gpu=gb200 - - --num_gpus=256 - - --compute_dtype=fp8 - - --fp8_recipe=cs - - --global_batch_size=2048 - - --max_steps=30 - - --micro_batch_size=1 - - --tensor_parallel_size=2 - - --context_parallel_size=1 - - --expert_parallel_size=1 - - --expert_tensor_parallel_size=1 - - --pipeline_parallel_size=1 - - --virtual_pipeline_parallel_size=1 - - --use_mcore_fsdp=1 - - --cuda_graphs=0 - - --activation_offload_layers=95 - - --log_dir=/job-logs/nemo-logs + defaultArguments[]: null envs: - name: ARTIFACT_DIR value: null - - name: PL_TORCH_DISTRIBUTED_BACKEND - value: "nccl" - name: GLOO_SOCKET_IFNAME value: eth0 - - name: TORCH_NCCL_HIGH_PRIORITY - value: "1" - name: NEMO_LAUNCH_SCRIPT - value: /workload/configs/llama3-1-405b-fp8cs-gbs2048-gpus256.py + value: /workload/configs/llama3-1-405b-fp8cs-gbs2048-gpu256.py gpus: 256 image: nvcr.io/nvidia/nemo:25.07