Skip to content

Commit aecceac

Browse files
committed
# GPU Initialization Script Update - 2026-02-10
This update modifies the `gpu/install_gpu_driver.sh` script to enhance support for newer CUDA versions and GPU architectures, including initial support for NVIDIA Blackwell. ## Changes: 1. **Expanded CUDA Version Support:** * Added mappings for CUDA versions 12.8, 12.9, 13.0, and 13.1 to the `DRIVER_FOR_CUDA`, `DRIVER_SUBVER`, `CUDNN_FOR_CUDA`, `NCCL_FOR_CUDA`, and `CUDA_SUBVER` arrays. This enables the script to select appropriate driver, CuDNN, and NCCL versions for these newer CUDA toolkits. 2. **Updated Default CUDA Version:** * Changed the `DEFAULT_CUDA_VERSION` for Dataproc 2.2 and 2.3 image versions to `13.0.1`. This makes CUDA 13 the default for newer Dataproc images, likely to improve compatibility with the latest GPU hardware. 3. **Refined NCCL Build Flags (`NVCC_GENCODE`):** * The logic for setting `NVCC_GENCODE` in the `install_nvidia_nccl` function has been updated to be more granular based on the CUDA version. * Volta architectures (`sm_70`, `sm_72`) are now only included for CUDA versions less than 13.0. * Blackwell architecture (`sm_110`) is now included for CUDA versions greater than or equal to 13.0. * The commented-out section for `sm_101` remains, suggesting it's not yet fully supported or tested. 4. **Script Robustness:** * In `install_nvidia_userspace_runfile`, the variables `local_tarball` and `gcs_tarball` are now explicitly initialized to empty strings, preventing potential unbound variable errors. * The `make clean` command within the `install_nvidia_nccl` function is now non-fatal. If it fails, a warning is printed, but the script continues execution. This prevents build failures due to a missing `doc` directory, which doesn't affect the creation of the necessary Debian/RPM packages.
1 parent 7902695 commit aecceac

1 file changed

Lines changed: 43 additions & 18 deletions

File tree

gpu/install_gpu_driver.sh

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ readonly -A DRIVER_FOR_CUDA=(
141141
["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
142142
["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.29.06"
143143
["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142"
144+
["12.8"]="570.211.01" ["12.9"]="575.64.05"
145+
["13.0"]="580.126.16" ["13.1"]="590.48.01"
144146
)
145147
readonly -A DRIVER_SUBVER=(
146148
["410"]="410.104" ["415"]="415.27" ["418"]="418.113"
@@ -150,7 +152,8 @@ readonly -A DRIVER_SUBVER=(
150152
["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05"
151153
["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06"
152154
["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03"
153-
["565"]="565.77"
155+
["565"]="565.77" ["570"]="570.211.01" ["575"]="575.64.05"
156+
["580"]="580.126.16" ["590"]="590.48.01"
154157
)
155158
# https://developer.nvidia.com/cudnn-downloads
156159
readonly -A CUDNN_FOR_CUDA=(
@@ -160,7 +163,8 @@ readonly -A CUDNN_FOR_CUDA=(
160163
["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17"
161164
["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5"
162165
["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18"
163-
["12.6"]="9.6.0.74"
166+
["12.6"]="9.6.0.74" ["12.8"]="9.8.0.87" ["12.9"]="9.10.2.21"
167+
["13.0"]="9.14.0.64" ["13.1"]="9.17.0.29"
164168
)
165169
# https://developer.nvidia.com/nccl/nccl-download
166170
readonly -A NCCL_FOR_CUDA=(
@@ -169,7 +173,8 @@ readonly -A NCCL_FOR_CUDA=(
169173
["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12"
170174
["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3"
171175
["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4"
172-
["12.5"]="2.22.3" ["12.6"]="2.23.4"
176+
["12.5"]="2.22.3" ["12.6"]="2.23.4" ["12.8"]="2.25.1"
177+
["12.9"]="2.27.3" ["13.0"]="2.27.7" ["13.1"]="2.29.2"
173178
)
174179
readonly -A CUDA_SUBVER=(
175180
["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89"
@@ -178,16 +183,17 @@ readonly -A CUDA_SUBVER=(
178183
["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0"
179184
["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2"
180185
["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1"
181-
["12.6"]="12.6.3"
186+
["12.6"]="12.6.3" ["12.8"]="12.8.1" ["12.9"]="12.9.1"
187+
["13.0"]="13.0.2" ["13.1"]="13.1.1"
182188
)
183189

184190
function set_cuda_version() {
185191
case "${DATAPROC_IMAGE_VERSION}" in
186192
"1.5" ) DEFAULT_CUDA_VERSION="11.6.2" ;;
187193
"2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
188194
"2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
189-
"2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
190-
"2.3" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
195+
"2.2" ) DEFAULT_CUDA_VERSION="13.0.1" ;;
196+
"2.3" ) DEFAULT_CUDA_VERSION="13.0.1" ;;
191197
* )
192198
echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
193199
exit 1
@@ -429,6 +435,10 @@ function set_cuda_runfile_url() {
429435
["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/
430436
["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
431437
["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05"
438+
["12.8.0"]="570.86.10" ["12.8.1"]="570.124.06"
439+
["12.9.0"]="575.51.03" ["12.9.1"]="575.57.08"
440+
["13.0.0"]="580.65.06" ["13.0.1"]="580.82.07" ["13.0.2"]="580.95.05"
441+
["13.1.0"]="590.44.01"
432442
)
433443

434444
# Verify that the file with the indicated combination exists
@@ -741,17 +751,30 @@ function install_nvidia_nccl() {
741751
# Ada: SM_89, compute_89
742752
# Hopper: SM_90,SM_90a compute_90,compute_90a
743753
# Blackwell: SM_100, compute_100
744-
local nvcc_gencode=("-gencode=arch=compute_70,code=sm_70" "-gencode=arch=compute_72,code=sm_72"
745-
"-gencode=arch=compute_80,code=sm_80" "-gencode=arch=compute_86,code=sm_86")
754+
local nvcc_gencode=("-gencode=arch=compute_80,code=sm_80" # Ampre
755+
"-gencode=arch=compute_86,code=sm_86" # Ampre
756+
)
746757

747758
if version_gt "${CUDA_VERSION}" "11.6" ; then
748-
nvcc_gencode+=("-gencode=arch=compute_87,code=sm_87")
759+
nvcc_gencode+=("-gencode=arch=compute_87,code=sm_87") # Ampre
749760
fi
750761
if version_ge "${CUDA_VERSION}" "11.8" ; then
751-
nvcc_gencode+=("-gencode=arch=compute_89,code=sm_89")
762+
nvcc_gencode+=("-gencode=arch=compute_89,code=sm_89") # Lovelace
752763
fi
753764
if version_ge "${CUDA_VERSION}" "12.0" ; then
754-
nvcc_gencode+=("-gencode=arch=compute_90,code=sm_90" "-gencode=arch=compute_90a,code=compute_90a")
765+
nvcc_gencode+=("-gencode=arch=compute_90,code=sm_90") # Hopper
766+
fi
767+
# if version_ge "${CUDA_VERSION}" "12.8" ; then
768+
# nvcc_gencode+=("-gencode=arch=compute_101,code=sm_101") # Blackwell
769+
# fi
770+
if version_lt "${CUDA_VERSION}" "13.0" ; then
771+
nvcc_gencode+=("-gencode=arch=compute_70,code=sm_70" # Volta
772+
"-gencode=arch=compute_72,code=sm_72" # Volta
773+
)
774+
775+
fi
776+
if version_ge "${CUDA_VERSION}" "13.0" ; then
777+
nvcc_gencode+=("-gencode=arch=compute_110,code=sm_110") # Blackwell
755778
fi
756779
NVCC_GENCODE="${nvcc_gencode[*]}"
757780

@@ -769,7 +792,7 @@ function install_nvidia_nccl() {
769792
execute_with_retries make -j$(nproc) pkg.redhat.build
770793
fi
771794
tar czvf "${local_tarball}" "../${build_path}"
772-
make clean
795+
make clean || echo "WARN: 'make clean' failed in nccl build, continuing..."
773796
popd
774797
tar xzvf "${local_tarball}"
775798
${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}"
@@ -1408,10 +1431,10 @@ function install_nvidia_userspace_runfile() {
14081431
local runfile_hash
14091432
runfile_hash=$(echo "${runfile_sha256sum}" | awk '{print $1}')
14101433

1411-
local runfile_args
1412-
runfile_args=""
1434+
local runfile_args=""
14131435
local cache_hit="0"
1414-
local local_tarball
1436+
local local_tarball="" # Initialize local_tarball here
1437+
local gcs_tarball="" # Initialize gcs_tarball here
14151438

14161439
# Build nonfree driver on rocky8, or when driver version is prior to
14171440
# open driver min, or when GPU architecture is prior to Turing
@@ -1422,13 +1445,13 @@ function install_nvidia_userspace_runfile() {
14221445
local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
14231446
test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
14241447
local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}_nonfree.tar.gz"
1425-
local_tarball="${workdir}/${build_tarball}"
1448+
local_tarball="${workdir}/${build_tarball}" # Set within the condition
14261449
local build_dir
14271450
if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
14281451
then build_dir="${modulus_md5sum}"
14291452
else build_dir="unsigned" ; fi
14301453

1431-
local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
1454+
gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" # Set within the condition
14321455

14331456
if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
14341457
# when running with fewer than 32 cores, yield to in-progress build
@@ -1497,7 +1520,7 @@ function install_nvidia_userspace_runfile() {
14971520
if [[ "${cache_hit}" == "1" ]] ; then
14981521
${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv
14991522
depmod -a
1500-
else
1523+
elif [[ -n "${local_tarball}" ]]; then # Check if local_tarball was set
15011524
clear_dkms_key
15021525
tar czvf "${local_tarball}" \
15031526
/var/log/nvidia-installer.log \
@@ -1506,6 +1529,8 @@ function install_nvidia_userspace_runfile() {
15061529

15071530
if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
15081531
building_file=""
1532+
else
1533+
echo "DEBUG: local_tarball not set, skipping tarball creation." >&2
15091534
fi
15101535
fi
15111536

0 commit comments

Comments
 (0)