From 7573147f071b88d95f157e2c9a643450cf30bf8f Mon Sep 17 00:00:00 2001 From: Scott McMillan Date: Fri, 13 Mar 2026 15:17:15 -0500 Subject: [PATCH] Update NCCL building block and bump default version to 2.29.7-1 --- docs/building_blocks.md | 4 +- hpccm/building_blocks/nccl.py | 77 ++++++++++++++++------ test/test_nccl.py | 116 +++++++++++++++++----------------- 3 files changed, 117 insertions(+), 80 deletions(-) diff --git a/docs/building_blocks.md b/docs/building_blocks.md index 9513828..418e8ff 100644 --- a/docs/building_blocks.md +++ b/docs/building_blocks.md @@ -2958,7 +2958,7 @@ source. The default value is False. use the latest commit on the default branch for the repository. - __cuda__: Flag to specify the CUDA version of the package to download. -The default is `11.6`. This option is ignored if build is True. +The default is `13.2`. This option is ignored if build is True. - __environment__: Boolean flag to specify whether the environment (`CPATH`, `LD_LIBRARY_PATH`, `LIBRARY_PATH`, and `PATH`) should be @@ -2980,7 +2980,7 @@ repository. The default is empty, i.e., use the release package specified by `version`. - __version__: The version of NCCL to install. The default value is -`2.12.10-1`. +`2.29.7-1`. __Examples__ diff --git a/hpccm/building_blocks/nccl.py b/hpccm/building_blocks/nccl.py index 64bb210..6613cb5 100644 --- a/hpccm/building_blocks/nccl.py +++ b/hpccm/building_blocks/nccl.py @@ -31,8 +31,7 @@ from hpccm.building_blocks.base import bb_base from hpccm.building_blocks.generic_build import generic_build from hpccm.building_blocks.packages import packages -from hpccm.common import linux_distro -from hpccm.config import get_cpu_architecture +from hpccm.common import cpu_arch, linux_distro from hpccm.primitives.comment import comment from hpccm.primitives.copy import copy from hpccm.primitives.environment import environment @@ -56,7 +55,7 @@ class nccl(bb_base, hpccm.templates.downloader, hpccm.templates.envvars, use the latest commit on the default branch for the repository. cuda: Flag to specify the CUDA version of the package to download. - The default is `11.6`. This option is ignored if build is True. + The default is `13.2`. This option is ignored if build is True. environment: Boolean flag to specify whether the environment (`CPATH`, `LD_LIBRARY_PATH`, `LIBRARY_PATH`, and `PATH`) should be @@ -78,7 +77,7 @@ class nccl(bb_base, hpccm.templates.downloader, hpccm.templates.envvars, specified by `version`. version: The version of NCCL to install. The default value is - `2.12.10-1`. + `2.29.7-1`. # Examples @@ -97,25 +96,33 @@ def __init__(self, **kwargs): super(nccl, self).__init__(**kwargs) + self.__arch_label = '' # Filled in by __cpu_arch self.__baseurl = kwargs.pop('baseurl', 'https://github.com/NVIDIA/nccl/archive') self.__build = kwargs.pop('build', False) self.__build_environment = '' # Filled in by __configure self.__default_repository = 'https://github.com/NVIDIA/nccl.git' self.__distro_label = '' # Filled in by __distro - self.__cuda = kwargs.pop('cuda', '11.6') + self.__cuda = kwargs.pop('cuda', '13.2') self.__make_variables = kwargs.pop('make_variables', {}) self.__ospackages = kwargs.pop('ospackages', []) self.__prefix = kwargs.pop('prefix', '/usr/local/nccl') + self.__repo_key = '' # Filled in by __repo_key self.__src_directory = kwargs.pop('src_directory', None) - self.__version = kwargs.pop('version', '2.12.10-1') + self.__version = kwargs.pop('version', '2.29.7-1') self.__wd = kwargs.get('wd', hpccm.config.g_wd) # working directory if not self.__build: # Install prebuild package + # Set the CPU architecture specific parameters + self.__cpu_arch() + # Set the Linux distribution specific parameters self.__distro() + # Set the repo key + self.__repo() + self += comment('NCCL {}'.format(self.__version)) self += packages(ospackages=self.__ospackages) self += packages( @@ -123,13 +130,13 @@ def __init__(self, **kwargs): self.__cuda), 'libnccl-dev={0}+cuda{1}'.format(self.__version, self.__cuda)], - apt_keys=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}/3bf863cc.pub'.format(self.__distro_label, get_cpu_architecture())], - apt_repositories=['deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1} /'.format(self.__distro_label, get_cpu_architecture())], + apt_keys=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}/{2}'.format(self.__distro_label, self.__arch_label, self.__repo_key)], + apt_repositories=['deb [signed-by=/usr/share/keyrings/{2}] https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1} /'.format(self.__distro_label, self.__arch_label, self.__repo_key.replace('.pub', '.gpg'))], yum=['libnccl-{0}+cuda{1}'.format(self.__version, self.__cuda), 'libnccl-devel-{0}+cuda{1}'.format(self.__version, self.__cuda)], - yum_keys=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}/3bf863cc.pub'.format(self.__distro_label, get_cpu_architecture())], - yum_repositories=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}'.format(self.__distro_label, get_cpu_architecture())]) + yum_keys=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}/{2}'.format(self.__distro_label, self.__arch_label, self.__repo_key)], + yum_repositories=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}'.format(self.__distro_label, self.__arch_label)]) else: # Build from source @@ -166,6 +173,17 @@ def __init__(self, **kwargs): self += packages(ospackages=self.__ospackages) self += self.__bb + def __cpu_arch(self): + """Based on the CPU architecture, set values accordingly. A user + specified value overrides any defaults.""" + + if hpccm.config.g_cpu_arch == cpu_arch.AARCH64: + self.__arch_label = 'sbsa' + elif hpccm.config.g_cpu_arch == cpu_arch.X86_64: + self.__arch_label = 'x86_64' + else: # pragma: no cover + raise RuntimeError('Unknown CPU architecture') + def __configure(self): """Setup build options based on user parameters""" @@ -192,16 +210,20 @@ def __distro(self): self.__ospackages = ['apt-transport-https', 'ca-certificates', 'gnupg', 'wget'] - if hpccm.config.g_linux_version >= Version('18.0'): - self.__distro_label = 'ubuntu1804' + if hpccm.config.g_linux_version >= Version('24.0'): + self.__distro_label = 'ubuntu2404' + elif hpccm.config.g_linux_version >= Version('22.0'): + self.__distro_label = 'ubuntu2204' else: - self.__distro_label = 'ubuntu1604' + self.__distro_label = 'ubuntu2004' elif hpccm.config.g_linux_distro == linux_distro.CENTOS: - if hpccm.config.g_linux_version >= Version('8.0'): - self.__distro_label = 'rhel8' + if hpccm.config.g_linux_version >= Version('10.0'): + self.__distro_label = 'rhel10' + elif hpccm.config.g_linux_version >= Version('9.0'): + self.__distro_label = 'rhel9' else: - self.__distro_label = 'rhel7' + self.__distro_label = 'rhel8' else: # pragma: no cover raise RuntimeError('Unknown Linux distribution') @@ -225,6 +247,21 @@ def __download(self): if not self.repository and not self.url: self.url = '{0}/v{1}.tar.gz'.format(self.__baseurl, self.__version) + def __repo(self): + """Based on the Linux distribution and CPU architecture, set values + accordingly. A user specified value overrides any defaults. + """ + + if self.__distro_label.startswith('ubuntu'): + self.__repo_key = '3bf863cc.pub' + elif self.__distro_label.startswith('rhel'): + if hpccm.config.g_linux_version >= Version('10.0'): + self.__repo_key = 'CDF6BA43.pub' + else: + self.__repo_key = 'D42D0685.pub' + else: # pragma: no cover + raise RuntimeError('Unknown repository') + def runtime(self, _from='0'): """Generate the set of instructions to install the runtime specific components from a build in a previous stage. @@ -246,10 +283,10 @@ def runtime(self, _from='0'): self.rt += packages( apt=['libnccl2={0}+cuda{1}'.format(self.__version, self.__cuda)], - apt_keys=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}/3bf863cc.pub'.format(self.__distro_label, get_cpu_architecture())], - apt_repositories=['deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1} /'.format(self.__distro_label, get_cpu_architecture())], + apt_keys=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}/{2}'.format(self.__distro_label, self.__arch_label, self.__repo_key)], + apt_repositories=['deb [signed-by=/usr/share/keyrings/{2}] https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1} /'.format(self.__distro_label, self.__arch_label, self.__repo_key.replace('.pub', '.gpg'))], yum=['libnccl-{0}+cuda{1}'.format(self.__version, self.__cuda)], - yum_keys=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}/3bf863cc.pub'.format(self.__distro_label, get_cpu_architecture())], - yum_repositories=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}'.format(self.__distro_label, get_cpu_architecture())]) + yum_keys=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}/{2}'.format(self.__distro_label, self.__arch_label, self.__repo_key)], + yum_repositories=['https://developer.download.nvidia.com/compute/cuda/repos/{0}/{1}'.format(self.__distro_label, self.__arch_label)]) return str(self.rt) diff --git a/test/test_nccl.py b/test/test_nccl.py index 2a61b28..c11ca93 100644 --- a/test/test_nccl.py +++ b/test/test_nccl.py @@ -22,7 +22,7 @@ import logging # pylint: disable=unused-import import unittest -from helpers import centos, centos8, docker, ppc64le, ubuntu, ubuntu18, x86_64 +from helpers import aarch64, centos8, rockylinux9, rockylinux10, docker, ubuntu20, ubuntu22, ubuntu24, x86_64 from hpccm.building_blocks.nccl import nccl @@ -32,13 +32,13 @@ def setUp(self): logging.disable(logging.ERROR) @x86_64 - @ubuntu + @ubuntu24 @docker def test_defaults_ubuntu(self): """nccl defaults""" n = nccl() - self.assertEqual(str(n), -r'''# NCCL 2.12.10-1 + self.assertMultiLineEqual(str(n), +r'''# NCCL 2.29.7-1 RUN apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ apt-transport-https \ @@ -48,22 +48,22 @@ def test_defaults_ubuntu(self): rm -rf /var/lib/apt/lists/* RUN mkdir -p /usr/share/keyrings && \ rm -f /usr/share/keyrings/3bf863cc.gpg && \ - wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/3bf863cc.gpg && \ - echo "deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" >> /etc/apt/sources.list.d/hpccm.list && \ + wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/3bf863cc.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64 /" >> /etc/apt/sources.list.d/hpccm.list && \ apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - libnccl-dev=2.12.10-1+cuda11.6 \ - libnccl2=2.12.10-1+cuda11.6 && \ + libnccl-dev=2.29.7-1+cuda13.2 \ + libnccl2=2.29.7-1+cuda13.2 && \ rm -rf /var/lib/apt/lists/*''') @x86_64 - @ubuntu18 + @ubuntu22 @docker - def test_defaults_ubuntu18(self): + def test_defaults_ubuntu22(self): """nccl defaults""" n = nccl() - self.assertEqual(str(n), -r'''# NCCL 2.12.10-1 + self.assertMultiLineEqual(str(n), +r'''# NCCL 2.29.7-1 RUN apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ apt-transport-https \ @@ -73,22 +73,22 @@ def test_defaults_ubuntu18(self): rm -rf /var/lib/apt/lists/* RUN mkdir -p /usr/share/keyrings && \ rm -f /usr/share/keyrings/3bf863cc.gpg && \ - wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/3bf863cc.gpg && \ - echo "deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" >> /etc/apt/sources.list.d/hpccm.list && \ + wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/3bf863cc.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 /" >> /etc/apt/sources.list.d/hpccm.list && \ apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - libnccl-dev=2.12.10-1+cuda11.6 \ - libnccl2=2.12.10-1+cuda11.6 && \ + libnccl-dev=2.29.7-1+cuda13.2 \ + libnccl2=2.29.7-1+cuda13.2 && \ rm -rf /var/lib/apt/lists/*''') - @ppc64le - @ubuntu + @aarch64 + @ubuntu24 @docker - def test_ubuntu_ppc64le(self): - """nccl ppc64le""" - n = nccl(cuda=9.2, version='2.4.8-1') - self.assertEqual(str(n), -r'''# NCCL 2.4.8-1 + def test_ubuntu_aarch64(self): + """nccl aarch64""" + n = nccl(cuda=13.2, version='2.29.7-1') + self.assertMultiLineEqual(str(n), +r'''# NCCL 2.29.7-1 RUN apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ apt-transport-https \ @@ -98,51 +98,51 @@ def test_ubuntu_ppc64le(self): rm -rf /var/lib/apt/lists/* RUN mkdir -p /usr/share/keyrings && \ rm -f /usr/share/keyrings/3bf863cc.gpg && \ - wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/ppc64el/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/3bf863cc.gpg && \ - echo "deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/ppc64el /" >> /etc/apt/sources.list.d/hpccm.list && \ + wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/3bf863cc.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa /" >> /etc/apt/sources.list.d/hpccm.list && \ apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - libnccl-dev=2.4.8-1+cuda9.2 \ - libnccl2=2.4.8-1+cuda9.2 && \ + libnccl-dev=2.29.7-1+cuda13.2 \ + libnccl2=2.29.7-1+cuda13.2 && \ rm -rf /var/lib/apt/lists/*''') @x86_64 - @ubuntu + @ubuntu24 @docker def test_build_ubuntu(self): """nccl build""" - n = nccl(build=True) - self.assertEqual(str(n), + n = nccl(build=True, version='2.29.7-1') + self.assertMultiLineEqual(str(n), r'''# NCCL RUN apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ make \ wget && \ rm -rf /var/lib/apt/lists/* -RUN mkdir -p /var/tmp && wget -q -nc -P /var/tmp https://github.com/NVIDIA/nccl/archive/v2.12.10-1.tar.gz && \ - mkdir -p /var/tmp && tar -x -f /var/tmp/v2.12.10-1.tar.gz -C /var/tmp -z && \ - cd /var/tmp/nccl-2.12.10-1 && \ +RUN mkdir -p /var/tmp && wget -q -nc -P /var/tmp https://github.com/NVIDIA/nccl/archive/v2.29.7-1.tar.gz && \ + mkdir -p /var/tmp && tar -x -f /var/tmp/v2.29.7-1.tar.gz -C /var/tmp -z && \ + cd /var/tmp/nccl-2.29.7-1 && \ PREFIX=/usr/local/nccl make -j$(nproc) install && \ - rm -rf /var/tmp/nccl-2.12.10-1 /var/tmp/v2.12.10-1.tar.gz + rm -rf /var/tmp/nccl-2.29.7-1 /var/tmp/v2.29.7-1.tar.gz ENV CPATH=/usr/local/nccl/include:$CPATH \ LD_LIBRARY_PATH=/usr/local/nccl/lib:$LD_LIBRARY_PATH \ LIBRARY_PATH=/usr/local/nccl/lib:$LIBRARY_PATH \ PATH=/usr/local/nccl/bin:$PATH''') @x86_64 - @centos + @rockylinux10 @docker - def test_defaults_centos(self): + def test_defaults_rockylinux10(self): """nccl defaults""" n = nccl() - self.assertEqual(str(n), -r'''# NCCL 2.12.10-1 -RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/3bf863cc.pub && \ - yum install -y yum-utils && \ - yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64 && \ + self.assertMultiLineEqual(str(n), +r'''# NCCL 2.29.7-1 +RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/CDF6BA43.pub && \ + yum install -y dnf-utils && \ + yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64 && \ yum install -y \ - libnccl-2.12.10-1+cuda11.6 \ - libnccl-devel-2.12.10-1+cuda11.6 && \ + libnccl-2.29.7-1+cuda13.2 \ + libnccl-devel-2.29.7-1+cuda13.2 && \ rm -rf /var/cache/yum/*''') @x86_64 @@ -150,10 +150,10 @@ def test_defaults_centos(self): @docker def test_defaults_centos8(self): """nccl defaults""" - n = nccl() - self.assertEqual(str(n), + n = nccl(version='2.12.10-1', cuda='11.6') + self.assertMultiLineEqual(str(n), r'''# NCCL 2.12.10-1 -RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/3bf863cc.pub && \ +RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub && \ yum install -y dnf-utils && \ yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64 && \ yum install -y \ @@ -162,13 +162,13 @@ def test_defaults_centos8(self): rm -rf /var/cache/yum/*''') @x86_64 - @ubuntu + @ubuntu24 @docker def test_build_repo_ubuntu(self): """nccl build from git""" n = nccl(build=True, make_variables={'CUDA_HOME': '/usr/local/cuda'}, repository=True) - self.assertEqual(str(n), + self.assertMultiLineEqual(str(n), r'''# NCCL RUN apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ @@ -185,12 +185,12 @@ def test_build_repo_ubuntu(self): LIBRARY_PATH=/usr/local/nccl/lib:$LIBRARY_PATH \ PATH=/usr/local/nccl/bin:$PATH''') - @centos + @rockylinux9 @docker - def test_build_centos(self): + def test_build_rocky9(self): """nccl build""" n = nccl(build=True, version='2.7.6-1') - self.assertEqual(str(n), + self.assertMultiLineEqual(str(n), r'''# NCCL RUN yum install -y \ make \ @@ -208,13 +208,13 @@ def test_build_centos(self): PATH=/usr/local/nccl/bin:$PATH''') @x86_64 - @ubuntu + @ubuntu24 @docker def test_runtime(self): """Runtime""" n = nccl() r = n.runtime() - self.assertEqual(r, + self.assertMultiLineEqual(r, r'''# NCCL RUN apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ @@ -225,21 +225,21 @@ def test_runtime(self): rm -rf /var/lib/apt/lists/* RUN mkdir -p /usr/share/keyrings && \ rm -f /usr/share/keyrings/3bf863cc.gpg && \ - wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/3bf863cc.gpg && \ - echo "deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" >> /etc/apt/sources.list.d/hpccm.list && \ + wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/3bf863cc.pub | gpg --dearmor -o /usr/share/keyrings/3bf863cc.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/3bf863cc.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64 /" >> /etc/apt/sources.list.d/hpccm.list && \ apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - libnccl2=2.12.10-1+cuda11.6 && \ + libnccl2=2.29.7-1+cuda13.2 && \ rm -rf /var/lib/apt/lists/*''') @x86_64 - @ubuntu + @ubuntu24 @docker def test_build_runtime(self): """Runtime""" n = nccl(build=True) r = n.runtime() - self.assertEqual(r, + self.assertMultiLineEqual(r, r'''# NCCL COPY --from=0 /usr/local/nccl /usr/local/nccl ENV CPATH=/usr/local/nccl/include:$CPATH \