From 025e0cf84e168f234995838e560740a761577d20 Mon Sep 17 00:00:00 2001 From: Chirs Ruihl Date: Tue, 14 Oct 2025 23:52:03 -0400 Subject: [PATCH 1/4] nvml build support --- slurm/defaults/main.yml | 5 +++++ slurm/tasks/pre_install.yaml | 27 +++++++++++++++++++++++++++ slurm/tasks/rpmbuild_slurm.yaml | 15 ++++++++++++++- slurm/vars/RedHat-7.yaml | 5 +++++ slurm/vars/RedHat-8.yaml | 5 +++++ slurm/vars/RedHat-9.yaml | 5 +++++ 6 files changed, 61 insertions(+), 1 deletion(-) diff --git a/slurm/defaults/main.yml b/slurm/defaults/main.yml index 5a6e239..ca457dc 100644 --- a/slurm/defaults/main.yml +++ b/slurm/defaults/main.yml @@ -10,6 +10,10 @@ slurm_rpmbuild_user_home: "/home/{{ slurm_rpmbuild_user }}" slurm_rpm_final_path: "/home/{{ slurm_rpmbuild_user }}/rpmbuild/RPMS/{{ ansible_architecture }}" slurm_local_repo_def_path: /etc/yum.repos.d/slurm.repo +# RPM build: optional NVML support. When set and slurm_enable_nvml is true, +# the rpmbuild command will be passed --with-nvml= so Slurm builds with NVML. +slurm_rpmbuild_nvml_path: "" + # Local Slurm Repo Details #========================== # This assumes an HTTP based RPM Repo @@ -122,6 +126,7 @@ slurm_rpm_repo: "" slurm_build_jobs: 8 slurm_enable_cgroup_conf: true slurm_enable_restd: true +slurm_enable_nvml: false slurm_restd_port: 8911 slurm_restd_host: "0.0.0.0" slurm_restd_user: srestd diff --git a/slurm/tasks/pre_install.yaml b/slurm/tasks/pre_install.yaml index e13b9e2..526bbd3 100644 --- a/slurm/tasks/pre_install.yaml +++ b/slurm/tasks/pre_install.yaml @@ -83,6 +83,33 @@ - slurm_install_controller - slurm_install_slurmd +- name: Install nvml packages + ansible.builtin.dnf: + name: "{{item}}" + state: present + enablerepo: "{{ slurm_el_repos }}" + loop: "{{ slurm_cuda_packages }}" + when: slurm_enable_nvml and ansible_distribution_major_version | int >= 8 + tags: + - slurm + - slurm_install_controller + - slurm_install_slurmd + +- name: Add NVIDIA CUDA repo for RHEL 9 + ansible.builtin.yum_repository: + name: cuda-rhel9-x86_64 + description: NVIDIA CUDA repository for RHEL 9 x86_64 + baseurl: https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64 + enabled: yes + gpgcheck: yes + gpgkey: https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/D42D0685.pub + state: present + when: slurm_enable_nvml and ansible_distribution_major_version | int == 9 + tags: + - slurm + - slurm_install_controller + - slurm_install_slurmd + - name: Add slurm group ansible.builtin.group: name: slurm diff --git a/slurm/tasks/rpmbuild_slurm.yaml b/slurm/tasks/rpmbuild_slurm.yaml index 36fa381..6398f3d 100644 --- a/slurm/tasks/rpmbuild_slurm.yaml +++ b/slurm/tasks/rpmbuild_slurm.yaml @@ -114,8 +114,21 @@ ansible.builtin.debug: msg: "rpmbuild command: {{ rpmbuild_cmd }} {{ rpmbuild_pmix_option }} {{ rpmbuild_restd_option }} -ta slurm-{{ slurm_version }}.tar.bz2" +- name: rpmbuild nvml option default to "" + ansible.builtin.set_fact: + rpmbuild_nvml_option: "" + +- name: rpmbuild nvml option + ansible.builtin.set_fact: + rpmbuild_nvml_option: "--with-nvml={{ slurm_rpmbuild_nvml_path }}" + when: slurm_enable_nvml and slurm_rpmbuild_nvml_path | length > 0 + +- name: show build command with nvml + ansible.builtin.debug: + msg: "rpmbuild command: {{ rpmbuild_cmd }} {{ rpmbuild_pmix_option }} {{ rpmbuild_restd_option }} {{ rpmbuild_nvml_option }} -ta slurm-{{ slurm_version }}.tar.bz2" + - name: rpmbuild from source tarball - command: "{{ rpmbuild_cmd }} {{ rpmbuild_pmix_option }} {{ rpmbuild_restd_option }} -ta slurm-{{ slurm_version }}.tar.bz2" + command: "{{ rpmbuild_cmd }} {{ rpmbuild_pmix_option }} {{ rpmbuild_restd_option }} {{ rpmbuild_nvml_option }} -ta slurm-{{ slurm_version }}.tar.bz2" args: chdir: "{{slurm_tmpdir}}" become: true diff --git a/slurm/vars/RedHat-7.yaml b/slurm/vars/RedHat-7.yaml index 1062562..514e387 100644 --- a/slurm/vars/RedHat-7.yaml +++ b/slurm/vars/RedHat-7.yaml @@ -21,6 +21,11 @@ slurm_required_packages: - pmix - pmix-devel +slurm_cuda_packages: + - nvidia-driver-cuda + - nvidia-driver-cuda-libs + - cuda-toolkit-12-8 + slurm_dbd_required_packages: - mariadb-server - mariadb-devel diff --git a/slurm/vars/RedHat-8.yaml b/slurm/vars/RedHat-8.yaml index 0d85466..2b32729 100644 --- a/slurm/vars/RedHat-8.yaml +++ b/slurm/vars/RedHat-8.yaml @@ -22,6 +22,11 @@ slurm_required_packages: - lua-devel - pam-devel +slurm_cuda_packages: + - nvidia-driver-cuda + - nvidia-driver-cuda-libs + - cuda-toolkit-12-8 + slurm_dbd_required_packages: - mariadb-server - mariadb-devel diff --git a/slurm/vars/RedHat-9.yaml b/slurm/vars/RedHat-9.yaml index 01dd223..1cdb7a7 100644 --- a/slurm/vars/RedHat-9.yaml +++ b/slurm/vars/RedHat-9.yaml @@ -14,6 +14,11 @@ slurm_required_packages: - s-nail - lua +slurm_cuda_packages: + - nvidia-driver-cuda + - nvidia-driver-cuda-libs + - cuda-toolkit-12-8 + slurm_required_devel_packages: - zlib-devel - bzip2-devel From 1df32c699b10d2f335716f7aa6eaf3cfdb06baa7 Mon Sep 17 00:00:00 2001 From: Chirs Ruihl Date: Wed, 29 Oct 2025 08:25:40 -0400 Subject: [PATCH 2/4] #22 add nvml build option to slurm --- slurm/defaults/main.yml | 7 ++++++- slurm/tasks/pre_install.yaml | 3 ++- slurm/tasks/rpmbuild_slurm.yaml | 6 +++--- slurm/vars/RedHat-9.yaml | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/slurm/defaults/main.yml b/slurm/defaults/main.yml index ca457dc..3dad785 100644 --- a/slurm/defaults/main.yml +++ b/slurm/defaults/main.yml @@ -2,6 +2,11 @@ # Build RPM Packages - when true, this will biuld RPM packages for slurm # and optionally PMIX then exit the role and complete. +# CUDA Details +slurm_cuda_toolkit: "cuda-toolkit-13.0" +slurm_cuda_driver: "nvidia-driver-cuda-580.95.05-1.el9.x86_64" +slurm_cuda_toolkit_version: "13-0" + # RPM Build Details #====================== slurm_build_rpms: false @@ -12,7 +17,7 @@ slurm_local_repo_def_path: /etc/yum.repos.d/slurm.repo # RPM build: optional NVML support. When set and slurm_enable_nvml is true, # the rpmbuild command will be passed --with-nvml= so Slurm builds with NVML. -slurm_rpmbuild_nvml_path: "" +slurm_rpmbuild_nvml_path: "/usr/local/cuda-{{ slurm_cuda_toolkit_version|replace('-', '.') }}" # Local Slurm Repo Details #========================== diff --git a/slurm/tasks/pre_install.yaml b/slurm/tasks/pre_install.yaml index 526bbd3..3103f2b 100644 --- a/slurm/tasks/pre_install.yaml +++ b/slurm/tasks/pre_install.yaml @@ -89,7 +89,8 @@ state: present enablerepo: "{{ slurm_el_repos }}" loop: "{{ slurm_cuda_packages }}" - when: slurm_enable_nvml and ansible_distribution_major_version | int >= 8 + when: false +#slurm_enable_nvml and ansible_distribution_major_version | int >= 8 tags: - slurm - slurm_install_controller diff --git a/slurm/tasks/rpmbuild_slurm.yaml b/slurm/tasks/rpmbuild_slurm.yaml index 6398f3d..2ca5f32 100644 --- a/slurm/tasks/rpmbuild_slurm.yaml +++ b/slurm/tasks/rpmbuild_slurm.yaml @@ -120,8 +120,8 @@ - name: rpmbuild nvml option ansible.builtin.set_fact: - rpmbuild_nvml_option: "--with-nvml={{ slurm_rpmbuild_nvml_path }}" - when: slurm_enable_nvml and slurm_rpmbuild_nvml_path | length > 0 + rpmbuild_nvml_option: "-D '_with_nvml --with-nvml={{ slurm_rpmbuild_nvml_path }}'" + when: slurm_enable_nvml - name: show build command with nvml ansible.builtin.debug: @@ -132,4 +132,4 @@ args: chdir: "{{slurm_tmpdir}}" become: true - become_user: "{{ slurm_rpmbuild_user }}" \ No newline at end of file + become_user: "{{ slurm_rpmbuild_user }}" \ No newline at end of file diff --git a/slurm/vars/RedHat-9.yaml b/slurm/vars/RedHat-9.yaml index 1cdb7a7..82e145d 100644 --- a/slurm/vars/RedHat-9.yaml +++ b/slurm/vars/RedHat-9.yaml @@ -17,7 +17,7 @@ slurm_required_packages: slurm_cuda_packages: - nvidia-driver-cuda - nvidia-driver-cuda-libs - - cuda-toolkit-12-8 + - cuda-toolkit-{{ slurm_cuda_toolkit_version }} slurm_required_devel_packages: - zlib-devel From a574b95d1c600c788803b67319d8e8a5f594e87b Mon Sep 17 00:00:00 2001 From: Chirs Ruihl Date: Wed, 29 Oct 2025 11:08:44 -0400 Subject: [PATCH 3/4] #22 Update --- slurm/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm/defaults/main.yml b/slurm/defaults/main.yml index 3dad785..22980b6 100644 --- a/slurm/defaults/main.yml +++ b/slurm/defaults/main.yml @@ -2,9 +2,9 @@ # Build RPM Packages - when true, this will biuld RPM packages for slurm # and optionally PMIX then exit the role and complete. +# You can use cuda-toolkit or cuda-nvml-devel # CUDA Details slurm_cuda_toolkit: "cuda-toolkit-13.0" -slurm_cuda_driver: "nvidia-driver-cuda-580.95.05-1.el9.x86_64" slurm_cuda_toolkit_version: "13-0" # RPM Build Details From d48b8ded0e582783a0de4e465091a26ad430fabc Mon Sep 17 00:00:00 2001 From: Chirs Ruihl Date: Wed, 29 Oct 2025 11:20:33 -0400 Subject: [PATCH 4/4] #22 Update slurm_cuda_packate --- slurm/vars/RedHat-7.yaml | 4 +--- slurm/vars/RedHat-8.yaml | 4 +--- slurm/vars/RedHat-9.yaml | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/slurm/vars/RedHat-7.yaml b/slurm/vars/RedHat-7.yaml index 514e387..3166e8c 100644 --- a/slurm/vars/RedHat-7.yaml +++ b/slurm/vars/RedHat-7.yaml @@ -22,9 +22,7 @@ slurm_required_packages: - pmix-devel slurm_cuda_packages: - - nvidia-driver-cuda - - nvidia-driver-cuda-libs - - cuda-toolkit-12-8 + - "{{ slurm_cuda_toolkit }}" slurm_dbd_required_packages: - mariadb-server diff --git a/slurm/vars/RedHat-8.yaml b/slurm/vars/RedHat-8.yaml index 2b32729..52a2437 100644 --- a/slurm/vars/RedHat-8.yaml +++ b/slurm/vars/RedHat-8.yaml @@ -23,9 +23,7 @@ slurm_required_packages: - pam-devel slurm_cuda_packages: - - nvidia-driver-cuda - - nvidia-driver-cuda-libs - - cuda-toolkit-12-8 + - "{{ slurm_cuda_toolkit }}" slurm_dbd_required_packages: - mariadb-server diff --git a/slurm/vars/RedHat-9.yaml b/slurm/vars/RedHat-9.yaml index 82e145d..4d25a2e 100644 --- a/slurm/vars/RedHat-9.yaml +++ b/slurm/vars/RedHat-9.yaml @@ -15,9 +15,7 @@ slurm_required_packages: - lua slurm_cuda_packages: - - nvidia-driver-cuda - - nvidia-driver-cuda-libs - - cuda-toolkit-{{ slurm_cuda_toolkit_version }} + - "{{ slurm_cuda_toolkit }}" slurm_required_devel_packages: - zlib-devel