diff --git a/slurm/defaults/main.yml b/slurm/defaults/main.yml index 5a6e239..22980b6 100644 --- a/slurm/defaults/main.yml +++ b/slurm/defaults/main.yml @@ -2,6 +2,11 @@ # Build RPM Packages - when true, this will biuld RPM packages for slurm # and optionally PMIX then exit the role and complete. +# You can use cuda-toolkit or cuda-nvml-devel +# CUDA Details +slurm_cuda_toolkit: "cuda-toolkit-13.0" +slurm_cuda_toolkit_version: "13-0" + # RPM Build Details #====================== slurm_build_rpms: false @@ -10,6 +15,10 @@ slurm_rpmbuild_user_home: "/home/{{ slurm_rpmbuild_user }}" slurm_rpm_final_path: "/home/{{ slurm_rpmbuild_user }}/rpmbuild/RPMS/{{ ansible_architecture }}" slurm_local_repo_def_path: /etc/yum.repos.d/slurm.repo +# RPM build: optional NVML support. When set and slurm_enable_nvml is true, +# the rpmbuild command will be passed --with-nvml= so Slurm builds with NVML. +slurm_rpmbuild_nvml_path: "/usr/local/cuda-{{ slurm_cuda_toolkit_version|replace('-', '.') }}" + # Local Slurm Repo Details #========================== # This assumes an HTTP based RPM Repo @@ -122,6 +131,7 @@ slurm_rpm_repo: "" slurm_build_jobs: 8 slurm_enable_cgroup_conf: true slurm_enable_restd: true +slurm_enable_nvml: false slurm_restd_port: 8911 slurm_restd_host: "0.0.0.0" slurm_restd_user: srestd diff --git a/slurm/tasks/pre_install.yaml b/slurm/tasks/pre_install.yaml index e13b9e2..3103f2b 100644 --- a/slurm/tasks/pre_install.yaml +++ b/slurm/tasks/pre_install.yaml @@ -83,6 +83,34 @@ - slurm_install_controller - slurm_install_slurmd +- name: Install nvml packages + ansible.builtin.dnf: + name: "{{item}}" + state: present + enablerepo: "{{ slurm_el_repos }}" + loop: "{{ slurm_cuda_packages }}" + when: false +#slurm_enable_nvml and ansible_distribution_major_version | int >= 8 + tags: + - slurm + - slurm_install_controller + - slurm_install_slurmd + +- name: Add NVIDIA CUDA repo for RHEL 9 + ansible.builtin.yum_repository: + name: cuda-rhel9-x86_64 + description: NVIDIA CUDA repository for RHEL 9 x86_64 + baseurl: https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64 + enabled: yes + gpgcheck: yes + gpgkey: https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/D42D0685.pub + state: present + when: slurm_enable_nvml and ansible_distribution_major_version | int == 9 + tags: + - slurm + - slurm_install_controller + - slurm_install_slurmd + - name: Add slurm group ansible.builtin.group: name: slurm diff --git a/slurm/tasks/rpmbuild_slurm.yaml b/slurm/tasks/rpmbuild_slurm.yaml index 36fa381..2ca5f32 100644 --- a/slurm/tasks/rpmbuild_slurm.yaml +++ b/slurm/tasks/rpmbuild_slurm.yaml @@ -114,9 +114,22 @@ ansible.builtin.debug: msg: "rpmbuild command: {{ rpmbuild_cmd }} {{ rpmbuild_pmix_option }} {{ rpmbuild_restd_option }} -ta slurm-{{ slurm_version }}.tar.bz2" +- name: rpmbuild nvml option default to "" + ansible.builtin.set_fact: + rpmbuild_nvml_option: "" + +- name: rpmbuild nvml option + ansible.builtin.set_fact: + rpmbuild_nvml_option: "-D '_with_nvml --with-nvml={{ slurm_rpmbuild_nvml_path }}'" + when: slurm_enable_nvml + +- name: show build command with nvml + ansible.builtin.debug: + msg: "rpmbuild command: {{ rpmbuild_cmd }} {{ rpmbuild_pmix_option }} {{ rpmbuild_restd_option }} {{ rpmbuild_nvml_option }} -ta slurm-{{ slurm_version }}.tar.bz2" + - name: rpmbuild from source tarball - command: "{{ rpmbuild_cmd }} {{ rpmbuild_pmix_option }} {{ rpmbuild_restd_option }} -ta slurm-{{ slurm_version }}.tar.bz2" + command: "{{ rpmbuild_cmd }} {{ rpmbuild_pmix_option }} {{ rpmbuild_restd_option }} {{ rpmbuild_nvml_option }} -ta slurm-{{ slurm_version }}.tar.bz2" args: chdir: "{{slurm_tmpdir}}" become: true - become_user: "{{ slurm_rpmbuild_user }}" \ No newline at end of file + become_user: "{{ slurm_rpmbuild_user }}" \ No newline at end of file diff --git a/slurm/vars/RedHat-7.yaml b/slurm/vars/RedHat-7.yaml index 1062562..3166e8c 100644 --- a/slurm/vars/RedHat-7.yaml +++ b/slurm/vars/RedHat-7.yaml @@ -21,6 +21,9 @@ slurm_required_packages: - pmix - pmix-devel +slurm_cuda_packages: + - "{{ slurm_cuda_toolkit }}" + slurm_dbd_required_packages: - mariadb-server - mariadb-devel diff --git a/slurm/vars/RedHat-8.yaml b/slurm/vars/RedHat-8.yaml index 0d85466..52a2437 100644 --- a/slurm/vars/RedHat-8.yaml +++ b/slurm/vars/RedHat-8.yaml @@ -22,6 +22,9 @@ slurm_required_packages: - lua-devel - pam-devel +slurm_cuda_packages: + - "{{ slurm_cuda_toolkit }}" + slurm_dbd_required_packages: - mariadb-server - mariadb-devel diff --git a/slurm/vars/RedHat-9.yaml b/slurm/vars/RedHat-9.yaml index 01dd223..4d25a2e 100644 --- a/slurm/vars/RedHat-9.yaml +++ b/slurm/vars/RedHat-9.yaml @@ -14,6 +14,9 @@ slurm_required_packages: - s-nail - lua +slurm_cuda_packages: + - "{{ slurm_cuda_toolkit }}" + slurm_required_devel_packages: - zlib-devel - bzip2-devel