From ad2184e1a84a4bedbabbfe74f33f50b5f160df47 Mon Sep 17 00:00:00 2001 From: Sylvain Bauza Date: Thu, 19 Sep 2024 15:54:24 +0200 Subject: [PATCH] Initial POC for installing nvidia driver on EDPM nodes TODO: - molecule tests - docs - more checks --- galaxy.yml | 2 +- playbooks/nvidia-mdev.yml | 67 ++++++++++++++++ roles/edpm_nvidia_mdev_prepare/README.md | 10 +++ .../defaults/main.yml | 28 +++++++ roles/edpm_nvidia_mdev_prepare/files/.gitkeep | 0 .../files/nvidia-sriov-manage@.service | 26 +++++++ .../handlers/main.yml | 15 ++++ roles/edpm_nvidia_mdev_prepare/meta/main.yml | 30 +++++++ .../molecule/default/converge.yml | 29 +++++++ .../molecule/default/molecule.yml | 11 +++ .../molecule/default/prepare.yml | 21 +++++ .../tasks/cleanup.yml | 19 +++++ roles/edpm_nvidia_mdev_prepare/tasks/main.yml | 16 ++++ .../edpm_nvidia_mdev_prepare/tasks/phase1.yml | 78 +++++++++++++++++++ .../edpm_nvidia_mdev_prepare/tasks/phase2.yml | 31 ++++++++ roles/edpm_nvidia_mdev_prepare/vars/main.yml | 22 ++++++ zuul.d/molecule.yaml | 11 +++ zuul.d/projects.yaml | 1 + 18 files changed, 416 insertions(+), 1 deletion(-) create mode 100644 playbooks/nvidia-mdev.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/README.md create mode 100644 roles/edpm_nvidia_mdev_prepare/defaults/main.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/files/.gitkeep create mode 100644 roles/edpm_nvidia_mdev_prepare/files/nvidia-sriov-manage@.service create mode 100644 roles/edpm_nvidia_mdev_prepare/handlers/main.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/meta/main.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/molecule/default/converge.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/molecule/default/molecule.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/molecule/default/prepare.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/tasks/cleanup.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/tasks/main.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/tasks/phase1.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/tasks/phase2.yml create mode 100644 roles/edpm_nvidia_mdev_prepare/vars/main.yml diff --git a/galaxy.yml b/galaxy.yml index 59e5b351ce..6f2c5f3e49 100644 --- a/galaxy.yml +++ b/galaxy.yml @@ -8,7 +8,7 @@ namespace: cifmw name: general # The version of the collection. Must be compatible with semantic versioning -version: 1.0.0 +version: 1.0.0+15f75128 # The path to the Markdown (.md) readme file. This path is relative to the root of the collection readme: README.md diff --git a/playbooks/nvidia-mdev.yml b/playbooks/nvidia-mdev.yml new file mode 100644 index 0000000000..e2610b22ce --- /dev/null +++ b/playbooks/nvidia-mdev.yml @@ -0,0 +1,67 @@ +--- +# Copyright 2024 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +- name: Gather the list of EDPM computes + hosts: "{{ cifmw_target_hook_host | default('localhost') }}" + gather_facts: false + tasks: + - name: Fetch OSP BMO nodesets + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path }}" + ansible.builtin.command: + cmd: >- + oc get OpenStackBaremetalSet -n "{{ namespace|default('openstack') }}" -o yaml + register: _osp_bmo_nodsets_oc_out + + - name: Add OSP BMO nodesets to Ansible + ansible.builtin.add_host: + name: "{{ item.name }}" + groups: "{{ item.group }}" + ansible_ssh_user: "{{ item.user }}" + ansible_host: "{{ item.ip }}" + ansible_ssh_private_key_file: "{{ ansible_user_dir }}/.ssh/id_cifw" + ansible_ssh_extra_args: '-o StrictHostKeyChecking=no' + loop: >- + {% set hosts = [] -%} + {% set nodesets = (_osp_bmo_nodsets_oc_out.stdout | from_yaml)['items'] | default([]) -%} + {% for spec in nodesets | map(attribute='spec') -%} + {% for host_key, host_val in spec.baremetalHosts.items() -%} + {% set _ = hosts.append( + { + 'name': host_key, + 'ip': host_val['ctlPlaneIP'] | ansible.utils.ipaddr('address'), + 'user': spec.cloudUserName, + 'group': host_key | split('-') | first + 's' + }) -%} + {% endfor -%} + {% endfor -%} + {{ hosts }} + +- name: Run the Nvidia role + hosts: computes + tasks: + - name: Run phase1 + ansible.builtin.import_role: + name: edpm_nvidia_mdev_prepare + # As a reminder, at the end of phase1, the compute will reboot + tasks_from: phase1 + + - name: Run phase 2 + ansible.builtin.import_role: + name: edpm_nvidia_mdev_prepare + tasks_from: phase2 diff --git a/roles/edpm_nvidia_mdev_prepare/README.md b/roles/edpm_nvidia_mdev_prepare/README.md new file mode 100644 index 0000000000..8d0d2c2b4a --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/README.md @@ -0,0 +1,10 @@ +# edpm_nvidia_mdev_prepare +Please explain the role purpose. + +## Privilege escalation +If apply, please explain the privilege escalation done in this role. + +## Parameters +* `param_1`: this is an example + +## Examples diff --git a/roles/edpm_nvidia_mdev_prepare/defaults/main.yml b/roles/edpm_nvidia_mdev_prepare/defaults/main.yml new file mode 100644 index 0000000000..a3c5e01b6d --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/defaults/main.yml @@ -0,0 +1,28 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# Does the OS needs to disable the nouveau driver ? +cifmw_edpm_nvidia_mdev_prepare_disable_nouveau: true + +# What is the URL or path for the nvidia driver RPM ? +cifmw_edpm_nvidia_mdev_prepare_driver_url: '' + +# What will be the name of the nvidia package ? +cifmw_edpm_nvidia_mdev_prepare_package_name: "NVIDIA-vGPU-rhel" + +# Which SR-IOV GPU devices should be creating VFs ? +cifmw_edpm_nvidia_mdev_prepare_sriov_devices: + - ALL \ No newline at end of file diff --git a/roles/edpm_nvidia_mdev_prepare/files/.gitkeep b/roles/edpm_nvidia_mdev_prepare/files/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/roles/edpm_nvidia_mdev_prepare/files/nvidia-sriov-manage@.service b/roles/edpm_nvidia_mdev_prepare/files/nvidia-sriov-manage@.service new file mode 100644 index 0000000000..6e1ced0bc1 --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/files/nvidia-sriov-manage@.service @@ -0,0 +1,26 @@ +[Unit] +After = nvidia-vgpu-mgr.service +After = nvidia-vgpud.service +Description = Enable Nvidia GPU virtual functions + +[Service] +Type = oneshot +User = root +Group = root +ExecStart = /usr/lib/nvidia/sriov-manage -e %i +# Give a reasonable amount of time for the server to start up/shut down +TimeoutSec = 120 +# This creates a specific slice which all services will operate from +# The accounting options give us the ability to see resource usage +# through the `systemd-cgtop` command. +Slice = system.slice +# Set Accounting +CPUAccounting = True +BlockIOAccounting = True +MemoryAccounting = True +TasksAccounting = True +RemainAfterExit = True +ExecStartPre = /usr/bin/sleep 30 + +[Install] +WantedBy = multi-user.target diff --git a/roles/edpm_nvidia_mdev_prepare/handlers/main.yml b/roles/edpm_nvidia_mdev_prepare/handlers/main.yml new file mode 100644 index 0000000000..ae5261d4ab --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/handlers/main.yml @@ -0,0 +1,15 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. diff --git a/roles/edpm_nvidia_mdev_prepare/meta/main.yml b/roles/edpm_nvidia_mdev_prepare/meta/main.yml new file mode 100644 index 0000000000..888d205779 --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/meta/main.yml @@ -0,0 +1,30 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +galaxy_info: + author: CI Framework + description: CI Framework Role -- edpm_nvidia_mdev_prepare + company: Red Hat + license: Apache-2.0 + min_ansible_version: "2.14" + namespace: cifmw + galaxy_tags: + - cifmw + +# List your role dependencies here, one per line. Be sure to remove the '[]' above, +# if you add dependencies to this list. +dependencies: [] diff --git a/roles/edpm_nvidia_mdev_prepare/molecule/default/converge.yml b/roles/edpm_nvidia_mdev_prepare/molecule/default/converge.yml new file mode 100644 index 0000000000..158e793e59 --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/molecule/default/converge.yml @@ -0,0 +1,29 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +- name: Converge + hosts: all + tasks: + - name: Run phase1 + ansible.builtin.import_role: + name: edpm_nvidia_mdev_prepare + tasks_from: phase1 + + - name: Run phase 2 + ansible.builtin.import_role: + name: edpm_nvidia_mdev_prepare + tasks_from: phase2 diff --git a/roles/edpm_nvidia_mdev_prepare/molecule/default/molecule.yml b/roles/edpm_nvidia_mdev_prepare/molecule/default/molecule.yml new file mode 100644 index 0000000000..fda947cafe --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/molecule/default/molecule.yml @@ -0,0 +1,11 @@ +--- +# Mainly used to override the defaults set in .config/molecule/ +# By default, it uses the "config_podman.yml" - in CI, it will use +# "config_local.yml". +log: true + +provisioner: + name: ansible + log: true + env: + ANSIBLE_STDOUT_CALLBACK: yaml diff --git a/roles/edpm_nvidia_mdev_prepare/molecule/default/prepare.yml b/roles/edpm_nvidia_mdev_prepare/molecule/default/prepare.yml new file mode 100644 index 0000000000..d3594acc41 --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/molecule/default/prepare.yml @@ -0,0 +1,21 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +- name: Prepare + hosts: all + roles: + - role: test_deps diff --git a/roles/edpm_nvidia_mdev_prepare/tasks/cleanup.yml b/roles/edpm_nvidia_mdev_prepare/tasks/cleanup.yml new file mode 100644 index 0000000000..bc6f868c97 --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/tasks/cleanup.yml @@ -0,0 +1,19 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Cleaning the World + ansible.builtin.debug: + msg: "So here edpm_nvidia_mdev_prepare should clean things up!" diff --git a/roles/edpm_nvidia_mdev_prepare/tasks/main.yml b/roles/edpm_nvidia_mdev_prepare/tasks/main.yml new file mode 100644 index 0000000000..6abfe81c1f --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/tasks/main.yml @@ -0,0 +1,16 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + diff --git a/roles/edpm_nvidia_mdev_prepare/tasks/phase1.yml b/roles/edpm_nvidia_mdev_prepare/tasks/phase1.yml new file mode 100644 index 0000000000..a65be9040e --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/tasks/phase1.yml @@ -0,0 +1,78 @@ +--- +# Copyright 2024 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Blacklist nouveau + become: true + ansible.builtin.copy: + dest: "/etc/modprobe.d/blacklist-nouveau.conf" + mode: "0644" + content: |- + blacklist nouveau + options nouveau modeset=0 + force: false + when: + - cifmw_edpm_nvidia_mdev_prepare_disable_nouveau | bool + register: _blacklist_nouveau + +- name: Make sure that we defined the driver URL + ansible.builtin.assert: + that: + - cifmw_edpm_nvidia_mdev_prepare_driver_url is defined + - cifmw_edpm_nvidia_mdev_prepare_driver_url | length > 0 + msg: "You need to set cifmw_edpm_nvidia_mdev_prepare_driver_url" + +- name: Gather the package facts + ansible.builtin.package_facts: + manager: auto + +- name: Install nvidia driver RPM either from path or URL + become: true + ansible.builtin.dnf: + name: "{{ cifmw_edpm_nvidia_mdev_prepare_driver_url }}" + state: present + disable_gpg_check: true + when: cifmw_edpm_nvidia_mdev_prepare_package_name not in ansible_facts.packages + register: _nvidia_driver_install + +- name: Regenerate initramfs + become: true + ansible.builtin.command: "{{ item }}" + loop: + - 'dracut --force' + - 'grub2-mkconfig -o /boot/efi/EFI/redhat/grub.cfg' + when: _blacklist_nouveau.changed or _nvidia_driver_install.changed + +- name: Enforce a reboot to ensure that we have the driver loaded + block: + - name: Create directory required by edpm-reboot role + become: true + ansible.builtin.file: + path: /var/lib/openstack/reboot_required/ + state: directory + mode: "0755" + - name: Create required file to enforce a reboot + become: true + ansible.builtin.file: + path: /var/lib/openstack/reboot_required/nvidia_mdev_reboot + state: touch + mode: "0600" + - name: Call edpm_reboot role + # Since the EDPM role isn't installed, we can't call it + # Removing it for now + # ansible.builtin.include_role: + # name: edpm_reboot + # Instead, use a regular reboot + ansible.builtin.reboot: diff --git a/roles/edpm_nvidia_mdev_prepare/tasks/phase2.yml b/roles/edpm_nvidia_mdev_prepare/tasks/phase2.yml new file mode 100644 index 0000000000..2e3183af09 --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/tasks/phase2.yml @@ -0,0 +1,31 @@ +--- +# Copyright 2024 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Create a systemd unit file that will enable SRIOV VFs + become: true + ansible.builtin.copy: + dest: "/etc/systemd/system/nvidia-sriov-manage@.service" + mode: "0644" + src: "nvidia-sriov-manage@.service" + force: false + +- name: Enable the systemd unit file + become: true + ansible.builtin.systemd_service: + name: "nvidia-sriov-manage@{{ item }}.service" + enabled: true + state: started + loop: "{{ cifmw_edpm_nvidia_mdev_prepare_sriov_devices }}" diff --git a/roles/edpm_nvidia_mdev_prepare/vars/main.yml b/roles/edpm_nvidia_mdev_prepare/vars/main.yml new file mode 100644 index 0000000000..d0e76ff0c9 --- /dev/null +++ b/roles/edpm_nvidia_mdev_prepare/vars/main.yml @@ -0,0 +1,22 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +# While options found within the vars/ path can be overridden using extra +# vars, items within this path are considered part of the role and not +# intended to be modified. + +# All variables within this role should have a prefix of "cifmw_edpm_nvidia_mdev_prepare" diff --git a/zuul.d/molecule.yaml b/zuul.d/molecule.yaml index 4f59f08732..0835008b87 100644 --- a/zuul.d/molecule.yaml +++ b/zuul.d/molecule.yaml @@ -367,6 +367,17 @@ parent: cifmw-molecule-base vars: TEST_RUN: edpm_kustomize +- job: + files: + - ^common-requirements.txt + - ^test-requirements.txt + - ^roles/edpm_nvidia_mdev_prepare/(defaults|files|handlers|library|lookup_plugins|module_utils|molecule|tasks|templates|vars).* + - ^ci/playbooks/molecule.* + - ^.config/molecule/.* + name: cifmw-molecule-edpm_nvidia_mdev_prepare + parent: cifmw-molecule-base + vars: + TEST_RUN: edpm_nvidia_mdev_prepare - job: files: - ^common-requirements.txt diff --git a/zuul.d/projects.yaml b/zuul.d/projects.yaml index 12f4f202f9..c6e5a54e39 100644 --- a/zuul.d/projects.yaml +++ b/zuul.d/projects.yaml @@ -48,6 +48,7 @@ - cifmw-molecule-edpm_deploy - cifmw-molecule-edpm_deploy_baremetal - cifmw-molecule-edpm_kustomize + - cifmw-molecule-edpm_nvidia_mdev_prepare - cifmw-molecule-edpm_prepare - cifmw-molecule-env_op_images - cifmw-molecule-hci_prepare