Skip to content

Commit

Permalink
Initial POC for installing nvidia driver on EDPM nodes
Browse files Browse the repository at this point in the history
TODO:
 - molecule tests
 - docs
 - more checks
  • Loading branch information
sbauza committed Oct 9, 2024
1 parent eb3e0d9 commit 509c821
Show file tree
Hide file tree
Showing 19 changed files with 534 additions and 1 deletion.
2 changes: 1 addition & 1 deletion galaxy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace: cifmw
name: general

# The version of the collection. Must be compatible with semantic versioning
version: 1.0.0
version: 1.0.0+15f75128

# The path to the Markdown (.md) readme file. This path is relative to the root of the collection
readme: README.md
Expand Down
73 changes: 73 additions & 0 deletions playbooks/nvidia-mdev-phase1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
---
# Copyright 2024 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


- name: Gather the list of EDPM computes
hosts: "{{ cifmw_target_hook_host | default('localhost') }}"
gather_facts: false
tasks:
- name: Fetch OSP BMO nodesets
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path }}"
ansible.builtin.command:
cmd: >-
oc get OpenStackBaremetalSet -n "{{ namespace|default('openstack') }}" -o yaml
register: _osp_bmo_nodsets_oc_out

- name: Craft the BM hosts list
ansible.builtin.set_fact:
_bmo_provisioned_hosts: >-
{% set hosts = [] -%}
{% set nodesets = (_osp_bmo_nodsets_oc_out.stdout | from_yaml)['items'] | default([]) -%}
{% for spec in nodesets | map(attribute='spec') -%}
{% for host_key, host_val in spec.baremetalHosts.items() -%}
{% set _ = hosts.append(
{
'name': host_key,
'ip': host_val['ctlPlaneIP'] | ansible.utils.ipaddr('address'),
'user': spec.cloudUserName,
'group': host_key | split('-') | first + 's'
}) -%}
{% endfor -%}
{% endfor -%}
{{ hosts }}
- name: Add OSP BMO nodesets to Ansible
ansible.builtin.add_host:
name: "{{ item.name }}"
groups: "{{ item.group }}"
ansible_ssh_user: "{{ item.user }}"
ansible_host: "{{ item.ip }}"
ansible_ssh_private_key_file: "{{ ansible_user_dir }}/.ssh/id_cifw"
ansible_ssh_extra_args: '-o StrictHostKeyChecking=no'
loop: "{{ _bmo_provisioned_hosts }}"

- name: Wait for the instance to boot
delegate_to: "{{ item.name }}"
ansible.builtin.wait_for_connection:
sleep: 2
timeout: 600
loop: "{{ _bmo_provisioned_hosts }}"

- name: Run the Nvidia phase 1 role
hosts: edpms
tasks:
- name: Run phase1
ansible.builtin.import_role:
name: edpm_nvidia_mdev_prepare
# As a reminder, at the end of phase1, the compute will reboot
tasks_from: phase1
72 changes: 72 additions & 0 deletions playbooks/nvidia-mdev-phase2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
---
# Copyright 2024 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


- name: Gather the list of EDPM computes
hosts: "{{ cifmw_target_hook_host | default('localhost') }}"
gather_facts: false
tasks:
- name: Fetch OSP BMO nodesets
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path }}"
ansible.builtin.command:
cmd: >-
oc get OpenStackBaremetalSet -n "{{ namespace|default('openstack') }}" -o yaml
register: _osp_bmo_nodsets_oc_out

- name: Craft the BM hosts list
ansible.builtin.set_fact:
_bmo_provisioned_hosts: >-
{% set hosts = [] -%}
{% set nodesets = (_osp_bmo_nodsets_oc_out.stdout | from_yaml)['items'] | default([]) -%}
{% for spec in nodesets | map(attribute='spec') -%}
{% for host_key, host_val in spec.baremetalHosts.items() -%}
{% set _ = hosts.append(
{
'name': host_key,
'ip': host_val['ctlPlaneIP'] | ansible.utils.ipaddr('address'),
'user': spec.cloudUserName,
'group': host_key | split('-') | first + 's'
}) -%}
{% endfor -%}
{% endfor -%}
{{ hosts }}
- name: Add OSP BMO nodesets to Ansible
ansible.builtin.add_host:
name: "{{ item.name }}"
groups: "{{ item.group }}"
ansible_ssh_user: "{{ item.user }}"
ansible_host: "{{ item.ip }}"
ansible_ssh_private_key_file: "{{ ansible_user_dir }}/.ssh/id_cifw"
ansible_ssh_extra_args: '-o StrictHostKeyChecking=no'
loop: "{{ _bmo_provisioned_hosts }}"

- name: Wait for the instance to boot
delegate_to: "{{ item.name }}"
ansible.builtin.wait_for_connection:
sleep: 2
timeout: 600
loop: "{{ _bmo_provisioned_hosts }}"

- name: Run the Nvidia phase 2 role
hosts: edpms
tasks:
- name: Run phase 2
ansible.builtin.import_role:
name: edpm_nvidia_mdev_prepare
tasks_from: phase2
10 changes: 10 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# edpm_nvidia_mdev_prepare
Please explain the role purpose.

## Privilege escalation
If apply, please explain the privilege escalation done in this role.

## Parameters
* `param_1`: this is an example

## Examples
28 changes: 28 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

# Does the OS needs to disable the nouveau driver ?
cifmw_edpm_nvidia_mdev_prepare_disable_nouveau: true

# What is the URL or path for the nvidia driver RPM ?
cifmw_edpm_nvidia_mdev_prepare_driver_url: ''

# What will be the name of the nvidia package ?
cifmw_edpm_nvidia_mdev_prepare_package_name: "NVIDIA-vGPU-rhel"

# Which SR-IOV GPU devices should be creating VFs ?
cifmw_edpm_nvidia_mdev_prepare_sriov_devices:
- ALL
Empty file.
26 changes: 26 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/files/[email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[Unit]
After = nvidia-vgpu-mgr.service
After = nvidia-vgpud.service
Description = Enable Nvidia GPU virtual functions

[Service]
Type = oneshot
User = root
Group = root
ExecStart = /usr/lib/nvidia/sriov-manage -e %i
# Give a reasonable amount of time for the server to start up/shut down
TimeoutSec = 120
# This creates a specific slice which all services will operate from
# The accounting options give us the ability to see resource usage
# through the `systemd-cgtop` command.
Slice = system.slice
# Set Accounting
CPUAccounting = True
BlockIOAccounting = True
MemoryAccounting = True
TasksAccounting = True
RemainAfterExit = True
ExecStartPre = /usr/bin/sleep 30

[Install]
WantedBy = multi-user.target
15 changes: 15 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/handlers/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
30 changes: 30 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/meta/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


galaxy_info:
author: CI Framework
description: CI Framework Role -- edpm_nvidia_mdev_prepare
company: Red Hat
license: Apache-2.0
min_ansible_version: "2.14"
namespace: cifmw
galaxy_tags:
- cifmw

# List your role dependencies here, one per line. Be sure to remove the '[]' above,
# if you add dependencies to this list.
dependencies: []
60 changes: 60 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/molecule/default/converge.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


- name: Converge
hosts: all
vars:
cifmw_edpm_nvidia_mdev_prepare_driver_url: 'tmux'
tasks:
- name: Run phase1
ansible.builtin.import_role:
name: edpm_nvidia_mdev_prepare
tasks_from: phase1

- name: Check expected files in phase 1
ansible.builtin.stat:
path: "{{ item }}"
loop:
- "/etc/modprobe.d/blacklist-nouveau.conf"
register: phase1_files

- name: Check if expected files where created
ansible.builtin.assert:
that: item.stat.exists
loop: "{{ phase1_files.results }}"

- name: Check that tmux was installed
ansible.builtin.command: tmux ls
register: busybox_id
failed_when: busybox_id.rc != 0

- name: Run phase 2
ansible.builtin.import_role:
name: edpm_nvidia_mdev_prepare
tasks_from: phase2

- name: Check expected files in phase 2
ansible.builtin.stat:
path: "{{ item }}"
loop:
- "/etc/systemd/system/[email protected]"
register: phase2_files

- name: Check if expected files where created
ansible.builtin.assert:
that: item.stat.exists
loop: "{{ phase2_files.results }}"
11 changes: 11 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/molecule/default/molecule.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
# Mainly used to override the defaults set in .config/molecule/
# By default, it uses the "config_podman.yml" - in CI, it will use
# "config_local.yml".
log: true

provisioner:
name: ansible
log: true
env:
ANSIBLE_STDOUT_CALLBACK: yaml
21 changes: 21 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/molecule/default/prepare.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


- name: Prepare
hosts: all
roles:
- role: test_deps
19 changes: 19 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/tasks/cleanup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

- name: Cleaning the World
ansible.builtin.debug:
msg: "So here edpm_nvidia_mdev_prepare should clean things up!"
Loading

0 comments on commit 509c821

Please sign in to comment.