Skip to content

Commit

Permalink
Add H4d vm integration tests and irdma ansible test
Browse files Browse the repository at this point in the history
  • Loading branch information
abbas1902 committed Feb 4, 2025
1 parent ba02c6b commit 57cf2fa
Show file tree
Hide file tree
Showing 3 changed files with 238 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

- name: Get RDMA interface
ansible.builtin.shell: |
raw_list=$(ls /sys/class/infiniband/*/device/net 2>/dev/null)
if [[ $? -ne 0 || -z "${raw_list}" ]]; then
echo "No RDMA interfaces found." >&2
exit 1
fi
rdma_iface=$(echo "${raw_list}" | awk '{print $1}')
if [[ -z $(ethtool -i "${rdma_iface}" | grep "driver.*idpf") ]]; then
echo "RDMA interface ${rdma_iface} does not load the IDPF driver." >&2
exit 1
fi
echo "${rdma_iface}"
args:
executable: /bin/bash
register: get_rdma_interface_output
changed_when: false

- name: Set rdma_iface fact
ansible.builtin.set_fact:
rdma_iface: "{{ get_rdma_interface_output.stdout }}"

- name: Debug RDMA interface
ansible.builtin.debug:
msg: "RDMA Interface: {{ rdma_iface }}"

- name: Check RDMA interface existence
ansible.builtin.shell: ifconfig | grep "{{ rdma_iface }}"
args:
executable: /bin/bash
register: rdma_interface_check
ignore_errors: true
changed_when: false

- name: Fail if RDMA interface not found
ansible.builtin.fail:
msg: "No RDMA interface found."
when: rdma_interface_check.rc != 0

- name: Check for CQP failure
ansible.builtin.shell: dmesg | grep 'hardware initialization FAILED'
args:
executable: /bin/bash
register: cqp_failure_check
ignore_errors: true
changed_when: false

- name: Fail if CQP failure found
ansible.builtin.fail:
msg: "CQP hardware initialization failed."
when: cqp_failure_check.rc == 0

- name: Check for MAD QP failure
ansible.builtin.shell: dmesg | grep 'create ib_mad QP1'
args:
executable: /bin/bash
register: mad_qp_failure_check
ignore_errors: true
changed_when: false

- name: Fail if MAD QP failure found
ansible.builtin.fail:
msg: "MAD QP registration failed."
when: mad_qp_failure_check.rc == 0

- name: Check for QP async events
ansible.builtin.shell: dmesg | grep 'qp async event'
args:
executable: /bin/bash
register: qp_async_events_check
ignore_errors: true
changed_when: false

- name: Fail if QP async events found
ansible.builtin.fail:
msg: "Async event error detected."
when: qp_async_events_check.rc == 0

- name: Check for MAD agent error
ansible.builtin.shell: dmesg | grep 'MAD agent registration failed'
args:
executable: /bin/bash
register: mad_agent_error_check
ignore_errors: true
changed_when: false

- name: Fail if MAD agent error found
ansible.builtin.fail:
msg: "MAD agent error detected."
when: mad_agent_error_check.rc == 0

- name: Run rping loopback test
block:
- name: Kill any existing rping processes
ansible.builtin.shell: killall rping 2>/dev/null || true
args:
executable: /bin/bash
changed_when: false

- name: Get primary IP
ansible.builtin.shell: ip addr show "{{ rdma_iface }}" | grep -Po "inet \K[\d.]+"
args:
executable: /bin/bash
register: get_primary_ip_output
changed_when: false

- name: Set primary_ip fact
ansible.builtin.set_fact:
primary_ip: "{{ get_primary_ip_output.stdout }}"

- name: Debug primary IP
ansible.builtin.debug:
msg: "Listening on {{ primary_ip }}"

- name: Run rping server/client
ansible.builtin.shell: |
rping -d -s -P -a "{{ primary_ip }}" > /dev/null &
sleep 10
rping -d -c -C 1 -a "{{ primary_ip }}"
args:
executable: /bin/bash
register: rping_client_output
ignore_errors: true
changed_when: false

- name: Fail if rping loopback failed
ansible.builtin.fail:
msg: "rping loopback failed with return value of {{ rping_client_output.rc }}."
when: rping_client_output.rc != 0

always:
- name: Kill rping processes after test
ansible.builtin.shell: killall rping 2>/dev/null || true
args:
executable: /bin/bash
changed_when: false

- name: Print success message
ansible.builtin.debug:
msg: "Basic local checks passed on"
48 changes: 48 additions & 0 deletions tools/cloud-build/daily-tests/builds/h4d-vm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
tags:
- vm
- m.filestore
- m.startup-script
- m.vpc
- m.vm-instance
- m.wait-for-startup

timeout: 14400s # 4hr
steps:
- id: h4d-vm
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
entrypoint: /bin/bash
env:
- "ANSIBLE_HOST_KEY_CHECKING=false"
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
args:
- -c
- |
set -x -e
cd /workspace && make
BUILD_ID_FULL=$BUILD_ID
BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
REGION=europe-west4
ZONE=europe-west4-b
BLUEPRINT="/workspace/examples/h4d-vm.yaml"
sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT}
sed -i -e '/reason:/d' $${BLUEPRINT}
ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
--user=sa_106486320838376751393 \
--extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
--extra-vars="region=$${REGION} zone=$${ZONE}" \
--extra-vars="@tools/cloud-build/daily-tests/tests/h4d-vm.yml"
34 changes: 34 additions & 0 deletions tools/cloud-build/daily-tests/tests/h4d-vm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

# region, zone must be defined in build file with --extra-vars flag!
test_name: h4d-jbvms
deployment_name: h4d-jbvms-{{ build }}
workspace: /workspace
blueprint_yaml: "{{ workspace }}/examples/h4d-vm.yaml"
region: europe-west4
zone: europe-west4-b
network: "{{ deployment_name }}-net"
remote_node: "{{ deployment_name }}-0"
post_deploy_tests:
- test-validation/test-irdma.yml
- test-validation/test-mounts.yml
custom_vars:
mounts:
- /home
cli_deployment_vars:
region: "{{ region }}"
zone: "{{ zone }}"

0 comments on commit 57cf2fa

Please sign in to comment.