Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add H4d vm integration tests #3621

Merged
merged 1 commit into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

- name: Get RDMA interface
ansible.builtin.shell: |
raw_list=$(ls /sys/class/infiniband/*/device/net 2>/dev/null)
if [[ $? -ne 0 || -z "${raw_list}" ]]; then
echo "No RDMA interfaces found." >&2
exit 1
fi
rdma_iface=$(echo "${raw_list}" | awk '{print $1}')
if [[ -z $(ethtool -i "${rdma_iface}" | grep "driver.*idpf") ]]; then
echo "RDMA interface ${rdma_iface} does not load the IDPF driver." >&2
exit 1
fi
echo "${rdma_iface}"
args:
executable: /bin/bash
register: get_rdma_interface_output
changed_when: false

- name: Set rdma_iface fact
ansible.builtin.set_fact:
rdma_iface: "{{ get_rdma_interface_output.stdout }}"

- name: Debug RDMA interface
ansible.builtin.debug:
msg: "RDMA Interface: {{ rdma_iface }}"

- name: Check RDMA interface existence
ansible.builtin.shell: ifconfig | grep "{{ rdma_iface }}"
args:
executable: /bin/bash
register: rdma_interface_check
ignore_errors: true
changed_when: false

- name: Fail if RDMA interface not found
ansible.builtin.fail:
msg: "No RDMA interface found."
when: rdma_interface_check.rc != 0

- name: Check for CQP failure
ansible.builtin.shell: dmesg | grep 'hardware initialization FAILED'
args:
executable: /bin/bash
register: cqp_failure_check
ignore_errors: true
changed_when: false

- name: Fail if CQP failure found
ansible.builtin.fail:
msg: "CQP hardware initialization failed."
when: cqp_failure_check.rc == 0

- name: Check for MAD QP failure
ansible.builtin.shell: dmesg | grep 'create ib_mad QP1'
args:
executable: /bin/bash
register: mad_qp_failure_check
ignore_errors: true
changed_when: false

- name: Fail if MAD QP failure found
ansible.builtin.fail:
msg: "MAD QP registration failed."
when: mad_qp_failure_check.rc == 0

- name: Check for QP async events
ansible.builtin.shell: dmesg | grep 'qp async event'
args:
executable: /bin/bash
register: qp_async_events_check
ignore_errors: true
changed_when: false

- name: Fail if QP async events found
ansible.builtin.fail:
msg: "Async event error detected."
when: qp_async_events_check.rc == 0

- name: Check for MAD agent error
ansible.builtin.shell: dmesg | grep 'MAD agent registration failed'
args:
executable: /bin/bash
register: mad_agent_error_check
ignore_errors: true
changed_when: false

- name: Fail if MAD agent error found
ansible.builtin.fail:
msg: "MAD agent error detected."
when: mad_agent_error_check.rc == 0

- name: Run rping loopback test
block:
- name: Kill any existing rping processes
ansible.builtin.shell: killall rping 2>/dev/null || true
args:
executable: /bin/bash
changed_when: false

- name: Get primary IP
ansible.builtin.shell: ip addr show "{{ rdma_iface }}" | grep -Po "inet \K[\d.]+"
args:
executable: /bin/bash
register: get_primary_ip_output
changed_when: false

- name: Set primary_ip fact
ansible.builtin.set_fact:
primary_ip: "{{ get_primary_ip_output.stdout }}"

- name: Debug primary IP
ansible.builtin.debug:
msg: "Listening on {{ primary_ip }}"

- name: Run rping server/client
ansible.builtin.shell: |
rping -d -s -P -a "{{ primary_ip }}" > /dev/null &
sleep 10
rping -d -c -C 1 -a "{{ primary_ip }}"
args:
executable: /bin/bash
register: rping_client_output
ignore_errors: true
changed_when: false

- name: Fail if rping loopback failed
ansible.builtin.fail:
msg: "rping loopback failed with return value of {{ rping_client_output.rc }}."
when: rping_client_output.rc != 0

always:
- name: Kill rping processes after test
ansible.builtin.shell: killall rping 2>/dev/null || true
args:
executable: /bin/bash
changed_when: false

- name: Print success message
ansible.builtin.debug:
msg: "Basic local checks passed on"
48 changes: 48 additions & 0 deletions tools/cloud-build/daily-tests/builds/h4d-vm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
tags:
- vm
- m.filestore
- m.startup-script
- m.vpc
- m.vm-instance
- m.wait-for-startup

timeout: 14400s # 4hr
steps:
- id: h4d-vm
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
entrypoint: /bin/bash
env:
- "ANSIBLE_HOST_KEY_CHECKING=false"
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
args:
- -c
- |
set -x -e
cd /workspace && make
BUILD_ID_FULL=$BUILD_ID
BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
REGION=europe-west4
ZONE=europe-west4-b
BLUEPRINT="/workspace/examples/h4d-vm.yaml"
sed -i -e '/deletion_protection:/{n;s/enabled: true/enabled: false/}' $${BLUEPRINT}
sed -i -e '/reason:/d' $${BLUEPRINT}
ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
--user=sa_106486320838376751393 \
--extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
--extra-vars="region=$${REGION} zone=$${ZONE}" \
--extra-vars="@tools/cloud-build/daily-tests/tests/h4d-vm.yml"
34 changes: 34 additions & 0 deletions tools/cloud-build/daily-tests/tests/h4d-vm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

# region, zone must be defined in build file with --extra-vars flag!
test_name: h4d-jbvms
deployment_name: h4d-jbvms-{{ build }}
workspace: /workspace
blueprint_yaml: "{{ workspace }}/examples/h4d-vm.yaml"
region: europe-west4
zone: europe-west4-b
network: "{{ deployment_name }}-net"
remote_node: "{{ deployment_name }}-0"
post_deploy_tests:
- test-validation/test-irdma.yml
- test-validation/test-mounts.yml
custom_vars:
mounts:
- /home
cli_deployment_vars:
region: "{{ region }}"
zone: "{{ zone }}"