Skip to content

Remove updating docs from PR #85

Remove updating docs from PR

Remove updating docs from PR #85

Workflow file for this run

name: Run Unit Tests
on:
push:
branches:
- main
pull_request:
branches:
- main
jobs:
build_wheels:
runs-on: ubuntu-24.04
container:
image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04'
steps:
- name: Update GCC
run: |
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -y build-essential gcc-10 g++-10
- name: Install Python versions and pips
run: |
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -y software-properties-common curl
add-apt-repository ppa:deadsnakes/ppa
apt-get install -y python3.10 python3.10-dev
apt-get install -y python3.11 python3.11-dev
apt-get install -y python3.12 python3.12-dev
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
- name: Checkout code
uses: actions/checkout@v4
- name: Build wheel with Python 3.10
run: |
python3.10 -m pip install -U setuptools poetry build six
python3.10 -m poetry build -f wheel
- name: Build wheel with Python 3.11
run: |
python3.11 -m pip install -U setuptools poetry build six
python3.11 -m poetry build -f wheel
- name: Build wheel with Python 3.12
run: |
python3.12 -m pip install -U setuptools poetry build six
python3.12 -m poetry build -f wheel
- name: Upload the wheel artifact
uses: actions/upload-artifact@v4
with:
name: resiliency-wheels
path: dist/*.whl
unit_tests_cpu_subset:
runs-on: ubuntu-24.04
needs: build_wheels
strategy:
matrix:
container:
- 'pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime'
- 'pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime'
- 'pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime'
test_type: ['fault_tolerance', 'straggler', 'ptl_resiliency']
container:
image: ${{ matrix.container }}
env:
MKL_SERVICE_FORCE_INTEL: 1 # Fix for "MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library."
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: resiliency-wheels
path: ./dist/
- name: Set up environment
run: |
pip install pytest lightning
PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))")
pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl
- name: Run unit tests
shell: bash
run: |
if [[ "${{ matrix.test_type }}" == "straggler" ]]; then
STRAGGLER_DET_CPU_TESTS_PATTERN="test_all_gather_object_calls_num \
or test_fail_if_not_initialized \
or test_individual_gpu_scores_one_rank \
or test_relative_gpu_scores_ \
or test_name_mapper_ \
or test_relative_gpu_scores_"
pytest -s -vvv tests/straggler/unit/ -k "${STRAGGLER_DET_CPU_TESTS_PATTERN}"
elif [[ "${{ matrix.test_type }}" == "fault_tolerance" ]]; then
pytest -s -vvv ./tests/fault_tolerance/unit/
elif [[ "${{ matrix.test_type }}" == "ptl_resiliency" ]]; then
pytest -s -vvv ./tests/ptl_resiliency/unit/test_ft_state_machine.py
else
echo "Unknown test type: ${{ matrix.test_type }}"
exit 1
fi