Skip to content

.ci/aws: Move p3dn's to ap-northeast-1 #28

.ci/aws: Move p3dn's to ap-northeast-1

.ci/aws: Move p3dn's to ap-northeast-1 #28

Workflow file for this run

name: PR CI
on: [push, pull_request]
env:
APT_PACKAGES: >-
build-essential
git
libhwloc-dev
make
concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
amazonlinux:
strategy:
matrix:
sdk:
- cuda
amazonlinux:
- al2023
- al2
efainstaller:
- latest
- 1.32.0
- 1.31.0
- 1.30.0
include:
- amazonlinux: al2023
efainstallerdir: ALINUX2023
nvidiadistro: fedora37
configmanager: dnf config-manager
cudapackages: cuda-cudart-devel-12-3 cuda-driver-devel-12-3
- amazonlinux: al2
efainstallerdir: ALINUX2
nvidiadistro: rhel7
configmanager: yum-config-manager
cudapackages: cuda-cudart-devel-12-3 cuda-driver-devel-12-3
runs-on: codebuild-ghactions-${{ matrix.amazonlinux }}-${{ github.run_id }}-${{ github.run_attempt }}
name: ${{matrix.amazonlinux}}/${{ matrix.sdk }}/efa@${{ matrix.efainstaller }}/makeinstall
steps:
# note, do not bump to v4: https://github.com/actions/checkout/issues/1590
- uses: actions/checkout@v3
- name: Fetch and Install EFA Installer Dependencies
run: |
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${{ matrix.efainstaller }}.tar.gz
tar -xf aws-efa-installer-*.tar.gz
cd aws-efa-installer/RPMS/${{ matrix.efainstallerdir }}/x86_64
find . | grep rpm$ | xargs sudo yum -y localinstall
- name: Install hwloc, utilities.
run: |
sudo yum -y install hwloc-devel yum-utils
- name: Add EPEL
if: matrix.amazonlinux == 'al2'
run: |
sudo yum -y install \
https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
- name: Install CUDA
run: |
sudo ${{ matrix.configmanager }} --add-repo \
http://developer.download.nvidia.com/compute/cuda/repos/${{ matrix.nvidiadistro }}/x86_64/cuda-${{ matrix.nvidiadistro }}.repo \
--save
sudo yum -y clean expire-cache
sudo yum -y install ${{ matrix.cudapackages }}
- name: Call `autoreconf -ivf`
run: ./autogen.sh
- name: Call `./configure`
run: |
./configure --prefix=/opt/aws-ofi-nccl --with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-tests=no \
--enable-platform-aws
- name: Call `make`
run: make -j
- name: Call `make install`
run: sudo make install
distcheck:
runs-on: ubuntu-22.04
strategy:
matrix:
cc-variant:
- latest
- legacy
cc:
- gcc
- clang
tracing:
- lttng
- none
sdk:
- cuda
- neuron
include:
- cc-variant: latest
cc: clang
cc-version: 19
- cc-variant: latest
cc: gcc
cc-version: 13
exclude:
# fails with
# > checking if __builtin_expect is available... no
# > configure: error: __builtin_expect not available
# clang-19 definitely supports this, so it's something in our m4.
- cc: clang
cc-variant: latest
name: u2204/${{ matrix.sdk }}/libfabric@git/${{matrix.cc}}(${{matrix.cc-variant}})/distcheck/
steps:
- uses: actions/checkout@v4
- name: Configure Neuron SDK Repository
if: matrix.sdk == 'neuron'
run: |
# Configure Linux for Neuron repository updates
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null << EOF
deb https://apt.repos.neuron.amazonaws.com jammy main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt update -y
- name: Add compiler repositories
if: matrix.cc-variant == 'latest'
run: |
if [ "${{ matrix.cc }}" == "clang" ]; then
wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh
# Delete the last line, allowing us to use the cache below for
# actually installing the package; this just adds the
# repository.
sed -i '$ d' llvm.sh
sudo ./llvm.sh ${{ matrix.cc-version }}
fi
if [ "${{ matrix.cc }}" == "gcc" ]; then
sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
fi
- name: Install Latest Compiler
uses: awalsh128/cache-apt-pkgs-action@latest
if: matrix.cc-variant == 'latest'
with:
packages: ${{ matrix.cc }}-${{matrix.cc-version}}
version: compiler-${{ matrix.cc }}-${{matrix.cc-version}}
- name: Install Base Dependencies
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: ${{ env.APT_PACKAGES }}
version: base-packages
- name: Install CUDA SDK
if: matrix.sdk == 'cuda'
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: nvidia-cuda-toolkit
version: cuda-packages
- name: Install Neuron SDK
if: matrix.sdk == 'neuron'
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: aws-neuronx-runtime-lib
version: neuron-packages
- name: Install lttng
uses: awalsh128/cache-apt-pkgs-action@latest
if: matrix.tracing == 'lttng'
with:
packages: liblttng-ust-dev
version: lttng
- name: Install Libfabric
run: |
# We're just doing distchecks, so it is fine if we
# just grab the latest master and built a lean build.
git clone --depth 1 https://github.com/ofiwg/libfabric.git
pushd libfabric
./autogen.sh
export CC="${{ matrix.realcc || matrix.cc }}"
./configure --prefix="$PWD/install" \
--disable-sockets \
--disable-udp \
--disable-mrail \
--disable-opx
make -j "$(nproc)"
make install
popd
- name: Build Plugin
run: |
set -x
export CC="${{ matrix.realcc || matrix.cc }}"
# actions/checkout@v4 would drop the plugin source in $PWD,
# so go ahead and build it.
./autogen.sh
if [ "${{ matrix.sdk }}" == "cuda" ]
then
./configure --with-libfabric="$PWD/libfabric/install" \
--with-cuda=/usr/local/cuda/ \
--enable-platform-aws
else
./configure --with-libfabric="$PWD/libfabric/install" \
--enable-neuron \
--enable-platform-aws
fi
make -j "$(nproc)"
- name: Run Dist Check
run: make distcheck
- name: Upload build logs
if: failure()
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.cc }}-${{ matrix.cc-variant }}-${{ matrix.sdk }}-config.log
path: config.log
if-no-files-found: ignore
codechecker:
runs-on: ubuntu-22.04
needs: [distcheck]
strategy:
matrix:
sdk:
- cuda
- neuron
name: CodeChecker - ${{ matrix.sdk }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.9'
- name: Configure Neuron SDK Repository
if: matrix.sdk == 'neuron'
run: |
# Configure Linux for Neuron repository updates
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null << EOF
deb https://apt.repos.neuron.amazonaws.com jammy main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt update -y
- name: Install Base Dependencies
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: ${{ env.APT_PACKAGES }}
version: base-packages
- name: Install CUDA SDK
if: matrix.sdk == 'cuda'
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: nvidia-cuda-toolkit
version: cuda-packages
- name: Install Neuron SDK
if: matrix.sdk == 'neuron'
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: aws-neuronx-runtime-lib
version: neuron-packages
- name: Install cppcheck
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: cppcheck
version: codechecker-cppcheck
- name: Install Libfabric
run: |
# We're just doing distchecks, so it is fine if we
# just grab the latest master and built a lean build.
git clone --depth 1 https://github.com/ofiwg/libfabric.git
pushd libfabric
./autogen.sh
./configure --prefix="$PWD/install" \
--disable-sockets \
--disable-udp \
--disable-mrail \
--disable-opx
make -j "$(nproc)"
make install
popd
- name: Run Configure
run: |
./autogen.sh
if [ "${{ matrix.sdk }}" == "neuron" ]; then
./configure \
--with-libfabric="$PWD/libfabric/install" \
--enable-neuron \
--enable-platform-aws
else
./configure \
--with-libfabric="$PWD/libfabric/install" \
--with-cuda=/usr/local/cuda/ \
--enable-platform-aws
fi
- name: Run CodeChecker
uses: whisperity/codechecker-analysis-action@v1
id: codechecker
with:
build-command: make
ctu: true
config: .github/codechecker.yaml
- name: Save CodeChecker HTML output.
uses: actions/upload-artifact@v4
with:
name: CodeChecker Bug Reports for ${{ matrix.sdk }}
path: ${{ steps.codechecker.outputs.result-html-dir }}/*.html
- name: CodeChecker Pass Or Fail?
if: steps.codechecker.outputs.warnings-in-diff == 'true'
shell: bash
run: |
echo "::error title=Static Analyzers Failed::Analysed commit(s) caused static analysis warnings"
exit 0