Skip to content

Commit

Permalink
e2e: replace old small job with new medium job
Browse files Browse the repository at this point in the history
this commit adds a new E2E job meant to test integration
of training library changes with the CLI's "full" train
pipeline to prevent any regressions

it also updates the relevant mergify configuration

Signed-off-by: Nathan Weinberg <[email protected]>
(cherry picked from commit c8439e0)

# Conflicts:
#	.github/workflows/e2e-nvidia-a10g-x1.yml
  • Loading branch information
nathan-weinberg authored and mergify[bot] committed Oct 19, 2024
1 parent 40b8b78 commit eb15557
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 45 deletions.
8 changes: 4 additions & 4 deletions .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,18 @@ pull_request_rules:
# e2e workflow
- or:
- and:
# note this should match the triggering criteria in 'e2e-nvidia-t4-x1.yml'
- check-success=e2e-workflow-complete
# note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml'
- check-success=e2e-medium-workflow-complete
- or:
- files~=\.py$
- files=pyproject.toml
- files~=^requirements.*\.txt$
- files=.github/workflows/e2e-nvidia-t4-x1.yml
- files=.github/workflows/e2e-nvidia-a10g-x1.yml
- and:
- -files~=\.py$
- -files=pyproject.toml
- -files~=^requirements.*\.txt$
- -files=.github/workflows/e2e-nvidia-t4-x1.yml
- -files=.github/workflows/e2e-nvidia-a10g-x1.yml

# code lint workflow
- or:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
# SPDX-License-Identifier: Apache-2.0

name: E2E (NVIDIA Tesla T4 x1)
name: E2E (NVIDIA A10G x1)

on:
# run against every merge commit to 'main' and release branches
push:
branches:
- main
- release-*
# only run on PRs that touch certain regex paths
pull_request_target:
types:
- opened
- synchronize
- reopened
branches:
- main
- release-*
Expand All @@ -20,15 +18,24 @@ on:
- '**.py'
- 'pyproject.toml'
- 'requirements**.txt'
- '.github/workflows/e2e-nvidia-t4-x1.yml' # Follow-on workflow
- '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
LC_ALL: en_US.UTF-8

defaults:
run:
shell: bash

permissions:
contents: read

jobs:
start-runner:
name: Start external EC2 runner
start-medium-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
Expand All @@ -40,6 +47,7 @@ jobs:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}

- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
Expand All @@ -59,23 +67,21 @@ jobs:
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]
e2e:
name: E2E Test
needs: start-runner
runs-on: ${{ needs.start-runner.outputs.label }}
e2e-medium-test:
needs:
- start-medium-ec2-runner
runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }}

# It is important that this job has no write permissions and has
# no access to any secrets. This part (e2e) is where we are running
# untrusted code from PRs.
permissions: {}

steps:
# for debugging
- name: Print environment state
- name: Install Packages
run: |
echo "Current Working Directory: $PWD"
echo "Files in Local Directory:"
ls -l
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
- name: Checkout instructlab/instructlab
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
Expand All @@ -93,30 +99,19 @@ jobs:
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

# for debugging
- name: Print environment state
run: |
echo "Current Working Directory: $PWD"
echo "Files in Local Directory:"
ls -l
- name: Fetch and checkout PR
id: fetch_pr
if: github.event_name == 'pull_request_target'
if: ${{ github.event_name == 'pull_request_target' }}
working-directory: ./training
run: |
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
git checkout pr-${{ github.event.pull_request.number }}
- name: Install system packages
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
- name: Install instructlab
- name: Install ilab
working-directory: ./instructlab
run: |
export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH"
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
export PATH="$PATH:$CUDA_HOME/bin"
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
nvidia-smi
Expand All @@ -127,7 +122,7 @@ jobs:
# https://github.com/instructlab/instructlab/issues/1821
# install with Torch and build dependencies installed
python3.11 -m pip install packaging wheel setuptools-scm
python3.11 -m pip install .[cuda]
python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
- name: Update instructlab-training library
working-directory: ./training
Expand All @@ -136,17 +131,25 @@ jobs:
pip install .
pip install .[cuda]
- name: Check disk
run: |
df -h
- name: Run e2e test
working-directory: ./instructlab
run: |
. venv/bin/activate
<<<<<<< HEAD:.github/workflows/e2e-nvidia-t4-x1.yml
./scripts/basic-workflow-tests.sh -a
=======
./scripts/e2e-ci.sh -m
>>>>>>> c8439e0 (e2e: replace old small job with new medium job):.github/workflows/e2e-nvidia-a10g-x1.yml

stop-runner:
stop-medium-ec2-runner:
name: Stop external EC2 runner
needs:
- start-runner
- e2e
- start-medium-ec2-runner
- e2e-medium-test
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
Expand All @@ -161,13 +164,13 @@ jobs:
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
label: ${{ needs.start-medium-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}

e2e-workflow-complete:
e2e-medium-workflow-complete:
# we don't want to block PRs on failed EC2 cleanup
# so not requiring "stop-runner" as well
needs: ["start-runner", "e2e"]
needs: ["start-medium-ec2-runner", "e2e-medium-test"]
runs-on: ubuntu-latest
steps:
- name: E2E Workflow Complete
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# InstructLab Training Library

![Lint](https://github.com/instructlab/training/actions/workflows/lint.yml/badge.svg?branch=main)
![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
![Build](https://github.com/instructlab/training/actions/workflows/pypi.yaml/badge.svg?branch=main)
![Release](https://img.shields.io/github/v/release/instructlab/training)
![License](https://img.shields.io/github/license/instructlab/training)
Expand Down

0 comments on commit eb15557

Please sign in to comment.