Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Part 5.5 - Add determine models code to e2e #236

Merged
merged 1 commit into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 34 additions & 12 deletions .github/workflows/e2e-preset-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,41 @@ jobs:
determine-models:
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.images.outputs.matrix }}
outputs:
matrix: ${{ steps.affected_models.outputs.matrix }}
is_matrix_empty: ${{ steps.check_matrix_empty.outputs.is_empty }}
full_matrix: ${{ steps.images.outputs.full_matrix }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0

- name: Determine Images for Testing
id: images
# This script should output a JSON array of model names
- name: Determine Affected Models
id: affected_models
run: |
echo "Setting image tag based on presets/models/supported_models.yaml"
MATRIX=$(yq e -o=json '.models' presets/models/supported_models.yaml | jq -c)

PR_BRANCH=${{ github.head_ref }} \
python3 .github/workflows/kind-cluster/determine_models.py

- name: Print Determined Models
run: |
echo "Output from determine_models: ${{ steps.affected_models.outputs.matrix }}"

- name: Check if Matrix is Empty
id: check_matrix_empty
run: |
if [ "${{ steps.affected_models.outputs.matrix }}" == "[]" ] || [ -z "${{ steps.affected_models.outputs.matrix }}" ]; then
echo "is_empty=true" >> $GITHUB_OUTPUT
else
echo "is_empty=false" >> $GITHUB_OUTPUT
fi

- name: Add Config info for Testing
if: steps.check_matrix_empty.outputs.is_empty == 'false'
id: images
run: |
# Read the additional configurations from e2e-preset-configs.json
CONFIGS=$(cat .github/e2e-preset-configs.json | jq -c '.matrix.image')

Expand All @@ -45,19 +65,21 @@ jobs:
# COMBINED_MATRIX.append(combined)
# break

COMBINED_MATRIX=$(echo $MATRIX | jq --argjson configs "$CONFIGS" -c '
COMBINED_MATRIX=$(echo ${{ steps.affected_models.outputs.matrix }} | jq --argjson configs "$CONFIGS" -c '
map(. as $model | $configs[] | select(.name == $model.name) | $model + .)
')

echo "matrix=$COMBINED_MATRIX" >> $GITHUB_OUTPUT
echo "full_matrix=$COMBINED_MATRIX" >> $GITHUB_OUTPUT

- name: Print Combined Matrix
if: steps.check_matrix_empty.outputs.is_empty == 'false'
run: |
echo "Combined Matrix:"
echo '${{ steps.images.outputs.matrix }}'
echo '${{ steps.images.outputs.full_matrix }}'

e2e-preset-tests:
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
needs: determine-models
if: needs.determine-models.outputs.is_matrix_empty == 'false' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success')
runs-on: ubuntu-latest
environment: e2e-test
strategy:
Expand All @@ -67,7 +89,7 @@ jobs:
# {"name":"falcon-40b","type":"text-generation","version":"#",
# "runtime":"tfs","tag":"0.0.1","node-count":1,
# "node-vm-size":"Standard_NC96ads_A100_v4", "node-osdisk-size":400}
model: ${{fromJson(needs.determine-models.outputs.matrix)}}
model: ${{fromJson(needs.determine-models.outputs.full_matrix)}}
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down
30 changes: 27 additions & 3 deletions .github/workflows/kind-cluster/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,31 @@ def populate_job_template(model_name, model_type, model_runtime, model_tag, job_
print(f"An error occurred while populating job template: {e}")
return None


def check_job_status(job_name):
def log_job_info(job_name):
"""Log information about our Job's pod for debugging."""
# Find the pod(s) associated with the job
command_find_pods = f"kubectl get pods --selector=job-name=docker-build-job-{job_name} -o jsonpath='{{.items[*].metadata.name}}'"
pod_names = run_command(command_find_pods)
if pod_names:
for pod_name in pod_names.split():
print(f"Logging info for pod: {pod_name}")
# Log pod description for status, events, etc.
command_describe_pod = f"kubectl describe pod {pod_name}"
pod_description = run_command(command_describe_pod)
print(f"Pod Description: \n{pod_description}")

# Log the last 100 lines of the pod's logs, adjust as necessary
command_logs = f"kubectl logs {pod_name} --tail=100"
pod_logs = run_command(command_logs)
print(f"Pod Logs: \n{pod_logs}")
else:
print(f"No pods found for job {job_name}.")

def check_job_status(job_name, iteration):
"""Check the status of a Kubernetes job."""
# Every 5 minutes log job information
if iteration % 10:
log_job_info(job_name)
# Query for the specific fields 'succeeded' and 'failed' in the job's status
command_succeeded = f"kubectl get job docker-build-job-{job_name} -o jsonpath='{{.status.succeeded}}'"
command_failed = f"kubectl get job docker-build-job-{job_name} -o jsonpath='{{.status.failed}}'"
Expand All @@ -185,12 +207,13 @@ def check_job_status(job_name):

def wait_for_jobs_to_complete(job_names, timeout=21600):
"""Wait for all jobs to complete with a timeout."""
iteration = 0
start_time = time.time()
while time.time() - start_time < timeout:
all_completed = True
for job_name in job_names:
print("Check Job Status: ", job_name)
status = check_job_status(job_name)
status = check_job_status(job_name, iteration)
if status != "succeeded":
all_completed = False
if status == "failed":
Expand All @@ -201,6 +224,7 @@ def wait_for_jobs_to_complete(job_names, timeout=21600):
print("All jobs completed successfully.")
return True
time.sleep(30) # Wait for 30 seconds before checking again
iteration += 1
print("Timeout waiting for jobs to complete.")
return False

Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/preset-image-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
determine-models:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.determine_models.outputs.matrix }}
matrix: ${{ steps.affected_models.outputs.matrix }}
is_matrix_empty: ${{ steps.check_matrix_empty.outputs.is_empty }}
steps:
- name: Checkout
Expand All @@ -42,19 +42,19 @@ jobs:

# This script should output a JSON array of model names
- name: Determine Affected Models
id: determine_models
id: affected_models
run: |
PR_BRANCH=${{ github.head_ref }} \
python3 .github/workflows/kind-cluster/determine_models.py

- name: Print Determined Models
run: |
echo "Output from determine_models: ${{ steps.determine_models.outputs.matrix }}"
echo "Output from affected_models: ${{ steps.affected_models.outputs.matrix }}"

- name: Check if Matrix is Empty
id: check_matrix_empty
run: |
if [ "${{ steps.determine_models.outputs.matrix }}" == "[]" ] || [ -z "${{ steps.determine_models.outputs.matrix }}" ]; then
if [ "${{ steps.affected_models.outputs.matrix }}" == "[]" ] || [ -z "${{ steps.affected_models.outputs.matrix }}" ]; then
echo "is_empty=true" >> $GITHUB_OUTPUT
else
echo "is_empty=false" >> $GITHUB_OUTPUT
Expand Down
Loading