From fc9dd17682387b9bcc0361c127a1ab1d325614af Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Wed, 7 Feb 2024 12:03:08 -0800 Subject: [PATCH] fix: add determine models code to e2e, add debug logs for k8s jobs --- .github/workflows/e2e-preset-test.yml | 46 +++++++++++++++++------- .github/workflows/kind-cluster/main.py | 30 ++++++++++++++-- .github/workflows/preset-image-build.yml | 8 ++--- 3 files changed, 65 insertions(+), 19 deletions(-) diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index aa8253d06..784c292bd 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -18,8 +18,10 @@ jobs: determine-models: if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.images.outputs.matrix }} + outputs: + matrix: ${{ steps.affected_models.outputs.matrix }} + is_matrix_empty: ${{ steps.check_matrix_empty.outputs.is_empty }} + full_matrix: ${{ steps.images.outputs.full_matrix }} steps: - name: Checkout uses: actions/checkout@v4 @@ -27,12 +29,30 @@ jobs: submodules: true fetch-depth: 0 - - name: Determine Images for Testing - id: images + # This script should output a JSON array of model names + - name: Determine Affected Models + id: affected_models run: | - echo "Setting image tag based on presets/models/supported_models.yaml" - MATRIX=$(yq e -o=json '.models' presets/models/supported_models.yaml | jq -c) - + PR_BRANCH=${{ github.head_ref }} \ + python3 .github/workflows/kind-cluster/determine_models.py + + - name: Print Determined Models + run: | + echo "Output from determine_models: ${{ steps.affected_models.outputs.matrix }}" + + - name: Check if Matrix is Empty + id: check_matrix_empty + run: | + if [ "${{ steps.affected_models.outputs.matrix }}" == "[]" ] || [ -z "${{ steps.affected_models.outputs.matrix }}" ]; then + echo "is_empty=true" >> $GITHUB_OUTPUT + else + echo "is_empty=false" >> $GITHUB_OUTPUT + fi + + - name: Add Config info for Testing + if: steps.check_matrix_empty.outputs.is_empty == 'false' + id: images + run: | # Read the additional configurations from e2e-preset-configs.json CONFIGS=$(cat .github/e2e-preset-configs.json | jq -c '.matrix.image') @@ -45,19 +65,21 @@ jobs: # COMBINED_MATRIX.append(combined) # break - COMBINED_MATRIX=$(echo $MATRIX | jq --argjson configs "$CONFIGS" -c ' + COMBINED_MATRIX=$(echo ${{ steps.affected_models.outputs.matrix }} | jq --argjson configs "$CONFIGS" -c ' map(. as $model | $configs[] | select(.name == $model.name) | $model + .) ') - echo "matrix=$COMBINED_MATRIX" >> $GITHUB_OUTPUT + echo "full_matrix=$COMBINED_MATRIX" >> $GITHUB_OUTPUT - name: Print Combined Matrix + if: steps.check_matrix_empty.outputs.is_empty == 'false' run: | echo "Combined Matrix:" - echo '${{ steps.images.outputs.matrix }}' + echo '${{ steps.images.outputs.full_matrix }}' + e2e-preset-tests: - if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' needs: determine-models + if: needs.determine-models.outputs.is_matrix_empty == 'false' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') runs-on: ubuntu-latest environment: e2e-test strategy: @@ -67,7 +89,7 @@ jobs: # {"name":"falcon-40b","type":"text-generation","version":"#", # "runtime":"tfs","tag":"0.0.1","node-count":1, # "node-vm-size":"Standard_NC96ads_A100_v4", "node-osdisk-size":400} - model: ${{fromJson(needs.determine-models.outputs.matrix)}} + model: ${{fromJson(needs.determine-models.outputs.full_matrix)}} steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/kind-cluster/main.py b/.github/workflows/kind-cluster/main.py index d3b0895e3..1db1e1992 100644 --- a/.github/workflows/kind-cluster/main.py +++ b/.github/workflows/kind-cluster/main.py @@ -166,9 +166,31 @@ def populate_job_template(model_name, model_type, model_runtime, model_tag, job_ print(f"An error occurred while populating job template: {e}") return None - -def check_job_status(job_name): +def log_job_info(job_name): + """Log information about our Job's pod for debugging.""" + # Find the pod(s) associated with the job + command_find_pods = f"kubectl get pods --selector=job-name=docker-build-job-{job_name} -o jsonpath='{{.items[*].metadata.name}}'" + pod_names = run_command(command_find_pods) + if pod_names: + for pod_name in pod_names.split(): + print(f"Logging info for pod: {pod_name}") + # Log pod description for status, events, etc. + command_describe_pod = f"kubectl describe pod {pod_name}" + pod_description = run_command(command_describe_pod) + print(f"Pod Description: \n{pod_description}") + + # Log the last 100 lines of the pod's logs, adjust as necessary + command_logs = f"kubectl logs {pod_name} --tail=100" + pod_logs = run_command(command_logs) + print(f"Pod Logs: \n{pod_logs}") + else: + print(f"No pods found for job {job_name}.") + +def check_job_status(job_name, iteration): """Check the status of a Kubernetes job.""" + # Every 5 minutes log job information + if iteration % 10: + log_job_info(job_name) # Query for the specific fields 'succeeded' and 'failed' in the job's status command_succeeded = f"kubectl get job docker-build-job-{job_name} -o jsonpath='{{.status.succeeded}}'" command_failed = f"kubectl get job docker-build-job-{job_name} -o jsonpath='{{.status.failed}}'" @@ -185,12 +207,13 @@ def check_job_status(job_name): def wait_for_jobs_to_complete(job_names, timeout=21600): """Wait for all jobs to complete with a timeout.""" + iteration = 0 start_time = time.time() while time.time() - start_time < timeout: all_completed = True for job_name in job_names: print("Check Job Status: ", job_name) - status = check_job_status(job_name) + status = check_job_status(job_name, iteration) if status != "succeeded": all_completed = False if status == "failed": @@ -201,6 +224,7 @@ def wait_for_jobs_to_complete(job_names, timeout=21600): print("All jobs completed successfully.") return True time.sleep(30) # Wait for 30 seconds before checking again + iteration += 1 print("Timeout waiting for jobs to complete.") return False diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml index ee08b7b6f..0c0b4a5c3 100644 --- a/.github/workflows/preset-image-build.yml +++ b/.github/workflows/preset-image-build.yml @@ -31,7 +31,7 @@ jobs: determine-models: runs-on: ubuntu-latest outputs: - matrix: ${{ steps.determine_models.outputs.matrix }} + matrix: ${{ steps.affected_models.outputs.matrix }} is_matrix_empty: ${{ steps.check_matrix_empty.outputs.is_empty }} steps: - name: Checkout @@ -42,19 +42,19 @@ jobs: # This script should output a JSON array of model names - name: Determine Affected Models - id: determine_models + id: affected_models run: | PR_BRANCH=${{ github.head_ref }} \ python3 .github/workflows/kind-cluster/determine_models.py - name: Print Determined Models run: | - echo "Output from determine_models: ${{ steps.determine_models.outputs.matrix }}" + echo "Output from affected_models: ${{ steps.affected_models.outputs.matrix }}" - name: Check if Matrix is Empty id: check_matrix_empty run: | - if [ "${{ steps.determine_models.outputs.matrix }}" == "[]" ] || [ -z "${{ steps.determine_models.outputs.matrix }}" ]; then + if [ "${{ steps.affected_models.outputs.matrix }}" == "[]" ] || [ -z "${{ steps.affected_models.outputs.matrix }}" ]; then echo "is_empty=true" >> $GITHUB_OUTPUT else echo "is_empty=false" >> $GITHUB_OUTPUT