Skip to content

[internal] Auto-retry post-commit #1522

[internal] Auto-retry post-commit

[internal] Auto-retry post-commit #1522

name: "[internal] Auto-retry post-commit"
on:
workflow_dispatch:
inputs:
test_workflow_run_id:
description: "Unique GitHub workflow run ID to use for test"
default: 12788722730
type: number
test_workflow_run_attempt:
description: "Run attempt of the workflow run"
default: 1
type: number
workflow_run:
workflows:
- "All post-commit tests"
- "(Single-card) Tests for new models"
types:
- completed
jobs:
auto-retry:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Get workflow run_id attempt number to analyze
id: get-run-id-and-attempt
shell: bash
run: |
event_name="${{ github.event_name }}"
if [[ "$event_name" == "workflow_dispatch" ]]; then
run_id="${{ inputs.test_workflow_run_id }}"
attempt_number="${{ inputs.test_workflow_run_attempt }}"
elif [[ "$event_name" == "workflow_run" ]]; then
run_id="${{ github.event.workflow_run.id }}"
attempt_number="${{ github.event.workflow_run.run_attempt }}"
[[ -z "$run_id" ]] && { echo "run_id is empty" ; exit 1; }
[[ -z "$attempt_number" ]] && { echo "attempt_number is empty" ; exit 1; }
else
echo "Unknown event name" && exit 1
fi
echo $run_id
echo $attempt_number
echo "run-id=$run_id" >> "$GITHUB_OUTPUT"
echo "attempt-number=$attempt_number" >> "$GITHUB_OUTPUT"
echo "::notice title=target-workflow-link::The workflow being analyzed is available at https://github.com/tenstorrent/tt-metal/actions/runs/$run_id/attempts/$attempt_number"
- name: Determine if we should continue
id: determine-should-continue
shell: bash
env:
GH_TOKEN: ${{ github.token }}
run: |
MAX_ATTEMPTS=3
read original_trigger conclusion <<< $(gh api /repos/tenstorrent/tt-metal/actions/runs/${{ steps.get-run-id-and-attempt.outputs.run-id }}/attempts/${{ steps.get-run-id-and-attempt.outputs.attempt-number }} --jq '.event + " " + .conclusion')
if [[ "${{ steps.get-run-id-and-attempt.outputs.attempt-number }}" -ge "$MAX_ATTEMPTS" ]]; then
echo "::notice title=no-continue-max-tries::This workflow has exceeded max tries. Not re-trying"
should_continue=false
elif [[ "$original_trigger" == "workflow_dispatch" ]]; then
echo "::notice title=no-continue-is-on-branch::This workflow was a workflow dispatched on a branch, not main. Not re-trying"
should_continue=false
elif [[ "$conclusion" != "failure" ]]; then
echo "::notice title=no-continue-did-not-fail::This workflow did not fail. Not re-trying"
should_continue=false
else
should_continue=true
fi
echo "should-continue=$should_continue" >> "$GITHUB_OUTPUT"
- name: Re-run failed jobs
if: ${{ steps.determine-should-continue.outputs.should-continue == 'true' }}
env:
GH_TOKEN: ${{ secrets.RETRY_WORKFLOW_TOKEN }}
run: |
echo "Re-running jobs here"
# gh api /repos/tenstorrent/tt-metal/actions/runs/${{ steps.get-run-id-and-attempt.outputs.run-id }}/timing
gh api --method POST \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
/repos/tenstorrent/tt-metal/actions/runs/${{ steps.get-run-id-and-attempt.outputs.run-id }}/rerun-failed-jobs
- uses: ./.github/actions/slack-report
if: ${{ failure() }}
with:
slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
owner: U014XCQ9CF8 # tt-rkim