Skip to content

Commit

Permalink
Merge branch 'main' into dpykhtar/nemo2_transition
Browse files Browse the repository at this point in the history
Signed-off-by: Dmytro Pykhtar <[email protected]>
  • Loading branch information
dimapihtar authored Feb 10, 2025
2 parents 0578d38 + d23e1d6 commit 91bc1ad
Show file tree
Hide file tree
Showing 105 changed files with 7,822 additions and 1,152 deletions.
9 changes: 9 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[flake8]
max-line-length = 119
select =
F541, # f-string without any placeholders
F841, # local variable 'x' is assigned to but never used
F401, # 'x' imported but unused
E741, # ambiguous variable name 'l'
F821, # undefined name 'x'
E266, # too many leading '#' for block comment
1 change: 1 addition & 0 deletions .flake8.other
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[flake8]
max-line-length = 119
select =
F541, # f-string without any placeholders
F841, # local variable 'x' is assigned to but never used
Expand Down
1 change: 1 addition & 0 deletions .flake8.speech
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[flake8]
max-line-length = 119
select =
F541, # f-string without any placeholders
F841, # local variable 'x' is assigned to but never used
Expand Down
275 changes: 138 additions & 137 deletions .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,150 +37,151 @@ on:
value: ${{ jobs.main.outputs.conclusion }}
log:
description: Last 2000 characters of the test step's log
value: ${{ jobs.main.outputs.log }}
value: ${{ jobs.main.outputs.log }}
potential_infra_failure:
description: Boolean flag when infra-related keyword spotted in logs.
value: ${{ jobs.main.outputs.potential_infra_failure }}
coverage_report:
description: Key of coverage_report artifact
value: ${{ jobs.main.outputs.coverage_report }}
jobs:

main:
runs-on: ${{ inputs.RUNNER }}
runs-on: ${{ inputs.RUNNER }}
outputs:
conclusion: ${{ steps.main.conclusion }}
log: ${{ steps.main.outputs.log }}
potential_infra_failure: ${{ steps.main.outputs.potential_infra_failure }}
coverage_report: ${{ steps.main.outputs.coverage_report }}
conclusion: ${{ steps.check.conclusion }}
log: ${{ steps.check.outputs.log }}
potential_infra_failure: ${{ steps.check.outputs.potential_infra_failure }}
env:
DIR: ${{ github.run_id }}
steps:
- name: Docker system cleanup
run: |
docker system prune -af --filter "until=24h" --force || true
- name: Docker pull image
run: |
docker pull nemoci.azurecr.io/nemo_container:${{ github.run_id }}
- name: Start container
env:
DIR: ${{ github.run_id }}
run: |
mkdir -p $DIR
ARG=("")
if [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
ARG=("--runtime=nvidia --gpus all")
fi
cmd=$(cat <<RUN_TEST_EOF
#!/bin/bash
docker container rm -f nemo_container_${{ github.run_id }} || true
docker run \
--rm \
-d \
--name nemo_container_${{ github.run_id }} ${ARG[@]} \
--shm-size=64g \
--env TRANSFORMERS_OFFLINE=0 \
--env HYDRA_FULL_ERROR=1 \
--env HF_HOME=/home/TestData/HF_HOME \
--volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} \
bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
RUN_TEST_EOF
)
- name: Docker system cleanup
run: |
docker system prune -af --filter "until=24h" --force || true
- name: Docker pull image
run: |
docker pull nemoci.azurecr.io/nemo_container:${{ github.run_id }}
- name: Start container
env:
DIR: ${{ github.run_id }}
run: |
mkdir -p $DIR
ARG=("")
if [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
ARG=("--runtime=nvidia --gpus all")
fi
cmd=$(cat <<RUN_TEST_EOF
#!/bin/bash
docker container rm -f nemo_container_${{ github.run_id }} || true
docker run \
--rm \
-d \
--name nemo_container_${{ github.run_id }} ${ARG[@]} \
--shm-size=64g \
--env TRANSFORMERS_OFFLINE=0 \
--env HYDRA_FULL_ERROR=1 \
--env HF_HOME=/home/TestData/HF_HOME \
--volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} \
bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
RUN_TEST_EOF
)
echo "$cmd" | tee "$DIR/retry_job.sh"
bash $DIR/retry_job.sh
- name: Create run-script
env:
DIR: ${{ github.run_id }}
SCRIPT: ${{ inputs.SCRIPT }}
id: create
run: |
SCRIPT=$(echo "$SCRIPT" | grep -v '^#')
SCRIPT=$(perl -pe 'chomp if eof' <<< "$SCRIPT")
mkdir -p $DIR
rm $DIR/.coverage || true
rm $DIR/err.log || true
cmd=$(cat <<RUN_TEST_EOF
#!/bin/bash
echo "$cmd" | tee "$DIR/retry_job.sh"
bash $DIR/retry_job.sh
- name: Create run-script
env:
DIR: ${{ github.run_id }}
SCRIPT: ${{ inputs.SCRIPT }}
id: create
run: |
SCRIPT=$(echo "$SCRIPT" | grep -v '^#')
SCRIPT=$(perl -pe 'chomp if eof' <<< "$SCRIPT")
mkdir -p $DIR
rm $DIR/.coverage || true
rm $DIR/err.log || true
cmd=$(cat <<RUN_TEST_EOF
#!/bin/bash
(
set -e
docker exec nemo_container_${{ github.run_id }} bash -c '$SCRIPT && echo "Finished successfully." || echo "Did not finish."'
) 2>&1 | tee $DIR/err.log
RUN_TEST_EOF
)
echo "timeout_in_seconds=$(( ${{ inputs.TIMEOUT }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
echo "$cmd" | tee "$DIR/job.sh"
- name: Run main script
uses: nick-fields/retry@v3
with:
timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }}
max_attempts: 3
shell: bash
retry_on: timeout
command: /bin/bash ${{ github.run_id }}/job.sh
on_retry_command: /bin/bash ${{ github.run_id }}/retry_job.sh

- name: Check result
id: check
env:
SAVE_COVERAGE_REPORT: ${{ inputs.SAVE_COVERAGE_REPORT }}
run: |
cat $DIR/err.log
log=$(tail -c 2000 $DIR/err.log | base64 -w 0)
echo "log=$log" >> "$GITHUB_OUTPUT"
potential_infra_failure=$(cat $DIR/err.log | grep -Eqiw "device" && echo true || echo false)
echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
coverage_report=coverage-${{ github.run_id }}-$(uuidgen)
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
if [[ "$SAVE_COVERAGE_REPORT" == "true" ]]; then
docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage $DIR/.coverage
docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml $DIR/coverage.xml
fi
IS_SUCCESS=$(tail -n 1 $DIR/err.log | grep -q "Finished successfully." && echo "true" || echo "false")
if [[ "$IS_SUCCESS" == "false" ]]; then
echo Test did not finish successfully.
exit 1
fi
exit $EXIT_CODE
- name: Upload artifacts
uses: actions/upload-artifact@v4
if: inputs.SAVE_COVERAGE_REPORT == true
with:
name: ${{ steps.check.outputs.coverage_report }}
path: |
${{ github.run_id }}/coverage.xml
${{ github.run_id }}/.coverage
include-hidden-files: true

- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: failure() && inputs.IS_OPTIONAL == false && !contains(github.event.pull_request.labels.*.name, 'no-fail-fast')
- name: after_script
if: always() && inputs.AFTER_SCRIPT != ':'
run: |
docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
- name: Container shutdown
if: always()
run: |
docker container rm -f nemo_container_${{ github.run_id }} || true
(
set -e
docker exec nemo_container_${{ github.run_id }} bash -c '$SCRIPT && echo "Finished successfully." || echo "Did not finish."'
) 2>&1 | tee $DIR/err.log
RUN_TEST_EOF
)
echo "timeout_in_seconds=$(( ${{ inputs.TIMEOUT }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
echo "$cmd" | tee "$DIR/job.sh"
- name: Run main script
uses: nick-fields/retry@v3
with:
timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }}
max_attempts: 3
shell: bash
retry_on: timeout
command: /bin/bash ${{ github.run_id }}/job.sh
on_retry_command: /bin/bash ${{ github.run_id }}/retry_job.sh

- name: Check result
id: check
env:
SAVE_COVERAGE_REPORT: ${{ inputs.SAVE_COVERAGE_REPORT }}
IS_OPTIONAL: ${{ inputs.IS_OPTIONAL }}
run: |
cat $DIR/err.log
log=$(tail -c 2000 $DIR/err.log | base64 -w 0)
echo "log=$log" >> "$GITHUB_OUTPUT"
potential_infra_failure=$(cat $DIR/err.log | grep -Eqiw "device" && echo true || echo false)
echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
coverage_report=coverage-${{ github.run_id }}-$(uuidgen)
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
if [[ "$SAVE_COVERAGE_REPORT" == "true" ]]; then
docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage $DIR/.coverage
docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml $DIR/coverage.xml
fi
IS_SUCCESS=$(tail -n 1 $DIR/err.log | grep -q "Finished successfully." && echo "true" || echo "false")
if [[ "$IS_SUCCESS" == "false" && "$IS_OPTIONAL" == "true" ]]; then
echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
IS_SUCCESS=true
fi
if [[ "$IS_SUCCESS" == "false" ]]; then
echo Test did not finish successfully.
exit 1
fi
exit $EXIT_CODE
- name: Upload artifacts
uses: actions/upload-artifact@v4
if: inputs.SAVE_COVERAGE_REPORT == true
with:
name: ${{ steps.check.outputs.coverage_report }}
path: |
${{ github.run_id }}/coverage.xml
${{ github.run_id }}/.coverage
include-hidden-files: true

- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: failure() && inputs.IS_OPTIONAL == false && !contains(github.event.pull_request.labels.*.name, 'no-fail-fast')
- name: after_script
if: always() && inputs.AFTER_SCRIPT != ':'
run: |
docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
- name: Container shutdown
if: always()
run: |
docker container rm -f nemo_container_${{ github.run_id }} || true
Loading

0 comments on commit 91bc1ad

Please sign in to comment.