Skip to content

Commit

Permalink
Retry tasks on exit code 128 to work around intermittent issues with …
Browse files Browse the repository at this point in the history
…DNS failures

This should help make mozilla#549 less painful. I suggest we back it out once we get to the bottom of that.
  • Loading branch information
bhearsum committed May 22, 2024
1 parent 419ff93 commit 23551c0
Show file tree
Hide file tree
Showing 40 changed files with 83 additions and 3 deletions.
4 changes: 4 additions & 0 deletions .taskcluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,10 @@ tasks:

image: mozillareleases/taskgraph:decision-v7.4.0@sha256:eb13ec9b67c93c8394856813bbe69bb876888a5015938d8d811ef3609c00fe21
maxRunTime: 1800
onExitStatus:
retry:
# 128 happens when cloning this repository fails
- 128

command:
- run-task
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/alignments-backtranslated/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ tasks:
env:
SRC: "{src_locale}"
TRG: "{trg_locale}"
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/alignments-original/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ tasks:
env:
SRC: "{src_locale}"
TRG: "{trg_locale}"
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/alignments-student/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ tasks:
env:
SRC: "{src_locale}"
TRG: "{trg_locale}"
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/analyze-corpus/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ tasks:
path: /builds/worker/artifacts
type: directory
env: {}
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/analyze-mono/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ task-defaults:
path: /builds/worker/artifacts
type: directory
env: {}
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/bicleaner-model/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ tasks:
path: /builds/worker/artifacts
type: directory
max-run-time: 86400
# 128 happens when cloning this repository fails
retry-exit-status: [128]


# Don't run unless explicitly scheduled
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/bicleaner/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ tasks:
CUDNN_DIR: fetches/cuda-toolkit
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/cefilter/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ tasks:
ARTIFACT_EXT: zst
SRC: "{src_locale}"
TRG: "{trg_locale}"
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/clean-corpus/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ tasks:
# there might be intermittent issues with OpusCleaner, specifically FastText model downloading can fail
retry-exit-status:
- 1
- 17
- 128

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/clean-mono/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ task-defaults:
env:
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/collect-corpus/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ tasks:
type: directory
env:
COMPRESSION_CMD: zstdmt
# 128 happens when cloning this repository fails
retry-exit-status: [128]

run:
using: run-task
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/collect-mono-src/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ task-defaults:
type: directory
env:
COMPRESSION_CMD: zstdmt
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/collect-mono-trg/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ task-defaults:
type: directory
env:
COMPRESSION_CMD: zstdmt
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/dataset/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ task-defaults:
- name: public/build
path: /builds/worker/artifacts
type: directory
# 128 happens when cloning this repository fails
retry-exit-status: [128]

run-on-tasks-for: []
run:
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/evaluate-quantized/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ tasks:

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/evaluate-teacher-ensemble/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ tasks:

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/evaluate/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ task-defaults:

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/export/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ tasks:
env:
SRC: "{src_locale}"
TRG: "{trg_locale}"
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/extract-best/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ tasks:
type: directory
env:
COMPRESSION_CMD: zstdmt
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
3 changes: 2 additions & 1 deletion taskcluster/kinds/finetune-student/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ tasks:
worker:
max-run-time: 2592000
# train_taskcluster.py exits with 17 if a request to Taskcluster fails
retry-exit-status: [17]
# 128 happens when cloning this repository fails
retry-exit-status: [17, 128]
env:
ARTIFACT_EXT: zst
COMPRESSION_CMD: zstdmt
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/merge-corpus/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ task-defaults:
TRG: "{trg_locale}"
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/merge-devset/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ task-defaults:
TRG: "{trg_locale}"
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/merge-mono/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ task-defaults:
env:
LOCALE: "{locale}"
COMPRESSION_CMD: zstdmt
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/merge-translated/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ task-defaults:
TRG: "{trg_locale}"
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/quantize/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ tasks:
env:
SRC: "{src_locale}"
TRG: "{trg_locale}"
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/score/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ tasks:
- name: public/build
path: artifacts
type: directory
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/shortlist/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ tasks:
ARTIFACT_EXT: zst
SRC: "{src_locale}"
TRG: "{trg_locale}"
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/split-corpus/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ tasks:
env:
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/split-mono-src/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ task-defaults:
LOCALE: "{locale}"
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/split-mono-trg/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ task-defaults:
LOCALE: "{locale}"
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
3 changes: 3 additions & 0 deletions taskcluster/kinds/tests/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ task-defaults:
run:
using: run-task
cwd: '{checkout}'
worker:
# 128 happens when cloning this repository fails
retry-exit-status: [128]

tasks:
# See issue: https://github.com/mozilla/firefox-translations-training/issues/363
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/toolchain/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ task-defaults:
docker-image: {"in-tree": "toolchain-build"}
max-run-time: 3600
env: {}
# 128 happens when cloning this repository fails
retry-exit-status: [128]
run:
using: toolchain-script

Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/train-backwards/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ tasks:

# Taskcluster proxy is required to read secrets
taskcluster-proxy: true
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# The task needs to be able to read that secret to publish on Weight & Biases
scopes:
Expand Down
3 changes: 2 additions & 1 deletion taskcluster/kinds/train-student/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ tasks:
worker:
max-run-time: 2592000
# train_taskcluster.py exits with 17 if a request to Taskcluster fails
retry-exit-status: [17]
# 128 happens when cloning this repository fails
retry-exit-status: [17, 128]
env:
ARTIFACT_EXT: zst
COMPRESSION_CMD: zstdmt
Expand Down
3 changes: 2 additions & 1 deletion taskcluster/kinds/train-teacher/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ tasks:
worker:
max-run-time: 2592000
# train_taskcluster.py exits with 17 if a request to Taskcluster fails
retry-exit-status: [17]
# 128 happens when cloning this repository fails
retry-exit-status: [17, 128]
env:
ARTIFACT_EXT: zst
COMPRESSION_CMD: zstdmt
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/train-vocab/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ tasks:
type: directory
env:
COMPRESSION_CMD: zstdmt
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/translate-corpus/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ tasks:
env:
CUDA_DIR: fetches/cuda-toolkit
CUDNN_DIR: fetches/cuda-toolkit
# 128 happens when cloning this repository fails
retry-exit-status: [128]

run:
using: run-task
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/translate-mono-src/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ task-defaults:
env:
CUDA_DIR: fetches/cuda-toolkit
CUDNN_DIR: fetches/cuda-toolkit
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down
2 changes: 2 additions & 0 deletions taskcluster/kinds/translate-mono-trg/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ task-defaults:
env:
CUDA_DIR: fetches/cuda-toolkit
CUDNN_DIR: fetches/cuda-toolkit
# 128 happens when cloning this repository fails
retry-exit-status: [128]

# Don't run unless explicitly scheduled
run-on-tasks-for: []
Expand Down

0 comments on commit 23551c0

Please sign in to comment.