From b3c86bb355a6cd46466a19b86a757f12280e2635 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 2 Dec 2024 18:09:55 -0800 Subject: [PATCH 1/3] Add retry for AMD jobs --- scripts/test-template-aws.j2 | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/test-template-aws.j2 b/scripts/test-template-aws.j2 index 718e31f..6068fe9 100644 --- a/scripts/test-template-aws.j2 +++ b/scripts/test-template-aws.j2 @@ -185,6 +185,8 @@ steps: limit: 2 - exit_status: -10 # Agent was lost limit: 2 + - exit_status: 1 # Machine occasionally fail + limit: 2 agents: queue: amd-cpu @@ -204,6 +206,15 @@ steps: {% else %} soft_fail: false {% endif %} + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + - exit_status: 1 # Machine occasionally fail + limit: 2 + {% endif %} {% endfor %} From 3a4b04d47da536020650873bc81c9fa24dabc624 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 3 Dec 2024 13:03:17 -0800 Subject: [PATCH 2/3] Update test-template-fastcheck.j2 --- scripts/test-template-fastcheck.j2 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/test-template-fastcheck.j2 b/scripts/test-template-fastcheck.j2 index a72af1d..c163e1c 100644 --- a/scripts/test-template-fastcheck.j2 +++ b/scripts/test-template-fastcheck.j2 @@ -7,7 +7,7 @@ steps: - label: ":docker: build image" key: image-build agents: - queue: cpu_queue + queue: cpu_queue_premerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ." @@ -38,9 +38,9 @@ steps: depends_on: image-build agents: {% if step.label == "Documentation Build" %} - queue: small_cpu_queue + queue: small_cpu_queue_premerge {% elif step.no_gpu %} - queue: cpu_queue + queue: cpu_queue_premerge {% elif step.num_gpus == 2 or step.num_gpus == 4 %} queue: gpu_4_queue {% else %} @@ -91,9 +91,9 @@ steps: depends_on: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} agents: {% if step.label == "Documentation Build" %} - queue: small_cpu_queue + queue: small_cpu_queue_premerge {% elif step.no_gpu %} - queue: cpu_queue + queue: cpu_queue_premerge {% elif step.num_gpus == 2 or step.num_gpus == 4 %} queue: gpu_4_queue {% else %} @@ -157,7 +157,7 @@ steps: - label: "{{ step.label }}" priority: 10000 agents: - queue: a100-queue + queue: a100_queue soft_fail: {{ step.soft_fail or false }} {% if step.parallelism %} parallelism: {{ step.parallelism }} @@ -212,7 +212,7 @@ steps: - label: "TPU Test" depends_on: ~ agents: - queue: tpu + queue: tpu_queue commands: - if [[ -f ".buildkite/run-tpu-test.sh" ]]; then bash .buildkite/run-tpu-test.sh; fi - yes | docker system prune -a From 807266e6a13991a4b397f9562c46d1e93ac01139 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 3 Dec 2024 13:20:56 -0800 Subject: [PATCH 3/3] Use postmerge repo for main branch --- scripts/test-template-aws.j2 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/test-template-aws.j2 b/scripts/test-template-aws.j2 index 6068fe9..ef98bc7 100644 --- a/scripts/test-template-aws.j2 +++ b/scripts/test-template-aws.j2 @@ -1,4 +1,7 @@ {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} +{% if branch == "main" %} +{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %} +{% endif %} {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} {% set default_working_dir = "/vllm-workspace/tests" %} {% set hf_home = "/root/.cache/huggingface" %}