From d2a0e426e79b789a51112026f8c2f1327aad4cfc Mon Sep 17 00:00:00 2001
From: Domenic Barbuzzi <domenic@neuralmagic.com>
Date: Mon, 13 May 2024 15:13:18 +0000
Subject: [PATCH 1/5] Include TEST-MULTI job in weekly

---
 .github/workflows/build-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index 0fee152825309..df2acd54b17af 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -139,7 +139,7 @@ jobs:
 
     TEST-MULTI:
         needs: [BUILD]
-        if: success() && contains(fromJSON('["NIGHTLY", "RELEASE"]'), inputs.wf_category)
+        if: success() && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category)
         uses: ./.github/workflows/test.yml
         with:
             test_label: ${{ inputs.test_label_multi }}

From 25bfdaf4169f38f126b4d45ab6d9c49725b5ea3f Mon Sep 17 00:00:00 2001
From: Domenic Barbuzzi <domenic@neuralmagic.com>
Date: Mon, 13 May 2024 15:14:09 +0000
Subject: [PATCH 2/5] Unify naming for accuracy (lm-eval) jobs

---
 .github/workflows/build-test.yml                              | 4 ++--
 .../{nm-lm-eval-accuracy.yml => nm-test-accuracy-full.yml}    | 4 ++--
 .../{nm-lm-eval-smoke.yml => nm-test-accuracy-smoke.yml}      | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename .github/workflows/{nm-lm-eval-accuracy.yml => nm-test-accuracy-full.yml} (98%)
 rename .github/workflows/{nm-lm-eval-smoke.yml => nm-test-accuracy-smoke.yml} (98%)

diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index df2acd54b17af..2100261c0e48a 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -180,7 +180,7 @@ jobs:
     TEST-ACCURACY-SMOKE:
       needs: [BUILD]
       if: inputs.wf_category == 'NIGHTLY'
-      uses: ./.github/workflows/nm-lm-eval-smoke.yml
+      uses: ./.github/workflows/nm-test-accuracy-smoke.yml
       with:
         label: ${{ inputs.test_label_solo }}
         timeout: ${{ inputs.benchmark_timeout }}
@@ -194,7 +194,7 @@ jobs:
     TEST-ACCURACY-FULL:
       needs: [BUILD]
       if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
-      uses: ./.github/workflows/nm-lm-eval-accuracy.yml
+      uses: ./.github/workflows/nm-test-accuracy-full.yml
       with:
         label: ${{ inputs.test_label_multi }}
         timeout: ${{ inputs.benchmark_timeout }}
diff --git a/.github/workflows/nm-lm-eval-accuracy.yml b/.github/workflows/nm-test-accuracy-full.yml
similarity index 98%
rename from .github/workflows/nm-lm-eval-accuracy.yml
rename to .github/workflows/nm-test-accuracy-full.yml
index f1612ad81d0c2..07f736cbdfbfa 100644
--- a/.github/workflows/nm-lm-eval-accuracy.yml
+++ b/.github/workflows/nm-test-accuracy-full.yml
@@ -1,4 +1,4 @@
-name: nm-lm-eval-accuracy
+name: nm-test-accuracy-full
 on:
   # makes workflow reusable
   workflow_call:
@@ -68,7 +68,7 @@ env:
   VENV_BASE: "LM_EVAL"
 
 jobs:
-  LM-EVAL-FULL:
+  TEST-ACCURACY-FULL:
 
     runs-on: ${{ inputs.label }}
     timeout-minutes: ${{ fromJSON(inputs.timeout) }}
diff --git a/.github/workflows/nm-lm-eval-smoke.yml b/.github/workflows/nm-test-accuracy-smoke.yml
similarity index 98%
rename from .github/workflows/nm-lm-eval-smoke.yml
rename to .github/workflows/nm-test-accuracy-smoke.yml
index ddedc1110f796..994a33a1beba9 100644
--- a/.github/workflows/nm-lm-eval-smoke.yml
+++ b/.github/workflows/nm-test-accuracy-smoke.yml
@@ -1,4 +1,4 @@
-name: nm-lm-eval-smoke
+name: nm-test-accuracy-smoke
 on:
   # makes workflow reusable
   workflow_call:
@@ -68,7 +68,7 @@ env:
   VENV_BASE: "LM_EVAL"
 
 jobs:
-  LM-EVAL-SMOKE:
+  TEST-ACCURACY-SMOKE:
 
     runs-on: ${{ inputs.label }}
     timeout-minutes: ${{ fromJSON(inputs.timeout) }}

From 35a7b12caa4654f6290089142cf450a959ec1ac9 Mon Sep 17 00:00:00 2001
From: Domenic Barbuzzi <domenic@neuralmagic.com>
Date: Mon, 13 May 2024 15:34:20 +0000
Subject: [PATCH 3/5] Increase server startup wait time

---
 tests/utils/server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils/server.py b/tests/utils/server.py
index e055040a144d5..4f1a2accaaa1b 100644
--- a/tests/utils/server.py
+++ b/tests/utils/server.py
@@ -12,7 +12,7 @@
 
 from tests.utils.logging import log_banner
 
-MAX_SERVER_START_WAIT = 600  # time (seconds) to wait for server to start
+MAX_SERVER_START_WAIT = 15 * 60  # time (seconds) to wait for server to start
 
 
 @ray.remote(num_gpus=torch.cuda.device_count())

From 21f189f7a9e7fa6bad3b539f290bfa2e7767bfec Mon Sep 17 00:00:00 2001
From: Domenic Barbuzzi <domenic@neuralmagic.com>
Date: Mon, 13 May 2024 16:00:56 +0000
Subject: [PATCH 4/5] Disable too-large model

---
 tests/accuracy/lm-eval-tasks.yaml | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/accuracy/lm-eval-tasks.yaml b/tests/accuracy/lm-eval-tasks.yaml
index 97b420f325495..61b9e14723442 100644
--- a/tests/accuracy/lm-eval-tasks.yaml
+++ b/tests/accuracy/lm-eval-tasks.yaml
@@ -63,12 +63,13 @@
 #       value: 0.5041698256254739
 
 # Mixtral: FP16
-- model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.6550416982562547
-    - name: "exact_match,flexible-extract"
-      value: 0.6603487490523123
-  enable_tensor_parallel: true
+# g5.12xlarge runner (4x 24GB A10 GPUs) has insufficient VRAM
+# - model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+#   tasks:
+#   - name: "gsm8k"
+#     metrics:
+#     - name: "exact_match,strict-match"
+#       value: 0.6550416982562547
+#     - name: "exact_match,flexible-extract"
+#       value: 0.6603487490523123
+#   enable_tensor_parallel: true

From 3b85dc96441b88c6770a21db13cda2b97c3df397 Mon Sep 17 00:00:00 2001
From: Domenic Barbuzzi <domenic@neuralmagic.com>
Date: Mon, 13 May 2024 16:04:49 +0000
Subject: [PATCH 5/5] Increase benchmarking server startup time limit

---
 neuralmagic/benchmarks/run_benchmark_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
index 42b81dc56be7c..de6be7a3e0368 100644
--- a/neuralmagic/benchmarks/run_benchmark_serving.py
+++ b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -37,7 +37,7 @@ def get_tensor_parallel_size(config: NamedTuple) -> int:
     return tensor_parallel_size
 
 
-def is_server_running(host: str, port: int, timeout=300) -> bool:
+def is_server_running(host: str, port: int, timeout=600) -> bool:
 
     def try_connection() -> bool:
         try: