From 3a2545670126854a4a685edd889fe68f2fe250c3 Mon Sep 17 00:00:00 2001
From: Domenic Barbuzzi <dbarbuzzi@gmail.com>
Date: Tue, 14 May 2024 10:41:26 -0400
Subject: [PATCH] Misc CI/CD updates (#240)

---
 .github/workflows/build-test.yml              |  6 +++---
 ...accuracy.yml => nm-test-accuracy-full.yml} |  4 ++--
 ...l-smoke.yml => nm-test-accuracy-smoke.yml} |  4 ++--
 .../benchmarks/run_benchmark_serving.py       |  2 +-
 tests/accuracy/lm-eval-tasks.yaml             | 19 ++++++++++---------
 tests/utils/server.py                         |  2 +-
 6 files changed, 19 insertions(+), 18 deletions(-)
 rename .github/workflows/{nm-lm-eval-accuracy.yml => nm-test-accuracy-full.yml} (98%)
 rename .github/workflows/{nm-lm-eval-smoke.yml => nm-test-accuracy-smoke.yml} (98%)

diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index 0fee152825309..2100261c0e48a 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -139,7 +139,7 @@ jobs:
 
     TEST-MULTI:
         needs: [BUILD]
-        if: success() && contains(fromJSON('["NIGHTLY", "RELEASE"]'), inputs.wf_category)
+        if: success() && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category)
         uses: ./.github/workflows/test.yml
         with:
             test_label: ${{ inputs.test_label_multi }}
@@ -180,7 +180,7 @@ jobs:
     TEST-ACCURACY-SMOKE:
       needs: [BUILD]
       if: inputs.wf_category == 'NIGHTLY'
-      uses: ./.github/workflows/nm-lm-eval-smoke.yml
+      uses: ./.github/workflows/nm-test-accuracy-smoke.yml
       with:
         label: ${{ inputs.test_label_solo }}
         timeout: ${{ inputs.benchmark_timeout }}
@@ -194,7 +194,7 @@ jobs:
     TEST-ACCURACY-FULL:
       needs: [BUILD]
       if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
-      uses: ./.github/workflows/nm-lm-eval-accuracy.yml
+      uses: ./.github/workflows/nm-test-accuracy-full.yml
       with:
         label: ${{ inputs.test_label_multi }}
         timeout: ${{ inputs.benchmark_timeout }}
diff --git a/.github/workflows/nm-lm-eval-accuracy.yml b/.github/workflows/nm-test-accuracy-full.yml
similarity index 98%
rename from .github/workflows/nm-lm-eval-accuracy.yml
rename to .github/workflows/nm-test-accuracy-full.yml
index f1612ad81d0c2..07f736cbdfbfa 100644
--- a/.github/workflows/nm-lm-eval-accuracy.yml
+++ b/.github/workflows/nm-test-accuracy-full.yml
@@ -1,4 +1,4 @@
-name: nm-lm-eval-accuracy
+name: nm-test-accuracy-full
 on:
   # makes workflow reusable
   workflow_call:
@@ -68,7 +68,7 @@ env:
   VENV_BASE: "LM_EVAL"
 
 jobs:
-  LM-EVAL-FULL:
+  TEST-ACCURACY-FULL:
 
     runs-on: ${{ inputs.label }}
     timeout-minutes: ${{ fromJSON(inputs.timeout) }}
diff --git a/.github/workflows/nm-lm-eval-smoke.yml b/.github/workflows/nm-test-accuracy-smoke.yml
similarity index 98%
rename from .github/workflows/nm-lm-eval-smoke.yml
rename to .github/workflows/nm-test-accuracy-smoke.yml
index ddedc1110f796..994a33a1beba9 100644
--- a/.github/workflows/nm-lm-eval-smoke.yml
+++ b/.github/workflows/nm-test-accuracy-smoke.yml
@@ -1,4 +1,4 @@
-name: nm-lm-eval-smoke
+name: nm-test-accuracy-smoke
 on:
   # makes workflow reusable
   workflow_call:
@@ -68,7 +68,7 @@ env:
   VENV_BASE: "LM_EVAL"
 
 jobs:
-  LM-EVAL-SMOKE:
+  TEST-ACCURACY-SMOKE:
 
     runs-on: ${{ inputs.label }}
     timeout-minutes: ${{ fromJSON(inputs.timeout) }}
diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
index 42b81dc56be7c..de6be7a3e0368 100644
--- a/neuralmagic/benchmarks/run_benchmark_serving.py
+++ b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -37,7 +37,7 @@ def get_tensor_parallel_size(config: NamedTuple) -> int:
     return tensor_parallel_size
 
 
-def is_server_running(host: str, port: int, timeout=300) -> bool:
+def is_server_running(host: str, port: int, timeout=600) -> bool:
 
     def try_connection() -> bool:
         try:
diff --git a/tests/accuracy/lm-eval-tasks.yaml b/tests/accuracy/lm-eval-tasks.yaml
index 97b420f325495..61b9e14723442 100644
--- a/tests/accuracy/lm-eval-tasks.yaml
+++ b/tests/accuracy/lm-eval-tasks.yaml
@@ -63,12 +63,13 @@
 #       value: 0.5041698256254739
 
 # Mixtral: FP16
-- model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.6550416982562547
-    - name: "exact_match,flexible-extract"
-      value: 0.6603487490523123
-  enable_tensor_parallel: true
+# g5.12xlarge runner (4x 24GB A10 GPUs) has insufficient VRAM
+# - model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+#   tasks:
+#   - name: "gsm8k"
+#     metrics:
+#     - name: "exact_match,strict-match"
+#       value: 0.6550416982562547
+#     - name: "exact_match,flexible-extract"
+#       value: 0.6603487490523123
+#   enable_tensor_parallel: true
diff --git a/tests/utils/server.py b/tests/utils/server.py
index e055040a144d5..4f1a2accaaa1b 100644
--- a/tests/utils/server.py
+++ b/tests/utils/server.py
@@ -12,7 +12,7 @@
 
 from tests.utils.logging import log_banner
 
-MAX_SERVER_START_WAIT = 600  # time (seconds) to wait for server to start
+MAX_SERVER_START_WAIT = 15 * 60  # time (seconds) to wait for server to start
 
 
 @ray.remote(num_gpus=torch.cuda.device_count())