neuralmagic · dbarbuzzi · May 14, 2024 · May 13, 2024 · May 13, 2024 · May 13, 2024
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -139,7 +139,7 @@ jobs:
 
     TEST-MULTI:
         needs: [BUILD]
-        if: success() && contains(fromJSON('["NIGHTLY", "RELEASE"]'), inputs.wf_category)
+        if: success() && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category)
         uses: ./.github/workflows/test.yml
         with:
             test_label: ${{ inputs.test_label_multi }}
@@ -180,7 +180,7 @@ jobs:
     TEST-ACCURACY-SMOKE:
       needs: [BUILD]
       if: inputs.wf_category == 'NIGHTLY'
-      uses: ./.github/workflows/nm-lm-eval-smoke.yml
+      uses: ./.github/workflows/nm-test-accuracy-smoke.yml
       with:
         label: ${{ inputs.test_label_solo }}
         timeout: ${{ inputs.benchmark_timeout }}
@@ -194,7 +194,7 @@ jobs:
     TEST-ACCURACY-FULL:
       needs: [BUILD]
       if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
-      uses: ./.github/workflows/nm-lm-eval-accuracy.yml
+      uses: ./.github/workflows/nm-test-accuracy-full.yml
       with:
         label: ${{ inputs.test_label_multi }}
         timeout: ${{ inputs.benchmark_timeout }}

diff --git a/.github/workflows/nm-lm-eval-accuracy.yml → .github/workflows/nm-test-accuracy-full.yml b/.github/workflows/nm-lm-eval-accuracy.yml → .github/workflows/nm-test-accuracy-full.yml
@@ -1,4 +1,4 @@
-name: nm-lm-eval-accuracy
+name: nm-test-accuracy-full
 on:
   # makes workflow reusable
   workflow_call:
@@ -68,7 +68,7 @@ env:
   VENV_BASE: "LM_EVAL"
 
 jobs:
-  LM-EVAL-FULL:
+  TEST-ACCURACY-FULL:
 
     runs-on: ${{ inputs.label }}
     timeout-minutes: ${{ fromJSON(inputs.timeout) }}

diff --git a/.github/workflows/nm-lm-eval-smoke.yml → .github/workflows/nm-test-accuracy-smoke.yml b/.github/workflows/nm-lm-eval-smoke.yml → .github/workflows/nm-test-accuracy-smoke.yml
@@ -1,4 +1,4 @@
-name: nm-lm-eval-smoke
+name: nm-test-accuracy-smoke
 on:
   # makes workflow reusable
   workflow_call:
@@ -68,7 +68,7 @@ env:
   VENV_BASE: "LM_EVAL"
 
 jobs:
-  LM-EVAL-SMOKE:
+  TEST-ACCURACY-SMOKE:
 
     runs-on: ${{ inputs.label }}
     timeout-minutes: ${{ fromJSON(inputs.timeout) }}

diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -37,7 +37,7 @@ def get_tensor_parallel_size(config: NamedTuple) -> int:
     return tensor_parallel_size
 
 
-def is_server_running(host: str, port: int, timeout=300) -> bool:
+def is_server_running(host: str, port: int, timeout=600) -> bool:
 
     def try_connection() -> bool:
         try:

diff --git a/tests/accuracy/lm-eval-tasks.yaml b/tests/accuracy/lm-eval-tasks.yaml
@@ -63,12 +63,13 @@
 #       value: 0.5041698256254739
 
 # Mixtral: FP16
-- model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.6550416982562547
-    - name: "exact_match,flexible-extract"
-      value: 0.6603487490523123
-  enable_tensor_parallel: true
+# g5.12xlarge runner (4x 24GB A10 GPUs) has insufficient VRAM
+# - model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+#   tasks:
+#   - name: "gsm8k"
+#     metrics:
+#     - name: "exact_match,strict-match"
+#       value: 0.6550416982562547
+#     - name: "exact_match,flexible-extract"
+#       value: 0.6603487490523123
+#   enable_tensor_parallel: true
diff --git a/tests/utils/server.py b/tests/utils/server.py
@@ -12,7 +12,7 @@
 
 from tests.utils.logging import log_banner
 
-MAX_SERVER_START_WAIT = 600  # time (seconds) to wait for server to start
+MAX_SERVER_START_WAIT = 15 * 60  # time (seconds) to wait for server to start
 
 
 @ray.remote(num_gpus=torch.cuda.device_count())