Add lm-eval correctness test (#210)

neuralmagic · May 10, 2024 · affd4f4 · affd4f4
1 parent 3a31485
commit affd4f4
Show file tree

Hide file tree

Showing 21 changed files with 531 additions and 259 deletions.
diff --git a/.github/actions/nm-lm-eval-accuracy/action.yml b/.github/actions/nm-lm-eval-accuracy/action.yml
@@ -12,6 +12,10 @@ runs:
   steps:
   - id: lm-eval
     run: |
+      # move source directories
+      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
+      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
+
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
@@ -20,7 +24,7 @@ runs:
       pip3 install pytest openai==1.3.9
 
       SUCCESS=0
-      pytest .github/scripts/test_lm_eval_sweep.py -s -v || SUCCESS=$?
+      pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$?
       echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/actions/nm-lm-eval-smoke/action.yml b/.github/actions/nm-lm-eval-smoke/action.yml
@@ -12,6 +12,10 @@ runs:
   steps:
   - id: lm-eval
     run: |
+      # move source directories
+      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
+      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
+
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate

diff --git a/.github/data/nm_benchmark_weekly_configs_list.txt b/.github/data/nm_benchmark_weekly_configs_list.txt
@@ -0,0 +1,5 @@
+neuralmagic/benchmarks/configs/benchmark_serving.json
+neuralmagic/benchmarks/configs/benchmark_throughput.json
+neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
+neuralmagic/benchmarks/configs/benchmark_throughput_prefill.json
+neuralmagic/benchmarks/configs/benchmark_remote_push.json
diff --git a/.github/scripts/lm_eval_compare_hf_vs_vllm.py b/.github/scripts/lm_eval_compare_hf_vs_vllm.py
@@ -38,7 +38,7 @@ def print_results(data_to_print: List = None,
 def check_passing_score(results_dict: Dict = None,
                         alpha: float = None) -> bool:
     for task in results_dict:
-        p_value = task["p_value"]
+        p_value = results_dict[task]["p_value"]
         if p_value <= alpha:
             return False
     return True
@@ -120,6 +120,6 @@ def parse_args():
         all_res[task1[0]] = {"z": z, "p_value": p_value}
     print_results([results_hf["results"], results_vllm["results"]], all_res,
                   args.alpha)
-    if not check_passing_score:
+    if not check_passing_score(all_res, args.alpha):
         print("Accuracy test failed!")
         exit(1)
diff --git a/.github/scripts/test_lm_eval_sweep.py b/.github/scripts/test_lm_eval_sweep.py
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -4,7 +4,7 @@ on:
   workflow_call:
     inputs:
       wf_category:
-        description: "categories: REMOTE, NIGHTLY, RELEASE"
+        description: "categories: REMOTE, NIGHTLY, WEEKLY, RELEASE"
         type: string
         default: "REMOTE"
       python:
@@ -177,17 +177,30 @@ jobs:
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
         secrets: inherit
 
-    # TODO: decide if this should build or use the whl
-    # single gpu
-    # TODO: this should only run if doing a NIGHTLY or RELEASE
-    # Accuracy-Smoke-AWS-AVX2-32G-A10G-24G:
-    #     if: ${{ inputs.wf_category == 'NIGHTLY' || inputs.wf_category == 'RELEASE' }}
-    #     uses: ./.github/workflows/nm-lm-eval-smoke.yml
-    #     with:
-    #         label: ${{ inputs.test_label_solo }}
-    #         timeout: ${{ inputs.benchmark_timeout }}
-    #         gitref: ${{ github.ref }}
-    #         Gi_per_thread: ${{ inputs.Gi_per_thread }}
-    #         nvcc_threads: ${{ inputs.nvcc_threads }}
-    #         python: ${{ inputs.python }}
-    #     secrets: inherit
+    TEST-ACCURACY-SMOKE:
+      needs: [BUILD]
+      if: inputs.wf_category == 'NIGHTLY'
+      uses: ./.github/workflows/nm-lm-eval-smoke.yml
+      with:
+        label: ${{ inputs.test_label_solo }}
+        timeout: ${{ inputs.benchmark_timeout }}
+        gitref: ${{ inputs.gitref }}
+        Gi_per_thread: ${{ inputs.Gi_per_thread }}
+        nvcc_threads: ${{ inputs.nvcc_threads }}
+        python: ${{ inputs.python }}
+        whl: ${{ needs.BUILD.outputs.whl }}
+      secrets: inherit
+
+    TEST-ACCURACY-FULL:
+      needs: [BUILD]
+      if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
+      uses: ./.github/workflows/nm-lm-eval-accuracy.yml
+      with:
+        label: ${{ inputs.test_label_multi }}
+        timeout: ${{ inputs.benchmark_timeout }}
+        gitref: ${{ inputs.gitref }}
+        Gi_per_thread: ${{ inputs.Gi_per_thread }}
+        nvcc_threads: ${{ inputs.nvcc_threads }}
+        python: ${{ inputs.python }}
+        whl: ${{ needs.BUILD.outputs.whl }}
+      secrets: inherit
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -3,7 +3,7 @@ run-name: ${{ github.actor }} triggered nightly on ${{ github.ref }}
 on:
     schedule:
       # * is a special character in YAML so you have to quote this string
-      - cron: '0 1 * * *'
+      - cron: '0 1 * * 1-6'  # nightly run (Mon-Sat)
 
     workflow_dispatch:
         inputs:
@@ -27,7 +27,7 @@ jobs:
             test_label_solo: aws-avx2-32G-a10g-24G
             test_label_multi: aws-avx2-192G-4-a10g-96G
             test_timeout: 480
-            test_skip_list:
+            test_skip_list: neuralmagic/tests/skip-for-nightly.txt
 
             benchmark_label: aws-avx2-32G-a10g-24G
             benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt