Benchmarking : Prepare for GHA benchmark UI (#122)

SUMMARY: - Miscellaneous updates to benchmarking infrastructure to support Github benchmarking UI - cleanup configs - Remove default arguments. - Add a `description` field to the benchmarking scripts so we may communicate intent to the UI - Move benchmark_result.py to logging folder - Add gha_benchmark_logging script that consumed BenchmarkResult JSON and outputs a JSON that the Github Benchmark UI can understand. - Add a minimal_test.json config that can be used for infra testing - Make config list as nightly explicity and add the remote-push job to nightly for fair comparison TEST PLAN: Manual testing nm-benchmark manual trigger : https://github.com/neuralmagic/nm-vllm/actions/runs/8284500798 nightly manual trigger : https://github.com/neuralmagic/nm-vllm/actions/runs/8285535882 --------- Co-authored-by: Varun Sundar Rabindranath <[email protected]>
neuralmagic · Mar 14, 2024 · feb86cd · feb86cd
1 parent ac9c9c8
commit feb86cd
Show file tree

Hide file tree

Showing 16 changed files with 318 additions and 64 deletions.
diff --git a/.github/actions/nm-benchmark/action.yml b/.github/actions/nm-benchmark/action.yml
@@ -24,7 +24,7 @@ runs:
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
       pip3 install -r neuralmagic/benchmarks/requirements-benchmark.txt
       SUCCESS=0
-      .github/workflows/scripts/nm-run-benchmarks.sh ${{ inputs.benchmark_config_list_file }} ${{ inputs.output_directory }} || SUCCESS=$?
+      .github/scripts/nm-run-benchmarks.sh ${{ inputs.benchmark_config_list_file }} ${{ inputs.output_directory }} || SUCCESS=$?
       echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/data/nm_benchmark_configs_minimal_test_list.txt b/.github/data/nm_benchmark_configs_minimal_test_list.txt
@@ -0,0 +1 @@
+neuralmagic/benchmarks/configs/minimal_test.json
diff --git a/.github/data/nm_benchmark_configs_list.txt → ...ata/nm_benchmark_nightly_configs_list.txt b/.github/data/nm_benchmark_configs_list.txt → ...ata/nm_benchmark_nightly_configs_list.txt
@@ -1,2 +1,3 @@
 neuralmagic/benchmarks/configs/benchmark_serving.json
 neuralmagic/benchmarks/configs/benchmark_throughput.json
+neuralmagic/benchmarks/configs/benchmark_remote_push.json
diff --git a/...ub/workflows/scripts/nm-run-benchmarks.sh → .github/scripts/nm-run-benchmarks.sh b/...ub/workflows/scripts/nm-run-benchmarks.sh → .github/scripts/nm-run-benchmarks.sh
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -36,7 +36,7 @@ jobs:
         uses: ./.github/workflows/nm-benchmark.yml
         with:
             label: aws-avx2-192G-4-a10g-96G
-            benchmark_config_list_file:  ./.github/data/nm_benchmark_configs_list.txt
+            benchmark_config_list_file:  ./.github/data/nm_benchmark_nightly_configs_list.txt
             timeout: 240
             gitref: '${{ github.ref }}'
             Gi_per_thread: 4
@@ -48,7 +48,7 @@ jobs:
         uses: ./.github/workflows/nm-benchmark.yml
         with:
             label: aws-avx2-32G-a10g-24G
-            benchmark_config_list_file:  ./.github/data/nm_benchmark_configs_list.txt
+            benchmark_config_list_file:  ./.github/data/nm_benchmark_nightly_configs_list.txt
             timeout: 240
             gitref: '${{ github.ref }}'
             Gi_per_thread: 12

diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py
@@ -26,7 +26,7 @@ def max_model_length_from_model_id(model: str,
     return _get_and_verify_max_len(config, max_model_len=None)
 
 
-def script_args_to_cla(config: NamedTuple) -> Iterable[list[str]]:
+def script_args_to_cla(config: NamedTuple) -> Iterable[dict]:
     #config is a NamedTuple constructed from some JSON in neuralmagic/benchmarks/configs
 
     kv = vars(config.script_args)
@@ -41,17 +41,17 @@ def script_args_to_cla(config: NamedTuple) -> Iterable[list[str]]:
         if len(v) == 0:
             key_args.append(k)
 
-    key_args_cla = list(map(lambda k: f"--{k}", key_args))
+    key_args_cla = {f"{k}": "" for k in key_args}
 
     # Remove empty lists from arg_lists and remove key args from keys
     arg_lists = list(filter(lambda arg_list: len(arg_list) != 0, arg_lists))
     keys = list(filter(lambda k: k not in key_args, keys))
     assert len(keys) == len(arg_lists)
 
     for args in itertools.product(*arg_lists):
-        cla = key_args_cla
-        for name, value in zip(keys, args):
-            cla.extend([f"--{name}", f"{value}"])
+        args_dict = dict(zip(keys, args))
+        cla = key_args_cla.copy()
+        cla.update(args_dict)
         yield cla
 
 

diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json
@@ -1,7 +1,7 @@
 {
 	"configs": [
 		{
-			"description": "Benchmark vllm serving",
+			"description": "VLLM Serving",
 			"models": [
 				"facebook/opt-125m",
 				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",

diff --git a/neuralmagic/benchmarks/configs/benchmark_throughput.json b/neuralmagic/benchmarks/configs/benchmark_throughput.json
@@ -1,7 +1,7 @@
 {
 	"configs": [
 		{
-			"description": "Benchmark vllm engine throughput - with dataset",
+			"description": "VLLM Engine throughput (with dataset)",
 			"models": [
 				"facebook/opt-125m",
 				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -36,7 +36,7 @@
 			}
 		},
 		{
-			"description": "Benchmark vllm engine prefill throughput - synthetic",
+			"description": "VLLM Engine prefill throughput (synthetic)",
 			"models": [
 				"facebook/opt-125m",
 				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -46,9 +46,6 @@
 			"max_model_lens" : [4096],
 			"script_name": "benchmark_throughput",
 			"script_args": {
-				"backend": [
-					"vllm"
-				],
 				"input-len": [
 					1,
 					16,
@@ -62,23 +59,14 @@
 				"output-len": [
 					1
 				],
-				"n": [
-					1
-				],
 				"num-prompts": [
 					1
 				],
-				"seed": [
-					0
-				],
-				"dtype": [
-					"auto"
-				],
 				"use-all-available-gpus_" : []
 			}
 		},
 		{
-			"description": "Benchmark vllm engine decode throughput - synthetic",
+			"description": "VLLM Engine decode throughput (synthetic)",
 			"models": [
 				"facebook/opt-125m",
 				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -88,18 +76,12 @@
 			"max_model_lens" : [4096],
 			"script_name": "benchmark_throughput",
 			"script_args": {
-				"backend": [
-					"vllm"
-				],
 				"input-len": [
 					2
 				],
 				"output-len": [
 					128
 				],
-				"n": [
-					1
-				],
 				"num-prompts": [
 					1,
 					4,
@@ -108,14 +90,8 @@
 					32,
 					64
 				],
-				"seed": [
-					0
-				],
-				"dtype": [
-					"auto"
-				],
 				"use-all-available-gpus_" : []
 			}
 		}
 	]
-}
+}
diff --git a/neuralmagic/benchmarks/configs/minimal_test.json b/neuralmagic/benchmarks/configs/minimal_test.json
@@ -0,0 +1,43 @@
+{
+	"configs": [
+		{
+			"description": "Benchmark vllm serving",
+			"models": [
+                          "mistralai/Mistral-7B-Instruct-v0.2"
+			],
+			"use_all_available_gpus" : "",
+			"max_model_lens": [
+				4096
+			],
+			"sparsity": [],
+			"script_name": "benchmark_serving",
+			"script_args": {
+                                "nr-qps-pair_" : ["5,inf"],
+				"dataset": [
+					"sharegpt"
+				]
+			}
+		},
+		{
+			"description": "Benchmark vllm engine throughput - with dataset",
+			"models": [
+                                "mistralai/Mistral-7B-Instruct-v0.2"
+			],
+			"max_model_lens" : [4096],
+			"script_name": "benchmark_throughput",
+			"script_args": {
+				"output-len": [
+					128
+				],
+				"num-prompts": [
+					100
+				],
+                                "dataset" : [
+                                  "sharegpt"
+                                ],
+                                "max-model-len" : [4096],
+				"use-all-available-gpus_" : []
+			}
+		}
+	]
+}
diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -2,6 +2,7 @@
 import subprocess
 import requests
 import time
+import json
 import itertools
 
 from typing import NamedTuple, Optional
@@ -122,12 +123,26 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
                             " ".join([f"--{k} {v}" for k, v in server_args.items()])
 
             for script_args in script_args_to_cla(config):
+
+                description = (f"{config.description}\n" +
+                               f"model - {model}\n" +
+                               f"max-model-len - {max_model_len}\n" +
+                               f"sparsity - {sparsity}\n" +
+                               f"{config.script_name} " +
+                               f"{json.dumps(script_args, indent=2)}")
+
                 bench_cmd = (["python3", "-m"
-                              f"{script_path}"] + script_args +
+                              f"{script_path}"] +
+                             ["--description", f"{description}"] +
                              ["--model", f"{model}"] +
                              ["--tokenizer", f"{model}"] +
                              ["--port", f"{BENCH_SERVER_PORT}"] +
                              ["--host", f"{BENCH_SERVER_HOST}"])
+                # Add script args
+                for k, v in script_args.items():
+                    bench_cmd.append(f"--{k}")
+                    if v != "":
+                        bench_cmd.append(f"{v}")
 
                 if output_directory:
                     bench_cmd += (["--save-directory", f"{output_directory}"] +

diff --git a/neuralmagic/benchmarks/run_benchmark_throughput.py b/neuralmagic/benchmarks/run_benchmark_throughput.py
@@ -1,4 +1,5 @@
 import argparse
+import json
 from pathlib import Path
 from typing import NamedTuple, Optional
 
@@ -29,10 +30,23 @@ def run_benchmark_throughput_script(config: NamedTuple,
 
         for max_model_len in max_model_lens:
             for script_args in script_args_to_cla(config):
+
+                description = (f"{config.description}\n"
+                               f"model - {model}\n" +
+                               f"max_model_len - {max_model_len}\n" +
+                               f"{config.script_name} " +
+                               f"{json.dumps(script_args, indent=2)}")
+
                 bench_cmd = (["python3", "-m", f"{script_path}"] +
-                             script_args + ["--model", f"{model}"] +
+                             ["--description", f"{description}"] +
+                             ["--model", f"{model}"] +
                              ["--tokenizer", f"{model}"] +
                              ["--max-model-len", f"{max_model_len}"])
+                # Add script args
+                for k, v in script_args.items():
+                    bench_cmd.append(f"--{k}")
+                    if v != "":
+                        bench_cmd.append(f"{v}")
 
                 if output_directory:
                     bench_cmd = bench_cmd + [

diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py
@@ -33,11 +33,11 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from .common import generate_synthetic_requests, print_serving_request_io
 from .datasets_registry import get_dataset, DatasetArgs
-from .benchmark_result import (BenchmarkResult,
-                               BenchmarkServingResultMetadataKeys as
-                               ResultMetadataKeys,
-                               BenchmarkServingResultMetricTemplates as
-                               ResultMetricTemplates)
+from .logging.benchmark_result import (BenchmarkResult,
+                                       BenchmarkServingResultMetadataKeys as
+                                       ResultMetadataKeys,
+                                       BenchmarkServingResultMetricTemplates as
+                                       ResultMetricTemplates)
 
 from neuralmagic.benchmarks.scripts.backend_request_func import (
     ASYNC_REQUEST_FUNCS,
@@ -337,6 +337,7 @@ def script_args_as_json_dict(script_args: argparse.Namespace):
 
         current_dt = datetime.now()
         result = BenchmarkResult(
+            description=args.description,
             date=current_dt,
             script_name=Path(__file__).name,
             script_args=script_args_as_json_dict(args),
@@ -382,6 +383,13 @@ def from_str(arg: str):
 
     parser = argparse.ArgumentParser(
         description='''Benchmark the online serving throughput.''')
+    parser.add_argument(
+        "--description",
+        type=str,
+        default="benchmark-serving",
+        help=
+        "Benchmark description. This is primarily useful when we log the benchmark results and process them for plotting charts"
+    )
     parser.add_argument(
         "--backend",
         type=str,

diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py
@@ -14,9 +14,9 @@
 from transformers import AutoTokenizer
 from .common import generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs
 from .datasets_registry import get_dataset, DatasetArgs
-from .benchmark_result import (BenchmarkResult,
-                               BenchmarkThroughputResultMetricTemplates as
-                               ResultMetricTemplates)
+from .logging.benchmark_result import (BenchmarkResult,
+                                       BenchmarkThroughputResultMetricTemplates
+                                       as ResultMetricTemplates)
 
 
 def get_tensor_parallel_size(args: argparse.Namespace) -> int:
@@ -145,6 +145,7 @@ def main(args: argparse.Namespace):
         current_dt = datetime.now()
 
         result = BenchmarkResult(
+            description=args.description,
             date=current_dt,
             script_name=Path(__file__).name,
             script_args=vars(args),
@@ -168,6 +169,13 @@ def main(args: argparse.Namespace):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument(
+        "--description",
+        type=str,
+        default="benchmark-throughput",
+        help=
+        "Benchmark description. This is primarily useful when we log the benchmark results and process them for plotting charts"
+    )
     parser.add_argument("--backend",
                         type=str,
                         choices=["vllm"],

diff --git a/neuralmagic/benchmarks/scripts/common.py b/neuralmagic/benchmarks/scripts/common.py
@@ -41,7 +41,7 @@ def get_benchmarking_context() -> dict:
         "torch_version": f"{torch.__version__}",
         "torch_cuda_version": f"{torch.version.cuda}",
         "cuda_devices": f"{cuda_devices}",
-        "cuda_device_names": f"{cuda_device_names}"
+        "cuda_device_names": cuda_device_names
     }